Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions .flake8

This file was deleted.

43 changes: 11 additions & 32 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,29 +1,16 @@
fail_fast: true
repos:
- repo: local
hooks:
- id: black
name: Black
entry: uv run --no-sync black docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: isort
name: isort
entry: uv run --no-sync isort docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: autoflake
name: autoflake
entry: uv run --no-sync autoflake docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.5
hooks:
- id: ruff-format
name: "Ruff formatter"
args: [--config=pyproject.toml]
files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
- id: ruff
name: "Ruff linter"
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
- repo: local
hooks:
- id: mypy
Expand All @@ -32,14 +19,6 @@ repos:
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: flake8
name: Flake8
entry: uv run --no-sync flake8 docling_core
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: pytest
Expand Down
6 changes: 1 addition & 5 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>

We use the following tools to enforce code style:

- isort, to sort imports
- Black, to format code
- Ruff, to format and lint code
- Flake8, to lint code
- autoflake, to remove unused variables and imports
- [MyPy](https://mypy.readthedocs.io), as static type checker
Expand All @@ -65,9 +64,6 @@ To run the checks on-demand, type:
uv run pre-commit run --all-files
```

Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is because `pre-commit` doesn't like to see files modified by their hooks. In these cases, `git add` the modified files and `git commit` again.


### Documentation

We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects.
Expand Down
51 changes: 12 additions & 39 deletions docling_core/experimental/idoctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def get_special_tokens(

if include_location_tokens:
# Adding dynamically generated location-tokens
for i in range(0, max(page_dimension[0], page_dimension[1])):
for i in range(max(page_dimension[0], page_dimension[1])):
special_tokens.append(f"<{IDocTagsToken._LOC_PREFIX.value}{i}/>")

return special_tokens
Expand Down Expand Up @@ -294,11 +294,7 @@ def serialize(
# as siblings at the same level (not wrapped in <list_item>).
for subref in child.children:
sub = subref.resolve(doc)
if (
isinstance(sub, ListGroup)
and sub.self_ref not in my_visited
and sub.self_ref not in excluded
):
if isinstance(sub, ListGroup) and sub.self_ref not in my_visited and sub.self_ref not in excluded:
my_visited.add(sub.self_ref)
sub_res = doc_serializer.serialize(
item=sub,
Expand Down Expand Up @@ -343,15 +339,9 @@ def serialize(
texts = (
[
tmp
for key in (
list(item.meta.__class__.model_fields)
+ list(item.meta.get_custom_part())
)
for key in (list(item.meta.__class__.model_fields) + list(item.meta.get_custom_part()))
if (
(
params.allowed_meta_names is None
or key in params.allowed_meta_names
)
(params.allowed_meta_names is None or key in params.allowed_meta_names)
and (key not in params.blocked_meta_names)
and (tmp := self._serialize_meta_field(item.meta, key))
)
Expand All @@ -369,28 +359,16 @@ def serialize(

def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
if (field_val := getattr(meta, name)) is not None:
if name == MetaFieldName.SUMMARY and isinstance(
field_val, SummaryMetaField
):
if name == MetaFieldName.SUMMARY and isinstance(field_val, SummaryMetaField):
txt = f"<summary>{field_val.text}</summary>"
elif name == MetaFieldName.DESCRIPTION and isinstance(
field_val, DescriptionMetaField
):
elif name == MetaFieldName.DESCRIPTION and isinstance(field_val, DescriptionMetaField):
txt = f"<description>{field_val.text}</description>"
elif name == MetaFieldName.CLASSIFICATION and isinstance(
field_val, PictureClassificationMetaField
):
class_name = self._humanize_text(
field_val.get_main_prediction().class_name
)
elif name == MetaFieldName.CLASSIFICATION and isinstance(field_val, PictureClassificationMetaField):
class_name = self._humanize_text(field_val.get_main_prediction().class_name)
txt = f"<classification>{class_name}</classification>"
elif name == MetaFieldName.MOLECULE and isinstance(
field_val, MoleculeMetaField
):
elif name == MetaFieldName.MOLECULE and isinstance(field_val, MoleculeMetaField):
txt = f"<molecule>{field_val.smi}</molecule>"
elif name == MetaFieldName.TABULAR_CHART and isinstance(
field_val, TabularChartMetaField
):
elif name == MetaFieldName.TABULAR_CHART and isinstance(field_val, TabularChartMetaField):
# suppressing tabular chart serialization
return None
# elif tmp := str(field_val or ""):
Expand Down Expand Up @@ -419,7 +397,6 @@ def serialize(
is_chart = False

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):

if item.meta:
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
if meta_res.text:
Expand Down Expand Up @@ -508,12 +485,8 @@ def serialize_doc(

text_res = tmp

if self.params.pretty_indentation and (
my_root := parseString(text_res).documentElement
):
if self.params.pretty_indentation and (my_root := parseString(text_res).documentElement):
text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
text_res = "\n".join(
[line for line in text_res.split("\n") if line.strip()]
)
text_res = "\n".join([line for line in text_res.split("\n") if line.strip()])

return create_ser_result(text=text_res, span_source=parts)
17 changes: 3 additions & 14 deletions docling_core/search/json_schema_to_search_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,9 +269,7 @@ def __suppress(d_: Any) -> Any:
if suppress_key in d_ and d_[suppress_key] is True:
return {}
else:
return {
k: v for k, v in ((k, __suppress(v)) for k, v in d_.items())
}
return {k: v for k, v in ((k, __suppress(v)) for k, v in d_.items())}
return d_

return __suppress(doc)
Expand Down Expand Up @@ -325,12 +323,7 @@ def __remove(d_: Any) -> Any:
return [v for v in (__remove(v) for v in d_)]

if isinstance(d_, dict):
return {
k: v
for k, v in (
(k, __remove(v)) for k, v in d_.items() if not regx.match(k)
)
}
return {k: v for k, v in ((k, __remove(v)) for k, v in d_.items() if not regx.match(k))}

return d_

Expand Down Expand Up @@ -393,11 +386,7 @@ def _clean(d_: Any) -> Any:
return [v for v in (_clean(v) for v in d_) if not _empty(v)]

if isinstance(d_, dict):
return {
k: v
for k, v in ((k, _clean(v)) for k, v in d_.items())
if not _empty(v)
}
return {k: v for k, v in ((k, _clean(v)) for k, v in d_.items()) if not _empty(v)}

return d_

Expand Down
8 changes: 2 additions & 6 deletions docling_core/search/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,8 @@ def version_has_schema(cls, v):
"""Validate that the docling-core library is always set in version field."""
docling_core = [item for item in v if item.name == "docling-core"]
if not docling_core:
raise ValueError(
"the version should include at least a valid docling-core package"
)
raise ValueError("the version should include at least a valid docling-core package")
elif len(docling_core) > 1:
raise ValueError(
"the version must not include more than 1 docling-core package"
)
raise ValueError("the version must not include more than 1 docling-core package")
else:
return v
4 changes: 2 additions & 2 deletions docling_core/search/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class Package(BaseModel, extra="forbid"):
"""

name: StrictStr = "docling-core"
version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = (
importlib.metadata.version("docling-core")
version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = importlib.metadata.version(
"docling-core"
)

def __hash__(self):
Expand Down
9 changes: 1 addition & 8 deletions docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,7 @@ def contextualize(self, chunk: BaseChunk) -> str:
for k in meta:
if k not in chunk.meta.excluded_embed:
if isinstance(meta[k], list):
items.append(
self.delim.join(
[
d if isinstance(d, str) else json.dumps(d)
for d in meta[k]
]
)
)
items.append(self.delim.join([d if isinstance(d, str) else json.dumps(d) for d in meta[k]]))
else:
items.append(json.dumps(meta[k]))
items.append(chunk.text)
Expand Down
Loading