diff --git a/.flake8 b/.flake8 deleted file mode 100644 index ba5db34c..00000000 --- a/.flake8 +++ /dev/null @@ -1,8 +0,0 @@ -[flake8] -per-file-ignores = __init__.py:F401 -max-line-length = 120 -exclude = test/* -max-complexity = 25 -docstring-convention = google -ignore = W503,E203,E741 -classmethod-decorators = classmethod,validator diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eda9dd6c..e8bda570 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,29 +1,16 @@ fail_fast: true repos: - - repo: local - hooks: - - id: black - name: Black - entry: uv run --no-sync black docling_core test - pass_filenames: false - language: system - files: '\.py$' - - repo: local - hooks: - - id: isort - name: isort - entry: uv run --no-sync isort docling_core test - pass_filenames: false - language: system - files: '\.py$' - - repo: local - hooks: - - id: autoflake - name: autoflake - entry: uv run --no-sync autoflake docling_core test - pass_filenames: false - language: system - files: '\.py$' + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.5 + hooks: + - id: ruff-format + name: "Ruff formatter" + args: [--config=pyproject.toml] + files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$' + - id: ruff + name: "Ruff linter" + args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] + files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$' - repo: local hooks: - id: mypy @@ -32,14 +19,6 @@ repos: pass_filenames: false language: system files: '\.py$' - - repo: local - hooks: - - id: flake8 - name: Flake8 - entry: uv run --no-sync flake8 docling_core - pass_filenames: false - language: system - files: '\.py$' - repo: local hooks: - id: pytest diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1d81415a..f15059d6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -47,8 +47,7 @@ uv add [OPTIONS] > We use the following tools to enforce code style: -- isort, to sort imports -- Black, to format code +- Ruff, to format and lint code - Flake8, to lint code - autoflake, to remove unused variables and imports - [MyPy](https://mypy.readthedocs.io), as static type checker @@ -65,9 +64,6 @@ To run the checks on-demand, type: uv run pre-commit run --all-files ``` -Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is because `pre-commit` doesn't like to see files modified by their hooks. In these cases, `git add` the modified files and `git commit` again. - - ### Documentation We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects. diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py index 7990f4cd..cb3143d4 100644 --- a/docling_core/experimental/idoctags.py +++ b/docling_core/experimental/idoctags.py @@ -147,7 +147,7 @@ def get_special_tokens( if include_location_tokens: # Adding dynamically generated location-tokens - for i in range(0, max(page_dimension[0], page_dimension[1])): + for i in range(max(page_dimension[0], page_dimension[1])): special_tokens.append(f"<{IDocTagsToken._LOC_PREFIX.value}{i}/>") return special_tokens @@ -294,11 +294,7 @@ def serialize( # as siblings at the same level (not wrapped in ). for subref in child.children: sub = subref.resolve(doc) - if ( - isinstance(sub, ListGroup) - and sub.self_ref not in my_visited - and sub.self_ref not in excluded - ): + if isinstance(sub, ListGroup) and sub.self_ref not in my_visited and sub.self_ref not in excluded: my_visited.add(sub.self_ref) sub_res = doc_serializer.serialize( item=sub, @@ -343,15 +339,9 @@ def serialize( texts = ( [ tmp - for key in ( - list(item.meta.__class__.model_fields) - + list(item.meta.get_custom_part()) - ) + for key in (list(item.meta.__class__.model_fields) + list(item.meta.get_custom_part())) if ( - ( - params.allowed_meta_names is None - or key in params.allowed_meta_names - ) + (params.allowed_meta_names is None or key in params.allowed_meta_names) and (key not in params.blocked_meta_names) and (tmp := self._serialize_meta_field(item.meta, key)) ) @@ -369,28 +359,16 @@ def serialize( def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]: if (field_val := getattr(meta, name)) is not None: - if name == MetaFieldName.SUMMARY and isinstance( - field_val, SummaryMetaField - ): + if name == MetaFieldName.SUMMARY and isinstance(field_val, SummaryMetaField): txt = f"{field_val.text}" - elif name == MetaFieldName.DESCRIPTION and isinstance( - field_val, DescriptionMetaField - ): + elif name == MetaFieldName.DESCRIPTION and isinstance(field_val, DescriptionMetaField): txt = f"{field_val.text}" - elif name == MetaFieldName.CLASSIFICATION and isinstance( - field_val, PictureClassificationMetaField - ): - class_name = self._humanize_text( - field_val.get_main_prediction().class_name - ) + elif name == MetaFieldName.CLASSIFICATION and isinstance(field_val, PictureClassificationMetaField): + class_name = self._humanize_text(field_val.get_main_prediction().class_name) txt = f"{class_name}" - elif name == MetaFieldName.MOLECULE and isinstance( - field_val, MoleculeMetaField - ): + elif name == MetaFieldName.MOLECULE and isinstance(field_val, MoleculeMetaField): txt = f"{field_val.smi}" - elif name == MetaFieldName.TABULAR_CHART and isinstance( - field_val, TabularChartMetaField - ): + elif name == MetaFieldName.TABULAR_CHART and isinstance(field_val, TabularChartMetaField): # suppressing tabular chart serialization return None # elif tmp := str(field_val or ""): @@ -419,7 +397,6 @@ def serialize( is_chart = False if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): - if item.meta: meta_res = doc_serializer.serialize_meta(item=item, **kwargs) if meta_res.text: @@ -508,12 +485,8 @@ def serialize_doc( text_res = tmp - if self.params.pretty_indentation and ( - my_root := parseString(text_res).documentElement - ): + if self.params.pretty_indentation and (my_root := parseString(text_res).documentElement): text_res = my_root.toprettyxml(indent=self.params.pretty_indentation) - text_res = "\n".join( - [line for line in text_res.split("\n") if line.strip()] - ) + text_res = "\n".join([line for line in text_res.split("\n") if line.strip()]) return create_ser_result(text=text_res, span_source=parts) diff --git a/docling_core/search/json_schema_to_search_mapper.py b/docling_core/search/json_schema_to_search_mapper.py index f5644835..2d8819a2 100644 --- a/docling_core/search/json_schema_to_search_mapper.py +++ b/docling_core/search/json_schema_to_search_mapper.py @@ -269,9 +269,7 @@ def __suppress(d_: Any) -> Any: if suppress_key in d_ and d_[suppress_key] is True: return {} else: - return { - k: v for k, v in ((k, __suppress(v)) for k, v in d_.items()) - } + return {k: v for k, v in ((k, __suppress(v)) for k, v in d_.items())} return d_ return __suppress(doc) @@ -325,12 +323,7 @@ def __remove(d_: Any) -> Any: return [v for v in (__remove(v) for v in d_)] if isinstance(d_, dict): - return { - k: v - for k, v in ( - (k, __remove(v)) for k, v in d_.items() if not regx.match(k) - ) - } + return {k: v for k, v in ((k, __remove(v)) for k, v in d_.items() if not regx.match(k))} return d_ @@ -393,11 +386,7 @@ def _clean(d_: Any) -> Any: return [v for v in (_clean(v) for v in d_) if not _empty(v)] if isinstance(d_, dict): - return { - k: v - for k, v in ((k, _clean(v)) for k, v in d_.items()) - if not _empty(v) - } + return {k: v for k, v in ((k, _clean(v)) for k, v in d_.items()) if not _empty(v)} return d_ diff --git a/docling_core/search/meta.py b/docling_core/search/meta.py index 2b5ff926..20e1e415 100644 --- a/docling_core/search/meta.py +++ b/docling_core/search/meta.py @@ -78,12 +78,8 @@ def version_has_schema(cls, v): """Validate that the docling-core library is always set in version field.""" docling_core = [item for item in v if item.name == "docling-core"] if not docling_core: - raise ValueError( - "the version should include at least a valid docling-core package" - ) + raise ValueError("the version should include at least a valid docling-core package") elif len(docling_core) > 1: - raise ValueError( - "the version must not include more than 1 docling-core package" - ) + raise ValueError("the version must not include more than 1 docling-core package") else: return v diff --git a/docling_core/search/package.py b/docling_core/search/package.py index 1d1bf37a..cdffc3f9 100644 --- a/docling_core/search/package.py +++ b/docling_core/search/package.py @@ -22,8 +22,8 @@ class Package(BaseModel, extra="forbid"): """ name: StrictStr = "docling-core" - version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = ( - importlib.metadata.version("docling-core") + version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = importlib.metadata.version( + "docling-core" ) def __hash__(self): diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py index 24f32ea4..72f9731c 100644 --- a/docling_core/transforms/chunker/base.py +++ b/docling_core/transforms/chunker/base.py @@ -77,14 +77,7 @@ def contextualize(self, chunk: BaseChunk) -> str: for k in meta: if k not in chunk.meta.excluded_embed: if isinstance(meta[k], list): - items.append( - self.delim.join( - [ - d if isinstance(d, str) else json.dumps(d) - for d in meta[k] - ] - ) - ) + items.append(self.delim.join([d if isinstance(d, str) else json.dumps(d) for d in meta[k]])) else: items.append(json.dumps(meta[k])) items.append(chunk.text) diff --git a/docling_core/transforms/chunker/code_chunking/_language_code_chunkers.py b/docling_core/transforms/chunker/code_chunking/_language_code_chunkers.py index 5c0d3c7c..5b9cb810 100644 --- a/docling_core/transforms/chunker/code_chunking/_language_code_chunkers.py +++ b/docling_core/transforms/chunker/code_chunking/_language_code_chunkers.py @@ -149,9 +149,7 @@ def build_class_metadata( chunk_type=CodeChunkType.CLASS, ) - def build_preamble_metadata( - self, *, item: CodeItem, content: str, start_line: int, end_line: int - ) -> CodeDocMeta: + def build_preamble_metadata(self, *, item: CodeItem, content: str, start_line: int, end_line: int) -> CodeDocMeta: """Build metadata for preamble chunks.""" return CodeDocMeta( doc_items=[item], @@ -162,9 +160,7 @@ def build_preamble_metadata( chunk_type=CodeChunkType.PREAMBLE, ) - def calculate_line_numbers( - self, code: str, start_byte: int, end_byte: int - ) -> Tuple[int, int]: + def calculate_line_numbers(self, code: str, start_byte: int, end_byte: int) -> Tuple[int, int]: """Calculate line numbers from byte positions.""" start_line = code[:start_byte].count("\n") + 1 if end_byte > 0 and end_byte <= len(code): @@ -224,9 +220,7 @@ def build_class_chunk( ) return CodeChunk(text=content, meta=metadata, doc_items=[self.item]) - def build_preamble_chunk( - self, content: str, start_line: int, end_line: int - ) -> CodeChunk: + def build_preamble_chunk(self, content: str, start_line: int, end_line: int) -> CodeChunk: """Build a preamble chunk.""" metadata = self.metadata_builder.build_preamble_metadata( item=self.item, @@ -236,15 +230,11 @@ def build_preamble_chunk( ) return CodeChunk(text=content, meta=metadata, doc_items=[self.item]) - def process_orphan_chunks( - self, used_ranges: List[Tuple[int, int]], dl_doc - ) -> Iterator[CodeChunk]: + def process_orphan_chunks(self, used_ranges: List[Tuple[int, int]], dl_doc) -> Iterator[CodeChunk]: """Process orphan chunks (preamble) from unused code ranges.""" from docling_core.types.doc.labels import DocItemLabel - code = next( - (t.text for t in dl_doc.texts if t.label == DocItemLabel.CODE), None - ) + code = next((t.text for t in dl_doc.texts if t.label == DocItemLabel.CODE), None) if not code: return @@ -263,18 +253,14 @@ def process_orphan_chunks( first_start_byte = orphan_pieces[0][1] last_end_byte = orphan_pieces[-1][2] - start_line, end_line = self.metadata_builder.calculate_line_numbers( - code, first_start_byte, last_end_byte - ) + start_line, end_line = self.metadata_builder.calculate_line_numbers(code, first_start_byte, last_end_byte) yield self.build_preamble_chunk(merged_content, start_line, end_line) class _ChunkSizeProcessor: """Processes chunks to split large ones into smaller pieces.""" - def __init__( - self, tokenizer, max_tokens: int, min_chunk_size: int = 300, chunker=None - ): + def __init__(self, tokenizer, max_tokens: int, min_chunk_size: int = 300, chunker=None): """Initialize the chunk size processor with tokenizer and size constraints.""" self.tokenizer = tokenizer self.max_tokens = max_tokens @@ -369,11 +355,7 @@ def _split_function_chunk( continue new_meta = chunk.meta.model_copy() - new_meta.part_name = ( - f"{chunk.meta.part_name}_part_{i + 1}" - if len(chunks) > 1 - else chunk.meta.part_name - ) + new_meta.part_name = f"{chunk.meta.part_name}_part_{i + 1}" if len(chunks) > 1 else chunk.meta.part_name sub_chunk = CodeChunk(text=chunk_text, meta=new_meta) yield sub_chunk, ranges @@ -405,9 +387,10 @@ def _split_generic_chunk( if current_size + line_tokens > self.max_tokens and current_chunk_lines: chunk_text = "\n".join(current_chunk_lines) if self.tokenizer.count_tokens(chunk_text) >= self.min_chunk_size: - yield self._create_split_chunk( - chunk, chunk_text, chunk_number - ), ranges + yield ( + self._create_split_chunk(chunk, chunk_text, chunk_number), + ranges, + ) chunk_number += 1 current_chunk_lines = [line] @@ -421,9 +404,7 @@ def _split_generic_chunk( if self.tokenizer.count_tokens(chunk_text) >= self.min_chunk_size: yield self._create_split_chunk(chunk, chunk_text, chunk_number), ranges - def _create_split_chunk( - self, original_chunk: CodeChunk, text: str, chunk_number: int - ) -> CodeChunk: + def _create_split_chunk(self, original_chunk: CodeChunk, text: str, chunk_number: int) -> CodeChunk: """Create a new chunk from split text.""" new_meta = original_chunk.meta.model_copy() new_meta.part_name = f"{original_chunk.meta.part_name}_part_{chunk_number}" @@ -484,9 +465,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[CodeChunk]: module_variables = self._get_module_variables(tree) range_tracker = _RangeTracker() chunk_builder = _ChunkBuilder(item=item, origin=dl_doc.origin) - size_processor = _ChunkSizeProcessor( - self.tokenizer, self.max_tokens, self.min_chunk_size, chunker=self - ) + size_processor = _ChunkSizeProcessor(self.tokenizer, self.max_tokens, self.min_chunk_size, chunker=self) self._mark_copyright_comments(tree.root_node, range_tracker) @@ -508,32 +487,24 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[CodeChunk]: all_chunks.append((chunk, chunk_used_ranges)) if module_variables: - self._track_constructor_variables( - tree.root_node, module_variables, range_tracker - ) + self._track_constructor_variables(tree.root_node, module_variables, range_tracker) empty_classes = self._get_classes_no_methods(tree.root_node, "") for node in empty_classes: for ( chunk, chunk_used_ranges, - ) in self._yield_class_chunk_with_ranges( - node, import_nodes, chunk_builder - ): + ) in self._yield_class_chunk_with_ranges(node, import_nodes, chunk_builder): range_tracker.extend(chunk_used_ranges) all_chunks.append((chunk, chunk_used_ranges)) - for chunk in chunk_builder.process_orphan_chunks( - range_tracker.get_used_ranges(), dl_doc - ): + for chunk in chunk_builder.process_orphan_chunks(range_tracker.get_used_ranges(), dl_doc): all_chunks.append((chunk, [])) for chunk, _ in size_processor.process_chunks(all_chunks): yield chunk - def _mark_copyright_comments( - self, root_node: Node, range_tracker: _RangeTracker - ) -> None: + def _mark_copyright_comments(self, root_node: Node, range_tracker: _RangeTracker) -> None: """Mark copyright comments as used.""" comment_nodes = _get_children(root_node, self.docs_types) for node in comment_nodes: @@ -549,14 +520,9 @@ def _yield_function_chunks_with_ranges( chunk_builder: _ChunkBuilder, module_variables: Optional[Dict[str, Node]] = None, ) -> Iterator[Tuple[CodeChunk, List[Tuple[int, int]]]]: - docstring = self._get_docstring(node) - additional_context, additional_context_no_docstring = ( - self._build_additional_context(node, root_node) - ) - imports = self._build_imports( - import_nodes, node, additional_context_no_docstring - ) + additional_context, additional_context_no_docstring = self._build_additional_context(node, root_node) + imports = self._build_imports(import_nodes, node, additional_context_no_docstring) function_line_start, _ = node.start_point function_line_end, _ = node.end_point signature_line_end, _ = self._get_function_signature_end(node) @@ -583,12 +549,8 @@ def _yield_function_chunks_with_ranges( current_node = node while current_node.parent: if current_node.parent.type in self.class_definition_types: - used_ranges.append( - (current_node.parent.start_byte, current_node.parent.end_byte) - ) - used_ranges.extend( - self._get_class_member_ranges(current_node.parent) - ) + used_ranges.append((current_node.parent.start_byte, current_node.parent.end_byte)) + used_ranges.extend(self._get_class_member_ranges(current_node.parent)) break current_node = current_node.parent @@ -605,23 +567,23 @@ def _yield_function_chunks_with_ranges( module_variable_definitions += var_text + "\n" function_content = self._build_function(node) - function_no_docstring = ( - function_content.replace(docstring, "") if docstring else function_content - ) + function_no_docstring = function_content.replace(docstring, "") if docstring else function_content base_content = ( - f"{prefix}{imports}{module_variable_definitions}" - f"{additional_context_no_docstring}{function_no_docstring}" + f"{prefix}{imports}{module_variable_definitions}{additional_context_no_docstring}{function_no_docstring}" ) - yield chunk_builder.build_function_chunk( - base_content, - function_name, - docstring, - function_line_start, - function_line_end, - signature_line_end, - ), used_ranges + yield ( + chunk_builder.build_function_chunk( + base_content, + function_name, + docstring, + function_line_start, + function_line_end, + signature_line_end, + ), + used_ranges, + ) def _yield_class_chunk_with_ranges( self, node: Node, import_nodes: Dict[str, Node], chunk_builder: _ChunkBuilder @@ -643,9 +605,7 @@ def _yield_class_chunk_with_ranges( used_ranges.extend(class_ranges) if imports: - used_imports = self._find_used_imports_in_function( - import_nodes, node, function_content, None - ) + used_imports = self._find_used_imports_in_function(import_nodes, node, function_content, None) for import_name in sorted(used_imports): if import_name in import_nodes: import_node = import_nodes[import_name] @@ -655,27 +615,26 @@ def _yield_class_chunk_with_ranges( if prefix: used_ranges.extend(prefix_range) - function_no_docstring = ( - function_content.replace(docstring, "") if docstring else function_content - ) + function_no_docstring = function_content.replace(docstring, "") if docstring else function_content content_no_docstring = f"{prefix}{imports}{function_no_docstring}" if chunk_builder: - yield chunk_builder.build_class_chunk( - content_no_docstring, - class_name, - docstring, - function_line_start, - function_line_end, - ), used_ranges + yield ( + chunk_builder.build_class_chunk( + content_no_docstring, + class_name, + docstring, + function_line_start, + function_line_end, + ), + used_ranges, + ) def _file_prefix(self, root_node: Node) -> Tuple[str, List]: return "", [] def _get_function_body(self, node: Node) -> Optional[Node]: - return next( - (child for child in node.children if child.type == self.function_body), None - ) + return next((child for child in node.children if child.type == self.function_body), None) def _get_docstring(self, node: Node) -> str: if node.prev_named_sibling and node.prev_named_sibling.type in self.docs_types: @@ -708,10 +667,7 @@ def _get_classes_no_methods(self, node: Node, parent_type: str) -> List[Node]: def has_methods(class_node: Node) -> bool: return any( child.type in self.function_definition_types - or any( - grandchild.type in self.function_definition_types - for grandchild in child.children - ) + or any(grandchild.type in self.function_definition_types for grandchild in child.children) for child in class_node.children ) @@ -781,10 +737,7 @@ def _build_imports( used, set_imports = set(), set() def find_used_imports(node): - if ( - node.type in self.identifiers - and node.text.decode(self.utf8_encoding) in imports - ): + if node.type in self.identifiers and node.text.decode(self.utf8_encoding) in imports: used.add(node.text.decode(self.utf8_encoding)) for child in node.children: find_used_imports(child) @@ -818,10 +771,7 @@ def _find_used_imports_in_function( used = set() def find_used_imports(node): - if ( - node.type in self.identifiers - and node.text.decode(self.utf8_encoding) in imports - ): + if node.type in self.identifiers and node.text.decode(self.utf8_encoding) in imports: used.add(node.text.decode(self.utf8_encoding)) for child in node.children: find_used_imports(child) @@ -892,27 +842,19 @@ def _get_node_ranges_with_comments(self, node: Node) -> List[Tuple[int, int]]: return ranges - def _get_variable_ranges_with_comments( - self, var_node: Node - ) -> List[Tuple[int, int]]: + def _get_variable_ranges_with_comments(self, var_node: Node) -> List[Tuple[int, int]]: """Get variable ranges including any preceding comments.""" return self._get_node_ranges_with_comments(var_node) - def _get_import_ranges_with_comments( - self, import_node: Node - ) -> List[Tuple[int, int]]: + def _get_import_ranges_with_comments(self, import_node: Node) -> List[Tuple[int, int]]: """Get import ranges including any preceding comments.""" return self._get_node_ranges_with_comments(import_node) - def _get_class_ranges_with_comments( - self, class_node: Node - ) -> List[Tuple[int, int]]: + def _get_class_ranges_with_comments(self, class_node: Node) -> List[Tuple[int, int]]: """Get class ranges including any preceding comments and docstrings.""" return self._get_node_ranges_with_comments(class_node) - def _build_additional_context( - self, function_node: Node, root_node: Node - ) -> Tuple[str, str]: + def _build_additional_context(self, function_node: Node, root_node: Node) -> Tuple[str, str]: context = "" context_no_docstring = "" node = function_node @@ -938,9 +880,7 @@ def _get_imports(self, tree: Tree) -> Dict[str, Node]: """Get imports from the AST. Must be implemented by language-specific chunkers.""" raise NotImplementedError - def _build_class_context( - self, class_node: Node, root_node: Node - ) -> Tuple[str, str]: + def _build_class_context(self, class_node: Node, root_node: Node) -> Tuple[str, str]: class_indent = class_node.start_point.column start_byte = class_node.start_byte @@ -960,9 +900,7 @@ def _build_class_context( header_text = "" header = f"{' ' * class_indent}{header_text}\n" docstring = self._get_docstring(class_node) - header_with_docstring = ( - f"{header}{' ' * (class_indent + 4)}{docstring}\n" if docstring else header - ) + header_with_docstring = f"{header}{' ' * (class_indent + 4)}{docstring}\n" if docstring else header fields = [ _to_str(child) @@ -975,9 +913,7 @@ def _build_class_context( constructor_doc = self._get_docstring(constructor_node) constructor_text = self._build_function(constructor_node) constructor_text_no_doc = ( - constructor_text.replace(constructor_doc, "") - if constructor_doc - else constructor_text + constructor_text.replace(constructor_doc, "") if constructor_doc else constructor_text ) else: constructor_text = constructor_text_no_doc = "" @@ -991,9 +927,7 @@ def _find_constructor(self, body: Node) -> Optional[Node]: for child in body.children: definition_field = child.child_by_field_name(self.definition_field) if self._is_constructor(child) or ( - child.type == self.decorator_type - and definition_field - and self._is_constructor(definition_field) + child.type == self.decorator_type and definition_field and self._is_constructor(definition_field) ): return child return None @@ -1029,10 +963,7 @@ def _is_only_function_in_class(self, constructor_node: Node) -> bool: function_count = 0 for child in body_node.children: - if ( - child.type in self.function_definition_types - and child != constructor_node - ): + if child.type in self.function_definition_types and child != constructor_node: function_count += 1 return function_count == 0 @@ -1057,7 +988,6 @@ def _track_constructor_variables( class _PythonFunctionChunker(_CodeChunker): - language: CodeLanguageLabel = CodeLanguageLabel.PYTHON ts_language: Any = Field(default=None) parser: Any = Field(default=None) @@ -1129,15 +1059,10 @@ def _get_module_variables(self, tree: Tree) -> Dict[str, Node]: if child.type in self.expression_types and child.named_children: expr = child.named_children[0] if expr.type == "assignment": - if ( - expr.named_children - and expr.named_children[0].type in self.identifiers - ): + if expr.named_children and expr.named_children[0].type in self.identifiers: text = expr.named_children[0].text var_name = text.decode(self.utf8_encoding) if text else "" - extended_node = self._get_variable_with_comments( - child, tree.root_node - ) + extended_node = self._get_variable_with_comments(child, tree.root_node) variables[var_name] = extended_node return variables @@ -1183,10 +1108,7 @@ def _is_local_assignment(self, identifier_node: Node) -> bool: current = identifier_node.parent while current: if current.type == "assignment": - if ( - current.named_children - and current.named_children[0] == identifier_node - ): + if current.named_children and current.named_children[0] == identifier_node: return True current = current.parent return False @@ -1247,13 +1169,9 @@ def _get_imports(self, tree: Tree) -> Dict[str, Node]: if sub_child.type == self.named_imports: for spec in sub_child.children: if spec.type == self.import_specifier: - name_node = spec.child_by_field_name( - self.name_field - ) + name_node = spec.child_by_field_name(self.name_field) if name_node: - identifiers.append( - name_node.text.decode("utf8") - ) + identifiers.append(name_node.text.decode("utf8")) elif sub_child.type in self.identifiers: identifiers.append(sub_child.text.decode("utf8")) elif sub_child.type == self.namespace_import: @@ -1312,10 +1230,7 @@ def _is_docstring(self, node: Node) -> bool: def _get_docstring(self, node: Node) -> str: docstring = "" if node.prev_named_sibling and node.prev_named_sibling.type in self.docs_types: - while ( - node.prev_named_sibling - and node.prev_named_sibling.type in self.docs_types - ): + while node.prev_named_sibling and node.prev_named_sibling.type in self.docs_types: text = node.prev_named_sibling.text if text: docstring += text.decode(self.utf8_encoding) @@ -1343,12 +1258,8 @@ def _structs(node): if clean_name: structs[clean_name] = node elif node.type in [self.declaration]: - if _has_child( - node.child_by_field_name(self.declarator), self.declarator - ): - name = node.child_by_field_name( - self.declarator - ).child_by_field_name(self.declarator) + if _has_child(node.child_by_field_name(self.declarator), self.declarator): + name = node.child_by_field_name(self.declarator).child_by_field_name(self.declarator) else: name = node.child_by_field_name(self.declarator) if name: @@ -1356,12 +1267,8 @@ def _structs(node): if clean_name: structs[clean_name] = node elif node.type in self.function_declaration: - if _has_child( - node.child_by_field_name(self.type_field), self.name_field - ): - name = node.child_by_field_name( - self.type_field - ).child_by_field_name(self.name_field) + if _has_child(node.child_by_field_name(self.type_field), self.name_field): + name = node.child_by_field_name(self.type_field).child_by_field_name(self.name_field) else: name = node.child_by_field_name(self.type_field) if name: @@ -1432,7 +1339,6 @@ def collect_identifiers(node, depth=0): class _JavaFunctionChunker(_CodeChunker): - language: CodeLanguageLabel = CodeLanguageLabel.JAVA ts_language: Any = Field(default=None) parser: Any = Field(default=None) @@ -1514,16 +1420,12 @@ def _get_imports(self, tree: Tree) -> Dict[str, Node]: return import_dict @override - def _build_additional_context( - self, function_node: Node, root_node: Node - ) -> Tuple[str, str]: + def _build_additional_context(self, function_node: Node, root_node: Node) -> Tuple[str, str]: context: List[str] = [] context_no_doc: List[str] = [] while function_node.parent is not None: if function_node.type in self.object_declarations: - with_doc, without_doc = self._build_java_object_context( - function_node, root_node - ) + with_doc, without_doc = self._build_java_object_context(function_node, root_node) context.insert(0, with_doc) context_no_doc.insert(0, without_doc) function_node = function_node.parent @@ -1534,9 +1436,7 @@ def _build_additional_context( without_doc + ("" if without_doc else ""), ) - def _build_java_object_context( - self, obj_node: Node, root_node: Node - ) -> Tuple[str, str]: + def _build_java_object_context(self, obj_node: Node, root_node: Node) -> Tuple[str, str]: """Build context for Java objects (classes, enums, interfaces).""" obj_type = obj_node.type @@ -1549,9 +1449,7 @@ def _build_java_object_context( return ("", "") - def _build_java_class_like_context( - self, node: Node, root_node: Node, context_type: str - ) -> Tuple[str, str]: + def _build_java_class_like_context(self, node: Node, root_node: Node, context_type: str) -> Tuple[str, str]: """Unified context building for Java classes, enums, and interfaces.""" body = node.child_by_field_name(self.class_body_field) if not body: @@ -1560,56 +1458,30 @@ def _build_java_class_like_context( header = self._get_function_signature(node, root_node) doc = self._get_docstring(node) - header_with_doc = ( - f"{header}{' ' * (node.start_point.column + 4)}{doc}" if doc else header - ) + header_with_doc = f"{header}{' ' * (node.start_point.column + 4)}{doc}" if doc else header inner_parts = [] if context_type == "enum": - constants = [ - _to_str(child) - for child in body.children - if child.type == self.enum_constant - ] + constants = [_to_str(child) for child in body.children if child.type == self.enum_constant] const_block = (",".join(constants) + ";") if constants else "" inner_parts.append(const_block) decl = next( - ( - child - for child in body.children - if child.type == self.enum_body_declarations - ), + (child for child in body.children if child.type == self.enum_body_declarations), None, ) if decl: - decl_parts = [ - _to_str(child) - for child in decl.children - if child.type in self.enum_inner_types - ] + decl_parts = [_to_str(child) for child in decl.children if child.type in self.enum_inner_types] inner_parts.append("".join(decl_parts)) elif context_type == "interface": - constants = [ - _to_str(child) - for child in body.children - if child.type == self.constant_declaration - ] - methods = [ - _to_str(child) - for child in body.children - if child.type in self.function_definition_types - ] + constants = [_to_str(child) for child in body.children if child.type == self.constant_declaration] + methods = [_to_str(child) for child in body.children if child.type in self.function_definition_types] inner_parts.extend(["".join(constants), "".join(methods)]) else: - parts = [ - _to_str(child) - for child in body.children - if child.type in self.class_header_inner_types - ] + parts = [_to_str(child) for child in body.children if child.type in self.class_header_inner_types] inner_parts.extend(parts) ctor = self._find_constructor(body) @@ -1619,9 +1491,7 @@ def _build_java_class_like_context( inner = "".join(part for part in inner_parts if part.strip()) close = (" " * node.start_point.column) + "}" - with_doc = ( - "\n\n".join(x for x in [header_with_doc, inner] if x).rstrip() + close - ) + with_doc = "\n\n".join(x for x in [header_with_doc, inner] if x).rstrip() + close without_doc = "\n\n".join(x for x in [header, inner] if x).rstrip() + close return with_doc, without_doc diff --git a/docling_core/transforms/chunker/code_chunking/_utils.py b/docling_core/transforms/chunker/code_chunking/_utils.py index aa971f3a..9f791585 100644 --- a/docling_core/transforms/chunker/code_chunking/_utils.py +++ b/docling_core/transforms/chunker/code_chunking/_utils.py @@ -98,9 +98,7 @@ def _get_function_name(language: CodeLanguageLabel, node: Node) -> Optional[str] return None -def _is_collectable_function( - language: CodeLanguageLabel, node: Node, constructor_name: str -) -> bool: +def _is_collectable_function(language: CodeLanguageLabel, node: Node, constructor_name: str) -> bool: """Check if a function should be collected for chunking.""" if language == CodeLanguageLabel.C: return True @@ -117,9 +115,7 @@ def _get_default_tokenizer() -> "BaseTokenizer": HuggingFaceTokenizer, ) - return HuggingFaceTokenizer.from_pretrained( - model_name="sentence-transformers/all-MiniLM-L6-v2" - ) + return HuggingFaceTokenizer.from_pretrained(model_name="sentence-transformers/all-MiniLM-L6-v2") def _has_child(node: Node, child_name: str) -> bool: diff --git a/docling_core/transforms/chunker/code_chunking/standard_code_chunking_strategy.py b/docling_core/transforms/chunker/code_chunking/standard_code_chunking_strategy.py index cdddb9ad..c4ebfb00 100644 --- a/docling_core/transforms/chunker/code_chunking/standard_code_chunking_strategy.py +++ b/docling_core/transforms/chunker/code_chunking/standard_code_chunking_strategy.py @@ -77,9 +77,7 @@ def chunk_code_item( if chunker := self._get_chunker(item.code_language): doc = DoclingDocument(name="", origin=doc.origin) - doc.add_code( - text=code_text, code_language=item.code_language, orig=code_text - ) + doc.add_code(text=code_text, code_language=item.code_language, orig=code_text) yield from chunker.chunk(doc, **kwargs) else: # if no inner chunker available for language, fall back to yielding a single code block chunk yield CodeChunk( diff --git a/docling_core/transforms/chunker/doc_chunk.py b/docling_core/transforms/chunker/doc_chunk.py index 74264560..43de0798 100644 --- a/docling_core/transforms/chunker/doc_chunk.py +++ b/docling_core/transforms/chunker/doc_chunk.py @@ -30,11 +30,9 @@ class DocMeta(BaseMeta): default="docling_core.transforms.chunker.DocMeta", alias=_KEY_SCHEMA_NAME, ) - version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = ( - Field( - default=_VERSION, - alias=_KEY_VERSION, - ) + version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = Field( + default=_VERSION, + alias=_KEY_VERSION, ) doc_items: list[DocItem] = Field( alias=_KEY_DOC_ITEMS, diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 7b855b2d..888a6967 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging -from typing import Any, Iterator, Optional +from typing import Any, Iterator from pydantic import ConfigDict, Field from typing_extensions import Annotated, override @@ -66,7 +66,6 @@ def serialize( if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): table_df = item.export_to_dataframe(doc) if table_df.shape[0] >= 1 and table_df.shape[1] >= 2: - # copy header as first row and shift all rows by one table_df.loc[-1] = table_df.columns # type: ignore[call-overload] table_df.index = table_df.index + 1 @@ -126,7 +125,7 @@ class HierarchicalChunker(BaseChunker): model_config = ConfigDict(arbitrary_types_allowed=True) serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider() - code_chunking_strategy: Optional[BaseCodeChunkingStrategy] = Field(default=None) + code_chunking_strategy: BaseCodeChunkingStrategy | None = Field(default=None) # deprecated: merge_list_items: Annotated[bool, Field(deprecated=True)] = True @@ -161,13 +160,8 @@ def chunk( for k in keys_to_del: heading_by_level.pop(k, None) continue - elif ( - isinstance(item, (ListGroup, InlineGroup, DocItem)) - and item.self_ref not in visited - ): - if self.code_chunking_strategy is not None and isinstance( - item, CodeItem - ): + elif isinstance(item, (ListGroup, InlineGroup, DocItem)) and item.self_ref not in visited: + if self.code_chunking_strategy is not None and isinstance(item, CodeItem): yield from self.code_chunking_strategy.chunk_code_item( item=item, doc=dl_doc, @@ -188,8 +182,7 @@ def chunk( text=ser_res.text, meta=DocMeta( doc_items=doc_items, - headings=[heading_by_level[k] for k in sorted(heading_by_level)] - or None, + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, origin=dl_doc.origin, ), ) diff --git a/docling_core/transforms/chunker/hybrid_chunker.py b/docling_core/transforms/chunker/hybrid_chunker.py index ecffaccd..a016f886 100644 --- a/docling_core/transforms/chunker/hybrid_chunker.py +++ b/docling_core/transforms/chunker/hybrid_chunker.py @@ -41,9 +41,7 @@ def _get_default_tokenizer(): HuggingFaceTokenizer, ) - return HuggingFaceTokenizer.from_pretrained( - model_name="sentence-transformers/all-MiniLM-L6-v2" - ) + return HuggingFaceTokenizer.from_pretrained(model_name="sentence-transformers/all-MiniLM-L6-v2") class HybridChunker(BaseChunker): @@ -72,8 +70,7 @@ def _patch(cls, data: Any) -> Any: max_tokens = data.get("max_tokens") if not isinstance(tokenizer, BaseTokenizer) and ( # some legacy param passed: - tokenizer is not None - or max_tokens is not None + tokenizer is not None or max_tokens is not None ): from docling_core.transforms.chunker.tokenizer.huggingface import ( HuggingFaceTokenizer, @@ -91,12 +88,8 @@ def _patch(cls, data: Any) -> Any: model_name=tokenizer, max_tokens=max_tokens, ) - elif tokenizer is None or isinstance( - tokenizer, PreTrainedTokenizerBase - ): - kwargs = { - "tokenizer": tokenizer or _get_default_tokenizer().tokenizer - } + elif tokenizer is None or isinstance(tokenizer, PreTrainedTokenizerBase): + kwargs = {"tokenizer": tokenizer or _get_default_tokenizer().tokenizer} if max_tokens is not None: kwargs["max_tokens"] = max_tokens data["tokenizer"] = HuggingFaceTokenizer(**kwargs) @@ -158,19 +151,13 @@ def _make_chunk_from_doc_items( if len(doc_chunk.meta.doc_items) == 1 # TODO: merging should ideally be done by the serializer: else self.delim.join( - [ - res_text - for doc_item in doc_items - if (res_text := doc_serializer.serialize(item=doc_item).text) - ] + [res_text for doc_item in doc_items if (res_text := doc_serializer.serialize(item=doc_item).text)] ) ) new_chunk = DocChunk(text=window_text, meta=meta) return new_chunk - def _split_by_doc_items( - self, doc_chunk: DocChunk, doc_serializer: BaseDocSerializer - ) -> list[DocChunk]: + def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerializer) -> list[DocChunk]: chunks = [] window_start = 0 window_end = 0 # an inclusive index @@ -224,9 +211,7 @@ def _split_using_plain_text( # How much room is there for text after subtracting out the headers and # captions: available_length = self.max_tokens - lengths.other_len - sem_chunker = semchunk.chunkerify( - self.tokenizer.get_tokenizer(), chunk_size=available_length - ) + sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length) if available_length <= 0: warnings.warn( "Headers and captions for this chunk are longer than the total " @@ -267,10 +252,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]): origin=chunk.meta.origin, ), ) - if ( - headings == current_headings - and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens - ): + if headings == current_headings and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens: # there is room to include the new chunk so add it to the window and # continue window_end += 1 @@ -311,11 +293,7 @@ def chunk( doc_serializer=my_doc_ser, **kwargs, ) # type: ignore - res = [ - x - for c in res - for x in self._split_by_doc_items(c, doc_serializer=my_doc_ser) - ] + res = [x for c in res for x in self._split_by_doc_items(c, doc_serializer=my_doc_ser)] res = [x for c in res for x in self._split_using_plain_text(c)] if self.merge_peers: res = self._merge_chunks_with_matching_metadata(res) diff --git a/docling_core/transforms/chunker/tokenizer/huggingface.py b/docling_core/transforms/chunker/tokenizer/huggingface.py index 5d23f73c..d7cec707 100644 --- a/docling_core/transforms/chunker/tokenizer/huggingface.py +++ b/docling_core/transforms/chunker/tokenizer/huggingface.py @@ -13,10 +13,7 @@ try: from transformers import AutoTokenizer, PreTrainedTokenizerBase except ImportError: - raise RuntimeError( - "Module requires 'chunking' extra; to install, run: " - "`pip install 'docling-core[chunking]'`" - ) + raise RuntimeError("Module requires 'chunking' extra; to install, run: `pip install 'docling-core[chunking]'`") class HuggingFaceTokenizer(BaseTokenizer): @@ -42,10 +39,7 @@ def _patch(self) -> Self: data = json.load(f) self.max_tokens = int(data["max_seq_length"]) except Exception as e: - raise RuntimeError( - "max_tokens could not be determined automatically; please set " - "explicitly." - ) from e + raise RuntimeError("max_tokens could not be determined automatically; please set explicitly.") from e return self def count_tokens(self, text: str): @@ -65,9 +59,7 @@ def from_pretrained( ) -> Self: """Create tokenizer from model name.""" my_kwargs = { - "tokenizer": AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=model_name, **kwargs - ), + "tokenizer": AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name, **kwargs), } if max_tokens is not None: my_kwargs["max_tokens"] = max_tokens diff --git a/docling_core/transforms/chunker/tokenizer/openai.py b/docling_core/transforms/chunker/tokenizer/openai.py index 48ce944b..572229ba 100644 --- a/docling_core/transforms/chunker/tokenizer/openai.py +++ b/docling_core/transforms/chunker/tokenizer/openai.py @@ -8,8 +8,7 @@ import tiktoken except ImportError: raise RuntimeError( - "Module requires 'chunking-openai' extra; to install, run: " - "`pip install 'docling-core[chunking-openai]'`" + "Module requires 'chunking-openai' extra; to install, run: `pip install 'docling-core[chunking-openai]'`" ) diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py index 674f90b8..385aca6a 100644 --- a/docling_core/transforms/serializer/azure.py +++ b/docling_core/transforms/serializer/azure.py @@ -74,9 +74,7 @@ def _bbox_to_polygon_coords( return [l, t, r, t, r, b, l, b] -def _bbox_to_polygon_for_item( - doc: DoclingDocument, item: DocItem -) -> Optional[list[float]]: +def _bbox_to_polygon_for_item(doc: DoclingDocument, item: DocItem) -> Optional[list[float]]: """Compute a TOPLEFT-origin polygon for the first provenance of the item.""" if not item.prov: return None @@ -113,7 +111,7 @@ class _AzureBoundingRegion(BaseModel): Matches Azure's schema; field names use camelCase by design. """ - pageNumber: int # noqa: N815 + pageNumber: int polygon: list[float] @@ -121,7 +119,7 @@ class _AzureParagraph(BaseModel): """Paragraph content with optional role and regions.""" content: str - boundingRegions: list["_AzureBoundingRegion"] # noqa: N815 + boundingRegions: list["_AzureBoundingRegion"] role: Optional[str] = None @@ -129,34 +127,34 @@ class _AzureTableCell(BaseModel): """Single table cell with position, span, and optional region.""" content: str - rowIndex: int # noqa: N815 - columnIndex: int # noqa: N815 - rowSpan: int = 1 # noqa: N815 - colSpan: int = 1 # noqa: N815 + rowIndex: int + columnIndex: int + rowSpan: int = 1 + colSpan: int = 1 kind: Optional[str] = None - boundingRegions: Optional[list[_AzureBoundingRegion]] = None # noqa: N815 + boundingRegions: Optional[list[_AzureBoundingRegion]] = None class _AzureTable(BaseModel): """Table with dimensions, regions, and cells.""" - rowCount: int # noqa: N815 - columnCount: int # noqa: N815 - boundingRegions: list[_AzureBoundingRegion] # noqa: N815 + rowCount: int + columnCount: int + boundingRegions: list[_AzureBoundingRegion] cells: list[_AzureTableCell] class _AzureImage(BaseModel): """Image/figure with bounding region and optional footnotes.""" - boundingRegions: list[_AzureBoundingRegion] # noqa: N815 + boundingRegions: list[_AzureBoundingRegion] footnotes: Optional[list[_AzureParagraph]] = None class _AzurePage(BaseModel): """Page metadata used in the Azure-like output.""" - pageNumber: int # noqa: N815 + pageNumber: int width: float height: float # Words are not currently emitted; keep as untyped list @@ -215,9 +213,7 @@ def serialize( if content != "" and polygon is not None: para = _AzureParagraph( content=content, - boundingRegions=[ - _AzureBoundingRegion(pageNumber=page_no, polygon=polygon) - ], + boundingRegions=[_AzureBoundingRegion(pageNumber=page_no, polygon=polygon)], role=role, ) @@ -266,9 +262,7 @@ def serialize( # For RichTableCell, get textual content via helper if isinstance(cell, RichTableCell): - content_text = cell._get_text( - doc=doc, doc_serializer=doc_serializer - ) + content_text = cell._get_text(doc=doc, doc_serializer=doc_serializer) else: content_text = cell.text @@ -280,9 +274,7 @@ def serialize( page_h = doc.pages[page_no].size.height if bbox.coord_origin != CoordOrigin.TOPLEFT: bbox = bbox.to_top_left_origin(page_height=page_h) - cell_poly = _bbox_to_polygon_coords( - l=bbox.l, t=bbox.t, r=bbox.r, b=bbox.b - ) + cell_poly = _bbox_to_polygon_coords(l=bbox.l, t=bbox.t, r=bbox.r, b=bbox.b) cell_obj = _AzureTableCell( content=content_text.strip(), @@ -290,15 +282,9 @@ def serialize( columnIndex=cell.start_col_offset_idx, rowSpan=max(cell.row_span, 1), colSpan=max(cell.col_span, 1), - kind=( - "columnHeader" - if cell.column_header - else ("rowHeader" if cell.row_header else None) - ), + kind=("columnHeader" if cell.column_header else ("rowHeader" if cell.row_header else None)), boundingRegions=( - [_AzureBoundingRegion(pageNumber=page_no, polygon=cell_poly)] - if cell_poly is not None - else None + [_AzureBoundingRegion(pageNumber=page_no, polygon=cell_poly)] if cell_poly is not None else None ), ) @@ -331,9 +317,7 @@ def serialize( if poly is None: return create_ser_result() - fig_obj = _AzureImage( - boundingRegions=[_AzureBoundingRegion(pageNumber=page_no, polygon=poly)] - ) + fig_obj = _AzureImage(boundingRegions=[_AzureBoundingRegion(pageNumber=page_no, polygon=poly)]) # Include picture footnotes if present foots = [] @@ -346,11 +330,7 @@ def serialize( foots.append( _AzureParagraph( content=tgt.text, - boundingRegions=[ - _AzureBoundingRegion( - pageNumber=tgt.prov[0].page_no, polygon=f_poly - ) - ], + boundingRegions=[_AzureBoundingRegion(pageNumber=tgt.prov[0].page_no, polygon=f_poly)], ) ) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index b494eb0e..b790b89c 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -96,10 +96,7 @@ def _iterate_items( traverse_pictures=traverse_pictures, ): if add_page_breaks: - if ( - isinstance(item, (ListGroup, InlineGroup)) - and item.self_ref not in my_visited - ): + if isinstance(item, (ListGroup, InlineGroup)) and item.self_ref not in my_visited: # if group starts with new page, yield page break before group node my_visited.add(item.self_ref) for it, _ in _iterate_items( @@ -113,21 +110,27 @@ def _iterate_items( if isinstance(it, DocItem) and it.prov: page_no = it.prov[0].page_no if prev_page_nr is not None and page_no > prev_page_nr: - yield _PageBreakNode( - self_ref=f"#/pb/{page_break_i}", - prev_page=prev_page_nr, - next_page=page_no, - ), lvl + yield ( + _PageBreakNode( + self_ref=f"#/pb/{page_break_i}", + prev_page=prev_page_nr, + next_page=page_no, + ), + lvl, + ) break elif isinstance(item, DocItem) and item.prov: page_no = item.prov[0].page_no if prev_page_nr is None or page_no > prev_page_nr: if prev_page_nr is not None: # close previous range - yield _PageBreakNode( - self_ref=f"#/pb/{page_break_i}", - prev_page=prev_page_nr, - next_page=page_no, - ), lvl + yield ( + _PageBreakNode( + self_ref=f"#/pb/{page_break_i}", + prev_page=prev_page_nr, + next_page=page_no, + ), + lvl, + ) page_break_i += 1 prev_page_nr = page_no yield item, lvl @@ -138,11 +141,7 @@ def _get_annotation_text( ) -> Optional[str]: result = None if isinstance(annotation, PictureClassificationData): - predicted_class = ( - annotation.predicted_classes[0].class_name - if annotation.predicted_classes - else None - ) + predicted_class = annotation.predicted_classes[0].class_name if annotation.predicted_classes else None if predicted_class is not None: result = predicted_class.replace("_", " ") elif isinstance(annotation, DescriptionAnnotation): @@ -286,10 +285,7 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]: or item.content_layer not in params.layers or ( params.pages is not None - and ( - (not item.prov) - or item.prov[0].page_no not in params.pages - ) + and ((not item.prov) or item.prov[0].page_no not in params.pages) ) ) ) @@ -450,9 +446,7 @@ def serialize( ) parts.append(part) - return create_ser_result( - text=delim.join([p.text for p in parts if p.text]), span_source=parts - ) + return create_ser_result(text=delim.join([p.text for p in parts if p.text]), span_source=parts) # making some assumptions about the kwargs it can pass @override @@ -604,13 +598,9 @@ def serialize_meta( **(self.params.model_dump() | kwargs), ) else: - return create_ser_result( - text="", span_source=item if isinstance(item, DocItem) else [] - ) + return create_ser_result(text="", span_source=item if isinstance(item, DocItem) else []) else: - return create_ser_result( - text="", span_source=item if isinstance(item, DocItem) else [] - ) + return create_ser_result(text="", span_source=item if isinstance(item, DocItem) else []) # TODO deprecate @override @@ -639,10 +629,7 @@ def _get_applicable_pages(self) -> Optional[list[int]]: if ( isinstance(item, DocItem) and item.prov - and ( - self.params.pages is None - or item.prov[0].page_no in self.params.pages - ) + and (self.params.pages is None or item.prov[0].page_no in self.params.pages) and ix >= self.params.start_idx and ix < self.params.stop_idx ) @@ -672,17 +659,9 @@ def _should_use_legacy_annotations( return False with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("ignore", DeprecationWarning) - if ( - incl_attr := getattr(params, "include_annotations", None) - ) is not None and not incl_attr: + if (incl_attr := getattr(params, "include_annotations", None)) is not None and not incl_attr: return False - use_legacy = bool( - [ - ann - for ann in item.annotations - if ((ann.kind == kind) if kind is not None else True) - ] - ) + use_legacy = bool([ann for ann in item.annotations if ((ann.kind == kind) if kind is not None else True)]) if use_legacy: for w in caught_warnings: warnings.warn(w.message, w.category) diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index beff6168..22485f84 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -107,13 +107,9 @@ def serialize( my_visited = visited if visited is not None else set() params = DocTagsParams(**kwargs) # Decide wrapping up-front so ListItem never gets wrapped here - wrap_tag_token: Optional[str] = ( - DocumentToken.create_token_name_from_doc_item_label( - label=item.label, - **( - {"level": item.level} if isinstance(item, SectionHeaderItem) else {} - ), - ) + wrap_tag_token: Optional[str] = DocumentToken.create_token_name_from_doc_item_label( + label=item.label, + **({"level": item.level} if isinstance(item, SectionHeaderItem) else {}), ) wrap_tag: Optional[str] = None if isinstance(item, ListItem) else wrap_tag_token parts: list[str] = [] @@ -137,9 +133,7 @@ def serialize( if ( item.text == "" and len(item.children) == 1 - and isinstance( - (child_group := item.children[0].resolve(doc)), InlineGroup - ) + and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup) ): ser_res = doc_serializer.serialize(item=child_group, visited=my_visited) text_part = ser_res.text @@ -259,23 +253,15 @@ def serialize( predicted_class: Optional[str] = None if item.meta: if item.meta.classification: - predicted_class = ( - item.meta.classification.get_main_prediction().class_name - ) + predicted_class = item.meta.classification.get_main_prediction().class_name elif _should_use_legacy_annotations( params=params, item=item, kind=PictureClassificationData.model_fields["kind"].default, ): - if classifications := [ - ann - for ann in item.annotations - if isinstance(ann, PictureClassificationData) - ]: + if classifications := [ann for ann in item.annotations if isinstance(ann, PictureClassificationData)]: if classifications[0].predicted_classes: - predicted_class = ( - classifications[0].predicted_classes[0].class_name - ) + predicted_class = classifications[0].predicted_classes[0].class_name if predicted_class: body += DocumentToken.get_picture_classification_token(predicted_class) if predicted_class in [ @@ -299,11 +285,7 @@ def serialize( item=item, kind=PictureMoleculeData.model_fields["kind"].default, ): - if smiles_annotations := [ - ann - for ann in item.annotations - if isinstance(ann, PictureMoleculeData) - ]: + if smiles_annotations := [ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)]: smi = smiles_annotations[0].smi if smi: body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value) @@ -319,17 +301,13 @@ def serialize( kind=PictureTabularChartData.model_fields["kind"].default, ): if tabular_chart_annotations := [ - ann - for ann in item.annotations - if isinstance(ann, PictureTabularChartData) + ann for ann in item.annotations if isinstance(ann, PictureTabularChartData) ]: chart_data = tabular_chart_annotations[0].chart_data if chart_data and chart_data.table_cells: temp_doc = DoclingDocument(name="temp") temp_table = temp_doc.add_table(data=chart_data) - otsl_content = temp_table.export_to_otsl( - temp_doc, add_cell_location=False - ) + otsl_content = temp_table.export_to_otsl(temp_doc, add_cell_location=False) body += otsl_content res_parts.append(create_ser_result(text=body, span_source=item)) @@ -379,9 +357,7 @@ def serialize( # mapping from source_cell_id to a list of target_cell_ids source_to_targets: Dict[int, List[int]] = {} for link in item.graph.links: - source_to_targets.setdefault(link.source_cell_id, []).append( - link.target_cell_id - ) + source_to_targets.setdefault(link.source_cell_id, []).append(link.target_cell_id) for cell in item.graph.cells: cell_txt = "" @@ -468,11 +444,7 @@ def serialize( if parts: text_res = delim.join( - [ - t - for p in parts - if (t := _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value)) - ] + [t for p in parts if (t := _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value))] ) text_res = f"{text_res}{delim}" wrap_tag = ( @@ -492,7 +464,6 @@ class DocTagsInlineSerializer(BaseInlineSerializer): def _get_inline_location_tags( self, doc: DoclingDocument, item: InlineGroup, params: DocTagsParams ) -> SerializationResult: - prov: Optional[ProvenanceItem] = None boxes: list[BoundingBox] = [] doc_items: list[DocItem] = [] diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index 4c18ec3c..af66246a 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any, Optional, Union from urllib.parse import quote -from xml.etree.cElementTree import SubElement, tostring +from xml.etree.ElementTree import SubElement, tostring from xml.sax.saxutils import unescape import latex2mathml.converter @@ -164,12 +164,8 @@ def serialize( # Prepare the HTML based on item type if isinstance(item, (TitleItem, SectionHeaderItem)): - section_level = ( - min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1 - ) - text = get_html_tag_with_text_direction( - html_tag=f"h{section_level}", text=text - ) + section_level = min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1 + text = get_html_tag_with_text_direction(html_tag=f"h{section_level}", text=text) elif isinstance(item, FormulaItem): text = self._process_formula( @@ -183,11 +179,7 @@ def serialize( ) elif isinstance(item, CodeItem): - text = ( - f"{text}" - if is_inline_scope - else f"
{text}
" - ) + text = f"{text}" if is_inline_scope else f"
{text}
" elif isinstance(item, ListItem): # List items are handled by list serializer @@ -271,11 +263,7 @@ def _process_formula( and orig != "" and len(item.prov) > 0 and image_mode == ImageRefMode.EMBEDDED - and ( - img_fallback := self._get_formula_image_fallback( - item=item, orig=orig, doc=doc - ) - ) + and (img_fallback := self._get_formula_image_fallback(item=item, orig=orig, doc=doc)) ): return img_fallback @@ -284,12 +272,8 @@ def _process_formula( try: # Set display mode based on context display_mode = "inline" if is_inline_scope else "block" - mathml_element = latex2mathml.converter.convert_to_element( - text, display=display_mode - ) - annotation = SubElement( - mathml_element, "annotation", dict(encoding="TeX") - ) + mathml_element = latex2mathml.converter.convert_to_element(text, display=display_mode) + annotation = SubElement(mathml_element, "annotation", dict(encoding="TeX")) annotation.text = text mathml = unescape(tostring(mathml_element, encoding="unicode")) @@ -300,14 +284,8 @@ def _process_formula( return f"
{mathml}
" except Exception: - img_fallback = self._get_formula_image_fallback( - item=item, orig=orig, doc=doc - ) - if ( - image_mode == ImageRefMode.EMBEDDED - and len(item.prov) > 0 - and img_fallback - ): + img_fallback = self._get_formula_image_fallback(item=item, orig=orig, doc=doc) + if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0 and img_fallback: return img_fallback elif text: return f"
{text}
" @@ -326,14 +304,12 @@ def _process_formula( return '
Formula not decoded
' - def _get_formula_image_fallback( - self, *, item: DocItem, orig: str, doc: DoclingDocument - ) -> Optional[str]: + def _get_formula_image_fallback(self, *, item: DocItem, orig: str, doc: DoclingDocument) -> Optional[str]: """Try to get an image fallback for a formula.""" item_image = item.get_image(doc=doc) if item_image is not None: img_ref = ImageRef.from_pil(item_image, dpi=72) - return "
" f'{orig}' "
" + return f'
{orig}
' return None @@ -362,7 +338,6 @@ def serialize( for i, row in enumerate(item.data.grid): body += "" for j, cell in enumerate(row): - rowspan, rowstart = ( cell.row_span, cell.start_row_offset_idx, @@ -378,9 +353,7 @@ def serialize( continue if isinstance(cell, RichTableCell): - ser_res = doc_serializer.serialize( - item=cell.ref.resolve(doc=doc), **kwargs - ) + ser_res = doc_serializer.serialize(item=cell.ref.resolve(doc=doc), **kwargs) content = ser_res.text span_source = [ser_res] else: @@ -448,7 +421,6 @@ def get_img_row(imgb64: str, ind: int) -> str: img_text = "" if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): - if params.image_mode == ImageRefMode.EMBEDDED: # short-cut: we already have the image in base64 if ( @@ -458,10 +430,7 @@ def get_img_row(imgb64: str, ind: int) -> str: ): img_text = f'' elif len(item.prov) > 1: # more than 1 provenance - - img_text = ( - '\n' - ) + img_text = '
\n' for ind, prov in enumerate(item.prov): img = item.get_image(doc, prov_index=ind) @@ -485,8 +454,7 @@ def get_img_row(imgb64: str, ind: int) -> str: elif params.image_mode == ImageRefMode.REFERENCED: if isinstance(item.image, ImageRef) and not ( - isinstance(item.image.uri, AnyUrl) - and item.image.uri.scheme == "data" + isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme == "data" ): img_text = f'' @@ -499,21 +467,13 @@ def get_img_row(imgb64: str, ind: int) -> str: kind=PictureTabularChartData.model_fields["kind"].default, ): # Check if picture has attached PictureTabularChartData - tabular_chart_annotations = [ - ann - for ann in item.annotations - if isinstance(ann, PictureTabularChartData) - ] + tabular_chart_annotations = [ann for ann in item.annotations if isinstance(ann, PictureTabularChartData)] if len(tabular_chart_annotations) > 0: temp_doc = DoclingDocument(name="temp") - temp_table = temp_doc.add_table( - data=tabular_chart_annotations[0].chart_data - ) + temp_table = temp_doc.add_table(data=tabular_chart_annotations[0].chart_data) html_table_content = temp_table.export_to_html(temp_doc) if len(html_table_content) > 0: - res_parts.append( - create_ser_result(text=html_table_content, span_source=item) - ) + res_parts.append(create_ser_result(text=html_table_content, span_source=item)) text_res = "".join([r.text for r in res_parts]) if text_res: @@ -537,30 +497,19 @@ def serialize( cell_map = {cell.cell_id: cell for cell in graph_data.cells} # Build relationship maps - child_links: dict[int, list[int]] = ( - {} - ) # source_id -> list of child_ids (to_child) + child_links: dict[int, list[int]] = {} # source_id -> list of child_ids (to_child) value_links: dict[int, list[int]] = {} # key_id -> list of value_ids (to_value) - parents: set[int] = ( - set() - ) # Set of all IDs that are targets of to_child (to find roots) + parents: set[int] = set() # Set of all IDs that are targets of to_child (to find roots) for link in graph_data.links: - if ( - link.source_cell_id not in cell_map - or link.target_cell_id not in cell_map - ): + if link.source_cell_id not in cell_map or link.target_cell_id not in cell_map: continue if link.label.value == "to_child": - child_links.setdefault(link.source_cell_id, []).append( - link.target_cell_id - ) + child_links.setdefault(link.source_cell_id, []).append(link.target_cell_id) parents.add(link.target_cell_id) elif link.label.value == "to_value": - value_links.setdefault(link.source_cell_id, []).append( - link.target_cell_id - ) + value_links.setdefault(link.source_cell_id, []).append(link.target_cell_id) # Find root cells (cells with no parent) root_ids = [cell_id for cell_id in cell_map.keys() if cell_id not in parents] @@ -624,7 +573,7 @@ def _render_cell_tree( cell_text = f"{cell_text}: {', '.join(value_texts)}" # If this cell has children, create a nested list - if cell_id in child_links and child_links[cell_id]: + if child_links.get(cell_id): children_html = [] children_html.append(f"
  • {cell_text}
  • ") children_html.append("
    ") @@ -1107,19 +1043,12 @@ def serialize_captions( if DocItemLabel.CAPTION in params.labels: for cap in item.captions: - if ( - isinstance(it := cap.resolve(self.doc), TextItem) - and it.self_ref not in excluded_refs - ): + if isinstance(it := cap.resolve(self.doc), TextItem) and it.self_ref not in excluded_refs: text_cap = it.text text_dir = get_text_direction(text_cap) dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else "" cap_ser_res = create_ser_result( - text=( - f'
    ' - f"{html.escape(text_cap)}" - f"
    " - ), + text=(f'
    {html.escape(text_cap)}
    '), span_source=it, ) results.append(cap_ser_res) @@ -1157,15 +1086,11 @@ def _generate_head(self) -> str: else: head_parts.append("Docling Document") - head_parts.append( - '' - ) + head_parts.append('') # Add default styles or custom CSS if params.css_styles: - if params.css_styles.startswith("" - ): + if params.css_styles.startswith(""): head_parts.append(f"\n{params.css_styles}\n") else: head_parts.append(f"") diff --git a/docling_core/transforms/serializer/latex.py b/docling_core/transforms/serializer/latex.py index 10ff899b..69edc7bc 100644 --- a/docling_core/transforms/serializer/latex.py +++ b/docling_core/transforms/serializer/latex.py @@ -187,9 +187,7 @@ def serialize( ) lvl = item.level if lvl <= 0 or lvl >= 4: - raise ValueError( - "LaTeX serializer: SectionHeaderItem.level must be in [1, 3]" - ) + raise ValueError("LaTeX serializer: SectionHeaderItem.level must be in [1, 3]") cmd = {1: "section", 2: "subsection", 3: "subsubsection"}[lvl] text_part = f"\\{cmd}{{{text}}}" post_process = False @@ -267,9 +265,7 @@ def serialize( if len(lines) <= 1: comment_text = f"% annotation[{ann.kind}]: {ann_text}" else: - prefixed_lines = [f"% annotation[{ann.kind}]: {lines[0]}"] + [ - f"% {ln}" for ln in lines[1:] - ] + prefixed_lines = [f"% annotation[{ann.kind}]: {lines[0]}"] + [f"% {ln}" for ln in lines[1:]] comment_text = "\n".join(prefixed_lines) res_parts.append( create_ser_result( @@ -310,15 +306,9 @@ def serialize( body_row: list[str] = [] for cell in row: if isinstance(cell, RichTableCell): - cell_text = doc_serializer.serialize( - item=cell.ref.resolve(doc=doc), **kwargs - ).text + cell_text = doc_serializer.serialize(item=cell.ref.resolve(doc=doc), **kwargs).text else: - cell_text = ( - _escape_latex(cell.text) - if params.escape_latex - else cell.text - ) + cell_text = _escape_latex(cell.text) if params.escape_latex else cell.text body_row.append(cell_text.replace("\n", " ")) body_rows.append(body_row) @@ -348,9 +338,7 @@ def serialize( if table_text: content.append(table_text) content.append("\\end{table}") - res_parts.append( - create_ser_result(text="\n".join(content), span_source=item) - ) + res_parts.append(create_ser_result(text="\n".join(content), span_source=item)) return create_ser_result( text="\n\n".join([r.text for r in res_parts if r.text]), @@ -401,25 +389,15 @@ def serialize( fig_lines.append(ann_res.text) fig_lines.append("\\end{figure}") - res_parts.append( - create_ser_result(text="\n".join(fig_lines), span_source=item) - ) + res_parts.append(create_ser_result(text="\n".join(fig_lines), span_source=item)) # Optional chart data as a simple table after the figure if params.enable_chart_tables: - tabular_chart_annotations = [ - ann - for ann in item.annotations - if isinstance(ann, PictureTabularChartData) - ] + tabular_chart_annotations = [ann for ann in item.annotations if isinstance(ann, PictureTabularChartData)] if tabular_chart_annotations: temp_doc = DoclingDocument(name="temp") - temp_table = temp_doc.add_table( - data=tabular_chart_annotations[0].chart_data - ) - latex_table_content = ( - LaTeXDocSerializer(doc=temp_doc).serialize(item=temp_table).text - ) + temp_table = temp_doc.add_table(data=tabular_chart_annotations[0].chart_data) + latex_table_content = LaTeXDocSerializer(doc=temp_doc).serialize(item=temp_table).text if latex_table_content: res_parts.append( create_ser_result( @@ -450,7 +428,7 @@ def _serialize_image_part( return create_ser_result(text=image_placeholder, span_source=item) else: return create_ser_result( - text=f"\\includegraphics[width=\\linewidth]{{{str(item.image.uri)}}}", + text=f"\\includegraphics[width=\\linewidth]{{{item.image.uri!s}}}", span_source=item, ) else: # EMBEDDED not supported natively @@ -523,11 +501,7 @@ def serialize( env = "enumerate" if item.first_item_is_enumerated(doc) else "itemize" indent_str = " " * (list_level * params.indent) content = "\n".join([p.text for p in parts if p.text]) - text_res = ( - f"{indent_str}\\begin{{{env}}}\n{content}\n{indent_str}\\end{{{env}}}" - if content - else "" - ) + text_res = f"{indent_str}\\begin{{{env}}}\n{content}\n{indent_str}\\end{{{env}}}" if content else "" return create_ser_result(text=text_res, span_source=parts) @@ -682,11 +656,7 @@ def serialize_doc( if title_cmd: preamble_lines.append(title_cmd) - header = ( - "\n".join(preamble_lines + ["", "\\begin{document}"]) - if preamble_lines - else "\\begin{document}" - ) + header = "\n".join(preamble_lines + ["", "\\begin{document}"]) if preamble_lines else "\\begin{document}" footer = "\\end{document}" # Compose final document with optional \maketitle after begin{document} diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 0702241b..bd371362 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -159,28 +159,22 @@ def serialize( if isinstance(item, ListItem): pieces: list[str] = [] - case_auto = ( - params.orig_list_item_marker_mode == OrigListItemMarkerMode.AUTO - and bool(re.search(r"[a-zA-Z0-9]", item.marker)) + case_auto = params.orig_list_item_marker_mode == OrigListItemMarkerMode.AUTO and bool( + re.search(r"[a-zA-Z0-9]", item.marker) ) case_already_valid = ( params.ensure_valid_list_item_marker - and params.orig_list_item_marker_mode - != OrigListItemMarkerMode.NEVER - and ( - item.marker in ["-", "*", "+"] - or re.fullmatch(r"\d+\.", item.marker) - ) + and params.orig_list_item_marker_mode != OrigListItemMarkerMode.NEVER + and (item.marker in ["-", "*", "+"] or re.fullmatch(r"\d+\.", item.marker)) ) # wrap with outer marker (if applicable) if params.ensure_valid_list_item_marker and not case_already_valid: - assert item.parent and isinstance( - (list_group := item.parent.resolve(doc)), ListGroup - ) + assert item.parent + list_group = item.parent.resolve(doc) + assert isinstance(list_group, ListGroup) if list_group.first_item_is_enumerated(doc) and ( - params.orig_list_item_marker_mode != OrigListItemMarkerMode.AUTO - or not item.marker + params.orig_list_item_marker_mode != OrigListItemMarkerMode.AUTO or not item.marker ): pos = -1 for i, child in enumerate(list_group.children): @@ -267,21 +261,11 @@ def serialize( text="\n\n".join( [ tmp - for key in ( - list(item.meta.__class__.model_fields) - + list(item.meta.get_custom_part()) - ) + for key in (list(item.meta.__class__.model_fields) + list(item.meta.get_custom_part())) if ( - ( - params.allowed_meta_names is None - or key in params.allowed_meta_names - ) + (params.allowed_meta_names is None or key in params.allowed_meta_names) and (key not in params.blocked_meta_names) - and ( - tmp := self._serialize_meta_field( - item.meta, key, params.mark_meta - ) - ) + and (tmp := self._serialize_meta_field(item.meta, key, params.mark_meta)) ) ] if item.meta @@ -291,9 +275,7 @@ def serialize( # NOTE for now using an empty span source for GroupItems ) - def _serialize_meta_field( - self, meta: BaseMeta, name: str, mark_meta: bool - ) -> Optional[str]: + def _serialize_meta_field(self, meta: BaseMeta, name: str, mark_meta: bool) -> Optional[str]: if (field_val := getattr(meta, name)) is not None: if isinstance(field_val, SummaryMetaField): txt = field_val.text @@ -315,9 +297,7 @@ def _serialize_meta_field( txt = tmp else: return None - return ( - f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt - ) + return f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt else: return None @@ -349,11 +329,7 @@ def serialize( if ann_text := _get_annotation_text(ann): ann_res = create_ser_result( text=( - ( - f'' - f"{ann_text}" - f"" - ) + (f'{ann_text}') if params.mark_annotations else ann_text ), @@ -390,9 +366,7 @@ def serialize( res_parts.append(cap_res) if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): - if _should_use_legacy_annotations(params=params, item=item): - ann_res = doc_serializer.serialize_annotations( item=item, **kwargs, @@ -405,9 +379,7 @@ def serialize( # make sure that md tables are not broken # due to newline chars in the text ( - doc_serializer.serialize( - item=col.ref.resolve(doc=doc), **kwargs - ).text + doc_serializer.serialize(item=col.ref.resolve(doc=doc), **kwargs).text if isinstance(col, RichTableCell) else col.text ).replace("\n", " ") @@ -483,21 +455,13 @@ def serialize( kind=PictureTabularChartData.model_fields["kind"].default, ): # Check if picture has attached PictureTabularChartData - tabular_chart_annotations = [ - ann - for ann in item.annotations - if isinstance(ann, PictureTabularChartData) - ] + tabular_chart_annotations = [ann for ann in item.annotations if isinstance(ann, PictureTabularChartData)] if len(tabular_chart_annotations) > 0: temp_doc = DoclingDocument(name="temp") - temp_table = temp_doc.add_table( - data=tabular_chart_annotations[0].chart_data - ) + temp_table = temp_doc.add_table(data=tabular_chart_annotations[0].chart_data) md_table_content = temp_table.export_to_markdown(temp_doc) if len(md_table_content) > 0: - res_parts.append( - create_ser_result(text=md_table_content, span_source=item) - ) + res_parts.append(create_ser_result(text=md_table_content, span_source=item)) text_res = "\n\n".join([r.text for r in res_parts if r.text]) return create_ser_result(text=text_res, span_source=res_parts) @@ -511,9 +475,7 @@ def _serialize_image_part( **kwargs: Any, ) -> SerializationResult: error_response = ( - "" + "" ) if image_mode == ImageRefMode.PLACEHOLDER: text_res = image_placeholder @@ -543,7 +505,7 @@ def _serialize_image_part( ): text_res = image_placeholder else: - text_res = f"![Image]({str(item.image.uri)})" + text_res = f"![Image]({item.image.uri!s})" else: text_res = image_placeholder @@ -739,7 +701,7 @@ def serialize_hyperlink( **kwargs: Any, ): """Apply Markdown-specific hyperlink serialization.""" - return f"[{text}]({str(hyperlink)})" + return f"[{text}]({hyperlink!s})" @classmethod def _escape_underscores(cls, text: str): diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py index b0198455..7e22c89d 100644 --- a/docling_core/transforms/visualizer/key_value_visualizer.py +++ b/docling_core/transforms/visualizer/key_value_visualizer.py @@ -81,9 +81,7 @@ def _draw_key_value_layer( if cell.prov is None or cell.prov.page_no != page_no: continue # skip cells not on this page or without bbox - tl_bbox = cell.prov.bbox.to_top_left_origin( - page_height=doc.pages[page_no].size.height - ) + tl_bbox = cell.prov.bbox.to_top_left_origin(page_height=doc.pages[page_no].size.height) x0, y0, x1, y1 = tl_bbox.as_tuple() x0 *= scale_x x1 *= scale_x @@ -133,9 +131,7 @@ def _draw_key_value_layer( continue # only draw if both ends are on this page def _centre(bbox): - tl = bbox.to_top_left_origin( - page_height=doc.pages[page_no].size.height - ) + tl = bbox.to_top_left_origin(page_height=doc.pages[page_no].size.height) l, t, r, b = tl.as_tuple() return ((l + r) / 2 * scale_x, (t + b) / 2 * scale_y) @@ -162,9 +158,7 @@ def _centre(bbox): tgt_xy[0] - ux * arrow_len + px * arrow_len / 2, tgt_xy[1] - uy * arrow_len + py * arrow_len / 2, ) - draw.polygon( - [tgt_xy, head_base_left, head_base_right], fill=_LINK_COLOUR - ) + draw.polygon([tgt_xy, head_base_left, head_base_right], fill=_LINK_COLOUR) # --------------------------------------------------------------------- # Public API – BaseVisualizer implementation @@ -180,9 +174,7 @@ def get_visualization( ) -> dict[Optional[int], Image]: """Return page‑wise images with key/value overlay (incl. base layer).""" base_images = ( - self.base_visualizer.get_visualization( - doc=doc, included_content_layers=included_content_layers, **kwargs - ) + self.base_visualizer.get_visualization(doc=doc, included_content_layers=included_content_layers, **kwargs) if self.base_visualizer else None ) diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py index 886ad8b4..190f6fd0 100644 --- a/docling_core/transforms/visualizer/layout_visualizer.py +++ b/docling_core/transforms/visualizer/layout_visualizer.py @@ -45,9 +45,7 @@ class Params(BaseModel): base_visualizer: Optional[BaseVisualizer] = None params: Params = Params() - def _draw_clusters( - self, image: Image, clusters: list[_TLCluster], scale_x: float, scale_y: float - ) -> None: + def _draw_clusters(self, image: Image, clusters: list[_TLCluster], scale_x: float, scale_y: float) -> None: """Draw clusters on an image.""" draw = ImageDraw.Draw(image, "RGBA") # Create a smaller font for the labels @@ -148,9 +146,7 @@ def _draw_doc_layout( prev_image = None prev_page_nr = None for idx, (elem, _) in enumerate( - doc.iterate_items( - included_content_layers=included_content_layers, traverse_pictures=True - ) + doc.iterate_items(included_content_layers=included_content_layers, traverse_pictures=True) ): if not isinstance(elem, DocItem): continue @@ -171,16 +167,12 @@ def _draw_doc_layout( self._draw_clusters( image=prev_image, clusters=clusters, - scale_x=prev_image.width - / doc.pages[prev_page_nr].size.width, - scale_y=prev_image.height - / doc.pages[prev_page_nr].size.height, + scale_x=prev_image.width / doc.pages[prev_page_nr].size.width, + scale_y=prev_image.height / doc.pages[prev_page_nr].size.height, ) clusters = [] - tlo_bbox = prov.bbox.to_top_left_origin( - page_height=doc.pages[prov.page_no].size.height - ) + tlo_bbox = prov.bbox.to_top_left_origin(page_height=doc.pages[prov.page_no].size.height) cluster = _TLCluster( id=idx, label=elem.label, @@ -211,11 +203,7 @@ def get_visualization( **kwargs, ) -> dict[Optional[int], Image]: """Get visualization of the document as images by page.""" - base_images = ( - self.base_visualizer.get_visualization(doc=doc, **kwargs) - if self.base_visualizer - else None - ) + base_images = self.base_visualizer.get_visualization(doc=doc, **kwargs) if self.base_visualizer else None return self._draw_doc_layout( doc=doc, images=base_images, diff --git a/docling_core/transforms/visualizer/reading_order_visualizer.py b/docling_core/transforms/visualizer/reading_order_visualizer.py index c012f22b..8f8f0803 100644 --- a/docling_core/transforms/visualizer/reading_order_visualizer.py +++ b/docling_core/transforms/visualizer/reading_order_visualizer.py @@ -31,16 +31,12 @@ class Params(BaseModel): show_label: bool = True show_branch_numbering: bool = False - content_layers: set[ContentLayer] = { - cl for cl in ContentLayer if cl != ContentLayer.BACKGROUND - } + content_layers: set[ContentLayer] = {cl for cl in ContentLayer if cl != ContentLayer.BACKGROUND} base_visualizer: Optional[BaseVisualizer] = None params: Params = Params() - def _get_picture_context( - self, elem: DocItem, doc: DoclingDocument - ) -> Optional[str]: + def _get_picture_context(self, elem: DocItem, doc: DoclingDocument) -> Optional[str]: """Get the picture self_ref if element is nested inside a PictureItem, None otherwise.""" current = elem while current.parent is not None: @@ -112,14 +108,10 @@ def _draw_doc_reading_order( # Separate reading order paths for outside vs inside pictures # Key: (page_no, picture_ref_or_None) -> (x0, y0, element_index) # picture_ref is None for elements outside any picture, otherwise the picture's self_ref - reading_order_state: dict[ - tuple[int, Optional[str]], tuple[float, float, int] - ] = {} + reading_order_state: dict[tuple[int, Optional[str]], tuple[float, float, int]] = {} number_data_to_draw: dict[int, list[_NumberDrawingData]] = {} # Only int keys are used (from prov.page_no), even if input images has Optional[int] keys - my_images: dict[int, Image] = { - k: v for k, v in (images or {}).items() if k is not None - } + my_images: dict[int, Image] = {k: v for k, v in (images or {}).items() if k is not None} prev_page: Optional[int] = None element_index = 0 @@ -153,23 +145,16 @@ def _draw_doc_reading_order( if image is None: page_image = doc.pages[page_no].image - if ( - page_image is None - or (pil_img := page_image.pil_image) is None - ): - raise RuntimeError( - "Cannot visualize document without images" - ) + if page_image is None or (pil_img := page_image.pil_image) is None: + raise RuntimeError("Cannot visualize document without images") else: image = deepcopy(pil_img) my_images[page_no] = image draw = ImageDraw.Draw(image, "RGBA") - tlo_bbox = prov.bbox.to_top_left_origin( - page_height=doc.pages[prov.page_no].size.height - ) + tlo_bbox = prov.bbox.to_top_left_origin(page_height=doc.pages[prov.page_no].size.height) ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size) - ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741 + ro_bbox.l = round(ro_bbox.l * image.width) ro_bbox.r = round(ro_bbox.r * image.width) ro_bbox.t = round(ro_bbox.t * image.height) ro_bbox.b = round(ro_bbox.b * image.height) @@ -214,7 +199,6 @@ def _draw_doc_reading_order( draw = ImageDraw.Draw(image, "RGBA") for num_item in number_data_to_draw[page]: - text_bbox = draw.textbbox(num_item.xy, num_item.text, font) text_bg_padding = 5 draw.ellipse( @@ -247,11 +231,7 @@ def get_visualization( **kwargs, ) -> dict[Optional[int], Image]: """Get visualization of the document as images by page.""" - base_images = ( - self.base_visualizer.get_visualization(doc=doc, **kwargs) - if self.base_visualizer - else None - ) + base_images = self.base_visualizer.get_visualization(doc=doc, **kwargs) if self.base_visualizer else None return self._draw_doc_reading_order( doc=doc, images=base_images, diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py index 0a722959..2b7b6813 100644 --- a/docling_core/transforms/visualizer/table_visualizer.py +++ b/docling_core/transforms/visualizer/table_visualizer.py @@ -57,20 +57,15 @@ def _draw_table_cells( for cell in table.data.table_cells: if cell.bbox is not None: - tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height) cell_color = self.params.cell_color # Transparent black for cells cell_outline = self.params.cell_outline if cell.column_header: - cell_color = ( - self.params.col_header_color - ) # Transparent black for cells + cell_color = self.params.col_header_color # Transparent black for cells cell_outline = self.params.col_header_outline if cell.row_header: - cell_color = ( - self.params.row_header_color - ) # Transparent black for cells + cell_color = self.params.row_header_color # Transparent black for cells cell_outline = self.params.row_header_outline if cell.row_section: cell_color = self.params.row_header_color @@ -102,7 +97,6 @@ def _draw_table_rows( rows = table.data.get_row_bounding_boxes() for rid, bbox in rows.items(): - tl_bbox = bbox.to_top_left_origin(page_height=page_height) cx0, cy0, cx1, cy1 = tl_bbox.as_tuple() @@ -131,7 +125,6 @@ def _draw_table_cols( cols = table.data.get_column_bounding_boxes() for cid, bbox in cols.items(): - tl_bbox = bbox.to_top_left_origin(page_height=page_height) cx0, cy0, cx1, cy1 = tl_bbox.as_tuple() @@ -171,16 +164,13 @@ def _draw_doc_tables( image = deepcopy(pil_img) my_images[page_nr] = image - for idx, (elem, _) in enumerate( - doc.iterate_items(included_content_layers=included_content_layers) - ): + for idx, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)): if not isinstance(elem, TableItem): continue if len(elem.prov) == 0: continue # Skip elements without provenances if len(elem.prov) == 1: - page_nr = elem.prov[0].page_no if page_nr in my_images: @@ -229,11 +219,7 @@ def get_visualization( **kwargs, ) -> dict[Optional[int], Image]: """Get visualization of the document as images by page.""" - base_images = ( - self.base_visualizer.get_visualization(doc=doc, **kwargs) - if self.base_visualizer - else None - ) + base_images = self.base_visualizer.get_visualization(doc=doc, **kwargs) if self.base_visualizer else None return self._draw_doc_tables( doc=doc, images=base_images, diff --git a/docling_core/types/base.py b/docling_core/types/base.py index 62460b4e..5eceab70 100644 --- a/docling_core/types/base.py +++ b/docling_core/types/base.py @@ -53,14 +53,10 @@ StrictDateTime = Annotated[ datetime, WrapValidator(validate_datetime), - PlainSerializer( - lambda x: x.astimezone(tz=timezone.utc).isoformat(), return_type=str - ), + PlainSerializer(lambda x: x.astimezone(tz=timezone.utc).isoformat(), return_type=str), ] -ACQUISITION_TYPE = Literal[ - "API", "FTP", "Download", "Link", "Web scraping/Crawling", "Other" -] +ACQUISITION_TYPE = Literal["API", "FTP", "Download", "Link", "Web scraping/Crawling", "Other"] class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"): @@ -68,16 +64,11 @@ class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"): type_: IdentifierTypeT = Field( alias="type", - description=( - "A string representing a collection or database that contains this " - "data object." - ), + description=("A string representing a collection or database that contains this data object."), json_schema_extra=es_field(type="keyword", ignore_above=8191), ) value: StrictStr = Field( - description=( - "The identifier value of the data object within a collection or database." - ), + description=("The identifier value of the data object within a collection or database."), json_schema_extra=es_field(type="keyword", ignore_above=8191), ) name: str = Field( @@ -103,8 +94,7 @@ def name_from_type_value(cls, v, info: ValidationInfo): and v != f"{info.data['type_'].lower()}#{info.data['value'].lower()}" ): raise ValueError( - "the _name field must be the concatenation of type and value in lower " - "case, separated by hash (#)" + "the _name field must be the concatenation of type and value in lower case, separated by hash (#)" ) return v @@ -134,9 +124,7 @@ class Log(AliasModel, extra="forbid"): description="A description of the task or any comments in natural language.", ) date: StrictDateTime = Field( - description=( - "A string representation of the task execution datetime in ISO 8601 format." - ) + description=("A string representation of the task execution datetime in ISO 8601 format.") ) @@ -149,18 +137,12 @@ class FileInfoObject(AliasModel): ) fileprov: Optional[StrictStr] = Field( default=None, - description=( - "The provenance of this data object, e.g. an archive file, a URL, or any" - " other repository." - ), + description=("The provenance of this data object, e.g. an archive file, a URL, or any other repository."), alias="filename-prov", json_schema_extra=es_field(type="keyword", ignore_above=8191), ) document_hash: StrictStr = Field( - description=( - "A unique identifier of this data object within a collection of a " - "Docling database" - ), + description=("A unique identifier of this data object within a collection of a Docling database"), alias="document-hash", json_schema_extra=es_field(type="keyword", ignore_above=8191), ) @@ -177,9 +159,7 @@ class CollectionTypeEnum(str, Enum): CollectionTypeT = TypeVar("CollectionTypeT", bound=CollectionTypeEnum) -class CollectionInfo( - BaseModel, Generic[CollectionNameTypeT, CollectionTypeT], extra="forbid" -): +class CollectionInfo(BaseModel, Generic[CollectionNameTypeT, CollectionTypeT], extra="forbid"): """Information of a collection.""" name: Optional[CollectionNameTypeT] = Field( @@ -192,9 +172,7 @@ class CollectionInfo( description="The collection type.", json_schema_extra=es_field(type="keyword", ignore_above=8191), ) - version: Optional[ - Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] - ] = Field( + version: Optional[Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)]] = Field( default=None, description="The version of this collection model.", json_schema_extra=es_field(type="keyword", ignore_above=8191), @@ -231,9 +209,7 @@ class Acquisition(BaseModel, extra="forbid"): ) date: Optional[StrictDateTime] = Field( default=None, - description=( - "A string representation of the acquisition datetime in ISO 8601 format." - ), + description=("A string representation of the acquisition datetime in ISO 8601 format."), ) link: Optional[AnyUrl] = Field( default=None, diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py index f4a020e4..d13a6b81 100644 --- a/docling_core/types/doc/base.py +++ b/docling_core/types/doc/base.py @@ -28,14 +28,10 @@ class PydanticSerCtxKey(str, Enum): CONFID_PREC = "confid_prec" # key for confidence values precision -def round_pydantic_float( - val: float, ctx: Any, precision_ctx_key: PydanticSerCtxKey -) -> float: +def round_pydantic_float(val: float, ctx: Any, precision_ctx_key: PydanticSerCtxKey) -> float: """Round float, provided the precision is available in the context.""" precision = ( - ctx.get(precision_ctx_key.value) - if isinstance(ctx, dict) - else getattr(ctx, precision_ctx_key.value, None) + ctx.get(precision_ctx_key.value) if isinstance(ctx, dict) else getattr(ctx, precision_ctx_key.value, None) ) return round(val, precision) if isinstance(precision, int) else val @@ -104,9 +100,7 @@ def scaled(self, scale: float): # same as before, but using the implementation above def normalized(self, page_size: Size): """normalized.""" - return self.scale_to_size( - old_size=page_size, new_size=Size(height=1.0, width=1.0) - ) + return self.scale_to_size(old_size=page_size, new_size=Size(height=1.0, width=1.0)) def expand_by_scale(self, x_scale: float, y_scale: float) -> "BoundingBox": """expand_to_size.""" @@ -190,9 +184,7 @@ def intersection_area_with(self, other: "BoundingBox") -> float: return width * height - def intersection_over_union( - self, other: "BoundingBox", eps: float = 1.0e-6 - ) -> float: + def intersection_over_union(self, other: "BoundingBox", eps: float = 1.0e-6) -> float: """intersection_over_union.""" intersection_area = self.intersection_area_with(other=other) @@ -204,9 +196,7 @@ def intersection_over_union( return intersection_area / (union_area + eps) - def intersection_over_self( - self, other: "BoundingBox", eps: float = 1.0e-6 - ) -> float: + def intersection_over_self(self, other: "BoundingBox", eps: float = 1.0e-6) -> float: """intersection_over_self.""" intersection_area = self.intersection_area_with(other=other) if self.area() > 0: @@ -244,17 +234,13 @@ def get_intersection_bbox(self, other: "BoundingBox") -> Optional["BoundingBox"] bottom = min(self.b, other.b) if right <= left or bottom <= top: return None - return BoundingBox( - l=left, t=top, r=right, b=bottom, coord_origin=self.coord_origin - ) + return BoundingBox(l=left, t=top, r=right, b=bottom, coord_origin=self.coord_origin) top = min(self.t, other.t) bottom = max(self.b, other.b) if right <= left or top <= bottom: return None - return BoundingBox( - l=left, t=top, r=right, b=bottom, coord_origin=self.coord_origin - ) + return BoundingBox(l=left, t=top, r=right, b=bottom, coord_origin=self.coord_origin) def to_top_left_origin(self, page_height: float) -> "BoundingBox": """to_top_left_origin. @@ -275,9 +261,7 @@ def to_top_left_origin(self, page_height: float) -> "BoundingBox": def overlaps(self, other: "BoundingBox") -> bool: """overlaps.""" - return self.overlaps_horizontally(other=other) and self.overlaps_vertically( - other=other - ) + return self.overlaps_horizontally(other=other) and self.overlaps_vertically(other=other) def overlaps_horizontally(self, other: "BoundingBox") -> bool: """Check if two bounding boxes overlap horizontally.""" @@ -296,13 +280,8 @@ def overlaps_vertically(self, other: "BoundingBox") -> bool: def overlaps_vertically_with_iou(self, other: "BoundingBox", iou: float) -> bool: """overlaps_y_with_iou.""" - if ( - self.coord_origin == CoordOrigin.BOTTOMLEFT - and other.coord_origin == CoordOrigin.BOTTOMLEFT - ): - + if self.coord_origin == CoordOrigin.BOTTOMLEFT and other.coord_origin == CoordOrigin.BOTTOMLEFT: if self.overlaps_vertically(other=other): - u0 = min(self.b, other.b) u1 = max(self.t, other.t) @@ -314,10 +293,7 @@ def overlaps_vertically_with_iou(self, other: "BoundingBox", iou: float) -> bool return False - elif ( - self.coord_origin == CoordOrigin.TOPLEFT - and other.coord_origin == CoordOrigin.TOPLEFT - ): + elif self.coord_origin == CoordOrigin.TOPLEFT and other.coord_origin == CoordOrigin.TOPLEFT: if self.overlaps_vertically(other=other): u0 = min(self.t, other.t) u1 = max(self.b, other.b) @@ -344,16 +320,10 @@ def is_strictly_left_of(self, other: "BoundingBox", eps: float = 0.001) -> bool: def is_above(self, other: "BoundingBox") -> bool: """is_above.""" - if ( - self.coord_origin == CoordOrigin.BOTTOMLEFT - and other.coord_origin == CoordOrigin.BOTTOMLEFT - ): + if self.coord_origin == CoordOrigin.BOTTOMLEFT and other.coord_origin == CoordOrigin.BOTTOMLEFT: return self.t > other.t - elif ( - self.coord_origin == CoordOrigin.TOPLEFT - and other.coord_origin == CoordOrigin.TOPLEFT - ): + elif self.coord_origin == CoordOrigin.TOPLEFT and other.coord_origin == CoordOrigin.TOPLEFT: return self.t < other.t else: @@ -363,16 +333,10 @@ def is_above(self, other: "BoundingBox") -> bool: def is_strictly_above(self, other: "BoundingBox", eps: float = 1.0e-3) -> bool: """is_strictly_above.""" - if ( - self.coord_origin == CoordOrigin.BOTTOMLEFT - and other.coord_origin == CoordOrigin.BOTTOMLEFT - ): + if self.coord_origin == CoordOrigin.BOTTOMLEFT and other.coord_origin == CoordOrigin.BOTTOMLEFT: return (self.b + eps) > other.t - elif ( - self.coord_origin == CoordOrigin.TOPLEFT - and other.coord_origin == CoordOrigin.TOPLEFT - ): + elif self.coord_origin == CoordOrigin.TOPLEFT and other.coord_origin == CoordOrigin.TOPLEFT: return (self.b + eps) < other.t else: @@ -380,9 +344,7 @@ def is_strictly_above(self, other: "BoundingBox", eps: float = 1.0e-3) -> bool: return False - def is_horizontally_connected( - self, elem_i: "BoundingBox", elem_j: "BoundingBox" - ) -> bool: + def is_horizontally_connected(self, elem_i: "BoundingBox", elem_j: "BoundingBox") -> bool: """is_horizontally_connected.""" if ( self.coord_origin == CoordOrigin.BOTTOMLEFT diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 414640d6..006d15b2 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -334,8 +334,7 @@ def from_dict_format(cls, data: Any) -> Any: # "bbox" not in data # or data["bbox"] is None # or isinstance(data["bbox"], BoundingBox) - "text" - in data + "text" in data ): return data text = data.get("bbox", {}).get("token", "") @@ -364,9 +363,7 @@ def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> s from docling_core.transforms.serializer.markdown import MarkdownDocSerializer if doc is not None: - doc_serializer = kwargs.pop( - "doc_serializer", MarkdownDocSerializer(doc=doc) - ) + doc_serializer = kwargs.pop("doc_serializer", MarkdownDocSerializer(doc=doc)) ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs) return ser_res.text else: @@ -421,9 +418,7 @@ def grid( return table_data - def remove_rows( - self, indices: List[int], doc: Optional["DoclingDocument"] = None - ) -> List[List[TableCell]]: + def remove_rows(self, indices: List[int], doc: Optional["DoclingDocument"] = None) -> List[List[TableCell]]: """Remove rows from the table by their indices. :param indices: List[int]: A list of indices of the rows to remove. (Starting from 0) @@ -486,9 +481,7 @@ def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]: return self.remove_row(self.num_rows - 1, doc=doc) - def remove_row( - self, row_index: int, doc: Optional["DoclingDocument"] = None - ) -> List[TableCell]: + def remove_row(self, row_index: int, doc: Optional["DoclingDocument"] = None) -> List[TableCell]: """Remove a row from the table by its index. :param row_index: int: The index of the row to remove. (Starting from 0) @@ -497,9 +490,7 @@ def remove_row( """ return self.remove_rows([row_index], doc=doc)[0] - def insert_rows( - self, row_index: int, rows: List[List[str]], after: bool = False - ) -> None: + def insert_rows(self, row_index: int, rows: List[List[str]], after: bool = False) -> None: """Insert multiple new rows from a list of lists of strings before/after a specific index in the table. :param row_index: int: The index at which to insert the new rows. (Starting from 0) @@ -523,16 +514,12 @@ def insert_row(self, row_index: int, row: List[str], after: bool = False) -> Non :returns: None """ if len(row) != self.num_cols: - raise ValueError( - f"Row length {len(row)} does not match the number of columns {self.num_cols}." - ) + raise ValueError(f"Row length {len(row)} does not match the number of columns {self.num_cols}.") effective_index = row_index + (1 if after else 0) if effective_index < 0 or effective_index > self.num_rows: - raise IndexError( - f"Row index {row_index} is out of bounds for the current number of rows {self.num_rows}." - ) + raise IndexError(f"Row index {row_index} is out of bounds for the current number of rows {self.num_rows}.") new_row_cells = [ TableCell( @@ -604,12 +591,7 @@ def get_row_bounding_boxes(self) -> dict[int, BoundingBox]: # Collect all cells in this row that have bounding boxes for cell in self.table_cells: - - if ( - cell.bbox is not None - and cell.start_row_offset_idx <= row_idx < cell.end_row_offset_idx - ): - + if cell.bbox is not None and cell.start_row_offset_idx <= row_idx < cell.end_row_offset_idx: row_span = cell.end_row_offset_idx - cell.start_row_offset_idx if row_span in row_cells_with_bbox: row_cells_with_bbox[row_span].append(cell.bbox) @@ -619,9 +601,7 @@ def get_row_bounding_boxes(self) -> dict[int, BoundingBox]: # Calculate the enclosing bounding box for this row if len(row_cells_with_bbox) > 0: min_row_span = min(row_cells_with_bbox.keys()) - row_bbox: BoundingBox = BoundingBox.enclosing_bbox( - row_cells_with_bbox[min_row_span] - ) + row_bbox: BoundingBox = BoundingBox.enclosing_bbox(row_cells_with_bbox[min_row_span]) for rspan, bboxs in row_cells_with_bbox.items(): for bbox in bboxs: @@ -658,12 +638,7 @@ def get_column_bounding_boxes(self) -> dict[int, BoundingBox]: # Collect all cells in this row that have bounding boxes for cell in self.table_cells: - - if ( - cell.bbox is not None - and cell.start_col_offset_idx <= col_idx < cell.end_col_offset_idx - ): - + if cell.bbox is not None and cell.start_col_offset_idx <= col_idx < cell.end_col_offset_idx: col_span = cell.end_col_offset_idx - cell.start_col_offset_idx if col_span in col_cells_with_bbox: col_cells_with_bbox[col_span].append(cell.bbox) @@ -673,9 +648,7 @@ def get_column_bounding_boxes(self) -> dict[int, BoundingBox]: # Calculate the enclosing bounding box for this row if len(col_cells_with_bbox) > 0: min_col_span = min(col_cells_with_bbox.keys()) - col_bbox: BoundingBox = BoundingBox.enclosing_bbox( - col_cells_with_bbox[min_col_span] - ) + col_bbox: BoundingBox = BoundingBox.enclosing_bbox(col_cells_with_bbox[min_col_span]) for rspan, bboxs in col_cells_with_bbox.items(): for bbox in bboxs: @@ -701,10 +674,7 @@ def _dedupe_bboxes( """Return elements whose bounding boxes are unique within ``iou_threshold``.""" deduped: list[BoundingBox] = [] for element in elements: - if all( - element.intersection_over_union(kept) < iou_threshold - for kept in deduped - ): + if all(element.intersection_over_union(kept) < iou_threshold for kept in deduped): deduped.append(element) return deduped @@ -790,12 +760,8 @@ def span_from_merge( # 1) Add merged cells first (and mark their covered simple cells) for m in merges: - rspan = span_from_merge( - m, rows, axis="row", frac_threshold=row_overlap_threshold - ) - cspan = span_from_merge( - m, columns, axis="col", frac_threshold=col_overlap_threshold - ) + rspan = span_from_merge(m, rows, axis="row", frac_threshold=row_overlap_threshold) + cspan = span_from_merge(m, columns, axis="col", frac_threshold=col_overlap_threshold) if rspan is None or cspan is None: # Can't confidently map this merge to grid -> skip it continue @@ -846,10 +812,8 @@ def span_from_merge( if not inter: # In degenerate cases (big gaps), there might be no intersection; skip. continue - c_column_header, c_row_header, c_row_section = ( - cls._process_table_headers( - inter, col_headers, row_headers, row_sections - ) + c_column_header, c_row_header, c_row_section = cls._process_table_headers( + inter, col_headers, row_headers, row_sections ) cells.append( TableCell( @@ -886,47 +850,23 @@ def from_regions( default_containment_thresh = 0.5 rows.extend(row_sections) # use row sections to compensate for missing rows rows = cls._dedupe_bboxes( - [ - e - for e in rows - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] + [e for e in rows if e.intersection_over_self(table_bbox) >= default_containment_thresh] ) cols = cls._dedupe_bboxes( - [ - e - for e in cols - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] + [e for e in cols if e.intersection_over_self(table_bbox) >= default_containment_thresh] ) merges = cls._dedupe_bboxes( - [ - e - for e in merges - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] + [e for e in merges if e.intersection_over_self(table_bbox) >= default_containment_thresh] ) col_headers = cls._dedupe_bboxes( - [ - e - for e in col_headers - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] + [e for e in col_headers if e.intersection_over_self(table_bbox) >= default_containment_thresh] ) row_headers = cls._dedupe_bboxes( - [ - e - for e in row_headers - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] + [e for e in row_headers if e.intersection_over_self(table_bbox) >= default_containment_thresh] ) row_sections = cls._dedupe_bboxes( - [ - e - for e in row_sections - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] + [e for e in row_sections if e.intersection_over_self(table_bbox) >= default_containment_thresh] ) # Compute table cells from CVAT elements: rows, cols, merges @@ -1033,9 +973,7 @@ def parse_hex_string(cls, value): # Convert hex string to an integer hash_int = Uint64(value, 16) # Mask to fit within 64 bits (unsigned) - return ( - hash_int & 0xFFFFFFFFFFFFFFFF - ) # TODO be sure it doesn't clip uint64 max + return hash_int & 0xFFFFFFFFFFFFFFFF # TODO be sure it doesn't clip uint64 max except ValueError: raise ValueError(f"Invalid sha256 hexdigest: {value}") return value # If already an int, return it as is. @@ -1237,9 +1175,7 @@ def get_custom_part(self) -> dict[str, Any]: def _copy_without_extra(self) -> Self: """Create a copy without the extra fields.""" - return self.model_validate( - self.model_dump(exclude={ex for ex in self.get_custom_part()}) - ) + return self.model_validate(self.model_dump(exclude={ex for ex in self.get_custom_part()})) def _check_custom_field_format(self, key: str) -> None: parts = key.split(MetaUtils._META_FIELD_NAMESPACE_DELIMITER, maxsplit=1) @@ -1255,9 +1191,7 @@ def _validate_field_names(self) -> Self: if key in extra_dict: self._check_custom_field_format(key=key) elif MetaUtils._META_FIELD_NAMESPACE_DELIMITER in key: - raise ValueError( - f"Standard meta field name must not contain '__': {key}" - ) + raise ValueError(f"Standard meta field name must not contain '__': {key}") return self @@ -1332,18 +1266,14 @@ class PictureClassificationPrediction(BasePrediction): class PictureClassificationMetaField(_ExtraAllowingModel): """Picture classification metadata field.""" - predictions: list[PictureClassificationPrediction] = Field( - default_factory=list, min_length=1 - ) + predictions: list[PictureClassificationPrediction] = Field(default_factory=list, min_length=1) def get_main_prediction(self) -> PictureClassificationPrediction: """Get prediction with highest confidence (if confidence not available, first is used by convention).""" max_conf_pos: Optional[int] = None max_conf: Optional[float] = None for i, pred in enumerate(self.predictions): - if pred.confidence is not None and ( - max_conf is None or pred.confidence > max_conf - ): + if pred.confidence is not None and (max_conf is None or pred.confidence > max_conf): max_conf_pos = i max_conf = pred.confidence return self.predictions[max_conf_pos if max_conf_pos is not None else 0] @@ -1393,9 +1323,7 @@ def get_ref(self) -> RefItem: """get_ref.""" return RefItem(cref=self.self_ref) - def _get_parent_ref( - self, doc: "DoclingDocument", stack: list[int] - ) -> Optional[RefItem]: + def _get_parent_ref(self, doc: "DoclingDocument", stack: list[int]) -> Optional[RefItem]: """get_parent_ref.""" if len(stack) == 0: return self.parent @@ -1416,9 +1344,7 @@ def _delete_child(self, doc: "DoclingDocument", stack: list[int]) -> bool: return False - def _update_child( - self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem - ) -> bool: + def _update_child(self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem) -> bool: """Update child node in tree.""" if len(stack) == 1 and stack[0] < len(self.children): # ensure the parent is correct @@ -1433,12 +1359,9 @@ def _update_child( return False - def _add_child( - self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem - ) -> bool: + def _add_child(self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem) -> bool: """Append child to node identified by stack.""" if len(stack) == 0: - # ensure the parent is correct new_item = new_ref.resolve(doc=doc) new_item.parent = self.get_ref() @@ -1475,9 +1398,7 @@ def _add_sibling( return True elif len(stack) > 1 and stack[0] < len(self.children): item = self.children[stack[0]].resolve(doc) - return item._add_sibling( - doc=doc, stack=stack[1:], new_ref=new_ref, after=after - ) + return item._add_sibling(doc=doc, stack=stack[1:], new_ref=new_ref, after=after) return False @@ -1517,9 +1438,7 @@ def first_item_is_enumerated(self, doc: "DoclingDocument"): class OrderedList(GroupItem): """OrderedList.""" - label: typing.Literal[GroupLabel.ORDERED_LIST] = ( - GroupLabel.ORDERED_LIST # type: ignore[assignment] - ) + label: typing.Literal[GroupLabel.ORDERED_LIST] = GroupLabel.ORDERED_LIST # type: ignore[assignment] class InlineGroup(GroupItem): @@ -1528,9 +1447,7 @@ class InlineGroup(GroupItem): label: typing.Literal[GroupLabel.INLINE] = GroupLabel.INLINE -class DocItem( - NodeItem -): # Base type for any element that carries content, can be a leaf node +class DocItem(NodeItem): # Base type for any element that carries content, can be a leaf node """DocItem.""" label: DocItemLabel @@ -1564,9 +1481,7 @@ def get_location_tokens( return location - def get_image( - self, doc: "DoclingDocument", prov_index: int = 0 - ) -> Optional[PILImage.Image]: + def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PILImage.Image]: """Returns the image of this DocItem. The function returns None if this DocItem has no valid provenance or @@ -1634,9 +1549,7 @@ class TextItem(DocItem): text: str # sanitized representation formatting: Optional[Formatting] = None - hyperlink: Optional[Union[AnyUrl, Path]] = Field( - union_mode="left_to_right", default=None - ) + hyperlink: Optional[Union[AnyUrl, Path]] = Field(union_mode="left_to_right", default=None) @deprecated("Use export_to_doctags() instead.") def export_to_document_tokens(self, *args, **kwargs): @@ -1683,17 +1596,13 @@ def export_to_doctags( class TitleItem(TextItem): """TitleItem.""" - label: typing.Literal[DocItemLabel.TITLE] = ( - DocItemLabel.TITLE # type: ignore[assignment] - ) + label: typing.Literal[DocItemLabel.TITLE] = DocItemLabel.TITLE # type: ignore[assignment] class SectionHeaderItem(TextItem): """SectionItem.""" - label: typing.Literal[DocItemLabel.SECTION_HEADER] = ( - DocItemLabel.SECTION_HEADER # type: ignore[assignment] - ) + label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER # type: ignore[assignment] level: LevelNumber = 1 @deprecated("Use export_to_doctags() instead.") @@ -1741,9 +1650,7 @@ def export_to_doctags( class ListItem(TextItem): """SectionItem.""" - label: typing.Literal[DocItemLabel.LIST_ITEM] = ( - DocItemLabel.LIST_ITEM # type: ignore[assignment] - ) + label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM # type: ignore[assignment] enumerated: bool = False marker: str = "-" # The bullet or number symbol that prefixes this list item @@ -1765,9 +1672,7 @@ def caption_text(self, doc: "DoclingDocument") -> str: text += cap.resolve(doc).text return text - def get_image( - self, doc: "DoclingDocument", prov_index: int = 0 - ) -> Optional[PILImage.Image]: + def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PILImage.Image]: """Returns the image corresponding to this FloatingItem. This function returns the PIL image from self.image if one is available. @@ -1785,9 +1690,7 @@ def get_image( class CodeItem(FloatingItem, TextItem): """CodeItem.""" - label: typing.Literal[DocItemLabel.CODE] = ( - DocItemLabel.CODE # type: ignore[assignment] - ) + label: typing.Literal[DocItemLabel.CODE] = DocItemLabel.CODE # type: ignore[assignment] code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN @deprecated("Use export_to_doctags() instead.") @@ -1835,9 +1738,7 @@ def export_to_doctags( class FormulaItem(TextItem): """FormulaItem.""" - label: typing.Literal[DocItemLabel.FORMULA] = ( - DocItemLabel.FORMULA # type: ignore[assignment] - ) + label: typing.Literal[DocItemLabel.FORMULA] = DocItemLabel.FORMULA # type: ignore[assignment] class MetaUtils: @@ -1862,17 +1763,13 @@ def _create_migrated_meta_field_name( *, name: str, ) -> str: - return cls.create_meta_field_name( - namespace=cls._META_FIELD_LEGACY_NAMESPACE, name=name - ) + return cls.create_meta_field_name(namespace=cls._META_FIELD_LEGACY_NAMESPACE, name=name) class PictureItem(FloatingItem): """PictureItem.""" - label: typing.Literal[DocItemLabel.PICTURE, DocItemLabel.CHART] = ( - DocItemLabel.PICTURE - ) + label: typing.Literal[DocItemLabel.PICTURE, DocItemLabel.CHART] = DocItemLabel.PICTURE meta: Optional[PictureMeta] = None annotations: Annotated[ @@ -1920,12 +1817,8 @@ def _migrate_annotations_to_meta(self) -> Self: confidence=ann.confidence, created_by=ann.provenance, **{ - MetaUtils._create_migrated_meta_field_name( - name="segmentation" - ): ann.segmentation, - MetaUtils._create_migrated_meta_field_name( - name="class_name" - ): ann.class_name, + MetaUtils._create_migrated_meta_field_name(name="segmentation"): ann.segmentation, + MetaUtils._create_migrated_meta_field_name(name="class_name"): ann.class_name, }, ) elif isinstance(ann, PictureTabularChartData): @@ -1937,11 +1830,7 @@ def _migrate_annotations_to_meta(self) -> Self: self.meta.set_custom_field( namespace=MetaUtils._META_FIELD_LEGACY_NAMESPACE, name=ann.kind, - value=( - ann.content - if isinstance(ann, MiscAnnotation) - else ann.model_dump(mode="json") - ), + value=(ann.content if isinstance(ann, MiscAnnotation) else ann.model_dump(mode="json")), ) return self @@ -1952,9 +1841,7 @@ def _image_to_base64(self, pil_image, format="PNG"): buffered = BytesIO() pil_image.save(buffered, format=format) # Save the image to the byte stream img_bytes = buffered.getvalue() # Get the byte data - img_base64 = base64.b64encode(img_bytes).decode( - "utf-8" - ) # Encode to Base64 and decode to string + img_base64 = base64.b64encode(img_bytes).decode("utf-8") # Encode to Base64 and decode to string return img_base64 @staticmethod @@ -2109,7 +1996,6 @@ def _migrate_annotations_to_meta(self) -> Self: "Note that only the first available instance of each annotation type will be migrated." ) for ann in self.annotations: - # ensure meta field is present if self.meta is None: self.meta = FloatingMeta() @@ -2123,23 +2009,15 @@ def _migrate_annotations_to_meta(self) -> Self: self.meta.set_custom_field( namespace=MetaUtils._META_FIELD_LEGACY_NAMESPACE, name=ann.kind, - value=( - ann.content - if isinstance(ann, MiscAnnotation) - else ann.model_dump(mode="json") - ), + value=(ann.content if isinstance(ann, MiscAnnotation) else ann.model_dump(mode="json")), ) return self - def export_to_dataframe( - self, doc: Optional["DoclingDocument"] = None - ) -> pd.DataFrame: + def export_to_dataframe(self, doc: Optional["DoclingDocument"] = None) -> pd.DataFrame: """Export the table as a Pandas DataFrame.""" if doc is None: - _logger.warning( - "Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated." - ) + _logger.warning("Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated.") if self.data.num_rows == 0 or self.data.num_cols == 0: return pd.DataFrame() @@ -2148,9 +2026,7 @@ def export_to_dataframe( num_headers = 0 for i, row in enumerate(self.data.grid): if len(row) == 0: - raise RuntimeError( - f"Invalid table. {len(row)=} but {self.data.num_cols=}." - ) + raise RuntimeError(f"Invalid table. {len(row)=} but {self.data.num_cols=}.") any_header = False for cell in row: @@ -2175,10 +2051,7 @@ def export_to_dataframe( columns[j] += col_name # Create table data - table_data = [ - [cell._get_text(doc=doc) for cell in row] - for row in self.data.grid[num_headers:] - ] + table_data = [[cell._get_text(doc=doc) for cell in row] for row in self.data.grid[num_headers:]] # Create DataFrame df = pd.DataFrame(table_data, columns=columns) @@ -2197,15 +2070,13 @@ def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str: return text else: _logger.warning( - "Usage of TableItem.export_to_markdown() without `doc` argument is " - "deprecated.", + "Usage of TableItem.export_to_markdown() without `doc` argument is deprecated.", ) table = [] for row in self.data.grid: tmp = [] for col in row: - # make sure that md tables are not broken # due to newline chars in the text text = col._get_text(doc=doc) @@ -2242,8 +2113,7 @@ def export_to_html( return text else: _logger.error( - "Usage of TableItem.export_to_html() without `doc` argument is " - "deprecated.", + "Usage of TableItem.export_to_html() without `doc` argument is deprecated.", ) return "" @@ -2290,9 +2160,7 @@ def export_to_otsl( for i in range(nrows): for j in range(ncols): cell: TableCell = self.data.grid[i][j] - content = cell._get_text( - doc=doc, doc_serializer=doc_serializer, **kwargs - ).strip() + content = cell._get_text(doc=doc, doc_serializer=doc_serializer, **kwargs).strip() rowspan, rowstart = ( cell.row_span, cell.start_row_offset_idx, @@ -2447,13 +2315,9 @@ def validate_links(cls, links, info): for link in links: if link.source_cell_id not in valid_cell_ids: - raise ValueError( - f"Invalid source_cell_id {link.source_cell_id} in GraphLink" - ) + raise ValueError(f"Invalid source_cell_id {link.source_cell_id} in GraphLink") if link.target_cell_id not in valid_cell_ids: - raise ValueError( - f"Invalid target_cell_id {link.target_cell_id} in GraphLink" - ) + raise ValueError(f"Invalid target_cell_id {link.target_cell_id} in GraphLink") return links @@ -2540,9 +2404,7 @@ class DoclingDocument(BaseModel): """DoclingDocument.""" schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument" - version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = ( - CURRENT_VERSION - ) + version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = CURRENT_VERSION name: str # The working name of this document, without extensions # (could be taken from originating doc, or just "Untitled 1") origin: Optional[DocumentOrigin] = ( @@ -2559,9 +2421,7 @@ class DoclingDocument(BaseModel): body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = [] groups: List[Union[ListGroup, InlineGroup, GroupItem]] = [] - texts: List[ - Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem] - ] = [] + texts: List[Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem]] = [] pictures: List[PictureItem] = [] tables: List[TableItem] = [] key_value_items: List[KeyValueItem] = [] @@ -2589,9 +2449,7 @@ def transform_to_content_layer(cls, data: Any) -> Any: # Public Manipulation methods # --------------------------- - def append_child_item( - self, *, child: NodeItem, parent: Optional[NodeItem] = None - ) -> None: + def append_child_item(self, *, child: NodeItem, parent: Optional[NodeItem] = None) -> None: """Adds an item.""" if len(child.children) > 0: raise ValueError("Can not append a child with children") @@ -2601,9 +2459,7 @@ def append_child_item( success, stack = self._get_stack_of_item(item=parent) if not success: - raise ValueError( - f"Could not resolve the parent node in the document tree: {parent}" - ) + raise ValueError(f"Could not resolve the parent node in the document tree: {parent}") # Append the item to the attributes of the doc self._append_item(item=child, parent_ref=parent.get_ref()) @@ -2616,15 +2472,11 @@ def append_child_item( self._pop_item(item=child) raise ValueError(f"Could not append child: {child} to parent: {parent}") - def insert_item_after_sibling( - self, *, new_item: NodeItem, sibling: NodeItem - ) -> None: + def insert_item_after_sibling(self, *, new_item: NodeItem, sibling: NodeItem) -> None: """Inserts an item, given its node_item instance, after other as a sibling.""" self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=True) - def insert_item_before_sibling( - self, *, new_item: NodeItem, sibling: NodeItem - ) -> None: + def insert_item_before_sibling(self, *, new_item: NodeItem, sibling: NodeItem) -> None: """Inserts an item, given its node_item instance, before other as a sibling.""" self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=False) @@ -2672,16 +2524,12 @@ def _get_stack_of_refitem(self, ref: RefItem) -> tuple[bool, list[int]]: return (True, stack) - def _insert_item_at_refitem( - self, item: NodeItem, ref: RefItem, after: bool - ) -> RefItem: + def _insert_item_at_refitem(self, item: NodeItem, ref: RefItem, after: bool) -> RefItem: """Insert node-item using the self-reference.""" success, stack = self._get_stack_of_refitem(ref=ref) if not success: - raise ValueError( - f"Could not insert at {ref.cref}: could not find the stack" - ) + raise ValueError(f"Could not insert at {ref.cref}: could not find the stack") return self._insert_item_at_stack(item=item, stack=stack, after=after) @@ -2780,17 +2628,13 @@ def _pop_item(self, *, item: NodeItem): item_label = path[1] item_index = int(path[2]) - if ( - len(self.__getattribute__(item_label)) == item_index + 1 - ): # we can only pop the last item + if len(self.__getattribute__(item_label)) == item_index + 1: # we can only pop the last item del self.__getattribute__(item_label)[item_index] else: msg = f"index:{item_index}, len:{len(self.__getattribute__(item_label))}" raise ValueError(f"Failed to pop: item is not last ({msg})") - def _insert_item_at_stack( - self, item: NodeItem, stack: list[int], after: bool - ) -> RefItem: + def _insert_item_at_stack(self, item: NodeItem, stack: list[int], after: bool) -> RefItem: """Insert node-item using the self-reference.""" parent_ref = self.body._get_parent_ref(doc=self, stack=stack) @@ -2799,16 +2643,12 @@ def _insert_item_at_stack( new_ref = self._append_item(item=item, parent_ref=parent_ref) - success = self.body._add_sibling( - doc=self, stack=stack, new_ref=new_ref, after=after - ) + success = self.body._add_sibling(doc=self, stack=stack, new_ref=new_ref, after=after) if not success: self._pop_item(item=item) - raise ValueError( - f"Could not insert item: {item} under parent: {parent_ref.resolve(doc=self)}" - ) + raise ValueError(f"Could not insert item: {item} under parent: {parent_ref.resolve(doc=self)}") return item.get_ref() @@ -2836,9 +2676,7 @@ def _delete_items(self, refs: list[RefItem]): to_be_deleted_items[tuple(stack)] = ref.cref if len(to_be_deleted_items) < len(refs): - raise ValueError( - f"Cannot find all provided RefItems in doc: {[r.cref for r in refs]}" - ) + raise ValueError(f"Cannot find all provided RefItems in doc: {[r.cref for r in refs]}") # Clean the tree, reverse the order to not have to update for stack_, ref_ in reversed(sorted(to_be_deleted_items.items())): @@ -2857,7 +2695,6 @@ def _delete_items(self, refs: list[RefItem]): for stack_, ref_ in to_be_deleted_items.items(): path = ref_.split("/") if len(path) == 3: - item_label = path[1] item_index = int(path[2]) @@ -2874,22 +2711,16 @@ def _delete_items(self, refs: list[RefItem]): _logger.debug(f"deleting item in doc for {item_label} for {item_index}") del self.__getattribute__(item_label)[item_index] - self._update_breadth_first_with_lookup( - node=self.body, refs_to_be_deleted=refs, lookup=lookup - ) + self._update_breadth_first_with_lookup(node=self.body, refs_to_be_deleted=refs, lookup=lookup) # Update the references - def _update_ref_with_lookup( - self, item_label: str, item_index: int, lookup: dict[str, dict[int, int]] - ) -> RefItem: + def _update_ref_with_lookup(self, item_label: str, item_index: int, lookup: dict[str, dict[int, int]]) -> RefItem: """Update ref with lookup.""" if item_label not in lookup: # Nothing to be done return RefItem(cref=f"#/{item_label}/{item_index}") # Count how many items have been deleted in front of you - delta = sum( - val if item_index >= key else 0 for key, val in lookup[item_label].items() - ) + delta = sum(val if item_index >= key else 0 for key, val in lookup[item_label].items()) new_index = item_index + delta return RefItem(cref=f"#/{item_label}/{new_index}") @@ -2903,10 +2734,7 @@ def _update_refitems_with_lookup( """Update refitems with lookup.""" new_refitems = [] for ref_item in ref_items: - - if ( - ref_item not in refs_to_be_deleted - ): # if ref_item is in ref, then delete/skip them + if ref_item not in refs_to_be_deleted: # if ref_item is in ref, then delete/skip them path = ref_item._split_ref_to_path() if len(path) == 3: new_refitems.append( @@ -2959,17 +2787,13 @@ def _update_breadth_first_with_lookup( if node.parent is not None: path = node.parent._split_ref_to_path() if len(path) == 3: - node.parent = self._update_ref_with_lookup( - item_label=path[1], item_index=int(path[2]), lookup=lookup - ) + node.parent = self._update_ref_with_lookup(item_label=path[1], item_index=int(path[2]), lookup=lookup) # Update the parent reference if node.self_ref is not None: path = node.self_ref.split("/") if len(path) == 3: - _ref = self._update_ref_with_lookup( - item_label=path[1], item_index=int(path[2]), lookup=lookup - ) + _ref = self._update_ref_with_lookup(item_label=path[1], item_index=int(path[2]), lookup=lookup) node.self_ref = _ref.cref # Update the child references @@ -2981,9 +2805,7 @@ def _update_breadth_first_with_lookup( for i, child_ref in enumerate(node.children): node = child_ref.resolve(self) - self._update_breadth_first_with_lookup( - node=node, refs_to_be_deleted=refs_to_be_deleted, lookup=lookup - ) + self._update_breadth_first_with_lookup(node=node, refs_to_be_deleted=refs_to_be_deleted, lookup=lookup) ################################### # TODO: refactor add* methods below @@ -3232,7 +3054,6 @@ def add_text( ) else: - if not parent: parent = self.body @@ -3601,9 +3422,7 @@ def add_form( # Node Item Insertion Methods # --------------------------- - def _get_insertion_stack_and_parent( - self, sibling: NodeItem - ) -> tuple[list[int], RefItem]: + def _get_insertion_stack_and_parent(self, sibling: NodeItem) -> tuple[list[int], RefItem]: """Get the stack and parent reference for inserting a new item at a sibling.""" # Get the stack of the sibling sibling_ref = sibling.get_ref() @@ -3611,9 +3430,7 @@ def _get_insertion_stack_and_parent( success, stack = self._get_stack_of_refitem(ref=sibling_ref) if not success: - raise ValueError( - f"Could not insert at {sibling_ref.cref}: could not find the stack" - ) + raise ValueError(f"Could not insert at {sibling_ref.cref}: could not find the stack") # Get the parent RefItem parent_ref = self.body._get_parent_ref(doc=self, stack=stack) @@ -3639,9 +3456,7 @@ def _insert_in_structure( new_ref = item.get_ref() - success = self.body._add_sibling( - doc=self, stack=stack, new_ref=new_ref, after=after - ) + success = self.body._add_sibling(doc=self, stack=stack, new_ref=new_ref, after=after) # Error handling can be determined here if not success: @@ -3650,9 +3465,7 @@ def _insert_in_structure( if created_parent: self.delete_items(node_items=[item.parent.resolve(self)]) - raise ValueError( - f"Could not insert item: {item} under parent: {item.parent.resolve(doc=self)}" - ) + raise ValueError(f"Could not insert item: {item} under parent: {item.parent.resolve(doc=self)}") def insert_list_group( self, @@ -3834,9 +3647,7 @@ def insert_list_item( if content_layer: list_item.content_layer = content_layer - self._insert_in_structure( - item=list_item, stack=stack, after=after, created_parent=set_parent - ) + self._insert_in_structure(item=list_item, stack=stack, after=after, created_parent=set_parent) return list_item @@ -4331,15 +4142,11 @@ def delete_items_range( :returns: None """ - start_parent_ref = ( - start.parent if start.parent is not None else self.body.get_ref() - ) + start_parent_ref = start.parent if start.parent is not None else self.body.get_ref() end_parent_ref = end.parent if end.parent is not None else self.body.get_ref() if start.parent != end.parent: - raise ValueError( - "Start and end NodeItems must have the same parent to delete a range." - ) + raise ValueError("Start and end NodeItems must have the same parent to delete a range.") start_ref = start.get_ref() end_ref = end.get_ref() @@ -4384,24 +4191,18 @@ def extract_items_range( :returns: DoclingDocument: A new document containing the extracted NodeItems and their children """ if not start.parent == end.parent: - raise ValueError( - "Start and end NodeItems must have the same parent to extract a range." - ) + raise ValueError("Start and end NodeItems must have the same parent to extract a range.") start_ref = start.get_ref() end_ref = end.get_ref() - start_parent_ref = ( - start.parent if start.parent is not None else self.body.get_ref() - ) + start_parent_ref = start.parent if start.parent is not None else self.body.get_ref() end_parent_ref = end.parent if end.parent is not None else self.body.get_ref() start_parent = start_parent_ref.resolve(doc=self) end_parent = end_parent_ref.resolve(doc=self) - start_index = start_parent.children.index(start_ref) + ( - 0 if start_inclusive else 1 - ) + start_index = start_parent.children.index(start_ref) + (0 if start_inclusive else 1) end_index = end_parent.children.index(end_ref) + (1 if end_inclusive else 0) if start_index > end_index: @@ -4442,9 +4243,7 @@ def insert_document( """ ref_items = doc.body.children node_items = [ref.resolve(doc) for ref in ref_items] - self.insert_node_items( - sibling=sibling, node_items=node_items, doc=doc, after=after - ) + self.insert_node_items(sibling=sibling, node_items=node_items, doc=doc, after=after) def add_document( self, @@ -4488,9 +4287,7 @@ def add_node_items( parent_ref = parent.get_ref() - new_refs = self._append_item_copies( - node_items=node_items, parent_ref=parent_ref, doc=doc - ) + new_refs = self._append_item_copies(node_items=node_items, parent_ref=parent_ref, doc=doc) # Add the new item refs in the document structure @@ -4519,17 +4316,13 @@ def insert_node_items( if not isinstance(parent, ListGroup): for item in node_items: if isinstance(item, ListItem): - raise ValueError( - "Cannot insert ListItem into a non-ListGroup parent." - ) + raise ValueError("Cannot insert ListItem into a non-ListGroup parent.") # Append the NodeItems to the document content parent_ref = parent.get_ref() - new_refs = self._append_item_copies( - node_items=node_items, parent_ref=parent_ref, doc=doc - ) + new_refs = self._append_item_copies(node_items=node_items, parent_ref=parent_ref, doc=doc) # Get the stack of the sibling @@ -4538,23 +4331,17 @@ def insert_node_items( success, stack = self._get_stack_of_refitem(ref=sibling_ref) if not success: - raise ValueError( - f"Could not insert at {sibling_ref.cref}: could not find the stack" - ) + raise ValueError(f"Could not insert at {sibling_ref.cref}: could not find the stack") # Insert the new item refs in the document structure reversed_new_refs = new_refs[::-1] for ref in reversed_new_refs: - success = self.body._add_sibling( - doc=self, stack=stack, new_ref=ref, after=after - ) + success = self.body._add_sibling(doc=self, stack=stack, new_ref=ref, after=after) if not success: - raise ValueError( - f"Could not insert item {ref.cref} at {sibling.get_ref().cref}" - ) + raise ValueError(f"Could not insert item {ref.cref} at {sibling.get_ref().cref}") def _append_item_copies( self, @@ -4605,8 +4392,7 @@ def validate_tree(self, root: NodeItem) -> bool: if isinstance(root, TableItem): for cell in root.data.table_cells: if isinstance(cell, RichTableCell) and ( - (par_ref := cell.ref.resolve(self).parent) is None - or par_ref.resolve(self) != root + (par_ref := cell.ref.resolve(self).parent) is None or par_ref.resolve(self) != root ): return False @@ -4641,11 +4427,7 @@ def _iterate_items_with_stack( _stack: Optional[list[int]] = None, ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level """Iterate elements with stack.""" - my_layers = ( - included_content_layers - if included_content_layers is not None - else DEFAULT_CONTENT_LAYERS - ) + my_layers = included_content_layers if included_content_layers is not None else DEFAULT_CONTENT_LAYERS my_stack: list[int] = _stack if _stack is not None else [] if not root: @@ -4658,10 +4440,7 @@ def _iterate_items_with_stack( (not isinstance(root, GroupItem) or with_groups) and ( not isinstance(root, DocItem) - or ( - page_nrs is None - or any(prov.page_no in page_nrs for prov in root.prov) - ) + or (page_nrs is None or any(prov.page_no in page_nrs for prov in root.prov)) ) and root.content_layer in my_layers ) @@ -4672,9 +4451,7 @@ def _iterate_items_with_stack( my_stack.append(-1) allowed_pic_refs: set[str] = ( - {r.cref for r in root.captions} - if (root_is_picture := isinstance(root, PictureItem)) - else set() + {r.cref for r in root.captions} if (root_is_picture := isinstance(root, PictureItem)) else set() ) # Traverse children @@ -4737,12 +4514,8 @@ def _with_embedded_pictures(self) -> "DoclingDocument": for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)): if isinstance(item, PictureItem): - if item.image is not None: - if ( - isinstance(item.image.uri, AnyUrl) - and item.image.uri.scheme == "file" - ): + if isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme == "file": assert isinstance(item.image.uri.path, str) tmp_image = PILImage.open(str(unquote(item.image.uri.path))) item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi) @@ -4774,7 +4547,6 @@ def _with_pictures_refs( if isinstance(item, PictureItem): img = item.get_image(doc=self) if img is not None: - hexhash = PictureItem._image_to_hexhash(img) # loc_path = image_dir / f"image_{img_count:06}.png" @@ -4792,9 +4564,7 @@ def _with_pictures_refs( if item.image is None: scale = img.size[0] / item.prov[0].bbox.width - item.image = ImageRef.from_pil( - image=img, dpi=round(72 * scale) - ) + item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale)) item.image.uri = Path(obj_path) # if item.image._pil is not None: @@ -4821,7 +4591,7 @@ def print_element_tree(self): elif isinstance(item, TextItem): print( " " * level, - f"{ix}: {item.label.value}: {item.text[:min(len(item.text), 100)]}", + f"{ix}: {item.label.value}: {item.text[: min(len(item.text), 100)]}", ) elif isinstance(item, DocItem): @@ -4838,14 +4608,9 @@ def export_to_element_tree(self) -> str: ) ): if isinstance(item, GroupItem): - texts.append( - " " * level + f"{ix}: {item.label.value} with name={item.name}" - ) + texts.append(" " * level + f"{ix}: {item.label.value} with name={item.name}") elif isinstance(item, TextItem): - texts.append( - " " * level - + f"{ix}: {item.label.value}: {item.text[:min(len(item.text), 100)]}" - ) + texts.append(" " * level + f"{ix}: {item.label.value}: {item.text[: min(len(item.text), 100)]}") elif isinstance(item, DocItem): texts.append(" " * level + f"{ix}: {item.label.value}") @@ -4868,13 +4633,9 @@ def save_as_json( if image_mode == ImageRefMode.REFERENCED: os.makedirs(artifacts_dir, exist_ok=True) - new_doc = self._make_copy_with_refmode( - artifacts_dir, image_mode, page_no=None, reference_path=reference_path - ) + new_doc = self._make_copy_with_refmode(artifacts_dir, image_mode, page_no=None, reference_path=reference_path) - out = new_doc.export_to_dict( - coord_precision=coord_precision, confid_precision=confid_precision - ) + out = new_doc.export_to_dict(coord_precision=coord_precision, confid_precision=confid_precision) with open(filename, "w", encoding="utf-8") as fw: json.dump(out, fw, indent=indent) @@ -4911,13 +4672,9 @@ def save_as_yaml( if image_mode == ImageRefMode.REFERENCED: os.makedirs(artifacts_dir, exist_ok=True) - new_doc = self._make_copy_with_refmode( - artifacts_dir, image_mode, page_no=None, reference_path=reference_path - ) + new_doc = self._make_copy_with_refmode(artifacts_dir, image_mode, page_no=None, reference_path=reference_path) - out = new_doc.export_to_dict( - coord_precision=coord_precision, confid_precision=confid_precision - ) + out = new_doc.export_to_dict(coord_precision=coord_precision, confid_precision=confid_precision) with open(filename, "w", encoding="utf-8") as fw: yaml.dump(out, fw, default_flow_style=default_flow_style) @@ -4951,9 +4708,7 @@ def export_to_dict( context[PydanticSerCtxKey.COORD_PREC.value] = coord_precision if confid_precision is not None: context[PydanticSerCtxKey.CONFID_PREC.value] = confid_precision - out = self.model_dump( - mode=mode, by_alias=by_alias, exclude_none=exclude_none, context=context - ) + out = self.model_dump(mode=mode, by_alias=by_alias, exclude_none=exclude_none, context=context) return out @@ -4988,9 +4743,7 @@ def save_as_markdown( if image_mode == ImageRefMode.REFERENCED: os.makedirs(artifacts_dir, exist_ok=True) - new_doc = self._make_copy_with_refmode( - artifacts_dir, image_mode, page_no, reference_path=reference_path - ) + new_doc = self._make_copy_with_refmode(artifacts_dir, image_mode, page_no, reference_path=reference_path) md_out = new_doc.export_to_markdown( delim=delim, @@ -5015,7 +4768,7 @@ def save_as_markdown( with open(filename, "w", encoding="utf-8") as fw: fw.write(md_out) - def export_to_markdown( # noqa: C901 + def export_to_markdown( self, delim: str = "\n\n", from_element: int = 0, @@ -5101,11 +4854,7 @@ def export_to_markdown( # noqa: C901 ) my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS - my_layers = ( - included_content_layers - if included_content_layers is not None - else DEFAULT_CONTENT_LAYERS - ) + my_layers = included_content_layers if included_content_layers is not None else DEFAULT_CONTENT_LAYERS if use_legacy_annotations is not None: warnings.warn( @@ -5149,7 +4898,7 @@ def export_to_markdown( # noqa: C901 return ser_res.text - def export_to_text( # noqa: C901 + def export_to_text( self, delim: str = "\n\n", from_element: int = 0, @@ -5194,9 +4943,7 @@ def save_as_html( if image_mode == ImageRefMode.REFERENCED: os.makedirs(artifacts_dir, exist_ok=True) - new_doc = self._make_copy_with_refmode( - artifacts_dir, image_mode, page_no, reference_path=reference_path - ) + new_doc = self._make_copy_with_refmode(artifacts_dir, image_mode, page_no, reference_path=reference_path) html_out = new_doc.export_to_html( from_element=from_element, @@ -5243,16 +4990,14 @@ def _make_copy_with_refmode( if image_mode == ImageRefMode.PLACEHOLDER: new_doc = self elif image_mode == ImageRefMode.REFERENCED: - new_doc = self._with_pictures_refs( - image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path - ) + new_doc = self._with_pictures_refs(image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path) elif image_mode == ImageRefMode.EMBEDDED: new_doc = self._with_embedded_pictures() else: raise ValueError("Unsupported ImageRefMode") return new_doc - def export_to_html( # noqa: C901 + def export_to_html( self, from_element: int = 0, to_element: int = sys.maxsize, @@ -5275,11 +5020,7 @@ def export_to_html( # noqa: C901 ) my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS - my_layers = ( - included_content_layers - if included_content_layers is not None - else DEFAULT_CONTENT_LAYERS - ) + my_layers = included_content_layers if included_content_layers is not None else DEFAULT_CONTENT_LAYERS output_style = HTMLOutputStyle.SINGLE_COLUMN if split_page_view: @@ -5446,8 +5187,8 @@ def parse_key_value_item( re.DOTALL, ) - cells: List["GraphCell"] = [] - links: List["GraphLink"] = [] + cells: List[GraphCell] = [] + links: List[GraphLink] = [] raw_link_predictions = [] for cell_match in cell_pattern.finditer(tokens): @@ -5680,9 +5421,7 @@ def _add_text( if caption is not None and caption_bbox is not None: caption.prov.append( ProvenanceItem( - bbox=caption_bbox.resize_by_scale( - pg_width, pg_height - ), + bbox=caption_bbox.resize_by_scale(pg_width, pg_height), charspan=(0, len(caption.text)), page_no=page_no, ) @@ -5707,10 +5446,7 @@ def _add_text( chart_data=table_data, ) - if ( - pic_classification is not None - or pic_tabular_chart is not None - ): + if pic_classification is not None or pic_tabular_chart is not None: pic.meta = PictureMeta( classification=pic_classification, tabular_chart=pic_tabular_chart, @@ -5720,17 +5456,13 @@ def _add_text( # In case we don't have access to an binary of an image pic = doc.add_picture( parent=None, - prov=ProvenanceItem( - bbox=bbox, charspan=(0, 0), page_no=page_no - ), + prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=page_no), ) # If there is a caption to an image, add it as well if caption is not None and caption_bbox is not None: caption.prov.append( ProvenanceItem( - bbox=caption_bbox.resize_by_scale( - pg_width, pg_height - ), + bbox=caption_bbox.resize_by_scale(pg_width, pg_height), charspan=(0, len(caption.text)), page_no=page_no, ) @@ -5742,23 +5474,17 @@ def _add_text( provenance="load_from_doctags", predicted_classes=[ # chart_type - PictureClassificationClass( - class_name=chart_type, confidence=1.0 - ) + PictureClassificationClass(class_name=chart_type, confidence=1.0) ], ) ) if table_data is not None: # Add chart data as PictureTabularChartData - pd = PictureTabularChartData( - chart_data=table_data, title=pic_title - ) + pd = PictureTabularChartData(chart_data=table_data, title=pic_title) pic.annotations.append(pd) elif tag_name == DocItemLabel.KEY_VALUE_REGION: - key_value_data, kv_item_prov = parse_key_value_item( - full_chunk, image - ) + key_value_data, kv_item_prov = parse_key_value_item(full_chunk, image) doc.add_key_values(graph=key_value_data, prov=kv_item_prov) elif tag_name in [ DocumentToken.ORDERED_LIST.value, @@ -5770,9 +5496,7 @@ def _add_text( if tag_name == DocumentToken.ORDERED_LIST.value: GroupLabel.ORDERED_LIST - list_item_pattern = ( - rf"<(?P{DocItemLabel.LIST_ITEM})>.*?" - ) + list_item_pattern = rf"<(?P{DocItemLabel.LIST_ITEM})>.*?" li_pattern = re.compile(list_item_pattern, re.DOTALL) # Add list group: new_list = doc.add_list_group(name="list") @@ -5865,7 +5589,7 @@ def export_to_document_tokens(self, *args, **kwargs): r"""Export to DocTags format.""" return self.export_to_doctags(*args, **kwargs) - def export_to_doctags( # noqa: C901 + def export_to_doctags( self, delim: str = "", # deprecated from_element: int = 0, @@ -5925,11 +5649,7 @@ def export_to_doctags( # noqa: C901 add_table_cell_location=add_table_cell_location, add_table_cell_text=add_table_cell_text, pages=pages, - mode=( - DocTagsParams.Mode.MINIFIED - if minified - else DocTagsParams.Mode.HUMAN_FRIENDLY - ), + mode=(DocTagsParams.Mode.MINIFIED if minified else DocTagsParams.Mode.HUMAN_FRIENDLY), ), ) ser_res = serializer.serialize() @@ -5945,7 +5665,6 @@ def _export_to_indented_text( result = [] def get_text(text: str, max_text_len: int): - middle = " ... " if max_text_len == -1: @@ -5960,48 +5679,34 @@ def get_text(text: str, max_text_len: int): for i, (item, level) in enumerate(self.iterate_items(with_groups=True)): if isinstance(item, GroupItem): - result.append( - indent * level - + f"item-{i} at level {level}: {item.label}: group {item.name}" - ) + result.append(indent * level + f"item-{i} at level {level}: {item.label}: group {item.name}") elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]: text = get_text(text=item.text, max_text_len=max_text_len) - result.append( - indent * level + f"item-{i} at level {level}: {item.label}: {text}" - ) + result.append(indent * level + f"item-{i} at level {level}: {item.label}: {text}") elif isinstance(item, SectionHeaderItem): text = get_text(text=item.text, max_text_len=max_text_len) - result.append( - indent * level + f"item-{i} at level {level}: {item.label}: {text}" - ) + result.append(indent * level + f"item-{i} at level {level}: {item.label}: {text}") elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]: text = get_text(text=item.text, max_text_len=max_text_len) - result.append( - indent * level + f"item-{i} at level {level}: {item.label}: {text}" - ) + result.append(indent * level + f"item-{i} at level {level}: {item.label}: {text}") elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]: text = get_text(text=item.text, max_text_len=max_text_len) - result.append( - indent * level + f"item-{i} at level {level}: {item.label}: {text}" - ) + result.append(indent * level + f"item-{i} at level {level}: {item.label}: {text}") elif isinstance(item, TextItem): text = get_text(text=item.text, max_text_len=max_text_len) - result.append( - indent * level + f"item-{i} at level {level}: {item.label}: {text}" - ) + result.append(indent * level + f"item-{i} at level {level}: {item.label}: {text}") elif isinstance(item, TableItem): - result.append( indent * level + f"item-{i} at level {level}: {item.label} with " @@ -6011,9 +5716,7 @@ def get_text(text: str, max_text_len: int): for _ in item.captions: caption = _.resolve(self) result.append( - indent * (level + 1) - + f"item-{i} at level {level + 1}: {caption.label}: " - + f"{caption.text}" + indent * (level + 1) + f"item-{i} at level {level + 1}: {caption.label}: " + f"{caption.text}" ) if explicit_tables: @@ -6022,38 +5725,26 @@ def get_text(text: str, max_text_len: int): grid.append([]) for j, cell in enumerate(row): if j < 10: - text = get_text( - cell._get_text(doc=self), max_text_len=16 - ) + text = get_text(cell._get_text(doc=self), max_text_len=16) grid[-1].append(text) result.append("\n" + tabulate(grid) + "\n") elif isinstance(item, PictureItem): - - result.append( - indent * level + f"item-{i} at level {level}: {item.label}" - ) + result.append(indent * level + f"item-{i} at level {level}: {item.label}") for _ in item.captions: caption = _.resolve(self) result.append( - indent * (level + 1) - + f"item-{i} at level {level + 1}: {caption.label}: " - + f"{caption.text}" + indent * (level + 1) + f"item-{i} at level {level + 1}: {caption.label}: " + f"{caption.text}" ) elif isinstance(item, DocItem): - result.append( - indent * (level + 1) - + f"item-{i} at level {level}: {item.label}: ignored" - ) + result.append(indent * (level + 1) + f"item-{i} at level {level}: {item.label}: ignored") return "\n".join(result) - def add_page( - self, page_no: int, size: Size, image: Optional[ImageRef] = None - ) -> PageItem: + def add_page(self, page_no: int, size: Size, image: Optional[ImageRef] = None) -> PageItem: """add_page. :param page_no: int: @@ -6139,9 +5830,7 @@ def check_version_is_compatible(cls, v: str) -> str: or doc_match["major"] != sdk_match["major"] or doc_match["minor"] > sdk_match["minor"] ): - raise ValueError( - f"Doc version {v} incompatible with SDK schema version {CURRENT_VERSION}" - ) + raise ValueError(f"Doc version {v} incompatible with SDK schema version {CURRENT_VERSION}") else: return CURRENT_VERSION @@ -6151,9 +5840,7 @@ def validate_document(self) -> Self: with warnings.catch_warnings(): # ignore warning from deprecated furniture warnings.filterwarnings("ignore", category=DeprecationWarning) - if not self.validate_tree(self.body) or not self.validate_tree( - self.furniture - ): + if not self.validate_tree(self.body) or not self.validate_tree(self.furniture): raise ValueError("Document hierachy is inconsistent.") return self @@ -6170,8 +5857,7 @@ def validate_misplaced_list_items(self) -> Self: with_groups=True, # so that we can distinguish neighboring lists ): if isinstance(item, ListItem) and ( - item.parent is None - or not isinstance(item.parent.resolve(doc=self), ListGroup) + item.parent is None or not isinstance(item.parent.resolve(doc=self), ListGroup) ): if isinstance(prev, ListItem) and ( prev.parent is None or prev.parent.resolve(self) == self.body @@ -6182,7 +5868,6 @@ def validate_misplaced_list_items(self) -> Self: prev = item for curr_list_items in reversed(misplaced_list_items): - # add group new_group = ListGroup(self_ref="#") self.insert_item_before_sibling( @@ -6227,16 +5912,9 @@ class _DocIndex(BaseModel): def get_item_list(self, key: str) -> list[NodeItem]: return getattr(self, key) - def index( - self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None - ) -> None: - - if page_nrs is not None and ( - unavailable_page_nrs := page_nrs - set(doc.pages.keys()) - ): - raise ValueError( - f"The following page numbers are not present in the document: {unavailable_page_nrs}" - ) + def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) -> None: + if page_nrs is not None and (unavailable_page_nrs := page_nrs - set(doc.pages.keys())): + raise ValueError(f"The following page numbers are not present in the document: {unavailable_page_nrs}") orig_ref_to_new_ref: dict[str, str] = {} page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0 @@ -6255,9 +5933,7 @@ def index( ): key = item.self_ref.split("/")[1] is_body = key == "body" - new_cref = ( - "#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}" - ) + new_cref = "#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}" # register cref mapping: orig_ref_to_new_ref[item.self_ref] = new_cref @@ -6281,15 +5957,10 @@ def index( # set item's parent new_parent_cref = orig_ref_to_new_ref.get(item.parent.cref) if new_parent_cref is None: - parent_ref = item.parent while new_parent_cref is None and parent_ref is not None: - parent_ref = RefItem( - cref=parent_ref.resolve(doc).parent.cref - ) - new_parent_cref = orig_ref_to_new_ref.get( - parent_ref.cref - ) + parent_ref = RefItem(cref=parent_ref.resolve(doc).parent.cref) + new_parent_cref = orig_ref_to_new_ref.get(parent_ref.cref) if new_parent_cref is not None: warnings.warn( @@ -6297,9 +5968,7 @@ def index( f"using ancestor {new_parent_cref} instead" ) else: - warnings.warn( - "No ancestor found in indexed nodes, using body as parent" - ) + warnings.warn("No ancestor found in indexed nodes, using body as parent") new_parent_cref = "#/body" new_item.parent = RefItem(cref=new_parent_cref) @@ -6316,27 +5985,20 @@ def index( if isinstance(parent_item, FloatingItem): for cap_it, cap in enumerate(parent_item.captions): if cap.cref == item.self_ref: - parent_item.captions[cap_it] = RefItem( - cref=new_cref - ) + parent_item.captions[cap_it] = RefItem(cref=new_cref) break # update rich table cells references: if isinstance(parent_item, TableItem): for cell in parent_item.data.table_cells: - if ( - isinstance(cell, RichTableCell) - and cell.ref.cref == item.self_ref - ): + if isinstance(cell, RichTableCell) and cell.ref.cref == item.self_ref: cell.ref.cref = new_cref break elif num_components == 2 and path_components[1] == "body": parent_item = self._body else: - raise RuntimeError( - f"Unsupported ref format: {new_parent_cref}" - ) + raise RuntimeError(f"Unsupported ref format: {new_parent_cref}") parent_item.children.append(RefItem(cref=new_cref)) # update pages @@ -6392,7 +6054,6 @@ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument": return res_doc def _validate_rules(self, raise_on_error: bool = True): - def _handle(error: Exception): if raise_on_error: raise error @@ -6405,9 +6066,7 @@ def validate_furniture(doc: DoclingDocument): has_furniture_children = len(doc.furniture.children) > 0 if has_furniture_children: _handle( - ValueError( - f"Deprecated furniture node {doc.furniture.self_ref} has children" - ), + ValueError(f"Deprecated furniture node {doc.furniture.self_ref} has children"), ) def validate_list_group(doc: DoclingDocument, item: ListGroup): @@ -6427,15 +6086,11 @@ def validate_list_item(doc: DoclingDocument, item: ListItem): ) elif not isinstance(item.parent.resolve(doc), ListGroup): _handle( - ValueError( - f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}" - ), + ValueError(f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"), ) def validate_group(doc: DoclingDocument, item: GroupItem): - if ( - item.parent and not item.children - ): # tolerate empty body, but not other groups + if item.parent and not item.children: # tolerate empty body, but not other groups _handle( ValueError(f"Group {item.self_ref} has no children"), ) @@ -6460,12 +6115,8 @@ def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None: """Add a table cell to the table.""" if isinstance(cell, RichTableCell): item = cell.ref.resolve(doc=self) - if isinstance(item, NodeItem) and ( - (not item.parent) or item.parent.cref != table_item.self_ref - ): - raise ValueError( - f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}" - ) + if isinstance(item, NodeItem) and ((not item.parent) or item.parent.cref != table_item.self_ref): + raise ValueError(f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}") table_item.data.table_cells.append(cell) diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index 835cea42..76223a5e 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -74,9 +74,7 @@ class GroupLabel(str, Enum): """GroupLabel.""" UNSPECIFIED = "unspecified" - LIST = ( - "list" # group label for list container (not the list-items) (e.g. HTML
      ) - ) + LIST = "list" # group label for list container (not the list-items) (e.g. HTML
        ) ORDERED_LIST = "ordered_list" # deprecated CHAPTER = "chapter" SECTION = "section" diff --git a/docling_core/types/doc/page.py b/docling_core/types/doc/page.py index 7fc67ddf..0c4dfda9 100644 --- a/docling_core/types/doc/page.py +++ b/docling_core/types/doc/page.py @@ -332,9 +332,7 @@ class PdfCellRenderingMode(int, Enum): class PdfTextCell(TextCell): """Specialized text cell for PDF documents with font information.""" - rendering_mode: ( - PdfCellRenderingMode # Turn into enum (PDF32000 Text Rendering Mode) - ) + rendering_mode: PdfCellRenderingMode # Turn into enum (PDF32000 Text Rendering Mode) widget: bool # Determines if this belongs to fillable PDF field. font_key: str @@ -347,9 +345,7 @@ class PdfTextCell(TextCell): def update_ltr_property(cls, data: dict) -> dict: """Update text direction property from left_to_right flag.""" if "left_to_right" in data: - data["text_direction"] = ( - "left_to_right" if data["left_to_right"] else "right_to_left" - ) + data["text_direction"] = "left_to_right" if data["left_to_right"] else "right_to_left" # if "ordering" in data: # data["index"] = data["ordering"] return data @@ -395,7 +391,7 @@ def iterate_segments( self, ) -> Iterator[Tuple[Coord2D, Coord2D]]: """Iterate through line segments defined by consecutive point pairs.""" - for k in range(0, len(self.points) - 1): + for k in range(len(self.points) - 1): yield (self.points[k], self.points[k + 1]) def to_bottom_left_origin(self, page_height: float): @@ -619,9 +615,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "SegmentedPdfPage": with open(filename, "r", encoding="utf-8") as f: return cls.model_validate_json(f.read()) - def crop_text( - self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0 - ) -> str: + def crop_text(self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0) -> str: """Extract text from cells within the specified bounding box. Args: @@ -633,16 +627,9 @@ def crop_text( """ selection = [] for page_cell in self.iterate_cells(cell_unit): - cell_bbox = page_cell.rect.to_bottom_left_origin( - page_height=self.dimension.height - ).to_bounding_box() - - if ( - bbox.l <= cell_bbox.l - and cell_bbox.r <= bbox.r - and bbox.b <= cell_bbox.b - and cell_bbox.t <= bbox.t - ): + cell_bbox = page_cell.rect.to_bottom_left_origin(page_height=self.dimension.height).to_bounding_box() + + if bbox.l <= cell_bbox.l and cell_bbox.r <= bbox.r and bbox.b <= cell_bbox.b and cell_bbox.t <= bbox.t: selection.append(page_cell.copy()) selection = sorted(selection, key=lambda x: x.index) @@ -654,10 +641,7 @@ def crop_text( else: prev = selection[i - 1] - if ( - abs(cell.rect.r_x0 - prev.rect.r_x1) < eps - and abs(cell.rect.r_y0 - prev.rect.r_y1) < eps - ): + if abs(cell.rect.r_x0 - prev.rect.r_x1) < eps and abs(cell.rect.r_y0 - prev.rect.r_y1) < eps: text += cell.text else: text += " " @@ -801,9 +785,7 @@ def render_as_image( page_height = page_bbox.height # Create a blank white image with RGBA mode - result = PILImage.new( - "RGBA", (round(page_width), round(page_height)), (255, 255, 255, 255) - ) + result = PILImage.new("RGBA", (round(page_width), round(page_height)), (255, 255, 255, 255)) draw = ImageDraw.Draw(result) # Draw each rectangle by connecting its four points @@ -817,9 +799,7 @@ def render_as_image( ) if draw_cells_text: - result = self._render_cells_text( - cell_unit=cell_unit, img=result, page_height=page_height - ) + result = self._render_cells_text(cell_unit=cell_unit, img=result, page_height=page_height) elif draw_cells_bbox: self._render_cells_bbox( @@ -902,16 +882,10 @@ def _render_bitmap_resources( Updated ImageDraw object """ for bitmap_resource in self.bitmap_resources: - poly = bitmap_resource.rect.to_top_left_origin( - page_height=page_height - ).to_polygon() + poly = bitmap_resource.rect.to_top_left_origin(page_height=page_height).to_polygon() - fill = self._get_rgba( - name=bitmap_resources_fill, alpha=bitmap_resources_alpha - ) - outline = self._get_rgba( - name=bitmap_resources_outline, alpha=bitmap_resources_alpha - ) + fill = self._get_rgba(name=bitmap_resources_fill, alpha=bitmap_resources_alpha) + outline = self._get_rgba(name=bitmap_resources_outline, alpha=bitmap_resources_alpha) draw.polygon(poly, outline=outline, fill=fill) @@ -944,9 +918,7 @@ def _render_cells_bbox( # Draw each rectangle by connecting its four points for page_cell in self.iterate_cells(unit_type=cell_unit): - poly = page_cell.rect.to_top_left_origin( - page_height=page_height - ).to_polygon() + poly = page_cell.rect.to_top_left_origin(page_height=page_height).to_polygon() draw.polygon(poly, outline=outline, fill=fill) return draw @@ -995,9 +967,7 @@ def _draw_text_in_rectangle( _, _, text_width, text_height = tmp_draw.textbbox((0, 0), text=text, font=font) # Create a properly sized temporary image - text_img = PILImage.new( - "RGBA", (round(text_width), round(text_height)), (255, 255, 255, 255) - ) + text_img = PILImage.new("RGBA", (round(text_width), round(text_height)), (255, 255, 255, 255)) text_draw = ImageDraw.Draw(text_img) text_draw.text((0, 0), text, font=font, fill=(0, 0, 0, 255)) @@ -1017,9 +987,7 @@ def _draw_text_in_rectangle( return img - def _render_cells_text( - self, cell_unit: TextCellUnit, img: PILImage.Image, page_height: float - ) -> PILImage.Image: + def _render_cells_text(self, cell_unit: TextCellUnit, img: PILImage.Image, page_height: float) -> PILImage.Image: """Render text content of cells on the image. Args: @@ -1070,9 +1038,7 @@ def _draw_cells_bl( # Draw each rectangle by connecting its four points for page_cell in self.iterate_cells(unit_type=cell_unit): - poly = page_cell.rect.to_top_left_origin( - page_height=page_height - ).to_polygon() + poly = page_cell.rect.to_top_left_origin(page_height=page_height).to_polygon() # Define the bounding box for the dot dot_bbox = [ (poly[0][0] - cell_bl_radius, poly[0][1] - cell_bl_radius), @@ -1113,9 +1079,7 @@ def _draw_cells_tr( # Draw each rectangle by connecting its four points for page_cell in self.iterate_cells(unit_type=cell_unit): - poly = page_cell.rect.to_top_left_origin( - page_height=page_height - ).to_polygon() + poly = page_cell.rect.to_top_left_origin(page_height=page_height).to_polygon() # Define the bounding box for the dot dot_bbox = [ (poly[0][0] - cell_tr_radius, poly[0][1] - cell_tr_radius), @@ -1181,9 +1145,7 @@ def initialise(self): for _ in matches: namespace_open, tag_open, content, namespace_close, tag_close = _ if namespace_open == namespace_close and tag_open == tag_close: - _logger.debug( - f"Namespace: {namespace_open}, Tag: {tag_open}, Content: {content}" - ) + _logger.debug(f"Namespace: {namespace_open}, Tag: {tag_open}, Content: {content}") self.data[tag_open] = content @@ -1253,8 +1215,7 @@ def iterate_pages( Returns: Iterator of (page number, page) tuples """ - for page_no, page in self.pages.items(): - yield (page_no, page) + yield from self.pages.items() def export_to_dict( self, diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py index 6b4f6919..81d831ba 100644 --- a/docling_core/types/doc/tokens.py +++ b/docling_core/types/doc/tokens.py @@ -206,7 +206,7 @@ def get_special_tokens( special_tokens.extend(TableToken.get_special_tokens()) # Adding dynamically generated location-tokens - for i in range(0, max(page_dimension[0], page_dimension[1])): + for i in range(max(page_dimension[0], page_dimension[1])): special_tokens.append(f"<{_LOC_PREFIX}{i}>") return special_tokens @@ -265,9 +265,7 @@ def get_code_language_token(code_language: str, self_closing: bool = False) -> s return _CodeLanguageToken(f"<_{code_language}_>").value @staticmethod - def get_location_token( - val: float, rnorm: int = 500, self_closing: bool = False - ): # TODO review + def get_location_token(val: float, rnorm: int = 500, self_closing: bool = False): # TODO review """Function to get location tokens.""" val_ = round(rnorm * val) val_ = max(val_, 0) @@ -292,18 +290,10 @@ def get_location( x1 = bbox[2] / page_w y1 = bbox[3] / page_h - x0_tok = DocumentToken.get_location_token( - val=min(x0, x1), rnorm=xsize, self_closing=self_closing - ) - y0_tok = DocumentToken.get_location_token( - val=min(y0, y1), rnorm=ysize, self_closing=self_closing - ) - x1_tok = DocumentToken.get_location_token( - val=max(x0, x1), rnorm=xsize, self_closing=self_closing - ) - y1_tok = DocumentToken.get_location_token( - val=max(y0, y1), rnorm=ysize, self_closing=self_closing - ) + x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize, self_closing=self_closing) + y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize, self_closing=self_closing) + x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize, self_closing=self_closing) + y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize, self_closing=self_closing) loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}" diff --git a/docling_core/types/doc/utils.py b/docling_core/types/doc/utils.py index c4e517a5..a105ae14 100644 --- a/docling_core/types/doc/utils.py +++ b/docling_core/types/doc/utils.py @@ -53,9 +53,7 @@ def relative_path(src: Path, target: Path) -> Path: return Path(*up_segments, *down_segments) -def get_html_tag_with_text_direction( - html_tag: str, text: str, attrs: Optional[dict] = None -) -> str: +def get_html_tag_with_text_direction(html_tag: str, text: str, attrs: Optional[dict] = None) -> str: """Form the HTML element with tag, text, and optional dir attribute.""" my_attrs = attrs or {} if (dir := my_attrs.get("dir")) is not None and dir != "ltr": @@ -63,10 +61,7 @@ def get_html_tag_with_text_direction( pieces: list[str] = [html_tag] if my_attrs: attrs_str = " ".join( - [ - f'{html.escape(k, quote=False)}="{html.escape(my_attrs[k], quote=False)}"' - for k in my_attrs - ] + [f'{html.escape(k, quote=False)}="{html.escape(my_attrs[k], quote=False)}"' for k in my_attrs] ) pieces.append(attrs_str) return f"<{' '.join(pieces)}>{text}" @@ -80,12 +75,7 @@ def get_text_direction(text: str) -> str: rtl_scripts = {"R", "AL"} rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text) - return ( - "rtl" - if unicodedata.bidirectional(text[0]) in rtl_scripts - or rtl_chars > len(text) / 2 - else "ltr" - ) + return "rtl" if unicodedata.bidirectional(text[0]) in rtl_scripts or rtl_chars > len(text) / 2 else "ltr" def otsl_extract_tokens_and_text(s: str) -> Tuple[List[str], List[str]]: @@ -128,9 +118,7 @@ def otsl_extract_tokens_and_text(s: str) -> Tuple[List[str], List[str]]: return tokens, text_parts -def otsl_parse_texts( - texts: List[str], tokens: List[str] -) -> Tuple[List["TableCell"], List[List[str]]]: +def otsl_parse_texts(texts: List[str], tokens: List[str]) -> Tuple[List["TableCell"], List[List[str]]]: """Parse OTSL texts and tokens into table cells.""" from docling_core.types.doc.document import TableCell @@ -151,19 +139,13 @@ def otsl_parse_texts( ]: clean_tokens.append(t) tokens = clean_tokens - split_row_tokens = [ - list(y) - for x, y in itertools.groupby(tokens, lambda z: z == split_word) - if not x - ] + split_row_tokens = [list(y) for x, y in itertools.groupby(tokens, lambda z: z == split_word) if not x] table_cells = [] r_idx = 0 c_idx = 0 - def count_right( - tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str] - ) -> int: + def count_right(tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str]) -> int: span = 0 c_idx_iter = c_idx while tokens[r_idx][c_idx_iter] in which_tokens: @@ -173,9 +155,7 @@ def count_right( return span return span - def count_down( - tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str] - ) -> int: + def count_down(tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str]) -> int: span = 0 r_idx_iter = r_idx while tokens[r_idx_iter][c_idx] in which_tokens: diff --git a/docling_core/types/gen/generic.py b/docling_core/types/gen/generic.py index 7596bf4e..3fa5a7a5 100644 --- a/docling_core/types/gen/generic.py +++ b/docling_core/types/gen/generic.py @@ -21,8 +21,6 @@ class Generic(AliasModel): file_info: FileInfoObject = Field( title="Document information", - description=( - "Minimal identification information of the document within a collection." - ), + description=("Minimal identification information of the document within a collection."), alias="file-info", ) diff --git a/docling_core/types/legacy_doc/base.py b/docling_core/types/legacy_doc/base.py index 6b9a2ee6..16e08338 100644 --- a/docling_core/types/legacy_doc/base.py +++ b/docling_core/types/legacy_doc/base.py @@ -48,18 +48,14 @@ class S3Data(AliasModel): pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images") json_document: Optional[S3Resource] = Field(default=None, alias="json-document") json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta") - glm_json_document: Optional[S3Resource] = Field( - default=None, alias="glm-json-document" - ) + glm_json_document: Optional[S3Resource] = Field(default=None, alias="glm-json-document") figures: Optional[list[S3Resource]] = None class S3Reference(AliasModel): """References an s3 resource.""" - ref_s3_data: StrictStr = Field( - alias="__ref_s3_data", examples=["#/_s3_data/figures/0"] - ) + ref_s3_data: StrictStr = Field(alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]) class Prov(AliasModel): @@ -84,9 +80,7 @@ class BitmapObject(AliasModel): """Bitmap object.""" obj_type: str = Field(alias="type") - bounding_box: BoundingBoxContainer = Field( - json_schema_extra=es_field(suppress=True) - ) + bounding_box: BoundingBoxContainer = Field(json_schema_extra=es_field(suppress=True)) prov: Prov @@ -111,31 +105,19 @@ class GlmTableCell(TableCell): """Glm Table cell.""" col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True)) - col_header: bool = Field( - default=False, alias="col-header", json_schema_extra=es_field(suppress=True) - ) - col_span: Optional[Span] = Field( - default=None, alias="col-span", json_schema_extra=es_field(suppress=True) - ) + col_header: bool = Field(default=False, alias="col-header", json_schema_extra=es_field(suppress=True)) + col_span: Optional[Span] = Field(default=None, alias="col-span", json_schema_extra=es_field(suppress=True)) row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True)) - row_header: bool = Field( - default=False, alias="row-header", json_schema_extra=es_field(suppress=True) - ) - row_span: Optional[Span] = Field( - default=None, alias="row-span", json_schema_extra=es_field(suppress=True) - ) + row_header: bool = Field(default=False, alias="row-header", json_schema_extra=es_field(suppress=True)) + row_span: Optional[Span] = Field(default=None, alias="row-span", json_schema_extra=es_field(suppress=True)) class BaseCell(AliasModel): """Base cell.""" prov: Optional[list[Prov]] = None - text: Optional[str] = Field( - default=None, json_schema_extra=es_field(term_vector="with_positions_offsets") - ) - obj_type: str = Field( - alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) + text: Optional[str] = Field(default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")) + obj_type: str = Field(alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)) payload: Optional[dict] = None def get_location_tokens( @@ -153,7 +135,6 @@ def get_location_tokens( location = "" for prov in self.prov: - page_i = -1 if add_page_index: page_i = prov.page @@ -247,8 +228,8 @@ def export_to_html(self) -> str: for j in range(ncols): cell: TableCell = self.data[i][j] - rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0) - colspan, colstart, colend = self._get_tablecell_span(cell, 1) + rowspan, rowstart, _ = self._get_tablecell_span(cell, 0) + colspan, colstart, _ = self._get_tablecell_span(cell, 1) if rowstart is not None and rowstart != i: continue @@ -318,7 +299,6 @@ def export_to_document_tokens( for i, row in enumerate(self.data): body += f"" for j, col in enumerate(row): - text = "" if add_cell_text: text = col.text.strip() @@ -339,11 +319,7 @@ def export_to_document_tokens( ysize=ysize, page_i=self.prov[0].page, ) - elif ( - col.bbox is not None - and add_cell_location - and not add_page_index - ): + elif col.bbox is not None and add_cell_location and not add_page_index: cell_loc = DocumentToken.get_location( bbox=col.bbox, page_w=page_w, @@ -354,11 +330,7 @@ def export_to_document_tokens( ) cell_label = "" - if ( - add_cell_label - and col.obj_type is not None - and len(col.obj_type) > 0 - ): + if add_cell_label and col.obj_type is not None and len(col.obj_type) > 0: cell_label = f"<{col.obj_type}>" body += f"{cell_loc}{cell_label}{text}" @@ -419,9 +391,7 @@ class BaseText(BaseCell): """Base model for text objects.""" # FIXME: do we need these ??? - name: Optional[StrictStr] = Field( - default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) + name: Optional[StrictStr] = Field(default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)) font: Optional[str] = None def export_to_document_tokens( @@ -438,9 +408,7 @@ def export_to_document_tokens( """Export text element to document tokens format.""" body = f"<{self.obj_type}>" - assert DocumentToken.is_known_token( - body - ), f"failed DocumentToken.is_known_token({body})" + assert DocumentToken.is_known_token(body), f"failed DocumentToken.is_known_token({body})" if add_location: body += self.get_location_tokens( diff --git a/docling_core/types/legacy_doc/doc_raw.py b/docling_core/types/legacy_doc/doc_raw.py index 5b177196..1428341c 100644 --- a/docling_core/types/legacy_doc/doc_raw.py +++ b/docling_core/types/legacy_doc/doc_raw.py @@ -153,9 +153,7 @@ class Page(AliasModel): cells: list[Cell] paths: list[Path] vertical_lines: Optional[list[VerticalLine]] = Field(..., alias="vertical-lines") - horizontal_lines: Optional[list[HorizontalLine]] = Field( - ..., alias="horizontal-lines" - ) + horizontal_lines: Optional[list[HorizontalLine]] = Field(..., alias="horizontal-lines") ignored_cells: list[IgnoredCell] = Field(..., alias="ignored-cells") images: list[Image] fonts: dict[str, FontInfo] diff --git a/docling_core/types/legacy_doc/document.py b/docling_core/types/legacy_doc/document.py index 91b4c2ac..c289d986 100644 --- a/docling_core/types/legacy_doc/document.py +++ b/docling_core/types/legacy_doc/document.py @@ -61,12 +61,8 @@ class CCSFileInfoObject(FileInfoObject, extra="forbid"): alias="collection-name", json_schema_extra=es_field(type="keyword", ignore_above=8191), ) - description: Optional[CCSFileInfoDescription] = Field( - default=None, json_schema_extra=es_field(suppress=True) - ) - page_hashes: Optional[list[PageReference]] = Field( - default=None, alias="page-hashes" - ) + description: Optional[CCSFileInfoDescription] = Field(default=None, json_schema_extra=es_field(suppress=True)) + page_hashes: Optional[list[PageReference]] = Field(default=None, alias="page-hashes") class Affiliation(BaseModel, extra="forbid"): @@ -85,12 +81,8 @@ class Affiliation(BaseModel, extra="forbid"): }, ), ) - id: Optional[str] = Field( - default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) - source: Optional[str] = Field( - default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) + id: Optional[str] = Field(default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)) + source: Optional[str] = Field(default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)) class Author(BaseModel, extra="forbid"): @@ -110,12 +102,8 @@ class Author(BaseModel, extra="forbid"): }, ), ) - id: Optional[str] = Field( - default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) - source: Optional[str] = Field( - default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) + id: Optional[str] = Field(default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)) + source: Optional[str] = Field(default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)) affiliations: Optional[list[Affiliation]] = None @@ -166,9 +154,7 @@ class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"): class DescriptionLicense(BaseModel, extra="forbid"): """Licence in document description.""" - code: Optional[StrictStr] = Field( - default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) + code: Optional[StrictStr] = Field(default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)) text: Optional[StrictStr] = None @@ -190,13 +176,9 @@ class CCSDocumentDescription( affiliations: Optional[list[Affiliation]] = None subjects: Optional[list[str]] = Field( default=None, - json_schema_extra=es_field( - fields={"keyword": {"ignore_above": 8191, "type": "keyword"}} - ), - ) - keywords: Optional[list[str]] = Field( - default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) + json_schema_extra=es_field(fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}), ) + keywords: Optional[list[str]] = Field(default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)) publication_date: Optional[datetime] = None languages: Optional[list[LanguageT]] = Field( default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) @@ -205,9 +187,7 @@ class CCSDocumentDescription( publishers: Optional[list[StrictStr]] = Field( default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) ) - url_refs: Optional[list[str]] = Field( - default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) + url_refs: Optional[list[str]] = Field(default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)) references: Optional[list[Identifier[IdentifierTypeT]]] = None publication: Optional[list[Publication]] = Field( default=None, description="List of publication journals or venues." @@ -240,10 +220,7 @@ class CCSDocumentDescription( ) acquisition: Optional[Acquisition] = Field( default=None, - description=( - "Information on how the document was obtained, for data governance" - " purposes." - ), + description=("Information on how the document was obtained, for data governance purposes."), ) @@ -269,9 +246,7 @@ class MinimalDocument( CollectionNameTypeT, ] file_info: FileInfoObject = Field(alias="file-info") - main_text: Optional[list[Union[Ref, BaseText]]] = Field( - default=None, alias="main-text" - ) + main_text: Optional[list[Union[Ref, BaseText]]] = Field(default=None, alias="main-text") figures: Optional[list[Figure]] = None tables: Optional[list[Table]] = None @@ -297,9 +272,7 @@ class CCSDocument( default=None, alias="main-text", ) - page_dimensions: Optional[list[PageDimensions]] = Field( - default=None, alias="page-dimensions" - ) + page_dimensions: Optional[list[PageDimensions]] = Field(default=None, alias="page-dimensions") page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers") page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers") s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data") @@ -370,12 +343,8 @@ class ExportedCCSDocument( CollectionNameTypeT, ] file_info: CCSFileInfoObject = Field(alias="file-info") - main_text: Optional[list[Union[Ref, BaseText]]] = Field( - default=None, alias="main-text" - ) - page_dimensions: Optional[list[PageDimensions]] = Field( - default=None, alias="page-dimensions" - ) + main_text: Optional[list[Union[Ref, BaseText]]] = Field(default=None, alias="main-text") + page_dimensions: Optional[list[PageDimensions]] = Field(default=None, alias="page-dimensions") page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers") page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers") s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data") @@ -433,7 +402,7 @@ def export_to_dict(self) -> Dict[str, Any]: """export_to_dict.""" return self.model_dump(mode="json", by_alias=True, exclude_none=True) - def export_to_markdown( # noqa: C901 + def export_to_markdown( self, delim: str = "\n\n", main_text_start: int = 0, @@ -480,30 +449,18 @@ def export_to_markdown( # noqa: C901 # to avoid repeating them embedded_captions = set() for orig_item in self.main_text[main_text_start:main_text_stop]: - item = ( - self._resolve_ref(orig_item) - if isinstance(orig_item, Ref) - else orig_item - ) + item = self._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item if item is None: continue - if ( - isinstance(item, (Table, Figure)) - and item.text - and item.obj_type in main_text_labels - ): + if isinstance(item, (Table, Figure)) and item.text and item.obj_type in main_text_labels: embedded_captions.add(item.text) # serialize document to markdown for orig_item in self.main_text[main_text_start:main_text_stop]: markdown_text = "" - item = ( - self._resolve_ref(orig_item) - if isinstance(orig_item, Ref) - else orig_item - ) + item = self._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item if item is None: continue @@ -531,9 +488,7 @@ def export_to_markdown( # noqa: C901 has_title = True # secondary titles - elif item_type in {"title", "subtitle-level-1"} or ( - has_title and item_type == "title" - ): + elif item_type in {"title", "subtitle-level-1"} or (has_title and item_type == "title"): if strict_text: markdown_text = f"{text}" else: @@ -543,12 +498,7 @@ def export_to_markdown( # noqa: C901 else: markdown_text = text - elif ( - isinstance(item, Table) - and (item.data or item.text) - and item_type in main_text_labels - ): - + elif isinstance(item, Table) and (item.data or item.text) and item_type in main_text_labels: md_table = "" table = [] if item.data is not None: @@ -560,9 +510,7 @@ def export_to_markdown( # noqa: C901 if len(table) > 1 and len(table[0]) > 0: try: - md_table = tabulate( - table[1:], headers=table[0], tablefmt="github" - ) + md_table = tabulate(table[1:], headers=table[0], tablefmt="github") except ValueError: md_table = tabulate( table[1:], @@ -575,19 +523,14 @@ def export_to_markdown( # noqa: C901 if item.text: markdown_text = item.text if not strict_text: - markdown_text += ( - "\n\n" if len(markdown_text) > 0 else "" - ) + md_table + markdown_text += ("\n\n" if len(markdown_text) > 0 else "") + md_table elif isinstance(item, Figure) and item_type in main_text_labels: - markdown_text = "" if item.text: markdown_text = item.text if not strict_text: - markdown_text += ( - "\n" if len(markdown_text) > 0 else "" - ) + image_placeholder + markdown_text += ("\n" if len(markdown_text) > 0 else "") + image_placeholder if markdown_text: md_texts.append(markdown_text) @@ -636,12 +579,7 @@ def export_to_document_tokens( if self.main_text is not None: for orig_item in self.main_text[main_text_start:main_text_stop]: - - item = ( - self._resolve_ref(orig_item) - if isinstance(orig_item, Ref) - else orig_item - ) + item = self._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item if item is None: continue @@ -652,13 +590,7 @@ def export_to_document_tokens( page_w = 0.0 page_h = 0.0 - if ( - add_location - and self.page_dimensions is not None - and prov is not None - and len(prov) > 0 - ): - + if add_location and self.page_dimensions is not None and prov is not None and len(prov) > 0: page_i = prov[0].page page_dim = self.page_dimensions[page_i - 1] @@ -667,7 +599,6 @@ def export_to_document_tokens( item_type = item.obj_type if isinstance(item, BaseText) and (item_type in main_text_labels): - doctags += item.export_to_document_tokens( new_line=new_line, page_w=page_w, @@ -680,7 +611,6 @@ def export_to_document_tokens( ) elif isinstance(item, Table) and (item_type in main_text_labels): - doctags += item.export_to_document_tokens( new_line=new_line, page_w=page_w, @@ -697,7 +627,6 @@ def export_to_document_tokens( ) elif isinstance(item, Figure) and (item_type in main_text_labels): - doctags += item.export_to_document_tokens( new_line=new_line, page_w=page_w, diff --git a/docling_core/types/legacy_doc/tokens.py b/docling_core/types/legacy_doc/tokens.py index 3936ecad..3ac6f0fc 100644 --- a/docling_core/types/legacy_doc/tokens.py +++ b/docling_core/types/legacy_doc/tokens.py @@ -99,10 +99,10 @@ def get_special_tokens( special_tokens = [token.value for token in cls] # Adding dynamically generated row and col tokens - for i in range(0, max_rows + 1): + for i in range(max_rows + 1): special_tokens += [f"", f""] - for i in range(0, max_cols + 1): + for i in range(max_cols + 1): special_tokens += [f"", f""] for i in range(6): @@ -113,12 +113,12 @@ def get_special_tokens( special_tokens += [f"", f""] # Adding dynamically generated page-tokens - for i in range(0, max_pages + 1): + for i in range(max_pages + 1): special_tokens.append(f"") special_tokens.append(f"") # Adding dynamically generated location-tokens - for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)): + for i in range(max(page_dimension[0] + 1, page_dimension[1] + 1)): special_tokens.append(f"") return special_tokens diff --git a/docling_core/types/nlp/qa.py b/docling_core/types/nlp/qa.py index 6f9159cf..3d7e1e3d 100644 --- a/docling_core/types/nlp/qa.py +++ b/docling_core/types/nlp/qa.py @@ -13,18 +13,11 @@ class QAPair(BaseModel, Generic[DescriptionAdvancedT]): """A representation of a question-answering (QA) pair.""" context: StrictStr = Field( - description=( - "A single string containing the context of the question enabling the" - " presentation of the answer." - ) + description=("A single string containing the context of the question enabling the presentation of the answer.") ) question: StrictStr = Field(description="A question on the given context.") - answer: StrictStr = Field( - description="The answer to the question from the context." - ) - short_answer: Optional[StrictStr] = Field( - default=None, description="Alternative and concise answer." - ) + answer: StrictStr = Field(description="The answer to the question from the context.") + short_answer: Optional[StrictStr] = Field(default=None, description="Alternative and concise answer.") retrieved_context: Optional[StrictBool] = Field( default=False, description="Whether the context was retrieved from the question.", @@ -35,14 +28,10 @@ class QAPair(BaseModel, Generic[DescriptionAdvancedT]): generated_answer: Optional[StrictBool] = Field( default=False, description="Whether the answer was generated by an AI model." ) - created: StrictDateTime = Field( - description="Datetime when the QA pair was created ." - ) + created: StrictDateTime = Field(description="Datetime when the QA pair was created .") user: Optional[StrictStr] = Field( default=None, - description=( - "Unique identifier of the user that created or curated this QA pair." - ), + description=("Unique identifier of the user that created or curated this QA pair."), json_schema_extra=es_field(type="keyword", ignore_above=8191), ) model: Optional[StrictStr] = Field( @@ -51,20 +40,12 @@ class QAPair(BaseModel, Generic[DescriptionAdvancedT]): json_schema_extra=es_field(type="keyword", ignore_above=8191), ) paths: UniqueList[StrictStr] = Field( - description=( - "One or more references to a document that identify the provenance of the" - " QA pair context." - ), - examples=[ - "badce7c84d0ba7ba0fb5e94492b0d91e2506a7cb48e4524ad572c546a35f768e#/" - "main-text/4" - ], + description=("One or more references to a document that identify the provenance of the QA pair context."), + examples=["badce7c84d0ba7ba0fb5e94492b0d91e2506a7cb48e4524ad572c546a35f768e#/main-text/4"], json_schema_extra=es_field(type="keyword", ignore_above=8191), ) advanced: Optional[DescriptionAdvancedT] = Field( default=None, description="Document metadata to provide more details on the context.", ) - labels: Optional[QALabelling] = Field( - default=None, description="QApair labelling axes." - ) + labels: Optional[QALabelling] = Field(default=None, description="QApair labelling axes.") diff --git a/docling_core/types/rec/attribute.py b/docling_core/types/rec/attribute.py index 95d77fcc..b6b8d74b 100644 --- a/docling_core/types/rec/attribute.py +++ b/docling_core/types/rec/attribute.py @@ -31,13 +31,11 @@ class Attribute( ): """Attribute model that describes a list of characteristics.""" - conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0, allow_inf_nan=False)] = ( - Field( - ..., - title="Confidence", - description="The confidence level of this attribute characteristics.", - json_schema_extra=es_field(type="float"), - ) + conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0, allow_inf_nan=False)] = Field( + ..., + title="Confidence", + description="The confidence level of this attribute characteristics.", + json_schema_extra=es_field(type="float"), ) prov: Optional[list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]]] = Field( @@ -46,6 +44,6 @@ class Attribute( description="The sources of this attribute characteristics.", ) - predicates: list[ - Predicate[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT] - ] = Field(..., description="A list of characteristics (type, value, and name).") + predicates: list[Predicate[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT]] = Field( + ..., description="A list of characteristics (type, value, and name)." + ) diff --git a/docling_core/types/rec/base.py b/docling_core/types/rec/base.py index 0d1af762..557166e6 100644 --- a/docling_core/types/rec/base.py +++ b/docling_core/types/rec/base.py @@ -10,19 +10,14 @@ from docling_core.utils.alias import AliasModel -class ProvenanceItem( - AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT], extra="forbid" -): +class ProvenanceItem(AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT], extra="forbid"): """A representation of an object provenance.""" type_: Optional[ProvenanceTypeT] = Field( default=None, alias="type", title="The provenance type", - description=( - "Any string representing the type of provenance, e.g. `sentence`, " - "`table`, or `doi`." - ), + description=("Any string representing the type of provenance, e.g. `sentence`, `table`, or `doi`."), json_schema_extra=es_field(type="keyword", ignore_above=8191), ) @@ -30,8 +25,7 @@ class ProvenanceItem( default=None, title="Evidence of the provenance", description=( - "A text representing the evidence of the provenance, e.g. the sentence " - "text or the content of a table cell" + "A text representing the evidence of the provenance, e.g. the sentence text or the content of a table cell" ), json_schema_extra=es_field(type="keyword", ignore_above=8191), ) @@ -56,15 +50,10 @@ class ProvenanceItem( json_schema_extra=es_field(type="keyword", ignore_above=8191), ) - span: Optional[Annotated[List[StrictInt], Field(min_length=2, max_length=2)]] = ( - Field( - default=None, - title="The location of the item in the text/table", - description=( - "location of the item in the text/table referenced by the `path`," - " e.g., `[34, 67]`" - ), - ) + span: Optional[Annotated[List[StrictInt], Field(min_length=2, max_length=2)]] = Field( + default=None, + title="The location of the item in the text/table", + description=("location of the item in the text/table referenced by the `path`, e.g., `[34, 67]`"), ) diff --git a/docling_core/types/rec/predicate.py b/docling_core/types/rec/predicate.py index c632c33b..983a8129 100644 --- a/docling_core/types/rec/predicate.py +++ b/docling_core/types/rec/predicate.py @@ -29,17 +29,13 @@ class NumericalValue(BaseModel, extra="forbid"): max: StrictFloat = Field(..., json_schema_extra=es_field(type="float")) val: StrictFloat = Field(..., json_schema_extra=es_field(type="float")) err: StrictFloat = Field(..., json_schema_extra=es_field(type="float")) - unit: StrictStr = Field( - ..., json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) + unit: StrictStr = Field(..., json_schema_extra=es_field(type="keyword", ignore_above=8191)) class NominalValue(BaseModel, extra="forbid"): """Model for nominal (categorical) values.""" - value: StrictStr = Field( - ..., json_schema_extra=es_field(type="keyword", ignore_above=8191) - ) + value: StrictStr = Field(..., json_schema_extra=es_field(type="keyword", ignore_above=8191)) class TextValue(BaseModel, extra="forbid"): @@ -79,9 +75,7 @@ def validate_coordinates(cls, v): return v -class PredicateKey( - AliasModel, Generic[PredicateKeyNameT, PredicateKeyTypeT], extra="forbid" -): +class PredicateKey(AliasModel, Generic[PredicateKeyNameT, PredicateKeyTypeT], extra="forbid"): """Model for the key (unique identifier) of a predicate.""" name: PredicateKeyNameT = Field( diff --git a/docling_core/types/rec/record.py b/docling_core/types/rec/record.py index a63a4373..a78e7e48 100644 --- a/docling_core/types/rec/record.py +++ b/docling_core/types/rec/record.py @@ -28,25 +28,18 @@ class RecordDescription(BaseModel, Generic[CollectionNameTypeT]): """Additional record metadata, including optional collection-specific fields.""" - logs: list[Log] = Field( - description="Logs that describe the ETL tasks applied to this record." - ) + logs: list[Log] = Field(description="Logs that describe the ETL tasks applied to this record.") publication_date: Optional[StrictDateTime] = Field( default=None, title="Publication date", - description=( - "The date that best represents the last publication time of a record." - ), + description=("The date that best represents the last publication time of a record."), ) collection: Optional[CollectionRecordInfo[CollectionNameTypeT]] = Field( default=None, description="The collection information of this record." ) acquisition: Optional[Acquisition] = Field( default=None, - description=( - "Information on how the document was obtained, for data governance" - " purposes." - ), + description=("Information on how the document was obtained, for data governance purposes."), ) diff --git a/docling_core/types/rec/subject.py b/docling_core/types/rec/subject.py index 45655308..15f550f5 100644 --- a/docling_core/types/rec/subject.py +++ b/docling_core/types/rec/subject.py @@ -48,16 +48,12 @@ class Subject( type_: SubjectTypeT = Field( alias="type", description=( - "Main subject type. For instance, `material`, `material-class`, " - "`material-device`, `company`, or `person`." + "Main subject type. For instance, `material`, `material-class`, `material-device`, `company`, or `person`." ), json_schema_extra=es_field(type="keyword", ignore_above=8191), ) names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field( - description=( - "List of given names for this subject. They may not be unique across " - "different subjects." - ) + description=("List of given names for this subject. They may not be unique across different subjects.") ) identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field( default=None, diff --git a/docling_core/utils/file.py b/docling_core/utils/file.py index 78d2eb2a..f98be74d 100644 --- a/docling_core/utils/file.py +++ b/docling_core/utils/file.py @@ -11,7 +11,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError from typing_extensions import deprecated -from docling_core.types.doc.utils import relative_path # noqa +from docling_core.types.doc.utils import relative_path from docling_core.types.io import DocumentStream diff --git a/docling_core/utils/generate_docs.py b/docling_core/utils/generate_docs.py index 61ba4eb8..509491ff 100644 --- a/docling_core/utils/generate_docs.py +++ b/docling_core/utils/generate_docs.py @@ -43,9 +43,7 @@ def generate_collection_jsonschema(folder: str): """ for item in MODELS: json_schema = generate_json_schema(item) - with open( - os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8" - ) as json_file: + with open(os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8") as json_file: json.dump(json_schema, json_file, ensure_ascii=False, indent=2) @@ -54,10 +52,7 @@ def main() -> None: argparser = argparse.ArgumentParser() argparser.add_argument( "directory", - help=( - "Directory to generate files. If it exists, any existing content will be" - " removed." - ), + help=("Directory to generate files. If it exists, any existing content will be removed."), ) argparser.add_argument( "--clean", diff --git a/docling_core/utils/generate_jsonschema.py b/docling_core/utils/generate_jsonschema.py index a06328de..af51f71b 100644 --- a/docling_core/utils/generate_jsonschema.py +++ b/docling_core/utils/generate_jsonschema.py @@ -43,15 +43,11 @@ def generate_json_schema(class_reference: str) -> Union[dict, None]: def main() -> None: """Print the JSON Schema of a model.""" argparser = argparse.ArgumentParser() - argparser.add_argument( - "class_ref", help="Class reference, e.g., doc.document.TableCell" - ) + argparser.add_argument("class_ref", help="Class reference, e.g., doc.document.TableCell") args = argparser.parse_args() json_schema = generate_json_schema(args.class_ref) - print( - json.dumps(json_schema, ensure_ascii=False, indent=2).encode("utf-8").decode() - ) + print(json.dumps(json_schema, ensure_ascii=False, indent=2).encode("utf-8").decode()) if __name__ == "__main__": diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py index 6f8fdf99..41436506 100644 --- a/docling_core/utils/legacy.py +++ b/docling_core/utils/legacy.py @@ -139,7 +139,6 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f embedded_captions = set() for ix, (item, level) in enumerate(doc.iterate_items(doc.body)): - if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0: caption = item.caption_text(doc) if caption: @@ -150,7 +149,6 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f item_type = item.label if isinstance(item, (TextItem, ListItem, SectionHeaderItem)): - if isinstance(item, ListItem) and item.marker: text = f"{item.marker} {item.text}" else: @@ -249,9 +247,7 @@ def _make_spans(cell: TableCell, table_item: TableItem): table_data[i][j] = GlmTableCell( text=cell._get_text(doc=doc), bbox=( - cell.bbox.as_tuple() - if cell.bbox is not None - else None + cell.bbox.as_tuple() if cell.bbox is not None else None ), # check if this is bottom-left spans=spans, obj_type=celltype, @@ -322,8 +318,7 @@ def _make_spans(cell: TableCell, table_item: TableItem): ) page_dimensions = [ - PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width) - for p in doc.pages.values() + PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width) for p in doc.pages.values() ] legacy_doc: DsDocument = DsDocument( @@ -362,9 +357,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: prov = ProvenanceItem( page_no=int(item.prov[0].page), charspan=tuple(item.prov[0].span), - bbox=BoundingBox.from_tuple( - tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT - ), + bbox=BoundingBox.from_tuple(tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT), ) return prov @@ -415,9 +408,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: if text_item.text is None: continue prov = _transform_prov(text_item) - doc.add_text( - label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture - ) + doc.add_text(label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture) # main-text content if legacy_doc.main_text is not None: @@ -427,11 +418,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: # to avoid repeating them embedded_captions: Dict[str, int] = {} for ix, orig_item in enumerate(legacy_doc.main_text): - item = ( - legacy_doc._resolve_ref(orig_item) - if isinstance(orig_item, Ref) - else orig_item - ) + item = legacy_doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item if item is None: continue @@ -441,21 +428,14 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: # build lookup from floating objects to their caption item floating_to_caption: Dict[int, BaseText] = {} for ix, orig_item in enumerate(legacy_doc.main_text): - item = ( - legacy_doc._resolve_ref(orig_item) - if isinstance(orig_item, Ref) - else orig_item - ) + item = legacy_doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item if item is None: continue item_type = item.obj_type.lower() if ( isinstance(item, BaseText) - and ( - item_type == "caption" - or (item.name is not None and item.name.lower() == "caption") - ) + and (item_type == "caption" or (item.name is not None and item.name.lower() == "caption")) and item.text in embedded_captions ): floating_ix = embedded_captions[item.text] @@ -464,11 +444,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: # main loop iteration current_list: Optional[GroupItem] = None for ix, orig_item in enumerate(legacy_doc.main_text): - item = ( - legacy_doc._resolve_ref(orig_item) - if isinstance(orig_item, Ref) - else orig_item - ) + item = legacy_doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item if item is None: continue @@ -476,9 +452,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: item_type = item.obj_type.lower() # if a group is needed, add it - if isinstance(item, BaseText) and ( - item_type in "list-item-level-1" or item.name in {"list", "list-item"} - ): + if isinstance(item, BaseText) and (item_type in "list-item-level-1" or item.name in {"list", "list-item"}): if current_list is None: current_list = doc.add_list_group(name="list") else: @@ -514,9 +488,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: "list-item", }: # TODO: Infer if this is a numbered or a bullet list item - doc.add_list_item( - text=text, enumerated=False, prov=prov, parent=current_list - ) + doc.add_list_item(text=text, enumerated=False, prov=prov, parent=current_list) # normal text else: @@ -530,13 +502,11 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: doc.add_text(label=label, text=text, prov=prov) elif isinstance(item, DsSchemaTable): - table_data = TableData(num_cols=item.num_cols, num_rows=item.num_rows) if item.data is not None: seen_spans = set() for row_ix, row in enumerate(item.data): for col_ix, orig_cell_data in enumerate(row): - cell_bbox: Optional[BoundingBox] = ( BoundingBox.from_tuple( tuple(orig_cell_data.bbox), @@ -559,9 +529,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: if orig_cell_data.spans is not None: # convert to a tuple of tuples for hashing - spans_tuple = tuple( - tuple(span) for span in orig_cell_data.spans - ) + spans_tuple = tuple(tuple(span) for span in orig_cell_data.spans) # skip repeated spans if spans_tuple in seen_spans: @@ -569,25 +537,13 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: seen_spans.add(spans_tuple) - cell.start_row_offset_idx = min( - s[0] for s in spans_tuple - ) - cell.end_row_offset_idx = ( - max(s[0] for s in spans_tuple) + 1 - ) - cell.start_col_offset_idx = min( - s[1] for s in spans_tuple - ) - cell.end_col_offset_idx = ( - max(s[1] for s in spans_tuple) + 1 - ) + cell.start_row_offset_idx = min(s[0] for s in spans_tuple) + cell.end_row_offset_idx = max(s[0] for s in spans_tuple) + 1 + cell.start_col_offset_idx = min(s[1] for s in spans_tuple) + cell.end_col_offset_idx = max(s[1] for s in spans_tuple) + 1 - cell.row_span = ( - cell.end_row_offset_idx - cell.start_row_offset_idx - ) - cell.col_span = ( - cell.end_col_offset_idx - cell.start_col_offset_idx - ) + cell.row_span = cell.end_row_offset_idx - cell.start_row_offset_idx + cell.col_span = cell.end_col_offset_idx - cell.start_col_offset_idx table_data.table_cells.append(cell) @@ -617,11 +573,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: new_item.captions.append(caption.get_ref()) # equations - elif ( - isinstance(item, BaseCell) - and item.text is not None - and item_type in {"formula", "equation"} - ): + elif isinstance(item, BaseCell) and item.text is not None and item_type in {"formula", "equation"}: doc.add_text(label=DocItemLabel.FORMULA, text=item.text, prov=prov) return doc diff --git a/docling_core/utils/validate.py b/docling_core/utils/validate.py index 51f6baf9..cc9668c0 100644 --- a/docling_core/utils/validate.py +++ b/docling_core/utils/validate.py @@ -17,13 +17,9 @@ def parse_arguments(): """Parse the arguments from the command line.""" argparser = argparse.ArgumentParser(description="validate example-file with schema") - argparser.add_argument( - "-f", "--format", required=True, help="format of the file [RAW, ANN, OCR]" - ) + argparser.add_argument("-f", "--format", required=True, help="format of the file [RAW, ANN, OCR]") - argparser.add_argument( - "-i", "--input-file", required=True, help="JSON filename to be validated" - ) + argparser.add_argument("-i", "--input-file", required=True, help="JSON filename to be validated") pargs = argparser.parse_args() @@ -54,7 +50,7 @@ def run(): if result[0]: logger.info("Done!") else: - logger.error("invalid schema: {}".format(result[1])) + logger.error(f"invalid schema: {result[1]}") def main(): diff --git a/docling_core/utils/validators.py b/docling_core/utils/validators.py index 7c7178b7..8a576d99 100644 --- a/docling_core/utils/validators.py +++ b/docling_core/utils/validators.py @@ -32,11 +32,7 @@ def validate_raw_schema(file_: dict) -> tuple[bool, str]: """Validate a RAW file.""" logger.debug("validate RAW schema ... ") - schema_txt = ( - resources.files("docling_core") - .joinpath("resources/schemas/legacy_doc/RAW.json") - .read_text("utf-8") - ) + schema_txt = resources.files("docling_core").joinpath("resources/schemas/legacy_doc/RAW.json").read_text("utf-8") schema = json.loads(schema_txt) return validate_schema(file_, schema) @@ -46,11 +42,7 @@ def validate_ann_schema(file_: dict) -> tuple[bool, str]: """Validate an annotated (ANN) file.""" logger.debug("validate ANN schema ... ") - schema_txt = ( - resources.files("docling_core") - .joinpath("resources/schemas/legacy_doc/ANN.json") - .read_text("utf-8") - ) + schema_txt = resources.files("docling_core").joinpath("resources/schemas/legacy_doc/ANN.json").read_text("utf-8") schema = json.loads(schema_txt) return validate_schema(file_, schema) @@ -61,9 +53,7 @@ def validate_ocr_schema(file_: dict) -> tuple[bool, str]: logger.debug("validate OCR schema ... ") schema_txt = ( - resources.files("docling_core") - .joinpath("resources/schemas/legacy_doc/OCR-output.json") - .read_text("utf-8") + resources.files("docling_core").joinpath("resources/schemas/legacy_doc/OCR-output.json").read_text("utf-8") ) schema = json.loads(schema_txt) diff --git a/examples/rich_table_cells.ipynb b/examples/rich_table_cells.ipynb index 2ccc7926..3beaebac 100644 --- a/examples/rich_table_cells.ipynb +++ b/examples/rich_table_cells.ipynb @@ -7,7 +7,13 @@ "metadata": {}, "outputs": [], "source": [ - "from docling_core.types.doc import DoclingDocument, TableData, TableCell, RichTableCell, DocItemLabel\n", + "from docling_core.types.doc import (\n", + " DoclingDocument,\n", + " TableData,\n", + " TableCell,\n", + " RichTableCell,\n", + " DocItemLabel,\n", + ")\n", "\n", "doc = DoclingDocument(name=\"\")\n", "doc.add_text(label=DocItemLabel.TITLE, text=\"Rich tables\")\n", @@ -226,7 +232,7 @@ } ], "source": [ - "print(doc.tables[0].export_to_doctags(doc=doc))\n" + "print(doc.tables[0].export_to_doctags(doc=doc))" ] }, { diff --git a/examples/table_annotations.ipynb b/examples/table_annotations.ipynb index f8a9017c..3c4fef91 100644 --- a/examples/table_annotations.ipynb +++ b/examples/table_annotations.ipynb @@ -34,9 +34,14 @@ "from rich.console import Console\n", "from rich.panel import Panel\n", "\n", + "\n", "def print_excerpt(\n", - " txt: str, *, limit: int = 2000, title: Optional[str] = None, min_width: int = 80,\n", - " table_end: str = \"--|\"\n", + " txt: str,\n", + " *,\n", + " limit: int = 2000,\n", + " title: Optional[str] = None,\n", + " min_width: int = 80,\n", + " table_end: str = \"--|\",\n", "):\n", " excerpt = txt[:limit]\n", " width = max(\n", @@ -44,7 +49,7 @@ " min_width,\n", " )\n", " console = Console(width=width)\n", - " console.print(Panel(f\"{excerpt}{'...' if len(txt)>limit else ''}\", title=title))" + " console.print(Panel(f\"{excerpt}{'...' if len(txt) > limit else ''}\", title=title))" ] }, { @@ -205,6 +210,7 @@ "from docling_core.transforms.serializer.markdown import MarkdownAnnotationSerializer\n", "from docling_core.types.doc.document import MiscAnnotation, DocItem\n", "\n", + "\n", "class CustomAnnotationSerializer(MarkdownAnnotationSerializer):\n", " def serialize(\n", " self,\n", diff --git a/pyproject.toml b/pyproject.toml index 7c78cd18..abf36ac4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -111,6 +111,7 @@ dev = [ "pytest~=8.3", "pytest-cov>=6.1.1", "python-semantic-release~=7.32", + "ruff>=0.14.8", ] [tool.uv] @@ -124,20 +125,81 @@ namespaces = true [tool.setuptools.package-data] "*" = ["*.json"] -[tool.black] -line-length = 88 -target-version = ["py39", "py310"] -include = '\.pyi?$' -preview = true +[tool.ruff] +target-version = "py39" +line-length = 120 +respect-gitignore = true +exclude = [ + "test/data/**" +] + +[tool.ruff.format] +skip-magic-trailing-comma = false + +[tool.ruff.lint] +select = [ + "C", # flake8-comprehensions + "C9", # mccabe + "E", # pycodestyle errors (default) + "F", # pyflakes (default) + "I", # isort + "PD", # pandas-vet + "PIE", # pie + "Q", # flake8-quotes + "RUF", # Enable all ruff-specific checks + "S307", # eval + "W", # pycodestyle warnings + "ASYNC", # async + "UP", # pyupgrade +] + +ignore = [ + "C403", # Unnecessary `list()` call (rewrite as a literal) + "C408", # Unnecessary `dict()` call (rewrite as a literal) + "C413", # Unnecessary `reversed()` call around `sorted()` + "C416", # Unnecessary set comprehension (rewrite using `set()`) + "E501", # Line too long, handled by ruff formatter + "E203", # whitespace-before-punctuation + "E741", # Ambiguous variable name: `l` + "D107", # "Missing docstring in __init__", + "F401", # imported but unused; consider using `importlib.util.find_spec` to test for " + "F811", # "redefinition of the same function" + "PL", # Pylint + "PD901", # Avoid using the generic variable name `df` for DataFrames + "RUF002", # Docstring contains ambiguous `‑` (NON-BREAKING HYPHEN). + "RUF003", # Comment contains ambiguous `‑` (NON-BREAKING HYPHEN). + "RUF005", # Consider ... instead of concatenation + "RUF012", # Mutable Class Attributes + "RUF034", # Useless `if`-`else` condition + "UP006", # List vs list, etc + "UP007", # Option and Union + "UP015", # Unnecessary mode argument + "UP035", # `typing.Set` is deprecated, use `set` instead" + "UP045", # Use `X | None` for type annotations +] + +[tool.ruff.lint.pep8-naming] +classmethod-decorators = [ + "classmethod", + "validator", + "pydantic.validator", +] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] +"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests +"*.ipynb" = ["I"] # Disable import sorting for notebooks + +[tool.ruff.lint.mccabe] +max-complexity = 30 -[tool.isort] -profile = "black" -line_length = 88 -py_version = 39 -multi_line_output = 3 -include_trailing_comma = true +[tool.ruff.lint.isort] +combine-as-imports = false -[tool.autoflake] +[toolruff.lint.autoflake] in-place = true ignore-init-module-imports = true remove-all-unused-imports = true @@ -147,8 +209,6 @@ recursive = true [tool.mypy] pretty = true -# strict = true -# disallow_untyped_defs = true no_implicit_optional = true namespace_packages = true show_error_codes = true diff --git a/test/conftest.py b/test/conftest.py index 7dadfc5c..6af24335 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -35,9 +35,7 @@ def _construct_doc() -> DoclingDocument: leading_list = doc.add_list_group(parent=None) doc.add_list_item(parent=leading_list, text="item of leading list", marker="■") - title = doc.add_title( - text="Title of the Document" - ) # can be done if such information is present, or ommitted. + title = doc.add_title(text="Title of the Document") # can be done if such information is present, or ommitted. # group, heading, paragraph, table, figure, title, list, provenance doc.add_text(parent=title, label=DocItemLabel.TEXT, text="Author 1\nAffiliation 1") @@ -92,9 +90,7 @@ def _construct_doc() -> DoclingDocument: doc.add_list_item(parent=mylist_level_1, text="list item 4", marker="■") - tab_caption = doc.add_text( - label=DocItemLabel.CAPTION, text="This is the caption of table 1." - ) + tab_caption = doc.add_text(label=DocItemLabel.CAPTION, text="This is the caption of table 1.") # Make some table cells table_cells = [] @@ -166,9 +162,7 @@ def _construct_doc() -> DoclingDocument: table_data = TableData(num_rows=3, num_cols=3, table_cells=table_cells) doc.add_table(data=table_data, caption=tab_caption) - fig_caption_1 = doc.add_text( - label=DocItemLabel.CAPTION, text="This is the caption of figure 1." - ) + fig_caption_1 = doc.add_text(label=DocItemLabel.CAPTION, text="This is the caption of figure 1.") doc.add_picture(caption=fig_caption_1) size = (64, 64) @@ -189,12 +183,8 @@ def _construct_doc() -> DoclingDocument: # Draw the red square # draw.rectangle([x1, y1, x2, y2], fill="red") - fig_caption_2 = doc.add_text( - label=DocItemLabel.CAPTION, text="This is the caption of figure 2." - ) - doc.add_picture( - image=ImageRef.from_pil(image=fig2_image, dpi=72), caption=fig_caption_2 - ) + fig_caption_2 = doc.add_text(label=DocItemLabel.CAPTION, text="This is the caption of figure 2.") + doc.add_picture(image=ImageRef.from_pil(image=fig2_image, dpi=72), caption=fig_caption_2) g0 = doc.add_list_group(parent=None) doc.add_list_item(text="item 1 of list", parent=g0, marker="■") @@ -225,9 +215,7 @@ def _construct_doc() -> DoclingDocument: parent=inline1, code_language=CodeLanguageLabel.PYTHON, ) - doc.add_text( - label=DocItemLabel.TEXT, text="(to be displayed inline)", parent=inline1 - ) + doc.add_text(label=DocItemLabel.TEXT, text="(to be displayed inline)", parent=inline1) g2_subgroup_li_2 = doc.add_list_item(text="", parent=g2_subgroup, marker="□") inline2 = doc.add_inline_group(parent=g2_subgroup_li_2) @@ -237,14 +225,10 @@ def _construct_doc() -> DoclingDocument: parent=inline2, ) doc.add_text(label=DocItemLabel.FORMULA, text="E=mc^2", parent=inline2) - doc.add_text( - label=DocItemLabel.TEXT, text="(to be displayed inline)", parent=inline2 - ) + doc.add_text(label=DocItemLabel.TEXT, text="(to be displayed inline)", parent=inline2) doc.add_text(label=DocItemLabel.TEXT, text="Here a code block:", parent=None) - doc.add_code( - text='print("Hello world")', parent=None, code_language=CodeLanguageLabel.PYTHON - ) + doc.add_code(text='print("Hello world")', parent=None, code_language=CodeLanguageLabel.PYTHON) doc.add_text(label=DocItemLabel.TEXT, text="Here a formula block:", parent=None) doc.add_text(label=DocItemLabel.FORMULA, text="E=mc^2", parent=None) @@ -279,9 +263,7 @@ def _construct_doc() -> DoclingDocument: doc.add_form(graph=graph) inline_fmt = doc.add_inline_group() - doc.add_text( - label=DocItemLabel.TEXT, text="Some formatting chops:", parent=inline_fmt - ) + doc.add_text(label=DocItemLabel.TEXT, text="Some formatting chops:", parent=inline_fmt) doc.add_text( label=DocItemLabel.TEXT, text="bold", @@ -341,21 +323,13 @@ def _construct_doc() -> DoclingDocument: ) parent_A = doc.add_list_group(name="list A") - doc.add_list_item( - text="Item 1 in A", enumerated=True, marker="(i)", parent=parent_A - ) - doc.add_list_item( - text="Item 2 in A", enumerated=True, marker="(ii)", parent=parent_A - ) - item_A_3 = doc.add_list_item( - text="Item 3 in A", enumerated=True, marker="(iii)", parent=parent_A - ) + doc.add_list_item(text="Item 1 in A", enumerated=True, marker="(i)", parent=parent_A) + doc.add_list_item(text="Item 2 in A", enumerated=True, marker="(ii)", parent=parent_A) + item_A_3 = doc.add_list_item(text="Item 3 in A", enumerated=True, marker="(iii)", parent=parent_A) parent_B = doc.add_list_group(parent=item_A_3, name="list B") doc.add_list_item(text="Item 1 in B", enumerated=True, parent=parent_B) - item_B_2 = doc.add_list_item( - text="Item 2 in B", enumerated=True, marker="42.", parent=parent_B - ) + item_B_2 = doc.add_list_item(text="Item 2 in B", enumerated=True, marker="42.", parent=parent_B) parent_C = doc.add_list_group(parent=item_B_2, name="list C") doc.add_list_item(text="Item 1 in C", enumerated=True, parent=parent_C) @@ -363,9 +337,7 @@ def _construct_doc() -> DoclingDocument: doc.add_list_item(text="Item 3 in B", enumerated=True, parent=parent_B) - doc.add_list_item( - text="Item 4 in A", enumerated=True, marker="(iv)", parent=parent_A - ) + doc.add_list_item(text="Item 4 in A", enumerated=True, marker="(iv)", parent=parent_A) with pytest.warns(DeprecationWarning, match="list group"): doc.add_list_item(text="List item without parent list group") @@ -407,9 +379,7 @@ def _rich_table_doc() -> DoclingDocument: doc.add_list_item(parent=rich_item_2, text="list item 1") doc.add_list_item(parent=rich_item_2, text="list item 2") - rich_item_3 = doc.add_table( - data=TableData(num_rows=2, num_cols=3), parent=table_item - ) + rich_item_3 = doc.add_table(data=TableData(num_rows=2, num_cols=3), parent=table_item) rich_item_4 = doc.add_group(parent=table_item, label=GroupLabel.UNSPECIFIED) doc.add_text( @@ -417,9 +387,7 @@ def _rich_table_doc() -> DoclingDocument: text="Some text in a generic group.", label=DocItemLabel.TEXT, ) - doc.add_text( - parent=rich_item_4, text="More text in the group.", label=DocItemLabel.TEXT - ) + doc.add_text(parent=rich_item_4, text="More text in the group.", label=DocItemLabel.TEXT) for i in range(rich_item_3.data.num_rows): for j in range(rich_item_3.data.num_cols): diff --git a/test/test_azure_serializer.py b/test/test_azure_serializer.py index 3b91d968..8d076729 100644 --- a/test/test_azure_serializer.py +++ b/test/test_azure_serializer.py @@ -47,9 +47,7 @@ def _assert_json_like_equal(a: Any, b: Any, eps: float = 1e-3, path: str = "$") # If either is float, compare with tolerance; if both int, exact match if isinstance(a, float) or isinstance(b, float): diff = abs(float(a) - float(b)) - assert ( - diff <= eps - ), f"Float mismatch at {path}: {a} != {b} (diff={diff}, eps={eps})" + assert diff <= eps, f"Float mismatch at {path}: {a} != {b} (diff={diff}, eps={eps})" else: assert a == b, f"Int mismatch at {path}: {a} != {b}" return @@ -116,9 +114,7 @@ def _ensure_prov(item, l=10.0, t=10.0, r=200.0, b=40.0): item.prov = [ ProvenanceItem( page_no=min(sample_doc.pages.keys()), - bbox=BoundingBox( - l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT - ), + bbox=BoundingBox(l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT), charspan=(0, 0), ) ] @@ -146,9 +142,7 @@ def _ensure_prov(item, l=10.0, t=10.0, r=200.0, b=40.0): # Basic structure check data = json.loads(actual_json) assert isinstance(data, dict) - assert ( - "pages" in data and isinstance(data["pages"], list) and len(data["pages"]) >= 1 - ) + assert "pages" in data and isinstance(data["pages"], list) and len(data["pages"]) >= 1 assert "paragraphs" in data and isinstance(data["paragraphs"], list) exp_file = Path("./test/data/doc/constructed.gt.azure.json") diff --git a/test/test_base.py b/test/test_base.py index 806e7fa6..d9476604 100644 --- a/test/test_base.py +++ b/test/test_base.py @@ -27,9 +27,7 @@ def test_identifier(): # dict(): important to set by_alias=True, if the model has aliases assert data.model_dump(by_alias=True) == gold_dict - assert data.model_dump_json(by_alias=True, indent=2) == json.dumps( - gold_dict, indent=2 - ) + assert data.model_dump_json(by_alias=True, indent=2) == json.dumps(gold_dict, indent=2) # schema_json(): no need to set by_alias since it is True by the default with open("test/data/json_schemas/base_identifier.json", encoding="utf-8") as tf: @@ -75,9 +73,7 @@ def test_log(): comment="UCMI 3.10", date="2021-11-03T04:42:54.844631+00:00", ) - data = Log( - task=None, agent="CXS", type="parsing", date="2021-11-03T04:42:54.844631+00:00" - ) + data = Log(task=None, agent="CXS", type="parsing", date="2021-11-03T04:42:54.844631+00:00") gold_dict = { "agent": "CXS", @@ -93,20 +89,13 @@ def test_log(): # Models that inherit from AliasModel will generate data with alias field names assert Log(**gold_dict).model_dump(exclude_unset=True) == gold_dict # ***Best practice***: exclude_unset=True, exclude_none=True, by_alias=True - assert ( - Log(**gold_dict).model_dump( - exclude_unset=True, exclude_none=True, by_alias=True - ) - == gold_dict - ) + assert Log(**gold_dict).model_dump(exclude_unset=True, exclude_none=True, by_alias=True) == gold_dict with open("test/data/json_schemas/base_log.json", encoding="utf-8") as tf: gold_json_schema = json.load(tf) assert Log.model_json_schema() == gold_json_schema - with pytest.raises( - ValidationError, match="Value type must be a datetime or a non-numeric string" - ): + with pytest.raises(ValidationError, match="Value type must be a datetime or a non-numeric string"): Log(agent="CXS", type="annotation", date=123456789) @@ -124,9 +113,7 @@ def test_file_info_object(): gold_dict.pop("filename-prov") gold_json = json.dumps(gold_dict) - FileInfoObject(**gold_dict).model_dump_json( - exclude_unset=True, exclude_none=True - ) == gold_json + FileInfoObject(**gold_dict).model_dump_json(exclude_unset=True, exclude_none=True) == gold_json # creating an instance with input variables requires the use of field names. Since # document-hash is an invalid function parameter name, 'populate_by_name' needs to @@ -166,19 +153,10 @@ def test_collection_info(): } clean_dict = {"name": "patent USPTO", "type": "Document", "version": "3.2.0"} data = CollectionInfo(**input_dict) - assert ( - data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) - != input_dict - ) - assert ( - data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) - == clean_dict - ) + assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) != input_dict + assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == clean_dict data = CollectionInfo(**clean_dict) - assert ( - data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) - == clean_dict - ) + assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == clean_dict def test_collection_document_info(): @@ -190,10 +168,7 @@ def test_collection_document_info(): "alias": ["patent"], } data = CollectionDocumentInfo(**gold_dict) - assert ( - data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) - == gold_dict - ) + assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == gold_dict # within dictionary desc_dict = { @@ -214,7 +189,7 @@ def test_collection_document_info(): CCSDocumentDescription(**desc_dict) desc_dict["collection"]["type"] = "Record" - with pytest.raises(ValidationError, match="collection.type"): + with pytest.raises(ValidationError, match="collection\\.type"): CCSDocumentDescription(**desc_dict) @@ -227,10 +202,7 @@ def test_collection_record_info(): "alias": ["chemical", "Material Sciences"], } data = CollectionRecordInfo(**gold_dict) - assert ( - data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) - == gold_dict - ) + assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == gold_dict # within dictionary desc_dict = { @@ -251,11 +223,11 @@ def test_collection_record_info(): RecordDescription(**desc_dict) desc_dict["collection"]["type"] = "Document" - with pytest.raises(ValidationError, match="collection.type"): + with pytest.raises(ValidationError, match="collection\\.type"): RecordDescription(**desc_dict) desc_dict["collection"]["type"] = "record" - with pytest.raises(ValidationError, match="collection.type"): + with pytest.raises(ValidationError, match="collection\\.type"): RecordDescription(**desc_dict) diff --git a/test/test_code_chunker.py b/test/test_code_chunker.py index e90a3d58..5b89d801 100644 --- a/test/test_code_chunker.py +++ b/test/test_code_chunker.py @@ -53,16 +53,9 @@ def create_documents_from_repository( all_files = [] for extension in all_extensions: - all_files.extend( - [ - f - for f in sorted( - glob.glob(f"{file_dir}/**/*{extension}", recursive=True) - ) - ] - ) + all_files.extend([f for f in sorted(glob.glob(f"{file_dir}/**/*{extension}", recursive=True))]) - all_files = sorted(list(set(all_files))) + all_files = sorted(set(all_files)) for file_path in all_files: with open(file_path, "r", encoding="utf-8") as f: @@ -72,11 +65,7 @@ def create_documents_from_repository( origin = DocumentOrigin( filename=file_relative, - uri=( - f"{repo_url}/blob/{commit_id}/{file_relative}" - if commit_id - else f"{repo_url}/{file_relative}" - ), + uri=(f"{repo_url}/blob/{commit_id}/{file_relative}" if commit_id else f"{repo_url}/{file_relative}"), mimetype="text/plain", binary_hash=_create_hash(file_content), ) @@ -97,41 +86,31 @@ def create_documents_from_repository( "Java", "/test/data/chunker_repo/repos/acmeair", "https://github.com/acmeair/acmeair", - lambda: HierarchicalChunker( - code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000) - ), + lambda: HierarchicalChunker(code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000)), ), ( "TypeScript", "/test/data/chunker_repo/repos/outline", "https://github.com/outline/outline", - lambda: HierarchicalChunker( - code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000) - ), + lambda: HierarchicalChunker(code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000)), ), ( "JavaScript", "/test/data/chunker_repo/repos/jquery", "https://github.com/jquery/jquery", - lambda: HierarchicalChunker( - code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000) - ), + lambda: HierarchicalChunker(code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000)), ), ( "Python", "/test/data/chunker_repo/repos/docling", "https://github.com/docling-project/docling", - lambda: HierarchicalChunker( - code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000) - ), + lambda: HierarchicalChunker(code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000)), ), ( "C", "/test/data/chunker_repo/repos/json-c", "https://github.com/json-c/json-c", - lambda: HierarchicalChunker( - code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000) - ), + lambda: HierarchicalChunker(code_chunking_strategy=StandardCodeChunkingStrategy(max_tokens=5000)), ), ] @@ -150,7 +129,6 @@ def _dump_or_assert(act_data: dict, out_path: pathlib.Path): @pytest.mark.parametrize("name,local_path,repo_url,chunker_factory", REPO_SPECS) def test_function_chunkers_repo(name, local_path, repo_url, chunker_factory): - local_path_full = os.getcwd() + local_path if not os.path.isdir(local_path_full): @@ -162,11 +140,7 @@ def test_function_chunkers_repo(name, local_path, repo_url, chunker_factory): language=CodeLanguageLabel(name), commit_id="abc123def456", ) - docs = [ - doc - for doc in docs - if any(text.label == DocItemLabel.CODE and text.text for text in doc.texts) - ] + docs = [doc for doc in docs if any(text.label == DocItemLabel.CODE and text.text for text in doc.texts)] if not docs: pytest.skip(f"No documents found in {local_path_full} for {name}.") diff --git a/test/test_code_chunking_strategy.py b/test/test_code_chunking_strategy.py index 783746cd..c8234b56 100644 --- a/test/test_code_chunking_strategy.py +++ b/test/test_code_chunking_strategy.py @@ -49,9 +49,7 @@ def factorial(n): format_code_blocks=False, ), ) - chunks = list( - strategy.chunk_code_item(item=code_item, doc=doc, doc_serializer=doc_ser) - ) + chunks = list(strategy.chunk_code_item(item=code_item, doc=doc, doc_serializer=doc_ser)) assert len(chunks) > 0 for chunk in chunks: @@ -77,12 +75,8 @@ def fibonacci(n): text="Here's some Python code:", orig="Here's some Python code:", ) - doc.add_code( - text=python_code, code_language=CodeLanguageLabel.PYTHON, orig=python_code - ) - doc.origin = DocumentOrigin( - filename="test.py", mimetype="text/x-python", binary_hash=12345 - ) + doc.add_code(text=python_code, code_language=CodeLanguageLabel.PYTHON, orig=python_code) + doc.origin = DocumentOrigin(filename="test.py", mimetype="text/x-python", binary_hash=12345) strategy = StandardCodeChunkingStrategy(min_chunk_size=50, max_tokens=1000) chunker_with_strategy = HierarchicalChunker(code_chunking_strategy=strategy) @@ -114,9 +108,7 @@ def test_hybrid_chunker_with_code_files(test_data_dir): pytest.skip("Python test file not found") doc = DoclingDocument(name="sample.py") - doc.origin = DocumentOrigin( - filename="sample.py", mimetype="text/x-python", binary_hash=12345 - ) + doc.origin = DocumentOrigin(filename="sample.py", mimetype="text/x-python", binary_hash=12345) with open(python_file, "r", encoding="utf-8") as f: content = f.read() @@ -141,9 +133,7 @@ def test_unsupported_language_fallback(test_data_dir): go_file = test_data_dir / "sample.go" if go_file.exists(): doc = DoclingDocument(name="sample.go") - doc.origin = DocumentOrigin( - filename="sample.go", mimetype="text/plain", binary_hash=12345 - ) + doc.origin = DocumentOrigin(filename="sample.go", mimetype="text/plain", binary_hash=12345) with open(go_file, "r", encoding="utf-8") as f: content = f.read() @@ -162,9 +152,7 @@ def test_unsupported_language_fallback(test_data_dir): md_file = test_data_dir / "sample.md" if md_file.exists(): doc = DoclingDocument(name="sample.md") - doc.origin = DocumentOrigin( - filename="sample.md", mimetype="text/plain", binary_hash=12345 - ) + doc.origin = DocumentOrigin(filename="sample.md", mimetype="text/plain", binary_hash=12345) with open(md_file, "r", encoding="utf-8") as f: content = f.read() @@ -191,9 +179,7 @@ def test_repository_processing(test_data_dir): all_chunks = [] for file_path in test_data_dir.glob("sample.*"): doc = DoclingDocument(name=file_path.name) - doc.origin = DocumentOrigin( - filename=file_path.name, mimetype="text/plain", binary_hash=12345 - ) + doc.origin = DocumentOrigin(filename=file_path.name, mimetype="text/plain", binary_hash=12345) with open(file_path, "r", encoding="utf-8") as f: content = f.read() diff --git a/test/test_collection.py b/test/test_collection.py index cc9d46d4..ab307847 100644 --- a/test/test_collection.py +++ b/test/test_collection.py @@ -56,15 +56,12 @@ def test_table_export_to_tokens(): doc = Document.model_validate_json(file_json) if doc.tables is not None and doc.page_dimensions is not None: - pagedims = doc.get_map_to_page_dimensions() if doc.tables is not None: for i, table in enumerate(doc.tables): page = table.prov[0].page - out = table.export_to_document_tokens( - page_w=pagedims[page][0], page_h=pagedims[page][1] - ) + out = table.export_to_document_tokens(page_w=pagedims[page][0], page_h=pagedims[page][1]) fname = f"{filename}_table_{i}.dt.txt" if GENERATE: @@ -81,13 +78,10 @@ def test_table_export_to_tokens(): break elif doc.tables is not None and doc.page_dimensions is None: - if doc.tables is not None: for i, table in enumerate(doc.tables): page = table.prov[0].page - out = table.export_to_document_tokens( - add_table_location=False, add_cell_location=False - ) + out = table.export_to_document_tokens(add_table_location=False, add_cell_location=False) fname = f"{filename}_table_{i}.dt.txt" if GENERATE: @@ -113,9 +107,7 @@ def test_document_export_to_md(): md = doc.export_to_markdown() if GENERATE: - with open( - "test/data/legacy_doc/doc-export.md", "w", encoding="utf-8" - ) as gold_obj: + with open("test/data/legacy_doc/doc-export.md", "w", encoding="utf-8") as gold_obj: gold_obj.write(md) with open("test/data/legacy_doc/doc-export.md", encoding="utf-8") as gold_obj: @@ -133,14 +125,10 @@ def test_document_export_to_tokens(): xml = doc.export_to_document_tokens(delim=True) if GENERATE: - with open( - "test/data/legacy_doc/doc-export.dt.txt", "w", encoding="utf-8" - ) as gold_obj: + with open("test/data/legacy_doc/doc-export.dt.txt", "w", encoding="utf-8") as gold_obj: gold_obj.write(xml) - with open( - "test/data/legacy_doc/doc-export.dt.txt", "r", encoding="utf-8" - ) as gold_obj: + with open("test/data/legacy_doc/doc-export.dt.txt", "r", encoding="utf-8") as gold_obj: gold_data = gold_obj.read().strip() assert xml == gold_data diff --git a/test/test_doc_schema.py b/test/test_doc_schema.py index 9776e791..0efc0f5c 100644 --- a/test/test_doc_schema.py +++ b/test/test_doc_schema.py @@ -44,9 +44,9 @@ def test_ccs_document(): except ValidationError as e: for error in e.errors(): # print(type(error)) - assert all( - item in error["loc"] for item in ("description", "logs") - ), f"Data in file {filename} should fail in logs" + assert all(item in error["loc"] for item in ("description", "logs")), ( + f"Data in file {filename} should fail in logs" + ) # check doc-error-2 is invalid for missing page-hashes with ( @@ -90,9 +90,7 @@ def test_description_advanced_t(): # any dictionary is valid, since it is not parametrized CCSDocumentDescription(**desc, advanced={"serial": "CXS12345"}) CCSDocumentDescription(**desc, advanced={0: "CXS12345"}) - with pytest.raises( - ValidationError, match="should be a valid dictionary or instance of BaseModel" - ): + with pytest.raises(ValidationError, match="should be a valid dictionary or instance of BaseModel"): CCSDocumentDescription(**desc, advanced=False) class MyAdvanced(BaseModel): diff --git a/test/test_doc_schema_extractor.py b/test/test_doc_schema_extractor.py index afa874eb..7871343e 100644 --- a/test/test_doc_schema_extractor.py +++ b/test/test_doc_schema_extractor.py @@ -20,7 +20,7 @@ def test_ccs_document_update(): doc = CCSDocument.model_validate(raw_doc) if doc.description.abstract: - assert False, f"Abstract should not be present" + assert False, "Abstract should not be present" except ValidationError as e: print(f"Validation error in file {filename}:\n{e.json()}") diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index 25ecddff..1250dabe 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -85,7 +85,6 @@ def test_overlaps_horizontally(): def test_overlaps_vertically(): - page_height = 300 # Same CoordOrigin (TOPLEFT) @@ -230,27 +229,19 @@ def test_y_overlap_with(): def test_union_area_with(): # Overlapping (TOPLEFT) - bbox1 = BoundingBox( - l=0, t=0, r=10, b=10, coord_origin=CoordOrigin.TOPLEFT - ) # Area 100 - bbox2 = BoundingBox( - l=5, t=5, r=15, b=15, coord_origin=CoordOrigin.TOPLEFT - ) # Area 100 + bbox1 = BoundingBox(l=0, t=0, r=10, b=10, coord_origin=CoordOrigin.TOPLEFT) # Area 100 + bbox2 = BoundingBox(l=5, t=5, r=15, b=15, coord_origin=CoordOrigin.TOPLEFT) # Area 100 # Intersection area 25 # Union area = 100 + 100 - 25 = 175 assert abs(bbox1.union_area_with(bbox2) - 175.0) < 1.0e-3 # Non-overlapping (TOPLEFT) - bbox3 = BoundingBox( - l=20, t=0, r=30, b=10, coord_origin=CoordOrigin.TOPLEFT - ) # Area 100 + bbox3 = BoundingBox(l=20, t=0, r=30, b=10, coord_origin=CoordOrigin.TOPLEFT) # Area 100 # Union area = 100 + 100 - 0 = 200 assert abs(bbox1.union_area_with(bbox3) - 200.0) < 1.0e-3 # Touching edges (TOPLEFT) - bbox4 = BoundingBox( - l=10, t=0, r=20, b=10, coord_origin=CoordOrigin.TOPLEFT - ) # Area 100 + bbox4 = BoundingBox(l=10, t=0, r=20, b=10, coord_origin=CoordOrigin.TOPLEFT) # Area 100 # Union area = 100 + 100 - 0 = 200 assert abs(bbox1.union_area_with(bbox4) - 200.0) < 1.0e-3 @@ -260,12 +251,8 @@ def test_union_area_with(): assert abs(bbox1.union_area_with(bbox5) - 100.0) < 1.0e-3 # Overlapping (BOTTOMLEFT) - bbox6 = BoundingBox( - l=0, b=0, r=10, t=10, coord_origin=CoordOrigin.BOTTOMLEFT - ) # Area 100 - bbox7 = BoundingBox( - l=5, b=5, r=15, t=15, coord_origin=CoordOrigin.BOTTOMLEFT - ) # Area 100 + bbox6 = BoundingBox(l=0, b=0, r=10, t=10, coord_origin=CoordOrigin.BOTTOMLEFT) # Area 100 + bbox7 = BoundingBox(l=5, b=5, r=15, t=15, coord_origin=CoordOrigin.BOTTOMLEFT) # Area 100 # Intersection area 25 # Union area = 100 + 100 - 25 = 175 assert abs(bbox6.union_area_with(bbox7) - 175.0) < 1.0e-3 @@ -307,7 +294,6 @@ def test_x_union_with(): def test_y_union_with(): - bbox1_tl = BoundingBox(l=0, t=0, r=10, b=10, coord_origin=CoordOrigin.TOPLEFT) bbox2_tl = BoundingBox(l=0, t=5, r=10, b=15, coord_origin=CoordOrigin.TOPLEFT) # y_union = max(10, 15) - min(0, 5) = 15 - 0 = 15 @@ -355,7 +341,6 @@ def test_y_union_with(): def test_orientation(): - page_height = 300 # Same CoordOrigin (TOPLEFT) @@ -380,12 +365,9 @@ def test_orientation(): def test_docitems(): - # Iterative function to find all subclasses def find_all_subclasses_iterative(base_class): - subclasses = deque( - [base_class] - ) # Use a deque for efficient popping from the front + subclasses = deque([base_class]) # Use a deque for efficient popping from the front all_subclasses = [] while subclasses: @@ -400,15 +382,11 @@ def serialise(obj): return yaml.safe_dump(obj.model_dump(mode="json", by_alias=True)) def write(name: str, serialisation: str): - with open( - f"./test/data/docling_document/unit/{name}.yaml", "w", encoding="utf-8" - ) as fw: + with open(f"./test/data/docling_document/unit/{name}.yaml", "w", encoding="utf-8") as fw: fw.write(serialisation) def read(name: str): - with open( - f"./test/data/docling_document/unit/{name}.yaml", "r", encoding="utf-8" - ) as fr: + with open(f"./test/data/docling_document/unit/{name}.yaml", "r", encoding="utf-8") as fr: gold = fr.read() return yaml.safe_load(gold) @@ -428,7 +406,6 @@ def verify(dc, obj): # Iterate over the derived classes of the BaseClass derived_classes = find_all_subclasses_iterative(DocItem) for dc in derived_classes: - if dc is TextItem: obj = dc( text="whatever", @@ -454,7 +431,6 @@ def verify(dc, obj): verify(dc, obj) elif dc is KeyValueItem: - graph = GraphData( cells=[ GraphCell( @@ -476,9 +452,7 @@ def verify(dc, obj): source_cell_id=0, target_cell_id=1, ), - GraphLink( - label=GraphLinkLabel.TO_KEY, source_cell_id=1, target_cell_id=0 - ), + GraphLink(label=GraphLinkLabel.TO_KEY, source_cell_id=1, target_cell_id=0), ], ) @@ -490,7 +464,6 @@ def verify(dc, obj): verify(dc, obj) elif dc is FormItem: - graph = GraphData( cells=[ GraphCell( @@ -512,9 +485,7 @@ def verify(dc, obj): source_cell_id=0, target_cell_id=1, ), - GraphLink( - label=GraphLinkLabel.TO_KEY, source_cell_id=1, target_cell_id=0 - ), + GraphLink(label=GraphLinkLabel.TO_KEY, source_cell_id=1, target_cell_id=0), ], ) @@ -578,7 +549,6 @@ def verify(dc, obj): def test_reference_doc(): - filename = "test/data/doc/dummy_doc.yaml" # Read YAML file of manual reference doc @@ -598,9 +568,7 @@ def test_reference_doc(): obj = doc.texts[2] # Text item with parent parent = obj.parent.resolve(doc=doc) # it is a figure - obj2 = parent.children[0].resolve( - doc=doc - ) # Child of figure must be the same as obj + obj2 = parent.children[0].resolve(doc=doc) # Child of figure must be the same as obj assert obj == obj2 assert obj is obj2 @@ -619,7 +587,6 @@ def test_reference_doc(): def test_parse_doc(): - filename = "test/data/doc/2206.01062.yaml" with open(filename, "r", encoding="utf-8") as fp: @@ -633,7 +600,6 @@ def test_parse_doc(): def test_construct_doc(sample_doc): - filename = "test/data/doc/constructed_document.yaml" assert sample_doc.validate_tree(sample_doc.body) @@ -647,7 +613,6 @@ def test_construct_doc(sample_doc): def test_construct_bad_doc(): - filename = "test/data/doc/bad_doc.yaml" doc = _construct_bad_doc() @@ -685,17 +650,13 @@ def _verify_regression_test(pred: str, filename: str, ext: str): with open(filename + f".{ext}", "r", encoding="utf-8") as fr: gt_true = fr.read().rstrip() - assert ( - gt_true == pred - ), f"Does not pass regression-test for {filename}.{ext}\n\n{gt_true}\n\n{pred}" + assert gt_true == pred, f"Does not pass regression-test for {filename}.{ext}\n\n{gt_true}\n\n{pred}" else: with open(filename + f".{ext}", "w", encoding="utf-8") as fw: fw.write(f"{pred}\n") -def _test_export_methods( - doc: DoclingDocument, filename: str, page_break_placeholder: Optional[str] = None -): +def _test_export_methods(doc: DoclingDocument, filename: str, page_break_placeholder: Optional[str] = None): # Iterate all elements et_pred = doc.export_to_element_tree() _verify_regression_test(et_pred, filename=filename, ext="et") @@ -777,7 +738,6 @@ def test_pil_image(): def test_image_ref(): - data_uri = { "dpi": 72, "mimetype": "image/png", @@ -816,7 +776,6 @@ class ContentOutput(BaseModel): def test_version_doc(): - # default version doc = DoclingDocument(name="Untitled 1") assert doc.version == CURRENT_VERSION @@ -874,9 +833,7 @@ def test_formula_with_missing_fallback(): prov = ProvenanceItem(page_no=1, bbox=bbox, charspan=(0, 2)) doc.add_text(label=DocItemLabel.FORMULA, text="", orig="(II.24) 2 Imar", prov=prov) - doc.export_to_html( - formula_to_mathml=True, html_head="", image_mode=ImageRefMode.EMBEDDED - ) + doc.export_to_html(formula_to_mathml=True, html_head="", image_mode=ImageRefMode.EMBEDDED) expected = """ @@ -926,17 +883,10 @@ def test_docitem_get_image(): doc_item = DocItem( self_ref="#", label=DocItemLabel.TEXT, - prov=[ - ProvenanceItem( - page_no=1, bbox=BoundingBox(l=2, t=4, r=4, b=8), charspan=(1, 2) - ) - ], + prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=2, t=4, r=4, b=8), charspan=(1, 2))], ) returned_doc_item_image = doc_item.get_image(doc=doc) - assert ( - returned_doc_item_image is not None - and returned_doc_item_image.tobytes() == doc_item_image.tobytes() - ) + assert returned_doc_item_image is not None and returned_doc_item_image.tobytes() == doc_item_image.tobytes() def test_floatingitem_get_image(): @@ -959,20 +909,14 @@ def test_floatingitem_get_image(): floating_item = FloatingItem( self_ref="#", label=DocItemLabel.PICTURE, - prov=[ - ProvenanceItem( - page_no=1, bbox=BoundingBox(l=2, t=4, r=6, b=12), charspan=(1, 2) - ) - ], + prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=2, t=4, r=6, b=12), charspan=(1, 2))], image=ImageRef.from_pil(image=new_image, dpi=72), ) retured_image = floating_item.get_image(doc=doc) assert retured_image is not None and retured_image.tobytes() == new_image.tobytes() # FloatingItem without explicit image and no provenance - floating_item = FloatingItem( - self_ref="#", label=DocItemLabel.PICTURE, prov=[], image=None - ) + floating_item = FloatingItem(self_ref="#", label=DocItemLabel.PICTURE, prov=[], image=None) assert floating_item.get_image(doc=doc) is None # FloatingItem without explicit image on invalid page @@ -997,24 +941,15 @@ def test_floatingitem_get_image(): floating_item = FloatingItem( self_ref="#", label=DocItemLabel.PICTURE, - prov=[ - ProvenanceItem( - page_no=1, bbox=BoundingBox(l=2, t=4, r=4, b=8), charspan=(1, 2) - ) - ], + prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=2, t=4, r=4, b=8), charspan=(1, 2))], image=None, ) retured_image = floating_item.get_image(doc=doc) - assert ( - retured_image is not None - and retured_image.tobytes() == floating_item_image.tobytes() - ) + assert retured_image is not None and retured_image.tobytes() == floating_item_image.tobytes() def test_save_pictures(sample_doc): - new_doc = sample_doc._with_pictures_refs( - image_dir=Path("./test/data/constructed_images/"), page_no=None - ) + new_doc = sample_doc._with_pictures_refs(image_dir=Path("./test/data/constructed_images/"), page_no=None) img_paths = new_doc._list_images_on_disk() assert len(img_paths) == 1, "len(img_paths)!=1" @@ -1034,31 +969,24 @@ def test_save_pictures_with_page(): image=ImageRef.from_pil(image=image, dpi=72), prov=ProvenanceItem( page_no=2, - bbox=BoundingBox( - b=0, l=0, r=200, t=400, coord_origin=CoordOrigin.BOTTOMLEFT - ), + bbox=BoundingBox(b=0, l=0, r=200, t=400, coord_origin=CoordOrigin.BOTTOMLEFT), charspan=(1, 2), ), ) # When - with_ref = doc._with_pictures_refs( - image_dir=Path("./test/data/constructed_images/"), page_no=1 - ) + with_ref = doc._with_pictures_refs(image_dir=Path("./test/data/constructed_images/"), page_no=1) # Then n_images = len(with_ref._list_images_on_disk()) assert n_images == 0 # When - with_ref = with_ref._with_pictures_refs( - image_dir=Path("./test/data/constructed_images/"), page_no=2 - ) + with_ref = with_ref._with_pictures_refs(image_dir=Path("./test/data/constructed_images/"), page_no=2) n_images = len(with_ref._list_images_on_disk()) # Then assert n_images == 1 def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]): - for p in paths: instr = instr.replace(str(p), str(p.name)) @@ -1066,7 +994,6 @@ def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]): def _verify_saved_output(filename: Union[str, Path], paths: List[Path]): - pred = "" with open(filename, "r", encoding="utf-8") as fr: pred = fr.read() @@ -1095,14 +1022,11 @@ def _verify_loaded_output(filename: Path, pred=None): pred = pred or DoclingDocument.load_from_json(Path(filename)) assert isinstance(pred, DoclingDocument) - assert ( - pred.export_to_dict() == gt.export_to_dict() - ), f"pred.export_to_dict() != gt.export_to_dict() for {filename}" + assert pred.export_to_dict() == gt.export_to_dict(), f"pred.export_to_dict() != gt.export_to_dict() for {filename}" assert pred == gt, f"pred!=gt for {filename}" def test_save_to_disk(sample_doc): - test_dir = Path("./test/data/doc") image_dir = Path("constructed_images/") # will be relative to test_dir @@ -1118,41 +1042,29 @@ def test_save_to_disk(sample_doc): ### MarkDown filename: Path = test_dir / "constructed_doc.placeholder.md" - sample_doc.save_as_markdown( - filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER - ) + sample_doc.save_as_markdown(filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER) _verify_saved_output(filename=filename, paths=paths) filename = test_dir / "constructed_doc.embedded.md" - sample_doc.save_as_markdown( - filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED - ) + sample_doc.save_as_markdown(filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED) _verify_saved_output(filename=filename, paths=paths) filename = test_dir / "constructed_doc.referenced.md" - sample_doc.save_as_markdown( - filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED - ) + sample_doc.save_as_markdown(filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED) _verify_saved_output(filename=filename, paths=paths) ### HTML filename = test_dir / "constructed_doc.placeholder.html" - sample_doc.save_as_html( - filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER - ) + sample_doc.save_as_html(filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER) _verify_saved_output(filename=filename, paths=paths) filename = test_dir / "constructed_doc.embedded.html" - sample_doc.save_as_html( - filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED - ) + sample_doc.save_as_html(filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED) _verify_saved_output(filename=filename, paths=paths) filename = test_dir / "constructed_doc.referenced.html" - sample_doc.save_as_html( - filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED - ) + sample_doc.save_as_html(filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED) _verify_saved_output(filename=filename, paths=paths) ### Document Tokens @@ -1207,7 +1119,6 @@ def test_save_to_disk(sample_doc): def test_document_stack_operations(sample_doc): - # _print(document=doc) ref = RefItem(cref="#/texts/12") @@ -1226,7 +1137,6 @@ def test_document_stack_operations(sample_doc): def test_document_manipulation(sample_doc: DoclingDocument) -> None: - def _resolve(document: DoclingDocument, cref: str) -> NodeItem: ref = RefItem(cref=cref) return ref.resolve(doc=document) @@ -1246,9 +1156,7 @@ def _verify( DoclingDocument.load_from_json(filename=_gt_filename(filename=filename)) # test if the document is the same as the stored GT - _verify_loaded_output( - filename=filename, pred=DoclingDocument.model_validate(document) - ) + _verify_loaded_output(filename=filename, pred=DoclingDocument.model_validate(document)) image_dir = Path("./test/data/doc/constructed_images/") @@ -1326,11 +1234,7 @@ def _verify( label=DocItemLabel.TEXT, text="foo", orig="foo", - children=[ - _resolve( - document=deepcopy(sample_doc), cref=text_item_4.self_ref - ).get_ref() - ], + children=[_resolve(document=deepcopy(sample_doc), cref=text_item_4.self_ref).get_ref()], ), parent=sample_doc.body, ) @@ -1353,12 +1257,8 @@ def _verify( node = _resolve(document=sample_doc, cref="#/texts/45") - last_node = sample_doc.insert_list_group( - sibling=node, name="Inserted List Group", after=True - ) - group_node = sample_doc.insert_inline_group( - sibling=node, name="Inserted Inline Group", after=False - ) + last_node = sample_doc.insert_list_group(sibling=node, name="Inserted List Group", after=True) + group_node = sample_doc.insert_inline_group(sibling=node, name="Inserted Inline Group", after=False) sample_doc.insert_group( sibling=node, label=GroupLabel.LIST, @@ -1430,16 +1330,12 @@ def _verify( ) ) - table_data = TableData( - table_cells=table_cells, num_rows=num_rows, num_cols=num_cols - ) + table_data = TableData(table_cells=table_cells, num_rows=num_rows, num_cols=num_cols) sample_doc.insert_table(sibling=node, data=table_data, after=False) size = (64, 64) img = PILImage.new("RGB", size, "black") - sample_doc.insert_picture( - sibling=node, image=ImageRef.from_pil(image=img, dpi=72), after=True - ) + sample_doc.insert_picture(sibling=node, image=ImageRef.from_pil(image=img, dpi=72), after=True) sample_doc.insert_title(sibling=node, text="Inserted Title", after=False) sample_doc.insert_code(sibling=node, text="Inserted Code", after=True) @@ -1480,12 +1376,8 @@ def _verify( # Test the handling of list items in insert_* methods, both with and without parent groups with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"): - li_sibling = sample_doc.insert_list_item( - sibling=node, text="Inserted List Item, Incorrect Parent", after=False - ) - sample_doc.insert_list_item( - sibling=li_sibling, text="Inserted List Item, Correct Parent", after=True - ) + li_sibling = sample_doc.insert_list_item(sibling=node, text="Inserted List Item, Incorrect Parent", after=False) + sample_doc.insert_list_item(sibling=li_sibling, text="Inserted List Item, Correct Parent", after=True) sample_doc.insert_text( sibling=li_sibling, label=DocItemLabel.LIST_ITEM, @@ -1518,9 +1410,7 @@ def _verify( label=DocItemLabel.TEXT, ) - sample_doc.add_node_items( - node_items=[text_item_6, text_item_7], doc=sample_doc, parent=group_node - ) + sample_doc.add_node_items(node_items=[text_item_6, text_item_7], doc=sample_doc, parent=group_node) filename = Path("test/data/doc/constructed_doc.bulk_item_addition.json") _verify(filename=filename, document=sample_doc, generate=GEN_TEST_DATA) @@ -1540,9 +1430,7 @@ def _verify( label=DocItemLabel.TEXT, ) - sample_doc.insert_node_items( - sibling=node, node_items=[text_item_8, text_item_9], doc=sample_doc, after=False - ) + sample_doc.insert_node_items(sibling=node, node_items=[text_item_8, text_item_9], doc=sample_doc, after=False) filename = Path("test/data/doc/constructed_doc.bulk_item_insertion.json") _verify(filename=filename, document=sample_doc, generate=GEN_TEST_DATA) @@ -1574,9 +1462,7 @@ def _verify( with pytest.raises(ValueError): extracted_doc = sample_doc.extract_items_range(start=li_sibling, end=node) - extracted_doc = sample_doc.extract_items_range( - start=group_node, end=node, end_inclusive=False, delete=True - ) + extracted_doc = sample_doc.extract_items_range(start=group_node, end=node, end_inclusive=False, delete=True) filename = Path("test/data/doc/constructed_doc.extracted_with_deletion.json") _verify(filename=filename, document=sample_doc, generate=GEN_TEST_DATA) @@ -1642,9 +1528,7 @@ def test_concatenate(): docs = [DoclingDocument.load_from_json(filename=f) for f in files] doc = DoclingDocument.concatenate(docs=docs) - html_data = doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, split_page_view=True - ) + html_data = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED, split_page_view=True) exp_json_file = Path("test/data/doc/concatenated.json") exp_html_file = exp_json_file.with_suffix(".html") @@ -1675,9 +1559,7 @@ def test_list_group_with_non_list_items(): bad_doc = DoclingDocument(name="") l1 = bad_doc.add_list_group() bad_doc.add_list_item(text="ListItem 1", parent=l1) - bad_doc.add_text( - text="non-ListItem in ListGroup", label=DocItemLabel.TEXT, parent=l1 - ) + bad_doc.add_text(text="non-ListItem in ListGroup", label=DocItemLabel.TEXT, parent=l1) with pytest.raises(ValueError): bad_doc._validate_rules() @@ -1811,7 +1693,6 @@ def test_invalid_rich_table_doc(): def test_rich_table_item_insertion_normalization(): - doc = DoclingDocument(name="") doc.add_text(label=DocItemLabel.TITLE, text="Rich tables") @@ -1888,9 +1769,7 @@ def test_filter_pages(): orig_doc = DoclingDocument.load_from_json(src) doc = orig_doc.filter(page_nrs={2, 3, 5}) - html_data = doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, split_page_view=True - ) + html_data = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED, split_page_view=True) exp_json_file = src.with_name(f"{src.stem}_p2_p3_p5.gt.json") exp_html_file = exp_json_file.with_suffix(".html") @@ -1911,27 +1790,20 @@ def test_filter_pages(): def _create_doc_for_filtering(): doc = DoclingDocument( name="", - pages={ - i: PageItem(page_no=i, size=Size(width=100, height=100), image=None) - for i in range(1, 3) - }, + pages={i: PageItem(page_no=i, size=Size(width=100, height=100), image=None) for i in range(1, 3)}, ) p1_text = doc.add_text( text="Text 1", parent=doc.body, label=DocItemLabel.TEXT, - prov=ProvenanceItem( - page_no=1, bbox=BoundingBox(l=0, t=0, r=100, b=100), charspan=(0, 1) - ), + prov=ProvenanceItem(page_no=1, bbox=BoundingBox(l=0, t=0, r=100, b=100), charspan=(0, 1)), ) doc.add_group(parent=p1_text) doc.add_text( text="Text 2", parent=doc.body, label=DocItemLabel.TEXT, - prov=ProvenanceItem( - page_no=2, bbox=BoundingBox(l=0, t=0, r=100, b=100), charspan=(0, 1) - ), + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=0, t=0, r=100, b=100), charspan=(0, 1)), ) return doc @@ -1950,9 +1822,7 @@ def test_filter_invalid_pages(): doc = _create_doc_for_filtering() with pytest.raises( ValueError, - match=re.escape( - "The following page numbers are not present in the document: {3}" - ), + match=re.escape("The following page numbers are not present in the document: {3}"), ): doc.filter(page_nrs={3}) diff --git a/test/test_doctags_load.py b/test/test_doctags_load.py index 5355c2d1..37cfe190 100644 --- a/test/test_doctags_load.py +++ b/test/test_doctags_load.py @@ -22,11 +22,7 @@ def verify(exp_file: Path, actual: dict): # as the test was flaky due to URIs def strip_image_uris(d): if isinstance(d, dict): - return { - k: strip_image_uris(v) - for k, v in d.items() - if k not in {"uri", "image_uri"} - } + return {k: strip_image_uris(v) for k, v in d.items() if k not in {"uri", "image_uri"}} elif isinstance(d, list): return [strip_image_uris(x) for x in d] else: @@ -34,9 +30,7 @@ def strip_image_uris(d): expected_stripped = strip_image_uris(expected) actual_stripped = strip_image_uris(actual) - assert ( - expected_stripped == actual_stripped - ), "Dicts differ (ignoring image URIs)" + assert expected_stripped == actual_stripped, "Dicts differ (ignoring image URIs)" if "data:image/png;base64" in str(expected): # check if the image URIs are the same @@ -44,7 +38,6 @@ def strip_image_uris(d): def test_doctags_load_from_files(): - doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( [Path("test/data/doc/page_with_pic.dt")], [Path("test/data/doc/page_with_pic.png")], @@ -59,7 +52,6 @@ def test_doctags_load_from_files(): def test_doctags_load_from_memory(): - with Path("test/data/doc/page_with_pic.dt").open() as file: doctags = file.read() image = PILImage.open(Path("test/data/doc/page_with_pic.png")) @@ -155,11 +147,7 @@ def test_doctags_inline(): doctags_doc = DocTagsDocument.from_multipage_doctags_and_images( doctags=doctags, - images=[ - pil_img - for p in doc.pages - if (img_ref := doc.pages[p].image) and (pil_img := img_ref.pil_image) - ], + images=[pil_img for p in doc.pages if (img_ref := doc.pages[p].image) and (pil_img := img_ref.pil_image)], ) deser_doc = DoclingDocument.load_from_doctags(doctags_doc) diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py index 8e09a9f9..46c65d39 100644 --- a/test/test_hierarchical_chunker.py +++ b/test/test_hierarchical_chunker.py @@ -32,9 +32,7 @@ def test_chunk(): merge_list_items=True, ) chunks = chunker.chunk(dl_doc=dl_doc) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str="test/data/chunker/0_out_chunks.json", @@ -59,9 +57,7 @@ def get_serializer(self, doc: DoclingDocument): ) chunks = chunker.chunk(dl_doc=dl_doc) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str="test/data/chunker/0b_out_chunks.json", diff --git a/test/test_hybrid_chunker.py b/test/test_hybrid_chunker.py index 41075c75..21818c8c 100644 --- a/test/test_hybrid_chunker.py +++ b/test/test_hybrid_chunker.py @@ -53,9 +53,7 @@ def test_chunk_merge_peers(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -79,9 +77,7 @@ def test_chunk_with_model_name(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -102,9 +98,7 @@ def test_chunk_deprecated_max_tokens(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -160,9 +154,7 @@ def test_chunk_no_merge_peers(): ) chunks = chunker.chunk(dl_doc=dl_doc) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -183,9 +175,7 @@ def test_chunk_deprecated_explicit_hf_obj(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -208,9 +198,7 @@ def test_ignore_deprecated_param_if_new_tokenizer_passed(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -232,9 +220,7 @@ def test_deprecated_no_max_tokens(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -300,9 +286,7 @@ def get_serializer(self, doc: DoclingDocument): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -325,9 +309,7 @@ def test_chunk_openai(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -345,9 +327,7 @@ def test_chunk_default(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, @@ -370,9 +350,7 @@ def test_chunk_explicit(): chunk_iter = chunker.chunk(dl_doc=dl_doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=EXPECTED_OUT_FILE, diff --git a/test/test_json_schema_to_search_mapper.py b/test/test_json_schema_to_search_mapper.py index 9a6acbe4..6061ebef 100644 --- a/test/test_json_schema_to_search_mapper.py +++ b/test/test_json_schema_to_search_mapper.py @@ -41,16 +41,14 @@ def test_json_schema_to_search_mapper_0(): assert index_def is not None - filename = os.path.abspath( - os.path.join(os.path.dirname(__file__), "data/json_schemas/document-ref.json") - ) + filename = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/json_schemas/document-ref.json")) index_ref = _load(filename) diff = jsondiff.diff(index_ref, index_def) # print(json.dumps(index_def, indent=2)) - assert ( - index_def == index_ref - ), f"Error in search mappings of ExportedCCSDocument. Difference:\n{json.dumps(diff, indent=2)}" + assert index_def == index_ref, ( + f"Error in search mappings of ExportedCCSDocument. Difference:\n{json.dumps(diff, indent=2)}" + ) def test_json_schema_to_search_mapper_1(): @@ -90,13 +88,9 @@ def test_json_schema_to_search_mapper_1(): assert index_def is not None - filename = os.path.abspath( - os.path.join(os.path.dirname(__file__), "data/json_schemas/dbrecord-ref.json") - ) + filename = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/json_schemas/dbrecord-ref.json")) index_ref = _load(filename) diff = jsondiff.diff(index_ref, index_def) # print(json.dumps(index_def, indent=2)) - assert ( - index_def == index_ref - ), f"Error in search mappings of Record. Difference:\n{json.dumps(diff, indent=2)}" + assert index_def == index_ref, f"Error in search mappings of Record. Difference:\n{json.dumps(diff, indent=2)}" diff --git a/test/test_metadata.py b/test/test_metadata.py index e73144c6..e739d497 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -41,9 +41,7 @@ def test_metadata_usage() -> None: # add a custom metadata object to the item value = CustomCoordinates(longitude=47.3769, latitude=8.5417) - target_name = example_item.meta.set_custom_field( - namespace="my_corp", name="coords", value=value - ) + target_name = example_item.meta.set_custom_field(namespace="my_corp", name="coords", value=value) assert target_name == "my_corp__coords" # save the document @@ -77,42 +75,24 @@ def test_namespace_absence_raises(): def _create_doc_with_group_with_metadata() -> DoclingDocument: doc = DoclingDocument(name="") - doc.body.meta = BaseMeta( - summary=SummaryMetaField(text="This document talks about various topics.") - ) + doc.body.meta = BaseMeta(summary=SummaryMetaField(text="This document talks about various topics.")) grp1 = doc.add_group(name="1", label=GroupLabel.CHAPTER) - grp1.meta = BaseMeta( - summary=SummaryMetaField(text="This chapter discusses foo and bar.") - ) - doc.add_text( - text="This is some introductory text.", label=DocItemLabel.TEXT, parent=grp1 - ) + grp1.meta = BaseMeta(summary=SummaryMetaField(text="This chapter discusses foo and bar.")) + doc.add_text(text="This is some introductory text.", label=DocItemLabel.TEXT, parent=grp1) grp1a = doc.add_group(parent=grp1, name="1a", label=GroupLabel.SECTION) - grp1a.meta = BaseMeta( - summary=SummaryMetaField(text="This section talks about foo.") - ) - grp1a.meta.set_custom_field( - namespace="my_corp", name="test_1", value="custom field value 1" - ) + grp1a.meta = BaseMeta(summary=SummaryMetaField(text="This section talks about foo.")) + grp1a.meta.set_custom_field(namespace="my_corp", name="test_1", value="custom field value 1") txt1 = doc.add_text(text="Regarding foo...", label=DocItemLabel.TEXT, parent=grp1a) - txt1.meta = BaseMeta( - summary=SummaryMetaField(text="This paragraph provides more details about foo.") - ) + txt1.meta = BaseMeta(summary=SummaryMetaField(text="This paragraph provides more details about foo.")) lst1a = doc.add_list_group(parent=grp1a) - lst1a.meta = BaseMeta( - summary=SummaryMetaField(text="Here some foo specifics are listed.") - ) + lst1a.meta = BaseMeta(summary=SummaryMetaField(text="Here some foo specifics are listed.")) doc.add_list_item(text="lorem", parent=lst1a, enumerated=True) doc.add_list_item(text="ipsum", parent=lst1a, enumerated=True) grp1b = doc.add_group(parent=grp1, name="1b", label=GroupLabel.SECTION) - grp1b.meta = BaseMeta( - summary=SummaryMetaField(text="This section talks about bar.") - ) - grp1b.meta.set_custom_field( - namespace="my_corp", name="test_2", value="custom field value 2" - ) + grp1b.meta = BaseMeta(summary=SummaryMetaField(text="This section talks about bar.")) + grp1b.meta.set_custom_field(namespace="my_corp", name="test_2", value="custom field value 2") doc.add_text(text="Regarding bar...", label=DocItemLabel.TEXT, parent=grp1b) return doc @@ -231,9 +211,7 @@ def test_md_ser_without_non_meta(): def test_ser_custom_meta_serializer(): - class SummaryMarkdownMetaSerializer(MarkdownMetaSerializer): - @override def serialize( self, @@ -249,15 +227,8 @@ def serialize( text="\n\n".join( [ f"{' ' * (level or 0)}[{item.self_ref}] [{item.__class__.__name__}:{item.label.value}] {tmp}" # type:ignore[attr-defined] - for key in ( - list(item.meta.__class__.model_fields) - + list(item.meta.get_custom_part()) - ) - if ( - tmp := self._serialize_meta_field( - item.meta, key, params.mark_meta - ) - ) + for key in (list(item.meta.__class__.model_fields) + list(item.meta.get_custom_part())) + if (tmp := self._serialize_meta_field(item.meta, key, params.mark_meta)) ] if item.meta else [] @@ -265,18 +236,10 @@ def serialize( span_source=item if isinstance(item, DocItem) else [], ) - def _serialize_meta_field( - self, meta: BaseMeta, name: str, mark_meta: bool - ) -> Optional[str]: - if (field_val := getattr(meta, name)) is not None and isinstance( - field_val, SummaryMetaField - ): + def _serialize_meta_field(self, meta: BaseMeta, name: str, mark_meta: bool) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None and isinstance(field_val, SummaryMetaField): txt = field_val.text - return ( - f"[{self._humanize_text(name, title=True)}] {txt}" - if mark_meta - else txt - ) + return f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt else: return None @@ -286,9 +249,7 @@ def _serialize_meta_field( params = MarkdownParams( include_non_meta=False, ) - ser = MarkdownDocSerializer( - doc=doc, params=params, meta_serializer=SummaryMarkdownMetaSerializer() - ) + ser = MarkdownDocSerializer(doc=doc, params=params, meta_serializer=SummaryMarkdownMetaSerializer()) ser_res = ser.serialize() actual = ser_res.text exp_file = Path("test/data/doc/group_with_metadata_summaries.md") diff --git a/test/test_otsl_table_export.py b/test/test_otsl_table_export.py index 84dd5005..54bbb520 100644 --- a/test/test_otsl_table_export.py +++ b/test/test_otsl_table_export.py @@ -2,7 +2,6 @@ def test_table_export_to_otsl(): - data_table_cells = [] num_cols = 6 num_rows = 5 @@ -271,9 +270,7 @@ def test_table_export_to_otsl(): data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=data_table_cells) doc.add_table(data=data) - otsl_string = doc.tables[0].export_to_otsl( - add_cell_location=False, add_cell_text=False, doc=doc - ) + otsl_string = doc.tables[0].export_to_otsl(add_cell_location=False, add_cell_text=False, doc=doc) otsl_string.split("") # print("OTSL out:") diff --git a/test/test_page.py b/test/test_page.py index 14b141da..72e50e00 100644 --- a/test/test_page.py +++ b/test/test_page.py @@ -207,8 +207,6 @@ (R_315_TL, 7 * np.pi / 4, 315), ], ) -def test_bounding_rectangle_angle( - rectangle: BoundingRectangle, expected_angle: float, expected_angle_360: int -): +def test_bounding_rectangle_angle(rectangle: BoundingRectangle, expected_angle: float, expected_angle_360: int): assert pytest.approx(rectangle.angle, abs=1e-6) == expected_angle assert pytest.approx(rectangle.angle_360, abs=1e-6) == expected_angle_360 diff --git a/test/test_page_chunker.py b/test/test_page_chunker.py index de280493..46f16f26 100644 --- a/test/test_page_chunker.py +++ b/test/test_page_chunker.py @@ -27,9 +27,7 @@ def test_page_chunks(): chunk_iter = chunker.chunk(dl_doc=doc) chunks = list(chunk_iter) - act_data = dict( - root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] - ) + act_data = dict(root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]) _process( act_data=act_data, exp_path_str=src.parent / f"{src.stem}_chunks.json", diff --git a/test/test_rec_schema.py b/test/test_rec_schema.py index 3d62b825..83ac93ce 100644 --- a/test/test_rec_schema.py +++ b/test/test_rec_schema.py @@ -37,7 +37,7 @@ def test_predicates_wrong(self): filename = "test/data/rec/error-predicate-02.json" with ( - pytest.raises(ValidationError, match="geopoint_value.conf"), + pytest.raises(ValidationError, match="geopoint_value\\.conf"), open(filename, encoding="utf-8") as file_obj, ): file_json = file_obj.read() @@ -78,9 +78,7 @@ def test_subjects(self): def test_subjects2(self): """Validate data with Subject schema.""" # IdentifierTypeT, SubjectTypeT, SubjectNameTypeT - subject = Subject[ - Literal["db"], Literal["material"], Literal["chemical_name", "sum_formula"] - ] + subject = Subject[Literal["db"], Literal["material"], Literal["chemical_name", "sum_formula"]] for filename in glob.glob("test/data/rec/subject-*.json"): try: with open(filename, encoding="utf-8") as file_obj: @@ -93,9 +91,7 @@ def test_subjects2(self): def test_subjects_wrong(self): """Validate data with Subject schema.""" # IdentifierTypeT, SubjectTypeT, SubjectNameTypeT - subject = Subject[ - Literal["db_"], Literal["material"], Literal["chemical_name", "sum_formula"] - ] + subject = Subject[Literal["db_"], Literal["material"], Literal["chemical_name", "sum_formula"]] for filename in glob.glob("test/data/rec/subject-*.json"): with ( self.assertRaises(ValidationError), @@ -103,9 +99,7 @@ def test_subjects_wrong(self): ): file_json = file_obj.read() subject.model_validate_json(file_json) - subject = Subject[ - Literal["db"], Literal["material_"], Literal["chemical_name", "sum_formula"] - ] + subject = Subject[Literal["db"], Literal["material_"], Literal["chemical_name", "sum_formula"]] for filename in glob.glob("test/data/rec/subject-*.json"): with ( self.assertRaises(ValidationError), diff --git a/test/test_regions_to_table.py b/test/test_regions_to_table.py index 309e39fd..28611bd3 100644 --- a/test/test_regions_to_table.py +++ b/test/test_regions_to_table.py @@ -67,11 +67,11 @@ def test_regions_to_table_convert(): assert table_data.table_cells[0].bbox.b == 25.0 assert table_data.table_cells[0].col_span == 2 - assert table_data.table_cells[0].column_header == True - assert table_data.table_cells[1].column_header == True + assert table_data.table_cells[0].column_header + assert table_data.table_cells[1].column_header - assert table_data.table_cells[10].row_header == True - assert table_data.table_cells[12].row_section == True + assert table_data.table_cells[10].row_header + assert table_data.table_cells[12].row_section assert table_data.table_cells[17].bbox.l == 75.0 assert table_data.table_cells[17].bbox.t == 100.0 diff --git a/test/test_search_meta.py b/test/test_search_meta.py index d5b39daa..5904452e 100644 --- a/test/test_search_meta.py +++ b/test/test_search_meta.py @@ -12,9 +12,7 @@ def test_meta(): """Validate data with Meta schema.""" taxonomy = Literal["Public", "PI"] - domain = Literal[ - "Science", "Technology", "History", "Art", "Literature", "Geography" - ] + domain = Literal["Science", "Technology", "History", "Art", "Literature", "Geography"] for filename in glob.glob("test/data/search/meta-*.json"): try: diff --git a/test/test_serialization.py b/test/test_serialization.py index a783c410..a2ebb4f4 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -43,7 +43,7 @@ def verify(exp_file: Path, actual: str): # Normalize platform-dependent quote escaping for DocTags outputs name = exp_file.name - if name.endswith(".dt") or name.endswith(".idt.xml"): + if name.endswith((".dt", ".idt.xml")): def _normalize_quotes(s: str) -> str: return s.replace(""", '"').replace(""", '"') @@ -209,7 +209,6 @@ def test_md_list_item_markers(sample_doc): root_dir = Path("./test/data/doc") for mode in OrigListItemMarkerMode: for valid in [False, True]: - ser = MarkdownDocSerializer( doc=sample_doc, params=MarkdownParams( @@ -219,8 +218,7 @@ def test_md_list_item_markers(sample_doc): ) actual = ser.serialize().text verify( - root_dir - / f"constructed_mode_{str(mode.value).lower()}_valid_{str(valid).lower()}.gt.md", + root_dir / f"constructed_mode_{str(mode.value).lower()}_valid_{str(valid).lower()}.gt.md", actual=actual, ) @@ -265,9 +263,7 @@ def test_md_legacy_annotations_mark_true(sample_doc): exp_file = Path("./test/data/doc/constructed_legacy_annot_mark_true.gt.md") with pytest.warns(DeprecationWarning): sample_doc.tables[0].annotations.append( - DescriptionAnnotation( - text="This is a description of table 1.", provenance="foo" - ) + DescriptionAnnotation(text="This is a description of table 1.", provenance="foo") ) ser = MarkdownDocSerializer( doc=sample_doc, @@ -286,9 +282,7 @@ def test_md_legacy_annotations_mark_false(sample_doc): exp_file = Path("./test/data/doc/constructed_legacy_annot_mark_false.gt.md") with pytest.warns(DeprecationWarning): sample_doc.tables[0].annotations.append( - DescriptionAnnotation( - text="This is a description of table 1.", provenance="foo" - ) + DescriptionAnnotation(text="This is a description of table 1.", provenance="foo") ) ser = MarkdownDocSerializer( doc=sample_doc, @@ -521,7 +515,6 @@ def test_html_include_annotations_true(): def test_html_list_item_markers(sample_doc): root_dir = Path("./test/data/doc") for orig in [False, True]: - ser = HTMLDocSerializer( doc=sample_doc, params=HTMLParams( diff --git a/test/test_visualization.py b/test/test_visualization.py index ae74fa04..12e0ae8b 100644 --- a/test/test_visualization.py +++ b/test/test_visualization.py @@ -12,7 +12,7 @@ def verify(exp_file: Path, actual: PIL.Image.Image): if GEN_TEST_DATA: - with open(exp_file, "w", encoding="utf-8") as f: + with open(exp_file, "w", encoding="utf-8"): actual.save(exp_file) else: with PIL.Image.open(exp_file) as expected: @@ -72,14 +72,11 @@ def test_table_visualization_for_rows_and_cols(): src = Path("./test/data/doc/2408.09869v3_enriched.json") doc = DoclingDocument.load_from_json(src) - visualizer = TableVisualizer( - params=TableVisualizer.Params(show_cells=False, show_rows=True, show_cols=True) - ) + visualizer = TableVisualizer(params=TableVisualizer.Params(show_cells=False, show_rows=True, show_cols=True)) viz_pages = visualizer.get_visualization(doc=doc) verify( - exp_file=VIZ_TEST_DATA_PATH - / f"{src.stem}_table_viz_wout_lbl_p5_rows_and_cols.png", + exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_table_viz_wout_lbl_p5_rows_and_cols.png", actual=viz_pages[5], ) @@ -92,6 +89,6 @@ def test_cross_page_lists_with_branch_nums(): for i in range(2): verify( - exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_p{i+1}.png", + exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_p{i + 1}.png", actual=viz_pages[i + 1], ) diff --git a/uv.lock b/uv.lock index 30f766a2..04452c23 100644 --- a/uv.lock +++ b/uv.lock @@ -783,6 +783,7 @@ dev = [ { name = "pytest" }, { name = "pytest-cov" }, { name = "python-semantic-release" }, + { name = "ruff" }, { name = "types-setuptools" }, ] @@ -835,6 +836,7 @@ dev = [ { name = "pytest", specifier = "~=8.3" }, { name = "pytest-cov", specifier = ">=6.1.1" }, { name = "python-semantic-release", specifier = "~=7.32" }, + { name = "ruff", specifier = ">=0.14.8" }, { name = "types-setuptools", specifier = "~=70.3" }, ] @@ -3281,6 +3283,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ed/d2/4a73b18821fd4669762c855fd1f4e80ceb66fb72d71162d14da58444a763/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5d0145edba8abd3db0ab22b5300c99dc152f5c9021fab861be0f0544dc3cbc5f", size = 552199, upload-time = "2025-10-22T22:24:26.54Z" }, ] +[[package]] +name = "ruff" +version = "0.14.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ed/d9/f7a0c4b3a2bf2556cd5d99b05372c29980249ef71e8e32669ba77428c82c/ruff-0.14.8.tar.gz", hash = "sha256:774ed0dd87d6ce925e3b8496feb3a00ac564bea52b9feb551ecd17e0a23d1eed", size = 5765385, upload-time = "2025-12-04T15:06:17.669Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/b8/9537b52010134b1d2b72870cc3f92d5fb759394094741b09ceccae183fbe/ruff-0.14.8-py3-none-linux_armv6l.whl", hash = "sha256:ec071e9c82eca417f6111fd39f7043acb53cd3fde9b1f95bbed745962e345afb", size = 13441540, upload-time = "2025-12-04T15:06:14.896Z" }, + { url = "https://files.pythonhosted.org/packages/24/00/99031684efb025829713682012b6dd37279b1f695ed1b01725f85fd94b38/ruff-0.14.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:8cdb162a7159f4ca36ce980a18c43d8f036966e7f73f866ac8f493b75e0c27e9", size = 13669384, upload-time = "2025-12-04T15:06:51.809Z" }, + { url = "https://files.pythonhosted.org/packages/72/64/3eb5949169fc19c50c04f28ece2c189d3b6edd57e5b533649dae6ca484fe/ruff-0.14.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2e2fcbefe91f9fad0916850edf0854530c15bd1926b6b779de47e9ab619ea38f", size = 12806917, upload-time = "2025-12-04T15:06:08.925Z" }, + { url = "https://files.pythonhosted.org/packages/c4/08/5250babb0b1b11910f470370ec0cbc67470231f7cdc033cee57d4976f941/ruff-0.14.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9d70721066a296f45786ec31916dc287b44040f553da21564de0ab4d45a869b", size = 13256112, upload-time = "2025-12-04T15:06:23.498Z" }, + { url = "https://files.pythonhosted.org/packages/78/4c/6c588e97a8e8c2d4b522c31a579e1df2b4d003eddfbe23d1f262b1a431ff/ruff-0.14.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2c87e09b3cd9d126fc67a9ecd3b5b1d3ded2b9c7fce3f16e315346b9d05cfb52", size = 13227559, upload-time = "2025-12-04T15:06:33.432Z" }, + { url = "https://files.pythonhosted.org/packages/23/ce/5f78cea13eda8eceac71b5f6fa6e9223df9b87bb2c1891c166d1f0dce9f1/ruff-0.14.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d62cb310c4fbcb9ee4ac023fe17f984ae1e12b8a4a02e3d21489f9a2a5f730c", size = 13896379, upload-time = "2025-12-04T15:06:02.687Z" }, + { url = "https://files.pythonhosted.org/packages/cf/79/13de4517c4dadce9218a20035b21212a4c180e009507731f0d3b3f5df85a/ruff-0.14.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1af35c2d62633d4da0521178e8a2641c636d2a7153da0bac1b30cfd4ccd91344", size = 15372786, upload-time = "2025-12-04T15:06:29.828Z" }, + { url = "https://files.pythonhosted.org/packages/00/06/33df72b3bb42be8a1c3815fd4fae83fa2945fc725a25d87ba3e42d1cc108/ruff-0.14.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25add4575ffecc53d60eed3f24b1e934493631b48ebbc6ebaf9d8517924aca4b", size = 14990029, upload-time = "2025-12-04T15:06:36.812Z" }, + { url = "https://files.pythonhosted.org/packages/64/61/0f34927bd90925880394de0e081ce1afab66d7b3525336f5771dcf0cb46c/ruff-0.14.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4c943d847b7f02f7db4201a0600ea7d244d8a404fbb639b439e987edcf2baf9a", size = 14407037, upload-time = "2025-12-04T15:06:39.979Z" }, + { url = "https://files.pythonhosted.org/packages/96/bc/058fe0aefc0fbf0d19614cb6d1a3e2c048f7dc77ca64957f33b12cfdc5ef/ruff-0.14.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb6e8bf7b4f627548daa1b69283dac5a296bfe9ce856703b03130732e20ddfe2", size = 14102390, upload-time = "2025-12-04T15:06:46.372Z" }, + { url = "https://files.pythonhosted.org/packages/af/a4/e4f77b02b804546f4c17e8b37a524c27012dd6ff05855d2243b49a7d3cb9/ruff-0.14.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:7aaf2974f378e6b01d1e257c6948207aec6a9b5ba53fab23d0182efb887a0e4a", size = 14230793, upload-time = "2025-12-04T15:06:20.497Z" }, + { url = "https://files.pythonhosted.org/packages/3f/52/bb8c02373f79552e8d087cedaffad76b8892033d2876c2498a2582f09dcf/ruff-0.14.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e5758ca513c43ad8a4ef13f0f081f80f08008f410790f3611a21a92421ab045b", size = 13160039, upload-time = "2025-12-04T15:06:49.06Z" }, + { url = "https://files.pythonhosted.org/packages/1f/ad/b69d6962e477842e25c0b11622548df746290cc6d76f9e0f4ed7456c2c31/ruff-0.14.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f74f7ba163b6e85a8d81a590363bf71618847e5078d90827749bfda1d88c9cdf", size = 13205158, upload-time = "2025-12-04T15:06:54.574Z" }, + { url = "https://files.pythonhosted.org/packages/06/63/54f23da1315c0b3dfc1bc03fbc34e10378918a20c0b0f086418734e57e74/ruff-0.14.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:eed28f6fafcc9591994c42254f5a5c5ca40e69a30721d2ab18bb0bb3baac3ab6", size = 13469550, upload-time = "2025-12-04T15:05:59.209Z" }, + { url = "https://files.pythonhosted.org/packages/70/7d/a4d7b1961e4903bc37fffb7ddcfaa7beb250f67d97cfd1ee1d5cddb1ec90/ruff-0.14.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:21d48fa744c9d1cb8d71eb0a740c4dd02751a5de9db9a730a8ef75ca34cf138e", size = 14211332, upload-time = "2025-12-04T15:06:06.027Z" }, + { url = "https://files.pythonhosted.org/packages/5d/93/2a5063341fa17054e5c86582136e9895db773e3c2ffb770dde50a09f35f0/ruff-0.14.8-py3-none-win32.whl", hash = "sha256:15f04cb45c051159baebb0f0037f404f1dc2f15a927418f29730f411a79bc4e7", size = 13151890, upload-time = "2025-12-04T15:06:11.668Z" }, + { url = "https://files.pythonhosted.org/packages/02/1c/65c61a0859c0add13a3e1cbb6024b42de587456a43006ca2d4fd3d1618fe/ruff-0.14.8-py3-none-win_amd64.whl", hash = "sha256:9eeb0b24242b5bbff3011409a739929f497f3fb5fe3b5698aba5e77e8c833097", size = 14537826, upload-time = "2025-12-04T15:06:26.409Z" }, + { url = "https://files.pythonhosted.org/packages/6d/63/8b41cea3afd7f58eb64ac9251668ee0073789a3bc9ac6f816c8c6fef986d/ruff-0.14.8-py3-none-win_arm64.whl", hash = "sha256:965a582c93c63fe715fd3e3f8aa37c4b776777203d8e1d8aa3cc0c14424a4b99", size = 13634522, upload-time = "2025-12-04T15:06:43.212Z" }, +] + [[package]] name = "safetensors" version = "0.6.2"