diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 414640d6..56e1230d 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -5351,6 +5351,10 @@ def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]: coords = coords[:4] if len(coords) == 4: l, t, r, b = map(float, coords) + eps = 1 / 500 + # Ignore bounding boxes with width or height of <1e-3, including cases where l>r or t>b. + if r - l < eps or b - t < eps: + return None return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500) return None @@ -5366,12 +5370,13 @@ def extract_caption( if caption is not None: caption_content = caption.group(1) bbox = extract_bounding_box(caption_content) - caption_text = extract_inner_text(caption_content) - caption_item = doc.add_text( - label=DocItemLabel.CAPTION, - text=caption_text, - parent=None, - ) + if bbox is not None: + caption_text = extract_inner_text(caption_content) + caption_item = doc.add_text( + label=DocItemLabel.CAPTION, + text=caption_text, + parent=None, + ) else: caption_item = None bbox = None @@ -5634,17 +5639,20 @@ def _add_text( common_bbox = extract_bounding_box(content) for item_match in pattern.finditer(content): item_tag = item_match.group("tag") - _add_text( - full_chunk=item_match.group(0), - bbox=common_bbox, - pg_width=pg_width, - pg_height=pg_height, - page_no=page_no, - tag_name=item_tag, - doc_label=tag_to_doclabel.get(item_tag, DocItemLabel.TEXT), - doc=doc, - parent=inline_group, - ) + if common_bbox is not None: + _add_text( + full_chunk=item_match.group(0), + bbox=common_bbox, + pg_width=pg_width, + pg_height=pg_height, + page_no=page_no, + tag_name=item_tag, + doc_label=tag_to_doclabel.get( + item_tag, DocItemLabel.TEXT + ), + doc=doc, + parent=inline_group, + ) elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]: caption, caption_bbox = extract_caption(full_chunk) diff --git a/test/data/doc/2408.09869v3_enriched.dt.json b/test/data/doc/2408.09869v3_enriched.dt.json index 5b3c9eaf..53aa324e 100644 --- a/test/data/doc/2408.09869v3_enriched.dt.json +++ b/test/data/doc/2408.09869v3_enriched.dt.json @@ -4960,22 +4960,7 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 670.7520000000001, - "t": 887.0400000000001, - "r": 670.7520000000001, - "b": 899.7119999999999, - "coord_origin": "TOPLEFT" - }, - "charspan": [ - 0, - 1 - ] - } - ], + "prov": [], "orig": ",", "text": "," }, @@ -5041,22 +5026,7 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 692.784, - "t": 899.7119999999999, - "r": 692.784, - "b": 912.3839999999999, - "coord_origin": "TOPLEFT" - }, - "charspan": [ - 0, - 1 - ] - } - ], + "prov": [], "orig": ".", "text": "." }, @@ -15297,4 +15267,4 @@ "page_no": 9 } } -} +} \ No newline at end of file diff --git a/test/data/doc/2408.09869v3_enriched.out.dt.json b/test/data/doc/2408.09869v3_enriched.out.dt.json index fec32692..651cb82c 100644 --- a/test/data/doc/2408.09869v3_enriched.out.dt.json +++ b/test/data/doc/2408.09869v3_enriched.out.dt.json @@ -4960,22 +4960,7 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 670.7520000000001, - "t": 887.0400000000001, - "r": 670.7520000000001, - "b": 899.7119999999999, - "coord_origin": "TOPLEFT" - }, - "charspan": [ - 0, - 1 - ] - } - ], + "prov": [], "orig": ",", "text": "," }, @@ -5041,22 +5026,7 @@ "children": [], "content_layer": "body", "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 692.784, - "t": 899.7119999999999, - "r": 692.784, - "b": 912.3839999999999, - "coord_origin": "TOPLEFT" - }, - "charspan": [ - 0, - 1 - ] - } - ], + "prov": [], "orig": ".", "text": "." }, @@ -16044,4 +16014,4 @@ "page_no": 9 } } -} +} \ No newline at end of file diff --git a/test/data/doc/defect_bbox_page.dt b/test/data/doc/defect_bbox_page.dt new file mode 100644 index 00000000..c93c7a8e --- /dev/null +++ b/test/data/doc/defect_bbox_page.dt @@ -0,0 +1,4 @@ +Assistant: +This is valid text with a zero-height bounding box. +This is valid text with a negative-width bounding box. + \ No newline at end of file diff --git a/test/data/doc/defect_bbox_page.dt.json b/test/data/doc/defect_bbox_page.dt.json new file mode 100644 index 00000000..e176211f --- /dev/null +++ b/test/data/doc/defect_bbox_page.dt.json @@ -0,0 +1,66 @@ +{ + "body": { + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + } + ], + "content_layer": "body", + "label": "unspecified", + "name": "_root_", + "self_ref": "#/body" + }, + "form_items": [], + "furniture": { + "children": [], + "content_layer": "furniture", + "label": "unspecified", + "name": "_root_", + "self_ref": "#/furniture" + }, + "groups": [], + "key_value_items": [], + "name": "Document", + "pages": { + "1": { + "page_no": 1, + "size": { + "height": 1.0, + "width": 1.0 + } + } + }, + "pictures": [], + "schema_name": "DoclingDocument", + "tables": [], + "texts": [ + { + "children": [], + "content_layer": "body", + "label": "text", + "orig": "This is valid text with a zero-height bounding box.", + "parent": { + "$ref": "#/body" + }, + "prov": [], + "self_ref": "#/texts/0", + "text": "This is valid text with a zero-height bounding box." + }, + { + "children": [], + "content_layer": "body", + "label": "text", + "orig": "This is valid text with a negative-width bounding box.", + "parent": { + "$ref": "#/body" + }, + "prov": [], + "self_ref": "#/texts/1", + "text": "This is valid text with a negative-width bounding box." + } + ], + "version": "1.8.0" +} \ No newline at end of file diff --git a/test/test_doctags_load.py b/test/test_doctags_load.py index 5355c2d1..c24989d4 100644 --- a/test/test_doctags_load.py +++ b/test/test_doctags_load.py @@ -168,3 +168,17 @@ def test_doctags_inline(): exp_file=exp, actual=deser_doc.export_to_dict(), ) + + +def test_doctags_handle_defect_bbox(): + + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( + [Path("test/data/doc/defect_bbox_page.dt")], None + ) + + doc = DoclingDocument.load_from_doctags(doctags_doc) + exp = "test/data/doc/defect_bbox_page.dt.json" + verify( + exp_file=exp, + actual=doc.export_to_dict(), + )