diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 414640d6..56e1230d 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -5351,6 +5351,10 @@ def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
coords = coords[:4]
if len(coords) == 4:
l, t, r, b = map(float, coords)
+ eps = 1 / 500
+ # Ignore bounding boxes with width or height of <1e-3, including cases where l>r or t>b.
+ if r - l < eps or b - t < eps:
+ return None
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
return None
@@ -5366,12 +5370,13 @@ def extract_caption(
if caption is not None:
caption_content = caption.group(1)
bbox = extract_bounding_box(caption_content)
- caption_text = extract_inner_text(caption_content)
- caption_item = doc.add_text(
- label=DocItemLabel.CAPTION,
- text=caption_text,
- parent=None,
- )
+ if bbox is not None:
+ caption_text = extract_inner_text(caption_content)
+ caption_item = doc.add_text(
+ label=DocItemLabel.CAPTION,
+ text=caption_text,
+ parent=None,
+ )
else:
caption_item = None
bbox = None
@@ -5634,17 +5639,20 @@ def _add_text(
common_bbox = extract_bounding_box(content)
for item_match in pattern.finditer(content):
item_tag = item_match.group("tag")
- _add_text(
- full_chunk=item_match.group(0),
- bbox=common_bbox,
- pg_width=pg_width,
- pg_height=pg_height,
- page_no=page_no,
- tag_name=item_tag,
- doc_label=tag_to_doclabel.get(item_tag, DocItemLabel.TEXT),
- doc=doc,
- parent=inline_group,
- )
+ if common_bbox is not None:
+ _add_text(
+ full_chunk=item_match.group(0),
+ bbox=common_bbox,
+ pg_width=pg_width,
+ pg_height=pg_height,
+ page_no=page_no,
+ tag_name=item_tag,
+ doc_label=tag_to_doclabel.get(
+ item_tag, DocItemLabel.TEXT
+ ),
+ doc=doc,
+ parent=inline_group,
+ )
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
caption, caption_bbox = extract_caption(full_chunk)
diff --git a/test/data/doc/2408.09869v3_enriched.dt.json b/test/data/doc/2408.09869v3_enriched.dt.json
index 5b3c9eaf..53aa324e 100644
--- a/test/data/doc/2408.09869v3_enriched.dt.json
+++ b/test/data/doc/2408.09869v3_enriched.dt.json
@@ -4960,22 +4960,7 @@
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [
- {
- "page_no": 9,
- "bbox": {
- "l": 670.7520000000001,
- "t": 887.0400000000001,
- "r": 670.7520000000001,
- "b": 899.7119999999999,
- "coord_origin": "TOPLEFT"
- },
- "charspan": [
- 0,
- 1
- ]
- }
- ],
+ "prov": [],
"orig": ",",
"text": ","
},
@@ -5041,22 +5026,7 @@
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [
- {
- "page_no": 9,
- "bbox": {
- "l": 692.784,
- "t": 899.7119999999999,
- "r": 692.784,
- "b": 912.3839999999999,
- "coord_origin": "TOPLEFT"
- },
- "charspan": [
- 0,
- 1
- ]
- }
- ],
+ "prov": [],
"orig": ".",
"text": "."
},
@@ -15297,4 +15267,4 @@
"page_no": 9
}
}
-}
+}
\ No newline at end of file
diff --git a/test/data/doc/2408.09869v3_enriched.out.dt.json b/test/data/doc/2408.09869v3_enriched.out.dt.json
index fec32692..651cb82c 100644
--- a/test/data/doc/2408.09869v3_enriched.out.dt.json
+++ b/test/data/doc/2408.09869v3_enriched.out.dt.json
@@ -4960,22 +4960,7 @@
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [
- {
- "page_no": 9,
- "bbox": {
- "l": 670.7520000000001,
- "t": 887.0400000000001,
- "r": 670.7520000000001,
- "b": 899.7119999999999,
- "coord_origin": "TOPLEFT"
- },
- "charspan": [
- 0,
- 1
- ]
- }
- ],
+ "prov": [],
"orig": ",",
"text": ","
},
@@ -5041,22 +5026,7 @@
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [
- {
- "page_no": 9,
- "bbox": {
- "l": 692.784,
- "t": 899.7119999999999,
- "r": 692.784,
- "b": 912.3839999999999,
- "coord_origin": "TOPLEFT"
- },
- "charspan": [
- 0,
- 1
- ]
- }
- ],
+ "prov": [],
"orig": ".",
"text": "."
},
@@ -16044,4 +16014,4 @@
"page_no": 9
}
}
-}
+}
\ No newline at end of file
diff --git a/test/data/doc/defect_bbox_page.dt b/test/data/doc/defect_bbox_page.dt
new file mode 100644
index 00000000..c93c7a8e
--- /dev/null
+++ b/test/data/doc/defect_bbox_page.dt
@@ -0,0 +1,4 @@
+Assistant:
+This is valid text with a zero-height bounding box.
+This is valid text with a negative-width bounding box.
+
\ No newline at end of file
diff --git a/test/data/doc/defect_bbox_page.dt.json b/test/data/doc/defect_bbox_page.dt.json
new file mode 100644
index 00000000..e176211f
--- /dev/null
+++ b/test/data/doc/defect_bbox_page.dt.json
@@ -0,0 +1,66 @@
+{
+ "body": {
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/texts/1"
+ }
+ ],
+ "content_layer": "body",
+ "label": "unspecified",
+ "name": "_root_",
+ "self_ref": "#/body"
+ },
+ "form_items": [],
+ "furniture": {
+ "children": [],
+ "content_layer": "furniture",
+ "label": "unspecified",
+ "name": "_root_",
+ "self_ref": "#/furniture"
+ },
+ "groups": [],
+ "key_value_items": [],
+ "name": "Document",
+ "pages": {
+ "1": {
+ "page_no": 1,
+ "size": {
+ "height": 1.0,
+ "width": 1.0
+ }
+ }
+ },
+ "pictures": [],
+ "schema_name": "DoclingDocument",
+ "tables": [],
+ "texts": [
+ {
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "orig": "This is valid text with a zero-height bounding box.",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "prov": [],
+ "self_ref": "#/texts/0",
+ "text": "This is valid text with a zero-height bounding box."
+ },
+ {
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "orig": "This is valid text with a negative-width bounding box.",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "prov": [],
+ "self_ref": "#/texts/1",
+ "text": "This is valid text with a negative-width bounding box."
+ }
+ ],
+ "version": "1.8.0"
+}
\ No newline at end of file
diff --git a/test/test_doctags_load.py b/test/test_doctags_load.py
index 5355c2d1..c24989d4 100644
--- a/test/test_doctags_load.py
+++ b/test/test_doctags_load.py
@@ -168,3 +168,17 @@ def test_doctags_inline():
exp_file=exp,
actual=deser_doc.export_to_dict(),
)
+
+
+def test_doctags_handle_defect_bbox():
+
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
+ [Path("test/data/doc/defect_bbox_page.dt")], None
+ )
+
+ doc = DoclingDocument.load_from_doctags(doctags_doc)
+ exp = "test/data/doc/defect_bbox_page.dt.json"
+ verify(
+ exp_file=exp,
+ actual=doc.export_to_dict(),
+ )