diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 30d4e50c4b..1d8ebddf12 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -10,7 +10,7 @@ ConfigDict, Field, ) -from typing_extensions import deprecated +from typing_extensions import Annotated, deprecated from docling.datamodel import asr_model_specs, vlm_model_specs @@ -79,18 +79,42 @@ class TableStructureOptions(BaseTableStructureOptions): class OcrOptions(BaseOptions): """OCR options.""" - lang: List[str] - force_full_page_ocr: bool = False # If enabled a full page OCR is always applied - bitmap_area_threshold: float = ( - 0.05 # percentage of the area for a bitmap to processed with OCR - ) + lang: Annotated[ + List[str], + Field( + description="List of OCR languages to use, in ISO codes (e.g., 'deu', 'eng').", + examples=[["deu", "eng"]], + ), + ] + + force_full_page_ocr: Annotated[ + bool, + Field( + description="If enabled, a full-page OCR is always applied.", + examples=[False], + ), + ] = False + + bitmap_area_threshold: Annotated[ + float, + Field( + description="Percentage of the page area for a bitmap to be processed with OCR.", + examples=[0.05, 0.1], + ), + ] = 0.05 class OcrAutoOptions(OcrOptions): """Options for pick OCR engine automatically.""" kind: ClassVar[Literal["auto"]] = "auto" - lang: List[str] = [] + lang: Annotated[ + List[str], + Field( + description="Default value is an empty list, i.e. no language selected", + examples=["eng", "deu"], + ), + ] = [] class RapidOcrOptions(OcrOptions):