docling-project · akanshajain231999 · Nov 13, 2025 · Nov 13, 2025
diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -357,6 +357,13 @@ def convert(  # noqa: C901
         AsrModelType,
         typer.Option(..., help="Choose the ASR model to use with audio/video files."),
     ] = AsrModelType.WHISPER_TINY,
+    asr_no_timing: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="Disable printing timing metadata in ASR transcription output.",
+        ),
+    ] = False,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -837,6 +844,10 @@ def convert(  # noqa: C901
             _log.error(f"{asr_model} is not known")
             raise ValueError(f"{asr_model} is not known")
 
+        # Apply timing metadata preference
+        if asr_pipeline_options.asr_options is not None:
+            asr_pipeline_options.asr_options.include_time_metadata = not asr_no_timing
+
         _log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
 
         audio_format_option = AudioFormatOption(

diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py
@@ -29,6 +29,7 @@ class InlineAsrOptions(BaseAsrOptions):
 
     verbose: bool = False
     timestamps: bool = True
+    include_time_metadata: bool = True  # Include timing metadata in output
 
     temperature: float = 0.0
     max_new_tokens: int = 256

diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
@@ -85,10 +85,18 @@ def __eq__(self, other):
             return NotImplemented
         return self.start_time == other.start_time
 
-    def to_string(self) -> str:
-        """Format the conversation entry as a string"""
+    def to_string(self, include_time_metadata: bool = True) -> str:
+        """Format the conversation entry as a string
+
+        Args:
+            include_time_metadata: If True, include timing information in the output
+        """
         result = ""
-        if (self.start_time is not None) and (self.end_time is not None):
+        if (
+            include_time_metadata
+            and (self.start_time is not None)
+            and (self.end_time is not None)
+        ):
             result += f"[time: {self.start_time}-{self.end_time}] "
 
         if self.speaker is not None:
@@ -154,6 +162,7 @@ def __init__(
             self.verbose = asr_options.verbose
             self.timestamps = asr_options.timestamps
             self.word_timestamps = asr_options.word_timestamps
+            self.include_time_metadata = asr_options.include_time_metadata
 
     def run(self, conv_res: ConversionResult) -> ConversionResult:
         # Access the file path from the backend, similar to how other pipelines handle it
@@ -191,7 +200,10 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
 
             for citem in conversation:
                 conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=citem.to_string()
+                    label=DocItemLabel.TEXT,
+                    text=citem.to_string(
+                        include_time_metadata=self.include_time_metadata
+                    ),
                 )
 
             return conv_res
@@ -281,6 +293,7 @@ def __init__(
             self.no_speech_threshold = asr_options.no_speech_threshold
             self.logprob_threshold = asr_options.logprob_threshold
             self.compression_ratio_threshold = asr_options.compression_ratio_threshold
+            self.include_time_metadata = asr_options.include_time_metadata
 
     def run(self, conv_res: ConversionResult) -> ConversionResult:
         audio_path: Path = Path(conv_res.input.file).resolve()
@@ -300,7 +313,10 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
 
             for citem in conversation:
                 conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=citem.to_string()
+                    label=DocItemLabel.TEXT,
+                    text=citem.to_string(
+                        include_time_metadata=self.include_time_metadata
+                    ),
                 )
 
             conv_res.status = ConversionStatus.SUCCESS

diff --git a/tests/test_asr_pipeline.py b/tests/test_asr_pipeline.py
@@ -402,3 +402,86 @@ def test_mlx_run_success_and_failure(tmp_path):
         model2.mlx_whisper.transcribe.side_effect = RuntimeError("fail")
         out2 = model2.run(conv_res2)
         assert out2.status.name == "FAILURE"
+
+
+def test_asr_pipeline_without_time_metadata(test_audio_path):
+    """Test ASR pipeline with time metadata disabled."""
+    # Create pipeline with include_time_metadata=False
+    pipeline_options = AsrPipelineOptions()
+    # Create a copy of the model options to avoid mutating the shared instance
+    pipeline_options.asr_options = asr_model_specs.WHISPER_TINY.model_copy(deep=True)
+    pipeline_options.asr_options.include_time_metadata = False
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.AUDIO: AudioFormatOption(
+                pipeline_cls=AsrPipeline,
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+
+    # Convert the audio file
+    doc_result: ConversionResult = converter.convert(test_audio_path)
+
+    # Verify conversion was successful
+    assert doc_result.status == ConversionStatus.SUCCESS
+
+    # Verify we have text content
+    texts = doc_result.document.texts
+    assert len(texts) > 0
+
+    # Verify timestamps are NOT in the text
+    for text_item in texts:
+        assert "[time:" not in text_item.text, (
+            f"Time metadata found in text when it should be disabled: {text_item.text}"
+        )
+        # But the text itself should still be present
+        assert text_item.text.strip() != "", "Text content should not be empty"
+
+
+def test_asr_pipeline_with_time_metadata_default(test_audio_path):
+    """Test ASR pipeline with default settings (time metadata enabled)."""
+    # Use default settings (include_time_metadata should be True by default)
+    converter = get_asr_converter()
+
+    # Convert the audio file
+    doc_result: ConversionResult = converter.convert(test_audio_path)
+
+    # Verify conversion was successful
+    assert doc_result.status == ConversionStatus.SUCCESS
+
+    # Verify we have text content
+    texts = doc_result.document.texts
+    assert len(texts) > 0
+
+    # Verify timestamps ARE in the text (default behavior)
+    has_time_metadata = any("[time:" in text_item.text for text_item in texts)
+    assert has_time_metadata, "Time metadata should be present by default"
+
+
+def test_conversation_item_to_string_with_and_without_time():
+    """Unit test for _ConversationItem.to_string() with include_time_metadata parameter."""
+    from docling.pipeline.asr_pipeline import _ConversationItem
+
+    item = _ConversationItem(
+        text="Hello world", start_time=0.5, end_time=2.5, speaker="Alice"
+    )
+
+    # With time metadata (default)
+    result_with_time = item.to_string(include_time_metadata=True)
+    assert "[time: 0.5-2.5]" in result_with_time
+    assert "[speaker:Alice]" in result_with_time
+    assert "Hello world" in result_with_time
+
+    # Without time metadata
+    result_without_time = item.to_string(include_time_metadata=False)
+    assert "[time:" not in result_without_time
+    assert (
+        "[speaker:Alice]" in result_without_time
+    )  # Speaker info should still be present
+    assert "Hello world" in result_without_time
+
+    # Test backward compatibility (default True when not specified)
+    result_default = item.to_string()
+    assert "[time: 0.5-2.5]" in result_default