Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,13 @@ def convert( # noqa: C901
AsrModelType,
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
] = AsrModelType.WHISPER_TINY,
asr_no_timing: Annotated[
bool,
typer.Option(
...,
help="Disable printing timing metadata in ASR transcription output.",
),
] = False,
ocr: Annotated[
bool,
typer.Option(
Expand Down Expand Up @@ -837,6 +844,10 @@ def convert( # noqa: C901
_log.error(f"{asr_model} is not known")
raise ValueError(f"{asr_model} is not known")

# Apply timing metadata preference
if asr_pipeline_options.asr_options is not None:
asr_pipeline_options.asr_options.include_time_metadata = not asr_no_timing

_log.debug(f"ASR pipeline_options: {asr_pipeline_options}")

audio_format_option = AudioFormatOption(
Expand Down
1 change: 1 addition & 0 deletions docling/datamodel/pipeline_options_asr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class InlineAsrOptions(BaseAsrOptions):

verbose: bool = False
timestamps: bool = True
include_time_metadata: bool = True # Include timing metadata in output

temperature: float = 0.0
max_new_tokens: int = 256
Expand Down
26 changes: 21 additions & 5 deletions docling/pipeline/asr_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,18 @@ def __eq__(self, other):
return NotImplemented
return self.start_time == other.start_time

def to_string(self) -> str:
"""Format the conversation entry as a string"""
def to_string(self, include_time_metadata: bool = True) -> str:
"""Format the conversation entry as a string

Args:
include_time_metadata: If True, include timing information in the output
"""
result = ""
if (self.start_time is not None) and (self.end_time is not None):
if (
include_time_metadata
and (self.start_time is not None)
and (self.end_time is not None)
):
result += f"[time: {self.start_time}-{self.end_time}] "

if self.speaker is not None:
Expand Down Expand Up @@ -154,6 +162,7 @@ def __init__(
self.verbose = asr_options.verbose
self.timestamps = asr_options.timestamps
self.word_timestamps = asr_options.word_timestamps
self.include_time_metadata = asr_options.include_time_metadata

def run(self, conv_res: ConversionResult) -> ConversionResult:
# Access the file path from the backend, similar to how other pipelines handle it
Expand Down Expand Up @@ -191,7 +200,10 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:

for citem in conversation:
conv_res.document.add_text(
label=DocItemLabel.TEXT, text=citem.to_string()
label=DocItemLabel.TEXT,
text=citem.to_string(
include_time_metadata=self.include_time_metadata
),
)

return conv_res
Expand Down Expand Up @@ -281,6 +293,7 @@ def __init__(
self.no_speech_threshold = asr_options.no_speech_threshold
self.logprob_threshold = asr_options.logprob_threshold
self.compression_ratio_threshold = asr_options.compression_ratio_threshold
self.include_time_metadata = asr_options.include_time_metadata

def run(self, conv_res: ConversionResult) -> ConversionResult:
audio_path: Path = Path(conv_res.input.file).resolve()
Expand All @@ -300,7 +313,10 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:

for citem in conversation:
conv_res.document.add_text(
label=DocItemLabel.TEXT, text=citem.to_string()
label=DocItemLabel.TEXT,
text=citem.to_string(
include_time_metadata=self.include_time_metadata
),
)

conv_res.status = ConversionStatus.SUCCESS
Expand Down
83 changes: 83 additions & 0 deletions tests/test_asr_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,3 +402,86 @@ def test_mlx_run_success_and_failure(tmp_path):
model2.mlx_whisper.transcribe.side_effect = RuntimeError("fail")
out2 = model2.run(conv_res2)
assert out2.status.name == "FAILURE"


def test_asr_pipeline_without_time_metadata(test_audio_path):
"""Test ASR pipeline with time metadata disabled."""
# Create pipeline with include_time_metadata=False
pipeline_options = AsrPipelineOptions()
# Create a copy of the model options to avoid mutating the shared instance
pipeline_options.asr_options = asr_model_specs.WHISPER_TINY.model_copy(deep=True)
pipeline_options.asr_options.include_time_metadata = False

converter = DocumentConverter(
format_options={
InputFormat.AUDIO: AudioFormatOption(
pipeline_cls=AsrPipeline,
pipeline_options=pipeline_options,
)
}
)

# Convert the audio file
doc_result: ConversionResult = converter.convert(test_audio_path)

# Verify conversion was successful
assert doc_result.status == ConversionStatus.SUCCESS

# Verify we have text content
texts = doc_result.document.texts
assert len(texts) > 0

# Verify timestamps are NOT in the text
for text_item in texts:
assert "[time:" not in text_item.text, (
f"Time metadata found in text when it should be disabled: {text_item.text}"
)
# But the text itself should still be present
assert text_item.text.strip() != "", "Text content should not be empty"


def test_asr_pipeline_with_time_metadata_default(test_audio_path):
"""Test ASR pipeline with default settings (time metadata enabled)."""
# Use default settings (include_time_metadata should be True by default)
converter = get_asr_converter()

# Convert the audio file
doc_result: ConversionResult = converter.convert(test_audio_path)

# Verify conversion was successful
assert doc_result.status == ConversionStatus.SUCCESS

# Verify we have text content
texts = doc_result.document.texts
assert len(texts) > 0

# Verify timestamps ARE in the text (default behavior)
has_time_metadata = any("[time:" in text_item.text for text_item in texts)
assert has_time_metadata, "Time metadata should be present by default"


def test_conversation_item_to_string_with_and_without_time():
"""Unit test for _ConversationItem.to_string() with include_time_metadata parameter."""
from docling.pipeline.asr_pipeline import _ConversationItem

item = _ConversationItem(
text="Hello world", start_time=0.5, end_time=2.5, speaker="Alice"
)

# With time metadata (default)
result_with_time = item.to_string(include_time_metadata=True)
assert "[time: 0.5-2.5]" in result_with_time
assert "[speaker:Alice]" in result_with_time
assert "Hello world" in result_with_time

# Without time metadata
result_without_time = item.to_string(include_time_metadata=False)
assert "[time:" not in result_without_time
assert (
"[speaker:Alice]" in result_without_time
) # Speaker info should still be present
assert "Hello world" in result_without_time

# Test backward compatibility (default True when not specified)
result_default = item.to_string()
assert "[time: 0.5-2.5]" in result_default