Skip to content

Commit 076c3ad

Browse files
committed
feat: Add DeepSeek-OCR integration with Apple Silicon MPS support
- Add DeepSeekOcrModel with automatic device detection (CUDA/MPS) - CUDA uses bfloat16 precision and flash_attention_2 (optimal) - MPS uses float16 precision and eager attention (Apple Silicon fallback) - Auto-switch to MPS-compatible model (Dogacel/DeepSeek-OCR-Metal-MPS) - Add PyTorch 2.7.0+ version validation for MPS support - Add clear error messages for device/version incompatibilities - Update test_e2e_ocr_conversion.py with CUDA/MPS device support - Update documentation with MPS support information Note: MPS support requires PyTorch 2.7.0+ and is currently blocked by a transformers version incompatibility in the community MPS model. See: https://huggingface.co/Dogacel/DeepSeek-OCR-Metal-MPS/discussions Signed-off-by: Rafael T. C. Soares <[email protected]>
1 parent f80c903 commit 076c3ad

File tree

7 files changed

+1397
-4
lines changed

7 files changed

+1397
-4
lines changed

docling/datamodel/pipeline_options.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,69 @@ class OcrMacOptions(OcrOptions):
203203
)
204204

205205

206+
class DeepSeekOcrOptions(OcrOptions):
207+
"""Options for the DeepSeek OCR engine.
208+
209+
DeepSeek-OCR is a Vision-Language Model (VLM) based OCR engine that uses
210+
transformer models for document understanding and text extraction.
211+
See: https://github.com/deepseek-ai/DeepSeek-OCR
212+
213+
Device Support:
214+
- CUDA (NVIDIA GPU): Optimal performance with flash_attention_2 and bfloat16.
215+
Uses the official 'deepseek-ai/DeepSeek-OCR' model.
216+
- MPS (Apple Silicon M1/M2/M3/M4): Supported via MPS-compatible model fork.
217+
Requires PyTorch 2.7.0+ for the aten::_upsample_bicubic2d_aa operator.
218+
Automatically switches to 'Dogacel/DeepSeek-OCR-Metal-MPS' model with
219+
float16 precision and eager attention.
220+
- CPU: Not supported. Use EasyOcrOptions, TesseractOcrOptions, or
221+
RapidOcrOptions for CPU-only environments.
222+
223+
Example:
224+
>>> from docling.datamodel.pipeline_options import DeepSeekOcrOptions
225+
>>> # Basic usage - auto-detects CUDA or MPS
226+
>>> options = DeepSeekOcrOptions()
227+
>>>
228+
>>> # Custom prompt for specific OCR tasks
229+
>>> options = DeepSeekOcrOptions(prompt="<image>\\nConvert to markdown.")
230+
"""
231+
232+
kind: ClassVar[Literal["deepseecocr"]] = "deepseecocr"
233+
234+
# DeepSeek-OCR is multilingual by default, no specific language configuration needed
235+
lang: List[str] = []
236+
237+
# Model configuration
238+
# Default is the official CUDA model; MPS users will auto-switch to MPS-compatible fork
239+
repo_id: str = "deepseek-ai/DeepSeek-OCR"
240+
241+
# Prompt for OCR extraction
242+
prompt: str = "<image>\nFree OCR."
243+
244+
# Image processing parameters
245+
base_size: int = 1024
246+
image_size: int = 640
247+
crop_mode: bool = True
248+
249+
# Generation parameters
250+
max_new_tokens: int = 4096
251+
temperature: float = 0.0
252+
do_sample: bool = False
253+
254+
# Trust remote code for loading the model
255+
trust_remote_code: bool = True
256+
257+
# Attention implementation:
258+
# - "flash_attention_2": Default for CUDA, optimal performance
259+
# - "eager": Required for MPS (auto-selected), also works on CUDA
260+
# - "sdpa": NOT supported by DeepSeek-OCR
261+
attn_implementation: Optional[str] = None
262+
263+
model_config = ConfigDict(
264+
extra="forbid",
265+
protected_namespaces=(),
266+
)
267+
268+
206269
class PictureDescriptionBaseOptions(BaseOptions):
207270
batch_size: int = 8
208271
scale: float = 2
@@ -273,6 +336,7 @@ class OcrEngine(str, Enum):
273336
TESSERACT = "tesseract"
274337
OCRMAC = "ocrmac"
275338
RAPIDOCR = "rapidocr"
339+
DEEPSEECOCR = "deepseecocr"
276340

277341

278342
class PipelineOptions(BaseOptions):

0 commit comments

Comments
 (0)