Description
Hi,
I'm observing a consistent performance difference between invoking Docling's conversion pipeline directly in memory versus through the docling-serve
HTTP API (using the official CUDA-enabled container).
Direct Integration (In-Memory, FastAPI App)
In this setup, I initialize the DocumentConverter
once and reuse it across requests within a FastAPI app. I'm explicitly using GPU acceleration (AcceleratorDevice.CUDA
) and setting TableFormerMode.FAST
.
Minimal Code:
ocr_options = TesseractOcrOptions()
device = (
AcceleratorDevice.MPS if sys.platform == "darwin"
else AcceleratorDevice.CUDA if self.cuda_available
else AcceleratorDevice.CPU
)
accelerator_options = AcceleratorOptions(
num_threads=2,
device=device,
)
pipeline_options = PdfPipelineOptions(
do_ocr=False,
ocr_options=ocr_options,
table_structure_options=TableStructureOptions(
mode=TableFormerMode.FAST, do_cell_matching=True
),
do_table_structure=True,
accelerator_options=accelerator_options,
artifacts_path=self.docling_artifacts_path or None,
)
converter = CustomDocumentConverter(
allowed_formats=DOCLING_ALLOWED_FORMATS,
format_options={
InputFormat.PDF: PdfFormatOption(
backend=PyPdfiumDocumentBackend,
pipeline_options=pipeline_options,
),
},
)
# Then invoked like this:
md_text = converter.convert()...
Performance: ~0.5s per document (GPU enabled, warm)
docling-serve
In contrast, calling the Docker container via HTTP (on the same machine with CUDA support) takes ~5s per similar document.
Code:
def _docling_serve_convert(self, path: str | Path, ocr: bool = False) -> list[str]:
with open(path, "rb") as f:
files = {
"files": (os.path.basename(path), f),
}
data = {
"pipeline": "standard",
"images_scale": "0",
"from_format": DOCLING_FILE_FORMATS,
"pdf_backend": "pypdfium2",
"image_export_mode": "placeholder",
"do_table_structure": "true",
"include_images": "false",
"table_mode": "fast",
"abort_on_error": "false",
"to_formats": "md",
"return_as_file": "false",
"picture_description_area_threshold": "0",
"document_timeout": "604800",
"md_page_break_placeholder": PAGE_BREAK_MARKER,
}
if ocr:
data.update({
"ocr_engine": "tessaract",
"force_ocr": "true",
"do_ocr": "true",
"ocr_lang": "en,fr,de,es",
})
logger.debug(
f"Sending request to Docling Serve at {self.docling_serve_url}/v1alpha/convert/file"
)
send_req_time = time.time()
response = requests.post(
f"{self.docling_serve_url}/v1alpha/convert/file", files=files, data=data
)
logger.debug(
f"Request to Docling Serve took {time.time() - send_req_time:.2f} seconds"
)
if not response.ok:
logger.error(
f"Docling response status: {response.status_code} "
f"Docling response content: {response.text}"
)
raise requests.HTTPError(response.text, response=response)
result = response.json()
md_text = result["document"]["md_content"]
return md_text.split(PAGE_BREAK_MARKER)
Performance: ~5s per document
Is docling-serve reloading models or doing extra setup per request? Or is there any known overhead in the way requests are handled that could explain this latency?