refactor: make Azure Document Intelligence an internal LLAMACLOUD accelerator instead of a standalone ETL service

This commit is contained in:
Anish Sarkar 2026-04-08 03:26:24 +05:30
parent 1fa8d1220b
commit 20fa93f0ba
9 changed files with 200 additions and 85 deletions

View file

@ -124,16 +124,27 @@ _SERVICE_MAP: dict[str, frozenset[str]] = {
"DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
"LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
"UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
"AZURE_DI": AZURE_DI_DOCUMENT_EXTENSIONS,
}
def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
"""Return the document extensions supported by *etl_service*.
When *etl_service* is ``LLAMACLOUD`` and Azure Document Intelligence
credentials are configured, the set is dynamically expanded to include
Azure DI's supported extensions (e.g. ``.heif``).
Falls back to the full union when the service is ``None`` or unknown.
"""
return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
extensions = _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
if etl_service == "LLAMACLOUD":
from app.config import config as app_config
if getattr(app_config, "AZURE_DI_ENDPOINT", None) and getattr(
app_config, "AZURE_DI_KEY", None
):
extensions = extensions | AZURE_DI_DOCUMENT_EXTENSIONS
return extensions
def is_supported_document_extension(filename: str) -> bool: