refactor: make Azure Document Intelligence an internal LLAMACLOUD accelerator instead of a standalone ETL service

2026-05-09 07:42:39 +02:00 · 2026-04-08 03:26:24 +05:30 · 2026-04-08 03:26:24 +05:30 · 20fa93f0ba
commit 20fa93f0ba
parent 1fa8d1220b
9 changed files with 200 additions and 85 deletions
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -1,3 +1,5 @@
+import logging
+
 from app.config import config as app_config
 from app.etl_pipeline.etl_document import EtlRequest, EtlResult
 from app.etl_pipeline.exceptions import (
@ -56,7 +58,7 @@ class EtlPipelineService:
        if not etl_service:
            raise EtlServiceUnavailableError(
                "No ETL_SERVICE configured. "
-                "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, DOCLING, or AZURE_DI in your .env"
+                "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
            )

        ext = PurePosixPath(request.filename).suffix.lower()
@ -75,17 +77,7 @@ class EtlPipelineService:

            content = await parse_with_unstructured(request.file_path)
        elif etl_service == "LLAMACLOUD":
-            from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
-
-            content = await parse_with_llamacloud(
-                request.file_path, request.estimated_pages
-            )
-        elif etl_service == "AZURE_DI":
-            from app.etl_pipeline.parsers.azure_doc_intelligence import (
-                parse_with_azure_doc_intelligence,
-            )
-
-            content = await parse_with_azure_doc_intelligence(request.file_path)
+            content = await self._extract_with_llamacloud(request)
        else:
            raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")

@ -94,3 +86,42 @@ class EtlPipelineService:
            etl_service=etl_service,
            content_type="document",
        )
+
+    async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
+        """Try Azure Document Intelligence first (when configured) then LlamaCloud.
+
+        Azure DI is an internal accelerator: cheaper and faster for its supported
+        file types.  If it is not configured, or the file extension is not in
+        Azure DI's supported set, LlamaCloud is used directly.  If Azure DI
+        fails for any reason, LlamaCloud is used as a fallback.
+        """
+        from pathlib import PurePosixPath
+
+        from app.utils.file_extensions import AZURE_DI_DOCUMENT_EXTENSIONS
+
+        ext = PurePosixPath(request.filename).suffix.lower()
+        azure_configured = bool(
+            getattr(app_config, "AZURE_DI_ENDPOINT", None)
+            and getattr(app_config, "AZURE_DI_KEY", None)
+        )
+
+        if azure_configured and ext in AZURE_DI_DOCUMENT_EXTENSIONS:
+            try:
+                from app.etl_pipeline.parsers.azure_doc_intelligence import (
+                    parse_with_azure_doc_intelligence,
+                )
+
+                return await parse_with_azure_doc_intelligence(request.file_path)
+            except Exception:
+                logging.warning(
+                    "Azure Document Intelligence failed for %s, "
+                    "falling back to LlamaCloud",
+                    request.filename,
+                    exc_info=True,
+                )
+
+        from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
+
+        return await parse_with_llamacloud(
+            request.file_path, request.estimated_pages
+        )