feat: add processing mode support for document uploads and ETL pipeline, improded error handling ux

- Introduced a `ProcessingMode` enum to differentiate between basic and premium processing modes. - Updated `EtlRequest` to include a `processing_mode` field, defaulting to basic. - Enhanced ETL pipeline services to utilize the selected processing mode for Azure Document Intelligence and LlamaCloud parsing. - Modified various routes and services to handle processing mode, affecting document upload and indexing tasks. - Improved error handling and logging to include processing mode details. - Added tests to validate processing mode functionality and its impact on ETL operations.
2026-05-08 23:32:40 +02:00 · 2026-04-14 21:26:00 -07:00 · 2026-04-14 21:26:00 -07:00 · 656e061f84
commit 656e061f84
parent b659f41bab
104 changed files with 1900 additions and 909 deletions
--- a/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py
@ -10,7 +10,15 @@ BASE_DELAY = 10
 MAX_DELAY = 120


-async def parse_with_azure_doc_intelligence(file_path: str) -> str:
+AZURE_MODEL_BY_MODE = {
+    "basic": "prebuilt-read",
+    "premium": "prebuilt-layout",
+}
+
+
+async def parse_with_azure_doc_intelligence(
+    file_path: str, processing_mode: str = "basic"
+) -> str:
    from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
    from azure.ai.documentintelligence.models import DocumentContentFormat
    from azure.core.credentials import AzureKeyCredential
@ -21,9 +29,15 @@ async def parse_with_azure_doc_intelligence(file_path: str) -> str:
        ServiceResponseError,
    )

+    model_id = AZURE_MODEL_BY_MODE.get(processing_mode, "prebuilt-read")
    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
    retryable_exceptions = (ServiceRequestError, ServiceResponseError)

+    logging.info(
+        f"Azure Document Intelligence using model={model_id} "
+        f"(mode={processing_mode}, file={file_size_mb:.1f}MB)"
+    )
+
    last_exception = None
    attempt_errors: list[str] = []

@ -36,7 +50,7 @@ async def parse_with_azure_doc_intelligence(file_path: str) -> str:
            async with client:
                with open(file_path, "rb") as f:
                    poller = await client.begin_analyze_document(
-                        "prebuilt-layout",
+                        model_id,
                        body=f,
                        output_content_format=DocumentContentFormat.MARKDOWN,
                    )
--- a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
@ -16,8 +16,15 @@ from app.etl_pipeline.constants import (
    calculate_upload_timeout,
 )

+LLAMA_TIER_BY_MODE = {
+    "basic": "cost_effective",
+    "premium": "agentic_plus",
+}

-async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
+
+async def parse_with_llamacloud(
+    file_path: str, estimated_pages: int, processing_mode: str = "basic"
+) -> str:
    from llama_cloud_services import LlamaParse
    from llama_cloud_services.parse.utils import ResultType

@ -34,10 +41,12 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
        pool=120.0,
    )

+    tier = LLAMA_TIER_BY_MODE.get(processing_mode, "cost_effective")
+
    logging.info(
        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
-        f"job_timeout={job_timeout:.0f}s"
+        f"job_timeout={job_timeout:.0f}s, tier={tier} (mode={processing_mode})"
    )

    last_exception = None
@ -56,6 +65,7 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
                    job_timeout_in_seconds=job_timeout,
                    job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
                    custom_client=custom_client,
+                    tier=tier,
                )
                result = await parser.aparse(file_path)