feat: add processing mode support for document uploads and ETL pipeline, improded error handling ux
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions

- Introduced a `ProcessingMode` enum to differentiate between basic and premium processing modes.
- Updated `EtlRequest` to include a `processing_mode` field, defaulting to basic.
- Enhanced ETL pipeline services to utilize the selected processing mode for Azure Document Intelligence and LlamaCloud parsing.
- Modified various routes and services to handle processing mode, affecting document upload and indexing tasks.
- Improved error handling and logging to include processing mode details.
- Added tests to validate processing mode functionality and its impact on ETL operations.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-04-14 21:26:00 -07:00
parent b659f41bab
commit 656e061f84
104 changed files with 1900 additions and 909 deletions

View file

@ -10,7 +10,15 @@ BASE_DELAY = 10
MAX_DELAY = 120
async def parse_with_azure_doc_intelligence(file_path: str) -> str:
AZURE_MODEL_BY_MODE = {
"basic": "prebuilt-read",
"premium": "prebuilt-layout",
}
async def parse_with_azure_doc_intelligence(
file_path: str, processing_mode: str = "basic"
) -> str:
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
@ -21,9 +29,15 @@ async def parse_with_azure_doc_intelligence(file_path: str) -> str:
ServiceResponseError,
)
model_id = AZURE_MODEL_BY_MODE.get(processing_mode, "prebuilt-read")
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
retryable_exceptions = (ServiceRequestError, ServiceResponseError)
logging.info(
f"Azure Document Intelligence using model={model_id} "
f"(mode={processing_mode}, file={file_size_mb:.1f}MB)"
)
last_exception = None
attempt_errors: list[str] = []
@ -36,7 +50,7 @@ async def parse_with_azure_doc_intelligence(file_path: str) -> str:
async with client:
with open(file_path, "rb") as f:
poller = await client.begin_analyze_document(
"prebuilt-layout",
model_id,
body=f,
output_content_format=DocumentContentFormat.MARKDOWN,
)

View file

@ -16,8 +16,15 @@ from app.etl_pipeline.constants import (
calculate_upload_timeout,
)
LLAMA_TIER_BY_MODE = {
"basic": "cost_effective",
"premium": "agentic_plus",
}
async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
async def parse_with_llamacloud(
file_path: str, estimated_pages: int, processing_mode: str = "basic"
) -> str:
from llama_cloud_services import LlamaParse
from llama_cloud_services.parse.utils import ResultType
@ -34,10 +41,12 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
pool=120.0,
)
tier = LLAMA_TIER_BY_MODE.get(processing_mode, "cost_effective")
logging.info(
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
f"job_timeout={job_timeout:.0f}s"
f"job_timeout={job_timeout:.0f}s, tier={tier} (mode={processing_mode})"
)
last_exception = None
@ -56,6 +65,7 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
job_timeout_in_seconds=job_timeout,
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
custom_client=custom_client,
tier=tier,
)
result = await parser.aparse(file_path)