mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-09 07:42:39 +02:00
refactor: make Azure Document Intelligence an internal LLAMACLOUD accelerator instead of a standalone ETL service
This commit is contained in:
parent
1fa8d1220b
commit
20fa93f0ba
9 changed files with 200 additions and 85 deletions
|
|
@ -1,3 +1,5 @@
|
|||
import logging
|
||||
|
||||
from app.config import config as app_config
|
||||
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
|
||||
from app.etl_pipeline.exceptions import (
|
||||
|
|
@ -56,7 +58,7 @@ class EtlPipelineService:
|
|||
if not etl_service:
|
||||
raise EtlServiceUnavailableError(
|
||||
"No ETL_SERVICE configured. "
|
||||
"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, DOCLING, or AZURE_DI in your .env"
|
||||
"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
|
||||
)
|
||||
|
||||
ext = PurePosixPath(request.filename).suffix.lower()
|
||||
|
|
@ -75,17 +77,7 @@ class EtlPipelineService:
|
|||
|
||||
content = await parse_with_unstructured(request.file_path)
|
||||
elif etl_service == "LLAMACLOUD":
|
||||
from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
|
||||
|
||||
content = await parse_with_llamacloud(
|
||||
request.file_path, request.estimated_pages
|
||||
)
|
||||
elif etl_service == "AZURE_DI":
|
||||
from app.etl_pipeline.parsers.azure_doc_intelligence import (
|
||||
parse_with_azure_doc_intelligence,
|
||||
)
|
||||
|
||||
content = await parse_with_azure_doc_intelligence(request.file_path)
|
||||
content = await self._extract_with_llamacloud(request)
|
||||
else:
|
||||
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
|
||||
|
||||
|
|
@ -94,3 +86,42 @@ class EtlPipelineService:
|
|||
etl_service=etl_service,
|
||||
content_type="document",
|
||||
)
|
||||
|
||||
async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
|
||||
"""Try Azure Document Intelligence first (when configured) then LlamaCloud.
|
||||
|
||||
Azure DI is an internal accelerator: cheaper and faster for its supported
|
||||
file types. If it is not configured, or the file extension is not in
|
||||
Azure DI's supported set, LlamaCloud is used directly. If Azure DI
|
||||
fails for any reason, LlamaCloud is used as a fallback.
|
||||
"""
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from app.utils.file_extensions import AZURE_DI_DOCUMENT_EXTENSIONS
|
||||
|
||||
ext = PurePosixPath(request.filename).suffix.lower()
|
||||
azure_configured = bool(
|
||||
getattr(app_config, "AZURE_DI_ENDPOINT", None)
|
||||
and getattr(app_config, "AZURE_DI_KEY", None)
|
||||
)
|
||||
|
||||
if azure_configured and ext in AZURE_DI_DOCUMENT_EXTENSIONS:
|
||||
try:
|
||||
from app.etl_pipeline.parsers.azure_doc_intelligence import (
|
||||
parse_with_azure_doc_intelligence,
|
||||
)
|
||||
|
||||
return await parse_with_azure_doc_intelligence(request.file_path)
|
||||
except Exception:
|
||||
logging.warning(
|
||||
"Azure Document Intelligence failed for %s, "
|
||||
"falling back to LlamaCloud",
|
||||
request.filename,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
|
||||
|
||||
return await parse_with_llamacloud(
|
||||
request.file_path, request.estimated_pages
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue