mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 17:26:23 +02:00
refactor: make Azure Document Intelligence an internal LLAMACLOUD accelerator instead of a standalone ETL service
This commit is contained in:
parent
1fa8d1220b
commit
20fa93f0ba
9 changed files with 200 additions and 85 deletions
|
|
@ -124,16 +124,27 @@ _SERVICE_MAP: dict[str, frozenset[str]] = {
|
|||
"DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
|
||||
"LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
|
||||
"UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
|
||||
"AZURE_DI": AZURE_DI_DOCUMENT_EXTENSIONS,
|
||||
}
|
||||
|
||||
|
||||
def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
|
||||
"""Return the document extensions supported by *etl_service*.
|
||||
|
||||
When *etl_service* is ``LLAMACLOUD`` and Azure Document Intelligence
|
||||
credentials are configured, the set is dynamically expanded to include
|
||||
Azure DI's supported extensions (e.g. ``.heif``).
|
||||
|
||||
Falls back to the full union when the service is ``None`` or unknown.
|
||||
"""
|
||||
return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
|
||||
extensions = _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
|
||||
if etl_service == "LLAMACLOUD":
|
||||
from app.config import config as app_config
|
||||
|
||||
if getattr(app_config, "AZURE_DI_ENDPOINT", None) and getattr(
|
||||
app_config, "AZURE_DI_KEY", None
|
||||
):
|
||||
extensions = extensions | AZURE_DI_DOCUMENT_EXTENSIONS
|
||||
return extensions
|
||||
|
||||
|
||||
def is_supported_document_extension(filename: str) -> bool:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue