mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-28 10:26:33 +02:00
feat: add support for Azure Document Intelligence in ETL pipeline
This commit is contained in:
parent
73a9c5fbd1
commit
1fa8d1220b
10 changed files with 248 additions and 5 deletions
|
|
@ -93,6 +93,22 @@ UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
|
|||
}
|
||||
)
|
||||
|
||||
AZURE_DI_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
|
||||
{
|
||||
".pdf",
|
||||
".docx",
|
||||
".xlsx",
|
||||
".pptx",
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".tif",
|
||||
".heif",
|
||||
}
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Union (used by classify_file for routing) + service lookup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -101,12 +117,14 @@ DOCUMENT_EXTENSIONS: frozenset[str] = (
|
|||
DOCLING_DOCUMENT_EXTENSIONS
|
||||
| LLAMAPARSE_DOCUMENT_EXTENSIONS
|
||||
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
|
||||
| AZURE_DI_DOCUMENT_EXTENSIONS
|
||||
)
|
||||
|
||||
_SERVICE_MAP: dict[str, frozenset[str]] = {
|
||||
"DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
|
||||
"LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
|
||||
"UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
|
||||
"AZURE_DI": AZURE_DI_DOCUMENT_EXTENSIONS,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue