diff --git a/docker/.env.example b/docker/.env.example index 816dbff12..1a3869773 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -20,7 +20,7 @@ AUTH_TYPE=LOCAL # Allow new user registrations (TRUE or FALSE) # REGISTRATION_ENABLED=TRUE -# Document parsing service: DOCLING, UNSTRUCTURED, LLAMACLOUD, or AZURE_DI +# Document parsing service: DOCLING, UNSTRUCTURED, or LLAMACLOUD ETL_SERVICE=DOCLING # Embedding model for vector search @@ -282,8 +282,7 @@ STT_SERVICE=local/base # LlamaCloud (if ETL_SERVICE=LLAMACLOUD) # LLAMA_CLOUD_API_KEY= - -# Azure Document Intelligence (if ETL_SERVICE=AZURE_DI) +# Optional: Azure Document Intelligence accelerator (used with LLAMACLOUD) # AZURE_DI_ENDPOINT=https://your-resource.cognitiveservices.azure.com/ # AZURE_DI_KEY= diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 83c527cae..8c8587cea 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -190,11 +190,12 @@ PAGES_LIMIT=500 FIRECRAWL_API_KEY=fcr-01J0000000000000000000000 # File Parser Service -ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING or AZURE_DI +ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING UNSTRUCTURED_API_KEY=Tpu3P0U8iy LLAMA_CLOUD_API_KEY=llx-nnn -AZURE_DI_ENDPOINT=https://your-resource.cognitiveservices.azure.com/ -AZURE_DI_KEY=your-key +# Optional: Azure Document Intelligence accelerator (used when ETL_SERVICE=LLAMACLOUD) +# AZURE_DI_ENDPOINT=https://your-resource.cognitiveservices.azure.com/ +# AZURE_DI_KEY=your-key # OPTIONAL: Add these for LangSmith Observability LANGSMITH_TRACING=true diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 912f4618f..0011ea289 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -394,10 +394,8 @@ class Config: UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") elif ETL_SERVICE == "LLAMACLOUD": - # LlamaCloud API Key LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") - - elif ETL_SERVICE == "AZURE_DI": + # Optional: Azure Document Intelligence accelerator for supported file types AZURE_DI_ENDPOINT = os.getenv("AZURE_DI_ENDPOINT") AZURE_DI_KEY = os.getenv("AZURE_DI_KEY") diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index 5f74268ed..167e43765 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -1,3 +1,5 @@ +import logging + from app.config import config as app_config from app.etl_pipeline.etl_document import EtlRequest, EtlResult from app.etl_pipeline.exceptions import ( @@ -56,7 +58,7 @@ class EtlPipelineService: if not etl_service: raise EtlServiceUnavailableError( "No ETL_SERVICE configured. " - "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, DOCLING, or AZURE_DI in your .env" + "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env" ) ext = PurePosixPath(request.filename).suffix.lower() @@ -75,17 +77,7 @@ class EtlPipelineService: content = await parse_with_unstructured(request.file_path) elif etl_service == "LLAMACLOUD": - from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud - - content = await parse_with_llamacloud( - request.file_path, request.estimated_pages - ) - elif etl_service == "AZURE_DI": - from app.etl_pipeline.parsers.azure_doc_intelligence import ( - parse_with_azure_doc_intelligence, - ) - - content = await parse_with_azure_doc_intelligence(request.file_path) + content = await self._extract_with_llamacloud(request) else: raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}") @@ -94,3 +86,42 @@ class EtlPipelineService: etl_service=etl_service, content_type="document", ) + + async def _extract_with_llamacloud(self, request: EtlRequest) -> str: + """Try Azure Document Intelligence first (when configured) then LlamaCloud. + + Azure DI is an internal accelerator: cheaper and faster for its supported + file types. If it is not configured, or the file extension is not in + Azure DI's supported set, LlamaCloud is used directly. If Azure DI + fails for any reason, LlamaCloud is used as a fallback. + """ + from pathlib import PurePosixPath + + from app.utils.file_extensions import AZURE_DI_DOCUMENT_EXTENSIONS + + ext = PurePosixPath(request.filename).suffix.lower() + azure_configured = bool( + getattr(app_config, "AZURE_DI_ENDPOINT", None) + and getattr(app_config, "AZURE_DI_KEY", None) + ) + + if azure_configured and ext in AZURE_DI_DOCUMENT_EXTENSIONS: + try: + from app.etl_pipeline.parsers.azure_doc_intelligence import ( + parse_with_azure_doc_intelligence, + ) + + return await parse_with_azure_doc_intelligence(request.file_path) + except Exception: + logging.warning( + "Azure Document Intelligence failed for %s, " + "falling back to LlamaCloud", + request.filename, + exc_info=True, + ) + + from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud + + return await parse_with_llamacloud( + request.file_path, request.estimated_pages + ) diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py index 6be43f8a0..16ac585b7 100644 --- a/surfsense_backend/app/utils/file_extensions.py +++ b/surfsense_backend/app/utils/file_extensions.py @@ -124,16 +124,27 @@ _SERVICE_MAP: dict[str, frozenset[str]] = { "DOCLING": DOCLING_DOCUMENT_EXTENSIONS, "LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS, "UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS, - "AZURE_DI": AZURE_DI_DOCUMENT_EXTENSIONS, } def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]: """Return the document extensions supported by *etl_service*. + When *etl_service* is ``LLAMACLOUD`` and Azure Document Intelligence + credentials are configured, the set is dynamically expanded to include + Azure DI's supported extensions (e.g. ``.heif``). + Falls back to the full union when the service is ``None`` or unknown. """ - return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS) + extensions = _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS) + if etl_service == "LLAMACLOUD": + from app.config import config as app_config + + if getattr(app_config, "AZURE_DI_ENDPOINT", None) and getattr( + app_config, "AZURE_DI_KEY", None + ): + extensions = extensions | AZURE_DI_DOCUMENT_EXTENSIONS + return extensions def is_supported_document_extension(filename: str) -> bool: diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py index ed0d315b1..2ca0cb1a3 100644 --- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py +++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py @@ -250,21 +250,17 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker): # --------------------------------------------------------------------------- -# Slice 9b - AZURE_DI document parsing +# Slice 9b - LLAMACLOUD + Azure DI accelerator # --------------------------------------------------------------------------- -async def test_extract_pdf_with_azure_di(tmp_path, mocker): - """A .pdf file with ETL_SERVICE=AZURE_DI returns parsed markdown.""" - pdf_file = tmp_path / "report.pdf" - pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10) - - mocker.patch("app.config.config.ETL_SERVICE", "AZURE_DI") - mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/", create=True) - mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True) +def _mock_azure_di(mocker, content="# Azure DI parsed"): + """Wire up Azure DI mocks and return the fake client for assertions.""" class FakeResult: - content = "# Azure DI parsed" + pass + + FakeResult.content = content fake_poller = mocker.AsyncMock() fake_poller.result.return_value = FakeResult() @@ -286,72 +282,160 @@ async def test_extract_pdf_with_azure_di(tmp_path, mocker): "azure.core.credentials.AzureKeyCredential", return_value=mocker.MagicMock(), ) + return fake_client + + +def _mock_llamacloud(mocker, content="# LlamaCloud parsed"): + """Wire up LlamaCloud mocks and return the fake parser for assertions.""" + + class FakeDoc: + pass + + FakeDoc.text = content + + class FakeJobResult: + pages = [] + + def get_markdown_documents(self, split_by_page=True): + return [FakeDoc()] + + fake_parser = mocker.AsyncMock() + fake_parser.aparse.return_value = FakeJobResult() + mocker.patch( + "llama_cloud_services.LlamaParse", + return_value=fake_parser, + ) + mocker.patch( + "llama_cloud_services.parse.utils.ResultType", + mocker.MagicMock(MD="md"), + ) + return fake_parser + + +async def test_llamacloud_with_azure_di_uses_azure_for_pdf(tmp_path, mocker): + """When Azure DI is configured, a supported extension (.pdf) is parsed by Azure DI.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10) + + mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD") + mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True) + mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/", create=True) + mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True) + + fake_client = _mock_azure_di(mocker, "# Azure DI parsed") + fake_parser = _mock_llamacloud(mocker) result = await EtlPipelineService().extract( EtlRequest(file_path=str(pdf_file), filename="report.pdf") ) assert result.markdown_content == "# Azure DI parsed" - assert result.etl_service == "AZURE_DI" + assert result.etl_service == "LLAMACLOUD" assert result.content_type == "document" + fake_client.begin_analyze_document.assert_called_once() + fake_parser.aparse.assert_not_called() -async def test_extract_docx_with_azure_di(tmp_path, mocker): - """A .docx file with ETL_SERVICE=AZURE_DI routes correctly.""" - docx_file = tmp_path / "doc.docx" - docx_file.write_bytes(b"PK fake docx") +async def test_llamacloud_azure_di_fallback_on_failure(tmp_path, mocker): + """When Azure DI fails, LlamaCloud is used as a fallback.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10) - mocker.patch("app.config.config.ETL_SERVICE", "AZURE_DI") + mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD") + mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True) mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/", create=True) mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True) - class FakeResult: - content = "Docx content from Azure" - - fake_poller = mocker.AsyncMock() - fake_poller.result.return_value = FakeResult() - - fake_client = mocker.AsyncMock() - fake_client.begin_analyze_document.return_value = fake_poller - fake_client.__aenter__ = mocker.AsyncMock(return_value=fake_client) - fake_client.__aexit__ = mocker.AsyncMock(return_value=False) - mocker.patch( - "azure.ai.documentintelligence.aio.DocumentIntelligenceClient", - return_value=fake_client, - ) - mocker.patch( - "azure.ai.documentintelligence.models.DocumentContentFormat", - mocker.MagicMock(MARKDOWN="markdown"), - ) - mocker.patch( - "azure.core.credentials.AzureKeyCredential", - return_value=mocker.MagicMock(), + "app.etl_pipeline.parsers.azure_doc_intelligence.parse_with_azure_doc_intelligence", + side_effect=RuntimeError("Azure DI unavailable"), ) + fake_parser = _mock_llamacloud(mocker, "# LlamaCloud fallback") result = await EtlPipelineService().extract( - EtlRequest(file_path=str(docx_file), filename="doc.docx") + EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5) ) - assert result.markdown_content == "Docx content from Azure" - assert result.etl_service == "AZURE_DI" + assert result.markdown_content == "# LlamaCloud fallback" + assert result.etl_service == "LLAMACLOUD" assert result.content_type == "document" + fake_parser.aparse.assert_called_once() -async def test_extract_unsupported_ext_with_azure_di_raises(tmp_path, mocker): - """AZURE_DI rejects extensions it doesn't support (e.g. .epub).""" - from app.etl_pipeline.exceptions import EtlUnsupportedFileError - - mocker.patch("app.config.config.ETL_SERVICE", "AZURE_DI") - +async def test_llamacloud_skips_azure_di_for_unsupported_ext(tmp_path, mocker): + """Azure DI is skipped for extensions it doesn't support (e.g. .epub).""" epub_file = tmp_path / "book.epub" epub_file.write_bytes(b"\x00" * 10) - with pytest.raises(EtlUnsupportedFileError, match="not supported by AZURE_DI"): + mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD") + mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True) + mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/", create=True) + mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True) + + fake_client = _mock_azure_di(mocker) + fake_parser = _mock_llamacloud(mocker, "# Epub from LlamaCloud") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(epub_file), filename="book.epub", estimated_pages=50) + ) + + assert result.markdown_content == "# Epub from LlamaCloud" + assert result.etl_service == "LLAMACLOUD" + fake_client.begin_analyze_document.assert_not_called() + fake_parser.aparse.assert_called_once() + + +async def test_llamacloud_without_azure_di_uses_llamacloud_directly(tmp_path, mocker): + """When Azure DI is not configured, LlamaCloud handles all file types directly.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10) + + mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD") + mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True) + mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True) + mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True) + + fake_parser = _mock_llamacloud(mocker, "# Direct LlamaCloud") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5) + ) + + assert result.markdown_content == "# Direct LlamaCloud" + assert result.etl_service == "LLAMACLOUD" + assert result.content_type == "document" + fake_parser.aparse.assert_called_once() + + +async def test_llamacloud_heif_accepted_only_with_azure_di(tmp_path, mocker): + """.heif is accepted by LLAMACLOUD only when Azure DI credentials are set.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + heif_file = tmp_path / "photo.heif" + heif_file.write_bytes(b"\x00" * 100) + + mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD") + mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True) + mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True) + mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True) + + with pytest.raises(EtlUnsupportedFileError, match="not supported by LLAMACLOUD"): await EtlPipelineService().extract( - EtlRequest(file_path=str(epub_file), filename="book.epub") + EtlRequest(file_path=str(heif_file), filename="photo.heif") ) + mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/") + mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key") + + fake_client = _mock_azure_di(mocker, "# HEIF from Azure DI") + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(heif_file), filename="photo.heif") + ) + + assert result.markdown_content == "# HEIF from Azure DI" + assert result.etl_service == "LLAMACLOUD" + fake_client.begin_analyze_document.assert_called_once() + # --------------------------------------------------------------------------- # Slice 10 - unknown extension falls through to document ETL @@ -520,13 +604,9 @@ async def test_extract_zip_raises_unsupported_error(tmp_path): ("file.svg", "DOCLING", True), ("file.p7s", "UNSTRUCTURED", False), ("file.p7s", "LLAMACLOUD", True), - ("file.pdf", "AZURE_DI", False), - ("file.docx", "AZURE_DI", False), - ("file.heif", "AZURE_DI", False), - ("file.epub", "AZURE_DI", True), - ("file.doc", "AZURE_DI", True), - ("file.rtf", "AZURE_DI", True), - ("file.svg", "AZURE_DI", True), + ("file.heif", "LLAMACLOUD", True), + ("file.heif", "DOCLING", True), + ("file.heif", "UNSTRUCTURED", True), ], ) def test_should_skip_for_service(filename, etl_service, expected_skip): diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example index 0e97607f1..b674d8e9b 100644 --- a/surfsense_web/.env.example +++ b/surfsense_web/.env.example @@ -1,6 +1,6 @@ NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000 NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE -NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING or AZURE_DI +NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:4848 # Contact Form Vars (optional) diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 579944bfb..36a24e299 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -96,11 +96,6 @@ const FILE_TYPE_CONFIG: Record> = { "image/tiff": [".tiff", ".tif"], ...audioFileTypes, }, - AZURE_DI: { - ...commonTypes, - "image/heic": [".heic"], - ...audioFileTypes, - }, default: { ...commonTypes, "application/msword": [".doc"], diff --git a/surfsense_web/lib/env-config.ts b/surfsense_web/lib/env-config.ts index 8dd49c250..80db395c6 100644 --- a/surfsense_web/lib/env-config.ts +++ b/surfsense_web/lib/env-config.ts @@ -19,7 +19,7 @@ export const AUTH_TYPE = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE || "G // Placeholder: __NEXT_PUBLIC_FASTAPI_BACKEND_URL__ export const BACKEND_URL = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000"; -// ETL Service: "DOCLING", "UNSTRUCTURED", "LLAMACLOUD", or "AZURE_DI" +// ETL Service: "DOCLING", "UNSTRUCTURED", or "LLAMACLOUD" // Placeholder: __NEXT_PUBLIC_ETL_SERVICE__ export const ETL_SERVICE = process.env.NEXT_PUBLIC_ETL_SERVICE || "DOCLING";