refactor: make Azure Document Intelligence an internal LLAMACLOUD accelerator instead of a standalone ETL service

This commit is contained in:
Anish Sarkar 2026-04-08 03:26:24 +05:30
parent 1fa8d1220b
commit 20fa93f0ba
9 changed files with 200 additions and 85 deletions

View file

@ -250,21 +250,17 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
# ---------------------------------------------------------------------------
# Slice 9b - AZURE_DI document parsing
# Slice 9b - LLAMACLOUD + Azure DI accelerator
# ---------------------------------------------------------------------------
async def test_extract_pdf_with_azure_di(tmp_path, mocker):
"""A .pdf file with ETL_SERVICE=AZURE_DI returns parsed markdown."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
mocker.patch("app.config.config.ETL_SERVICE", "AZURE_DI")
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/", create=True)
mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True)
def _mock_azure_di(mocker, content="# Azure DI parsed"):
"""Wire up Azure DI mocks and return the fake client for assertions."""
class FakeResult:
content = "# Azure DI parsed"
pass
FakeResult.content = content
fake_poller = mocker.AsyncMock()
fake_poller.result.return_value = FakeResult()
@ -286,72 +282,160 @@ async def test_extract_pdf_with_azure_di(tmp_path, mocker):
"azure.core.credentials.AzureKeyCredential",
return_value=mocker.MagicMock(),
)
return fake_client
def _mock_llamacloud(mocker, content="# LlamaCloud parsed"):
"""Wire up LlamaCloud mocks and return the fake parser for assertions."""
class FakeDoc:
pass
FakeDoc.text = content
class FakeJobResult:
pages = []
def get_markdown_documents(self, split_by_page=True):
return [FakeDoc()]
fake_parser = mocker.AsyncMock()
fake_parser.aparse.return_value = FakeJobResult()
mocker.patch(
"llama_cloud_services.LlamaParse",
return_value=fake_parser,
)
mocker.patch(
"llama_cloud_services.parse.utils.ResultType",
mocker.MagicMock(MD="md"),
)
return fake_parser
async def test_llamacloud_with_azure_di_uses_azure_for_pdf(tmp_path, mocker):
"""When Azure DI is configured, a supported extension (.pdf) is parsed by Azure DI."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/", create=True)
mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True)
fake_client = _mock_azure_di(mocker, "# Azure DI parsed")
fake_parser = _mock_llamacloud(mocker)
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert result.markdown_content == "# Azure DI parsed"
assert result.etl_service == "AZURE_DI"
assert result.etl_service == "LLAMACLOUD"
assert result.content_type == "document"
fake_client.begin_analyze_document.assert_called_once()
fake_parser.aparse.assert_not_called()
async def test_extract_docx_with_azure_di(tmp_path, mocker):
"""A .docx file with ETL_SERVICE=AZURE_DI routes correctly."""
docx_file = tmp_path / "doc.docx"
docx_file.write_bytes(b"PK fake docx")
async def test_llamacloud_azure_di_fallback_on_failure(tmp_path, mocker):
"""When Azure DI fails, LlamaCloud is used as a fallback."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
mocker.patch("app.config.config.ETL_SERVICE", "AZURE_DI")
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/", create=True)
mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True)
class FakeResult:
content = "Docx content from Azure"
fake_poller = mocker.AsyncMock()
fake_poller.result.return_value = FakeResult()
fake_client = mocker.AsyncMock()
fake_client.begin_analyze_document.return_value = fake_poller
fake_client.__aenter__ = mocker.AsyncMock(return_value=fake_client)
fake_client.__aexit__ = mocker.AsyncMock(return_value=False)
mocker.patch(
"azure.ai.documentintelligence.aio.DocumentIntelligenceClient",
return_value=fake_client,
)
mocker.patch(
"azure.ai.documentintelligence.models.DocumentContentFormat",
mocker.MagicMock(MARKDOWN="markdown"),
)
mocker.patch(
"azure.core.credentials.AzureKeyCredential",
return_value=mocker.MagicMock(),
"app.etl_pipeline.parsers.azure_doc_intelligence.parse_with_azure_doc_intelligence",
side_effect=RuntimeError("Azure DI unavailable"),
)
fake_parser = _mock_llamacloud(mocker, "# LlamaCloud fallback")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(docx_file), filename="doc.docx")
EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5)
)
assert result.markdown_content == "Docx content from Azure"
assert result.etl_service == "AZURE_DI"
assert result.markdown_content == "# LlamaCloud fallback"
assert result.etl_service == "LLAMACLOUD"
assert result.content_type == "document"
fake_parser.aparse.assert_called_once()
async def test_extract_unsupported_ext_with_azure_di_raises(tmp_path, mocker):
"""AZURE_DI rejects extensions it doesn't support (e.g. .epub)."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
mocker.patch("app.config.config.ETL_SERVICE", "AZURE_DI")
async def test_llamacloud_skips_azure_di_for_unsupported_ext(tmp_path, mocker):
"""Azure DI is skipped for extensions it doesn't support (e.g. .epub)."""
epub_file = tmp_path / "book.epub"
epub_file.write_bytes(b"\x00" * 10)
with pytest.raises(EtlUnsupportedFileError, match="not supported by AZURE_DI"):
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/", create=True)
mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True)
fake_client = _mock_azure_di(mocker)
fake_parser = _mock_llamacloud(mocker, "# Epub from LlamaCloud")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(epub_file), filename="book.epub", estimated_pages=50)
)
assert result.markdown_content == "# Epub from LlamaCloud"
assert result.etl_service == "LLAMACLOUD"
fake_client.begin_analyze_document.assert_not_called()
fake_parser.aparse.assert_called_once()
async def test_llamacloud_without_azure_di_uses_llamacloud_directly(tmp_path, mocker):
"""When Azure DI is not configured, LlamaCloud handles all file types directly."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
fake_parser = _mock_llamacloud(mocker, "# Direct LlamaCloud")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5)
)
assert result.markdown_content == "# Direct LlamaCloud"
assert result.etl_service == "LLAMACLOUD"
assert result.content_type == "document"
fake_parser.aparse.assert_called_once()
async def test_llamacloud_heif_accepted_only_with_azure_di(tmp_path, mocker):
""".heif is accepted by LLAMACLOUD only when Azure DI credentials are set."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
heif_file = tmp_path / "photo.heif"
heif_file.write_bytes(b"\x00" * 100)
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
with pytest.raises(EtlUnsupportedFileError, match="not supported by LLAMACLOUD"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(epub_file), filename="book.epub")
EtlRequest(file_path=str(heif_file), filename="photo.heif")
)
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", "https://fake.cognitiveservices.azure.com/")
mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key")
fake_client = _mock_azure_di(mocker, "# HEIF from Azure DI")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(heif_file), filename="photo.heif")
)
assert result.markdown_content == "# HEIF from Azure DI"
assert result.etl_service == "LLAMACLOUD"
fake_client.begin_analyze_document.assert_called_once()
# ---------------------------------------------------------------------------
# Slice 10 - unknown extension falls through to document ETL
@ -520,13 +604,9 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
("file.svg", "DOCLING", True),
("file.p7s", "UNSTRUCTURED", False),
("file.p7s", "LLAMACLOUD", True),
("file.pdf", "AZURE_DI", False),
("file.docx", "AZURE_DI", False),
("file.heif", "AZURE_DI", False),
("file.epub", "AZURE_DI", True),
("file.doc", "AZURE_DI", True),
("file.rtf", "AZURE_DI", True),
("file.svg", "AZURE_DI", True),
("file.heif", "LLAMACLOUD", True),
("file.heif", "DOCLING", True),
("file.heif", "UNSTRUCTURED", True),
],
)
def test_should_skip_for_service(filename, etl_service, expected_skip):