mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 09:46:25 +02:00
feat: add processing mode support for document uploads and ETL pipeline, improded error handling ux
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions
- Introduced a `ProcessingMode` enum to differentiate between basic and premium processing modes. - Updated `EtlRequest` to include a `processing_mode` field, defaulting to basic. - Enhanced ETL pipeline services to utilize the selected processing mode for Azure Document Intelligence and LlamaCloud parsing. - Modified various routes and services to handle processing mode, affecting document upload and indexing tasks. - Improved error handling and logging to include processing mode details. - Added tests to validate processing mode functionality and its impact on ETL operations.
This commit is contained in:
parent
b659f41bab
commit
656e061f84
104 changed files with 1900 additions and 909 deletions
|
|
@ -739,3 +739,187 @@ async def test_extract_image_falls_back_to_document_without_vision_llm(
|
|||
assert result.markdown_content == "# OCR text from image"
|
||||
assert result.etl_service == "DOCLING"
|
||||
assert result.content_type == "document"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Processing Mode enum tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_processing_mode_coerce_basic():
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
assert ProcessingMode.coerce("basic") == ProcessingMode.BASIC
|
||||
assert ProcessingMode.coerce("BASIC") == ProcessingMode.BASIC
|
||||
assert ProcessingMode.coerce(None) == ProcessingMode.BASIC
|
||||
assert ProcessingMode.coerce("invalid") == ProcessingMode.BASIC
|
||||
|
||||
|
||||
def test_processing_mode_coerce_premium():
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
assert ProcessingMode.coerce("premium") == ProcessingMode.PREMIUM
|
||||
assert ProcessingMode.coerce("PREMIUM") == ProcessingMode.PREMIUM
|
||||
|
||||
|
||||
def test_processing_mode_page_multiplier():
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
assert ProcessingMode.BASIC.page_multiplier == 1
|
||||
assert ProcessingMode.PREMIUM.page_multiplier == 10
|
||||
|
||||
|
||||
def test_etl_request_default_processing_mode():
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
req = EtlRequest(file_path="/tmp/test.pdf", filename="test.pdf")
|
||||
assert req.processing_mode == ProcessingMode.BASIC
|
||||
|
||||
|
||||
def test_etl_request_premium_processing_mode():
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
req = EtlRequest(
|
||||
file_path="/tmp/test.pdf",
|
||||
filename="test.pdf",
|
||||
processing_mode=ProcessingMode.PREMIUM,
|
||||
)
|
||||
assert req.processing_mode == ProcessingMode.PREMIUM
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Azure DI model selection by processing mode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_azure_di_basic_uses_prebuilt_read(tmp_path, mocker):
|
||||
"""Basic mode should use prebuilt-read model for Azure DI."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
|
||||
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
|
||||
mocker.patch(
|
||||
"app.config.config.AZURE_DI_ENDPOINT",
|
||||
"https://fake.cognitiveservices.azure.com/",
|
||||
create=True,
|
||||
)
|
||||
mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True)
|
||||
|
||||
fake_client = _mock_azure_di(mocker, "# Azure basic")
|
||||
_mock_llamacloud(mocker)
|
||||
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
result = await EtlPipelineService().extract(
|
||||
EtlRequest(
|
||||
file_path=str(pdf_file),
|
||||
filename="report.pdf",
|
||||
processing_mode=ProcessingMode.BASIC,
|
||||
)
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# Azure basic"
|
||||
call_args = fake_client.begin_analyze_document.call_args
|
||||
assert call_args[0][0] == "prebuilt-read"
|
||||
|
||||
|
||||
async def test_azure_di_premium_uses_prebuilt_layout(tmp_path, mocker):
|
||||
"""Premium mode should use prebuilt-layout model for Azure DI."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
|
||||
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
|
||||
mocker.patch(
|
||||
"app.config.config.AZURE_DI_ENDPOINT",
|
||||
"https://fake.cognitiveservices.azure.com/",
|
||||
create=True,
|
||||
)
|
||||
mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True)
|
||||
|
||||
fake_client = _mock_azure_di(mocker, "# Azure premium")
|
||||
_mock_llamacloud(mocker)
|
||||
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
result = await EtlPipelineService().extract(
|
||||
EtlRequest(
|
||||
file_path=str(pdf_file),
|
||||
filename="report.pdf",
|
||||
processing_mode=ProcessingMode.PREMIUM,
|
||||
)
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# Azure premium"
|
||||
call_args = fake_client.begin_analyze_document.call_args
|
||||
assert call_args[0][0] == "prebuilt-layout"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlamaCloud tier selection by processing mode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_llamacloud_basic_uses_cost_effective_tier(tmp_path, mocker):
|
||||
"""Basic mode should use cost_effective tier for LlamaCloud."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
|
||||
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
|
||||
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
|
||||
mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
|
||||
|
||||
fake_parser = _mock_llamacloud(mocker, "# Llama basic")
|
||||
|
||||
llama_parse_cls = mocker.patch(
|
||||
"llama_cloud_services.LlamaParse", return_value=fake_parser
|
||||
)
|
||||
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
result = await EtlPipelineService().extract(
|
||||
EtlRequest(
|
||||
file_path=str(pdf_file),
|
||||
filename="report.pdf",
|
||||
estimated_pages=5,
|
||||
processing_mode=ProcessingMode.BASIC,
|
||||
)
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# Llama basic"
|
||||
call_kwargs = llama_parse_cls.call_args[1]
|
||||
assert call_kwargs["tier"] == "cost_effective"
|
||||
|
||||
|
||||
async def test_llamacloud_premium_uses_agentic_plus_tier(tmp_path, mocker):
|
||||
"""Premium mode should use agentic_plus tier for LlamaCloud."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
|
||||
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
|
||||
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
|
||||
mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
|
||||
|
||||
fake_parser = _mock_llamacloud(mocker, "# Llama premium")
|
||||
|
||||
llama_parse_cls = mocker.patch(
|
||||
"llama_cloud_services.LlamaParse", return_value=fake_parser
|
||||
)
|
||||
|
||||
from app.etl_pipeline.etl_document import ProcessingMode
|
||||
|
||||
result = await EtlPipelineService().extract(
|
||||
EtlRequest(
|
||||
file_path=str(pdf_file),
|
||||
filename="report.pdf",
|
||||
estimated_pages=5,
|
||||
processing_mode=ProcessingMode.PREMIUM,
|
||||
)
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# Llama premium"
|
||||
call_kwargs = llama_parse_cls.call_args[1]
|
||||
assert call_kwargs["tier"] == "agentic_plus"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue