diff --git a/surfsense_backend/tests/integration/document_upload/test_document_upload.py b/surfsense_backend/tests/integration/document_upload/test_document_upload.py index 1a9433a8f..49ba9ce0a 100644 --- a/surfsense_backend/tests/integration/document_upload/test_document_upload.py +++ b/surfsense_backend/tests/integration/document_upload/test_document_upload.py @@ -1,8 +1,10 @@ """ -Integration tests for manual document upload. +Integration tests for manual document upload - HTTP API layer. -These tests exercise the full pipeline via the HTTP API: - API upload → inline task dispatch → ETL extraction → chunking → embedding → DB storage +Each test verifies a distinct user-facing behavior through the public HTTP +endpoints. Pipeline internals (indexing, chunking, embedding) are covered by +the ``indexing_pipeline`` test suite; this module focuses on the API contract, +error handling, auth, and cross-cutting concerns like duplicate detection. External boundaries mocked: LLM summarization, text embedding, text chunking, Redis heartbeat. Task dispatch is swapped via DI (InlineTaskDispatcher). @@ -21,8 +23,6 @@ import pytest from tests.utils.helpers import ( FIXTURES_DIR, - delete_document, - get_document, poll_document_status, upload_file, upload_multiple_files, @@ -30,27 +30,14 @@ from tests.utils.helpers import ( pytestmark = pytest.mark.integration -# --------------------------------------------------------------------------- -# Helpers local to this module -# --------------------------------------------------------------------------- - - -def _assert_document_ready(doc: dict, *, expected_filename: str) -> None: - """Common assertions for a successfully processed document.""" - assert doc["title"] == expected_filename - assert doc["document_type"] == "FILE" - assert doc["content"], "Document content (summary) should not be empty" - assert doc["content_hash"], "content_hash should be set" - assert doc["document_metadata"].get("FILE_NAME") == expected_filename - # --------------------------------------------------------------------------- -# Test A: Upload a .txt file (direct read path) +# Upload smoke tests (one per distinct code-path: direct-read & ETL) # --------------------------------------------------------------------------- class TestTxtFileUpload: - """Upload a plain-text file and verify the full pipeline.""" + """Upload a plain-text file (direct-read path) via the HTTP API.""" async def test_upload_txt_returns_document_id( self, @@ -89,83 +76,9 @@ class TestTxtFileUpload: for did in doc_ids: assert statuses[did]["status"]["state"] == "ready" - async def test_txt_document_fields_populated( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - cleanup_doc_ids: list[int], - ): - resp = await upload_file( - client, headers, "sample.txt", search_space_id=search_space_id - ) - doc_ids = resp.json()["document_ids"] - cleanup_doc_ids.extend(doc_ids) - - await poll_document_status( - client, headers, doc_ids, search_space_id=search_space_id - ) - - doc = await get_document(client, headers, doc_ids[0]) - _assert_document_ready(doc, expected_filename="sample.txt") - - -# --------------------------------------------------------------------------- -# Test B: Upload a .md file (markdown direct-read path) -# --------------------------------------------------------------------------- - - -class TestMarkdownFileUpload: - """Upload a Markdown file and verify the full pipeline.""" - - async def test_md_processing_reaches_ready( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - cleanup_doc_ids: list[int], - ): - resp = await upload_file( - client, headers, "sample.md", search_space_id=search_space_id - ) - assert resp.status_code == 200 - doc_ids = resp.json()["document_ids"] - cleanup_doc_ids.extend(doc_ids) - - statuses = await poll_document_status( - client, headers, doc_ids, search_space_id=search_space_id - ) - for did in doc_ids: - assert statuses[did]["status"]["state"] == "ready" - - async def test_md_document_fields_populated( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - cleanup_doc_ids: list[int], - ): - resp = await upload_file( - client, headers, "sample.md", search_space_id=search_space_id - ) - doc_ids = resp.json()["document_ids"] - cleanup_doc_ids.extend(doc_ids) - - await poll_document_status( - client, headers, doc_ids, search_space_id=search_space_id - ) - - doc = await get_document(client, headers, doc_ids[0]) - _assert_document_ready(doc, expected_filename="sample.md") - - -# --------------------------------------------------------------------------- -# Test C: Upload a .pdf file (ETL path) -# --------------------------------------------------------------------------- - class TestPdfFileUpload: - """Upload a PDF and verify it goes through the ETL extraction pipeline.""" + """Upload a PDF (ETL extraction path) via the HTTP API.""" async def test_pdf_processing_reaches_ready( self, @@ -187,26 +100,6 @@ class TestPdfFileUpload: for did in doc_ids: assert statuses[did]["status"]["state"] == "ready" - async def test_pdf_document_fields_populated( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - cleanup_doc_ids: list[int], - ): - resp = await upload_file( - client, headers, "sample.pdf", search_space_id=search_space_id - ) - doc_ids = resp.json()["document_ids"] - cleanup_doc_ids.extend(doc_ids) - - await poll_document_status( - client, headers, doc_ids, search_space_id=search_space_id, timeout=300.0 - ) - - doc = await get_document(client, headers, doc_ids[0]) - _assert_document_ready(doc, expected_filename="sample.pdf") - # --------------------------------------------------------------------------- # Test D: Upload multiple files in a single request @@ -214,7 +107,7 @@ class TestPdfFileUpload: class TestMultiFileUpload: - """Upload several files at once and verify all are processed.""" + """Upload several files at once and verify the API response contract.""" async def test_multi_upload_returns_all_ids( self, @@ -236,28 +129,6 @@ class TestMultiFileUpload: assert len(body["document_ids"]) == 2 cleanup_doc_ids.extend(body["document_ids"]) - async def test_multi_upload_all_reach_ready( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - cleanup_doc_ids: list[int], - ): - resp = await upload_multiple_files( - client, - headers, - ["sample.txt", "sample.md"], - search_space_id=search_space_id, - ) - doc_ids = resp.json()["document_ids"] - cleanup_doc_ids.extend(doc_ids) - - statuses = await poll_document_status( - client, headers, doc_ids, search_space_id=search_space_id - ) - for did in doc_ids: - assert statuses[did]["status"]["state"] == "ready" - # --------------------------------------------------------------------------- # Test E: Duplicate file upload (same file uploaded twice) @@ -433,38 +304,6 @@ class TestNoFilesUpload: assert resp.status_code in {400, 422} -# --------------------------------------------------------------------------- -# Test J: Document deletion after successful upload -# --------------------------------------------------------------------------- - - -class TestDocumentDeletion: - """Upload, wait for ready, delete, then verify it's gone.""" - - async def test_delete_processed_document( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - ): - resp = await upload_file( - client, headers, "sample.txt", search_space_id=search_space_id - ) - doc_ids = resp.json()["document_ids"] - await poll_document_status( - client, headers, doc_ids, search_space_id=search_space_id - ) - - del_resp = await delete_document(client, headers, doc_ids[0]) - assert del_resp.status_code == 200 - - get_resp = await client.get( - f"/api/v1/documents/{doc_ids[0]}", - headers=headers, - ) - assert get_resp.status_code == 404 - - # --------------------------------------------------------------------------- # Test K: Searchability after upload # ---------------------------------------------------------------------------