refactor: enhance document upload integration tests for API contract validation

2026-04-25 08:46:22 +02:00 · 2026-02-27 01:24:20 +05:30 · 2026-02-27 01:24:20 +05:30 · 7c09958ddc
commit 7c09958ddc
parent 1068ea25a7
1 changed files with 9 additions and 170 deletions
--- a/surfsense_backend/tests/integration/document_upload/test_document_upload.py
+++ b/surfsense_backend/tests/integration/document_upload/test_document_upload.py
@ -1,8 +1,10 @@
 """
-Integration tests for manual document upload.
+Integration tests for manual document upload - HTTP API layer.

-These tests exercise the full pipeline via the HTTP API:
-  API upload → inline task dispatch → ETL extraction → chunking → embedding → DB storage
+Each test verifies a distinct user-facing behavior through the public HTTP
+endpoints.  Pipeline internals (indexing, chunking, embedding) are covered by
+the ``indexing_pipeline`` test suite; this module focuses on the API contract,
+error handling, auth, and cross-cutting concerns like duplicate detection.

 External boundaries mocked: LLM summarization, text embedding, text chunking,
 Redis heartbeat. Task dispatch is swapped via DI (InlineTaskDispatcher).
@ -21,8 +23,6 @@ import pytest

 from tests.utils.helpers import (
    FIXTURES_DIR,
-    delete_document,
-    get_document,
    poll_document_status,
    upload_file,
    upload_multiple_files,
@ -30,27 +30,14 @@ from tests.utils.helpers import (

 pytestmark = pytest.mark.integration

-# ---------------------------------------------------------------------------
-# Helpers local to this module
-# ---------------------------------------------------------------------------
-
-
-def _assert_document_ready(doc: dict, *, expected_filename: str) -> None:
-    """Common assertions for a successfully processed document."""
-    assert doc["title"] == expected_filename
-    assert doc["document_type"] == "FILE"
-    assert doc["content"], "Document content (summary) should not be empty"
-    assert doc["content_hash"], "content_hash should be set"
-    assert doc["document_metadata"].get("FILE_NAME") == expected_filename
-

 # ---------------------------------------------------------------------------
-# Test A: Upload a .txt file (direct read path)
+# Upload smoke tests (one per distinct code-path: direct-read & ETL)
 # ---------------------------------------------------------------------------


 class TestTxtFileUpload:
-    """Upload a plain-text file and verify the full pipeline."""
+    """Upload a plain-text file (direct-read path) via the HTTP API."""

    async def test_upload_txt_returns_document_id(
        self,
@ -89,83 +76,9 @@ class TestTxtFileUpload:
        for did in doc_ids:
            assert statuses[did]["status"]["state"] == "ready"

-    async def test_txt_document_fields_populated(
-        self,
-        client: httpx.AsyncClient,
-        headers: dict[str, str],
-        search_space_id: int,
-        cleanup_doc_ids: list[int],
-    ):
-        resp = await upload_file(
-            client, headers, "sample.txt", search_space_id=search_space_id
-        )
-        doc_ids = resp.json()["document_ids"]
-        cleanup_doc_ids.extend(doc_ids)
-
-        await poll_document_status(
-            client, headers, doc_ids, search_space_id=search_space_id
-        )
-
-        doc = await get_document(client, headers, doc_ids[0])
-        _assert_document_ready(doc, expected_filename="sample.txt")
-
-
-# ---------------------------------------------------------------------------
-# Test B: Upload a .md file (markdown direct-read path)
-# ---------------------------------------------------------------------------
-
-
-class TestMarkdownFileUpload:
-    """Upload a Markdown file and verify the full pipeline."""
-
-    async def test_md_processing_reaches_ready(
-        self,
-        client: httpx.AsyncClient,
-        headers: dict[str, str],
-        search_space_id: int,
-        cleanup_doc_ids: list[int],
-    ):
-        resp = await upload_file(
-            client, headers, "sample.md", search_space_id=search_space_id
-        )
-        assert resp.status_code == 200
-        doc_ids = resp.json()["document_ids"]
-        cleanup_doc_ids.extend(doc_ids)
-
-        statuses = await poll_document_status(
-            client, headers, doc_ids, search_space_id=search_space_id
-        )
-        for did in doc_ids:
-            assert statuses[did]["status"]["state"] == "ready"
-
-    async def test_md_document_fields_populated(
-        self,
-        client: httpx.AsyncClient,
-        headers: dict[str, str],
-        search_space_id: int,
-        cleanup_doc_ids: list[int],
-    ):
-        resp = await upload_file(
-            client, headers, "sample.md", search_space_id=search_space_id
-        )
-        doc_ids = resp.json()["document_ids"]
-        cleanup_doc_ids.extend(doc_ids)
-
-        await poll_document_status(
-            client, headers, doc_ids, search_space_id=search_space_id
-        )
-
-        doc = await get_document(client, headers, doc_ids[0])
-        _assert_document_ready(doc, expected_filename="sample.md")
-
-
-# ---------------------------------------------------------------------------
-# Test C: Upload a .pdf file (ETL path)
-# ---------------------------------------------------------------------------
-

 class TestPdfFileUpload:
-    """Upload a PDF and verify it goes through the ETL extraction pipeline."""
+    """Upload a PDF (ETL extraction path) via the HTTP API."""

    async def test_pdf_processing_reaches_ready(
        self,
@ -187,26 +100,6 @@ class TestPdfFileUpload:
        for did in doc_ids:
            assert statuses[did]["status"]["state"] == "ready"

-    async def test_pdf_document_fields_populated(
-        self,
-        client: httpx.AsyncClient,
-        headers: dict[str, str],
-        search_space_id: int,
-        cleanup_doc_ids: list[int],
-    ):
-        resp = await upload_file(
-            client, headers, "sample.pdf", search_space_id=search_space_id
-        )
-        doc_ids = resp.json()["document_ids"]
-        cleanup_doc_ids.extend(doc_ids)
-
-        await poll_document_status(
-            client, headers, doc_ids, search_space_id=search_space_id, timeout=300.0
-        )
-
-        doc = await get_document(client, headers, doc_ids[0])
-        _assert_document_ready(doc, expected_filename="sample.pdf")
-

 # ---------------------------------------------------------------------------
 # Test D: Upload multiple files in a single request
@ -214,7 +107,7 @@ class TestPdfFileUpload:


 class TestMultiFileUpload:
-    """Upload several files at once and verify all are processed."""
+    """Upload several files at once and verify the API response contract."""

    async def test_multi_upload_returns_all_ids(
        self,
@ -236,28 +129,6 @@ class TestMultiFileUpload:
        assert len(body["document_ids"]) == 2
        cleanup_doc_ids.extend(body["document_ids"])

-    async def test_multi_upload_all_reach_ready(
-        self,
-        client: httpx.AsyncClient,
-        headers: dict[str, str],
-        search_space_id: int,
-        cleanup_doc_ids: list[int],
-    ):
-        resp = await upload_multiple_files(
-            client,
-            headers,
-            ["sample.txt", "sample.md"],
-            search_space_id=search_space_id,
-        )
-        doc_ids = resp.json()["document_ids"]
-        cleanup_doc_ids.extend(doc_ids)
-
-        statuses = await poll_document_status(
-            client, headers, doc_ids, search_space_id=search_space_id
-        )
-        for did in doc_ids:
-            assert statuses[did]["status"]["state"] == "ready"
-

 # ---------------------------------------------------------------------------
 # Test E: Duplicate file upload (same file uploaded twice)
@ -433,38 +304,6 @@ class TestNoFilesUpload:
        assert resp.status_code in {400, 422}


-# ---------------------------------------------------------------------------
-# Test J: Document deletion after successful upload
-# ---------------------------------------------------------------------------
-
-
-class TestDocumentDeletion:
-    """Upload, wait for ready, delete, then verify it's gone."""
-
-    async def test_delete_processed_document(
-        self,
-        client: httpx.AsyncClient,
-        headers: dict[str, str],
-        search_space_id: int,
-    ):
-        resp = await upload_file(
-            client, headers, "sample.txt", search_space_id=search_space_id
-        )
-        doc_ids = resp.json()["document_ids"]
-        await poll_document_status(
-            client, headers, doc_ids, search_space_id=search_space_id
-        )
-
-        del_resp = await delete_document(client, headers, doc_ids[0])
-        assert del_resp.status_code == 200
-
-        get_resp = await client.get(
-            f"/api/v1/documents/{doc_ids[0]}",
-            headers=headers,
-        )
-        assert get_resp.status_code == 404
-
-
 # ---------------------------------------------------------------------------
 # Test K: Searchability after upload
 # ---------------------------------------------------------------------------