From 41eb68663a1ae85cf2da42526646c9e55c40e569 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 25 Feb 2026 16:39:45 +0530 Subject: [PATCH] feat: Add end-to-end tests for document upload pipeline and shared test utilities - Introduced new test files for end-to-end testing of document uploads, including support for .txt, .md, and .pdf formats. - Created shared fixtures and helper functions for authentication, document management, and cleanup. - Added sample documents for testing purposes. - Established a conftest.py file to provide reusable fixtures across test modules. --- surfsense_backend/tests/__init__.py | 0 surfsense_backend/tests/conftest.py | 73 +++ surfsense_backend/tests/e2e/__init__.py | 0 .../tests/e2e/test_document_upload.py | 487 ++++++++++++++++++ surfsense_backend/tests/fixtures/empty.pdf | Bin surfsense_backend/tests/fixtures/sample.md | 51 ++ surfsense_backend/tests/fixtures/sample.pdf | Bin 0 -> 18921 bytes surfsense_backend/tests/fixtures/sample.txt | 34 ++ surfsense_backend/tests/utils/__init__.py | 0 surfsense_backend/tests/utils/helpers.py | 157 ++++++ 10 files changed, 802 insertions(+) create mode 100644 surfsense_backend/tests/__init__.py create mode 100644 surfsense_backend/tests/conftest.py create mode 100644 surfsense_backend/tests/e2e/__init__.py create mode 100644 surfsense_backend/tests/e2e/test_document_upload.py create mode 100644 surfsense_backend/tests/fixtures/empty.pdf create mode 100644 surfsense_backend/tests/fixtures/sample.md create mode 100644 surfsense_backend/tests/fixtures/sample.pdf create mode 100644 surfsense_backend/tests/fixtures/sample.txt create mode 100644 surfsense_backend/tests/utils/__init__.py create mode 100644 surfsense_backend/tests/utils/helpers.py diff --git a/surfsense_backend/tests/__init__.py b/surfsense_backend/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/conftest.py b/surfsense_backend/tests/conftest.py new file mode 100644 index 000000000..0eb36016f --- /dev/null +++ b/surfsense_backend/tests/conftest.py @@ -0,0 +1,73 @@ +"""Root conftest — shared fixtures available to all test modules.""" + +from __future__ import annotations + +import contextlib +from collections.abc import AsyncGenerator + +import httpx +import pytest + +from tests.utils.helpers import ( + BACKEND_URL, + TEST_SEARCH_SPACE_ID, + auth_headers, + delete_document, + get_auth_token, +) + + +@pytest.fixture(scope="session") +def backend_url() -> str: + return BACKEND_URL + + +@pytest.fixture(scope="session") +def search_space_id() -> int: + return TEST_SEARCH_SPACE_ID + + +@pytest.fixture(scope="session") +async def auth_token(backend_url: str) -> str: + """Authenticate once per session and return the JWT token.""" + async with httpx.AsyncClient( + base_url=backend_url, timeout=30.0 + ) as client: + return await get_auth_token(client) + + +@pytest.fixture(scope="session") +def headers(auth_token: str) -> dict[str, str]: + """Authorization headers reused across all tests in the session.""" + return auth_headers(auth_token) + + +@pytest.fixture +async def client(backend_url: str) -> AsyncGenerator[httpx.AsyncClient]: + """Per-test async HTTP client pointing at the running backend.""" + async with httpx.AsyncClient( + base_url=backend_url, timeout=180.0 + ) as c: + yield c + + +@pytest.fixture +def cleanup_doc_ids() -> list[int]: + """Accumulator for document IDs that should be deleted after the test.""" + return [] + + +@pytest.fixture(autouse=True) +async def _cleanup_documents( + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], +): + """ + Runs after every test. Deletes any document IDs that were appended to + the ``cleanup_doc_ids`` list during the test body. + """ + yield + for doc_id in cleanup_doc_ids: + with contextlib.suppress(Exception): + await delete_document(client, headers, doc_id) diff --git a/surfsense_backend/tests/e2e/__init__.py b/surfsense_backend/tests/e2e/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/e2e/test_document_upload.py b/surfsense_backend/tests/e2e/test_document_upload.py new file mode 100644 index 000000000..d4540f0b6 --- /dev/null +++ b/surfsense_backend/tests/e2e/test_document_upload.py @@ -0,0 +1,487 @@ +""" +End-to-end tests for manual document upload. + +These tests exercise the full pipeline: + API upload → Celery task → ETL extraction → chunking → embedding → DB storage + +Prerequisites (must be running): + - FastAPI backend + - PostgreSQL + pgvector + - Redis + - Celery worker +""" + +from __future__ import annotations + +import shutil +from pathlib import Path + +import httpx + +from tests.utils.helpers import ( + FIXTURES_DIR, + TEST_SEARCH_SPACE_ID, + delete_document, + get_document, + poll_document_status, + upload_file, + upload_multiple_files, +) + +# --------------------------------------------------------------------------- +# Helpers local to this module +# --------------------------------------------------------------------------- + +def _assert_document_ready(doc: dict, *, expected_filename: str) -> None: + """Common assertions for a successfully processed document.""" + assert doc["title"] == expected_filename + assert doc["document_type"] == "FILE" + assert doc["content"], "Document content (summary) should not be empty" + assert doc["content_hash"], "content_hash should be set" + assert doc["document_metadata"].get("FILE_NAME") == expected_filename + assert doc["document_metadata"].get("ETL_SERVICE"), "ETL_SERVICE should be set" + if doc.get("status"): + assert doc["status"]["state"] == "ready" + + +# --------------------------------------------------------------------------- +# Test A: Upload a .txt file (direct read path — no ETL service needed) +# --------------------------------------------------------------------------- + + +class TestTxtFileUpload: + """Upload a plain-text file and verify the full pipeline.""" + + async def test_upload_txt_returns_document_id( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.txt") + assert resp.status_code == 200 + + body = resp.json() + assert body["pending_files"] >= 1 + assert len(body["document_ids"]) >= 1 + cleanup_doc_ids.extend(body["document_ids"]) + + async def test_txt_processing_reaches_ready( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.txt") + assert resp.status_code == 200 + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + statuses = await poll_document_status(client, headers, doc_ids) + for did in doc_ids: + assert statuses[did]["status"]["state"] == "ready" + + async def test_txt_document_fields_populated( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.txt") + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + await poll_document_status(client, headers, doc_ids) + + doc = await get_document(client, headers, doc_ids[0]) + _assert_document_ready(doc, expected_filename="sample.txt") + assert doc["document_metadata"]["ETL_SERVICE"] == "MARKDOWN" + + +# --------------------------------------------------------------------------- +# Test B: Upload a .md file (markdown direct-read path) +# --------------------------------------------------------------------------- + + +class TestMarkdownFileUpload: + """Upload a Markdown file and verify the full pipeline.""" + + async def test_md_processing_reaches_ready( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.md") + assert resp.status_code == 200 + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + statuses = await poll_document_status(client, headers, doc_ids) + for did in doc_ids: + assert statuses[did]["status"]["state"] == "ready" + + async def test_md_document_fields_populated( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.md") + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + await poll_document_status(client, headers, doc_ids) + + doc = await get_document(client, headers, doc_ids[0]) + _assert_document_ready(doc, expected_filename="sample.md") + assert doc["document_metadata"]["ETL_SERVICE"] == "MARKDOWN" + + +# --------------------------------------------------------------------------- +# Test C: Upload a .pdf file (ETL path — Docling / Unstructured) +# --------------------------------------------------------------------------- + + +class TestPdfFileUpload: + """Upload a PDF and verify it goes through the ETL extraction pipeline.""" + + async def test_pdf_processing_reaches_ready( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.pdf") + assert resp.status_code == 200 + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + statuses = await poll_document_status( + client, headers, doc_ids, timeout=300.0 + ) + for did in doc_ids: + assert statuses[did]["status"]["state"] == "ready" + + async def test_pdf_document_fields_populated( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.pdf") + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + await poll_document_status( + client, headers, doc_ids, timeout=300.0 + ) + + doc = await get_document(client, headers, doc_ids[0]) + _assert_document_ready(doc, expected_filename="sample.pdf") + assert doc["document_metadata"]["ETL_SERVICE"] in { + "DOCLING", + "UNSTRUCTURED", + "LLAMACLOUD", + } + + +# --------------------------------------------------------------------------- +# Test D: Upload multiple files in a single request +# --------------------------------------------------------------------------- + + +class TestMultiFileUpload: + """Upload several files at once and verify all are processed.""" + + async def test_multi_upload_returns_all_ids( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_multiple_files( + client, headers, ["sample.txt", "sample.md"] + ) + assert resp.status_code == 200 + + body = resp.json() + assert body["pending_files"] == 2 + assert len(body["document_ids"]) == 2 + cleanup_doc_ids.extend(body["document_ids"]) + + async def test_multi_upload_all_reach_ready( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_multiple_files( + client, headers, ["sample.txt", "sample.md"] + ) + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + statuses = await poll_document_status(client, headers, doc_ids) + for did in doc_ids: + assert statuses[did]["status"]["state"] == "ready" + + +# --------------------------------------------------------------------------- +# Test E: Duplicate file upload (same file uploaded twice) +# --------------------------------------------------------------------------- + + +class TestDuplicateFileUpload: + """ + Uploading the exact same file a second time should be detected as a + duplicate via ``unique_identifier_hash``. + """ + + async def test_duplicate_file_is_skipped( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + # First upload + resp1 = await upload_file(client, headers, "sample.txt") + assert resp1.status_code == 200 + first_ids = resp1.json()["document_ids"] + cleanup_doc_ids.extend(first_ids) + + await poll_document_status(client, headers, first_ids) + + # Second upload of the same file + resp2 = await upload_file(client, headers, "sample.txt") + assert resp2.status_code == 200 + + body2 = resp2.json() + assert body2["skipped_duplicates"] >= 1 + assert len(body2["duplicate_document_ids"]) >= 1 + cleanup_doc_ids.extend(body2.get("document_ids", [])) + + +# --------------------------------------------------------------------------- +# Test F: Duplicate content detection (different name, same content) +# --------------------------------------------------------------------------- + + +class TestDuplicateContentDetection: + """ + Uploading a file with a different name but identical content should be + detected as duplicate content via ``content_hash``. + """ + + async def test_same_content_different_name_detected( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + tmp_path: Path, + ): + # First upload + resp1 = await upload_file(client, headers, "sample.txt") + assert resp1.status_code == 200 + first_ids = resp1.json()["document_ids"] + cleanup_doc_ids.extend(first_ids) + await poll_document_status(client, headers, first_ids) + + # Copy fixture content to a differently named temp file + src = FIXTURES_DIR / "sample.txt" + dest = tmp_path / "renamed_sample.txt" + shutil.copy2(src, dest) + + with open(dest, "rb") as f: + resp2 = await client.post( + "/api/v1/documents/fileupload", + headers=headers, + files={"files": ("renamed_sample.txt", f)}, + data={"search_space_id": str(TEST_SEARCH_SPACE_ID)}, + ) + assert resp2.status_code == 200 + second_ids = resp2.json()["document_ids"] + cleanup_doc_ids.extend(second_ids) + + if second_ids: + statuses = await poll_document_status(client, headers, second_ids) + for did in second_ids: + assert statuses[did]["status"]["state"] == "failed" + assert "duplicate" in ( + statuses[did]["status"].get("reason", "").lower() + ) + + +# --------------------------------------------------------------------------- +# Test G: Empty / corrupt file handling +# --------------------------------------------------------------------------- + + +class TestEmptyFileUpload: + """An empty file should be processed but ultimately fail gracefully.""" + + async def test_empty_pdf_fails( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "empty.pdf") + assert resp.status_code == 200 + + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + if doc_ids: + statuses = await poll_document_status( + client, headers, doc_ids, timeout=120.0 + ) + for did in doc_ids: + assert statuses[did]["status"]["state"] == "failed" + assert statuses[did]["status"].get("reason"), ( + "Failed document should include a reason" + ) + + +# --------------------------------------------------------------------------- +# Test H: Upload without authentication +# --------------------------------------------------------------------------- + + +class TestUnauthenticatedUpload: + """Requests without a valid JWT should be rejected.""" + + async def test_upload_without_auth_returns_401( + self, + client: httpx.AsyncClient, + ): + file_path = FIXTURES_DIR / "sample.txt" + with open(file_path, "rb") as f: + resp = await client.post( + "/api/v1/documents/fileupload", + files={"files": ("sample.txt", f)}, + data={"search_space_id": str(TEST_SEARCH_SPACE_ID)}, + ) + assert resp.status_code == 401 + + +# --------------------------------------------------------------------------- +# Test I: Upload with no files attached +# --------------------------------------------------------------------------- + + +class TestNoFilesUpload: + """Submitting the form with zero files should return a validation error.""" + + async def test_no_files_returns_error( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + ): + resp = await client.post( + "/api/v1/documents/fileupload", + headers=headers, + data={"search_space_id": str(TEST_SEARCH_SPACE_ID)}, + ) + assert resp.status_code in {400, 422} + + +# --------------------------------------------------------------------------- +# Test J: Document deletion after successful upload +# --------------------------------------------------------------------------- + + +class TestDocumentDeletion: + """Upload, wait for ready, delete, then verify it's gone.""" + + async def test_delete_processed_document( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + ): + resp = await upload_file(client, headers, "sample.txt") + doc_ids = resp.json()["document_ids"] + await poll_document_status(client, headers, doc_ids) + + del_resp = await delete_document(client, headers, doc_ids[0]) + assert del_resp.status_code == 200 + + get_resp = await client.get( + f"/api/v1/documents/{doc_ids[0]}", + headers=headers, + ) + assert get_resp.status_code == 404 + + +# --------------------------------------------------------------------------- +# Test K: Cannot delete a document while it is still processing +# --------------------------------------------------------------------------- + + +class TestDeleteWhileProcessing: + """Attempting to delete a pending/processing document should be rejected.""" + + async def test_delete_pending_document_returns_409( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.pdf") + assert resp.status_code == 200 + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + # Immediately try to delete before processing finishes + del_resp = await delete_document(client, headers, doc_ids[0]) + assert del_resp.status_code == 409 + + # Let it finish so cleanup can work + await poll_document_status( + client, headers, doc_ids, timeout=300.0 + ) + + +# --------------------------------------------------------------------------- +# Test L: Status polling returns correct structure +# --------------------------------------------------------------------------- + + +class TestStatusPolling: + """Verify the status endpoint returns well-formed responses.""" + + async def test_status_endpoint_returns_items( + self, + client: httpx.AsyncClient, + headers: dict[str, str], + cleanup_doc_ids: list[int], + ): + resp = await upload_file(client, headers, "sample.txt") + doc_ids = resp.json()["document_ids"] + cleanup_doc_ids.extend(doc_ids) + + status_resp = await client.get( + "/api/v1/documents/status", + headers=headers, + params={ + "search_space_id": TEST_SEARCH_SPACE_ID, + "document_ids": ",".join(str(d) for d in doc_ids), + }, + ) + assert status_resp.status_code == 200 + + body = status_resp.json() + assert "items" in body + assert len(body["items"]) == len(doc_ids) + for item in body["items"]: + assert "id" in item + assert "status" in item + assert "state" in item["status"] + assert item["status"]["state"] in { + "pending", + "processing", + "ready", + "failed", + } + + await poll_document_status(client, headers, doc_ids) diff --git a/surfsense_backend/tests/fixtures/empty.pdf b/surfsense_backend/tests/fixtures/empty.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/surfsense_backend/tests/fixtures/sample.md b/surfsense_backend/tests/fixtures/sample.md new file mode 100644 index 000000000..7217540d8 --- /dev/null +++ b/surfsense_backend/tests/fixtures/sample.md @@ -0,0 +1,51 @@ +# SurfSense Test Document + +## Overview + +This is a **sample markdown document** used for end-to-end testing of the manual +document upload pipeline. It includes various markdown formatting elements. + +## Key Features + +- Document upload and processing +- Automatic chunking of content +- Embedding generation for semantic search +- Real-time status tracking via ElectricSQL + +## Technical Architecture + +### Backend Stack + +The SurfSense backend is built with: + +1. **FastAPI** for the REST API +2. **PostgreSQL** with pgvector for vector storage +3. **Celery** with Redis for background task processing +4. **Docling/Unstructured** for document parsing (ETL) + +### Processing Pipeline + +Documents go through a multi-stage pipeline: + +| Stage | Description | +|-------|-------------| +| Upload | File received via API endpoint | +| Parsing | Content extracted using ETL service | +| Chunking | Text split into semantic chunks | +| Embedding | Vector representations generated | +| Storage | Chunks stored with embeddings in pgvector | + +## Code Example + +```python +async def process_document(file_path: str) -> Document: + content = extract_content(file_path) + chunks = create_chunks(content) + embeddings = generate_embeddings(chunks) + return store_document(chunks, embeddings) +``` + +## Conclusion + +This document serves as a test fixture to validate the complete document processing +pipeline from upload through to chunk creation and embedding storage. diff --git a/surfsense_backend/tests/fixtures/sample.pdf b/surfsense_backend/tests/fixtures/sample.pdf new file mode 100644 index 0000000000000000000000000000000000000000..228071def423a70a69e28994be4e4248e8de12b9 GIT binary patch literal 18921 zcmeHvc_5VE_kU%JlCnf9jeR#8Gh@%bOLmGHj3vxq$dZs&Te3u=RHPz8c9ETgB$X{G zMOj1ALVov|8Kvd@{(OGlKfZr`$5S)+KIfi$&bjxVd!N_6=Mgs1)RsodU|EF2z=wqe z2}8h04u@EjlvrRexOt!#0R}h0yAa4wQiJ42^o1dzp9XGDWY|7529TTDv8br95Qt7t zITqAk80_)$^>8Cnd1$-&kbPljs2s${5WhOLQo$;I5amySOz=JgAOchgr8t$F>`Ict zjUv-eai!7FZ*wo6&OpoYG@bGYRArKu2t7@Stv8on`#eb;fhi)c5 zgu}2OdegMCBslt#d|;Y*U%UgJOdzkSj4JG^${8_a*qcd4g_d=7lE$ z3dp1cA5SvW4}G8_KWq#)p{4*+647OK0WHrTDIgFiU;x`E55xgo6(p#g2omB)t;T+! zf_(@WZiR?i>?bAE#s-8y(D_$s2vI{e+=qm8HH9H5%>zSH3njrs$=RZ4})g6j5b*<{CphNwq83E|>ZykBX1+1PgwMj&ZYcun6 z@TE+kkctSXFRKHu4BZ98R^7tJ!bH;0&4B%ICGK((G3(Mz)*k8G8D4kd+i@0Yv^f0 zdH;WsL*jyhW`Sg1f~OwQne@ZIni5>x$i6;-Fflc-OrXqyfOe0M6VMu{Cbps)G@pPQ zW`15?z{NpRJ!ChkGlt}Wl=gA+f@V1s#c4rRYFbgK;8u_aN5JF|2pAIl$suIHLIDMn zlgEMe1{ek?9GEd>VQ6`95B$mE)Cpf&a(BP;v<1%D_F~z=8h*7ZweE z0hXYgtQ_RK!G)0nUuft;%i$m=4=y=5Km-T?Z#fJ!8-Pn50Wz@Ag_Q-&04b0Lv7ohlVaW=oU~)7KQ~nLIgk$SuoUA%}IdO zYNupKAiDUv0)ZhzX}fv&5`4h72bg3v2~Y=tih_nA;5}Ia(!MP(FRxw0TuE0A%|W#6 z9}By``RrfYgi;4+C1X4^fB#Dtrx1GJUC6+GNCXzQQczu;6aY1%Gztw`LLLq2Ng4Ed zzIYEeM>V1gSj16Ak{Z99V}yF*CHbTy4=`Cl8uMgdO&OR2R4)q{6i#lAzO)o2&(Rau>0dpca1B(Ki{>4fByt&#@P;x{P}6c~I?||VI%-g(I0)+i{u5Br~IRI`jznm4S_w{e{+PtF8gTy0F9KTxT3%L zSjYjbtOZt2tQxBvML^nh%Rpa_(c)x8660-&2qt+akv3cdKfh3#k}02v$$H_pCyFri z(Hu2A9yW!9S6usMBc=Ooon`xOw=+(EBlgBeM@tX)>G2B*ZDLgpl=j~GmAxc{ntq-s(fZy(bH~Lo=VZu-zV- zAr5lpZuDHT;pF(br1y~r9{q^>OlY;Vw5UZ1+9=8HikR5o<%E4{cXS_#ovBvbx6l@v zdnD)udOT#urE~mI)luR>{vpO$hPIola`Rz=qRTzDe55V0kM`N`ZLT&sV8s^KarG3j zGAw6|lih%O+hJW|WD<|omci?rxz_GAejM9#MJT#?bS6Ma%IPVp$<;|+`@{!jF5V8! z+5FD)-uE7TKBZ;AZBs{&w~Kl%WnYQospelaQ!6)-Qj*%C#+W1|6X@u6w4rak#OsQ8 z)2gOX+@ie4CyHS&d7K=>@lB1lUo}lJ93S!z6&1>7n8-hQ_cVNc4?4axdPD9N`iatx zI_^U!_p=)sly$DlV`*TL{jT2ATKz8SV=Y}im%!sSl{4BW8ZKWsWE&+qEn}5=ZuWMX z1`{G>?%=+yNIvWxo`>p9=hwcspQzcQwHT%7tYX38y7yMD`GG7Q*Na&*qmvOb^2ZqM z(K*Lz9vNYRg5P)@SpK}dC#CM{?X?1)9;5@dXaUcVS8eV+6GKDYIxqGe#<0(S-IVe0 znBn+QxjP)M!$#PnZhpCcTFjblFWy61N$8ko9h(u?r1kN^>&06di!REv21e*{G`bcy zsi$sD>AG~gH_Y2xJ!Wsc>zT!F!)%^pXPcM8_$U6t7QQn%sF9Y^_+j%XXSyb3!Qh7b z)5EV8g1#OQHT;q#AZS_J<2PT{99R<4fiK+Kp2*tklB@KES7e0G)wEX2x@mdGwYn?o zZH4OdIQ&G{-*~b2$U6VBY}vzw6(v_ISf}4l&HFpgXSuW#9_AC6Uzcr#b!*t+TU@nc zPqwbqOB)S0BP|hWR0i2{vbM~wu18V$4bP;OKKT*ivvve@)KO++5bALdW70U5`^#quGi| zqy2U&FX?s-#+z>wPnP&}r?xxMBrZEggnUwNtH$;%Z}x}BT(_7xRS8?y*4B3Gh}s2% zox2=fr@O11a;XaYDySSr#%a>66$^@av58$zf2>-6+u4w%gyY8^J+N(ZXuP7`N%XwA zEH)>xfQzu-_aNe#&z}1O?5auH651jL@Ym$&jaA%2n`2I-Dh19%~q+y@-yxE-=iO({tcczYoZb_Y)IwE6K;7V*1 zF-5n^%*;K~_^cCl!l*_f;%%7R0~w7XBT`bw>*?hO%MT9vS*bClJW22JzPT+9len;O z(9_6BS>t%?=S|lh4>_b2x8#RpW~bk@8d2o<(6Dsqdc2g_&9(=Pg#*nO%{9{_^oX;yGYjULx5_WdJQZ{xS_bFoF)W4PCWnu&xi&f(8YYMT&=_-`UEFG~ zV56L^<5jb$HlB+4f}T3Zx}F_J!;h8S9Vf^j?Wga!-4SbYLzf)#&<&SwM#UE7oS5z| zF}(l%6ViCm_0y{uf<%+KF=ub9r*Qg1Jg?XkojFXh-M zyy#n(M_)+Cj@52XDifP#O60yjt2Mrz`vLQE&7u`ggSS9HWJW{-r){Kx{ILhR-(e<# z>kaK4HVFt#*uFA!nL|vF-iFlWzOsM)*?V7T(r8HPW-GJfol2Wz?rZJ*!likjIOF+l zZvBhtc3V5-xP(*nQcE7W-sSH;S5V68?#P(Jr|zzJsQXYH{qgKEtws5B%zfJx`XZLO z7PIDB&RTXEJxxioq_6V8SuDwW>KMv8!aChg#UJ|0)4{;dq?5ALhE18xyQ;<=CEXC5 z9;N#EY!o3W&~TAKuBGG4#F(z@30L2O1}6qM^VbB3_r7o}M!o<~?2|*|wwNuZdmk7IkNrWD^K-p_^=aT>et-n}WU!8;WRYIB3Xx9*&eh58T*TyJQw(ljEeg?^lexF+MDxx#D_Eq)+P5aw zXF~!nzu*^#Ke6Hyy?avORLN!%^GyrcQV-RnS0+WA99o|B7L%Xb%BQBByK;*yaLny9 z!AG~)TlZ`hI{xv%9mfMKY4cl|dPR$P3>DTbDVJ*2Y&?}Zh>6{fOs%~w*qI|Y+j`No zFjHjTKC_eE=I_Z^yo3I*3qwdX(>1)R=IOxQ60i6c=YncvzGwFi3udijoo;jZFjV0_ zFxFEx@kUntaGBF{;>NSJlJWjc>BE*@*MtaPwS@=#-``D0ekszp=THc%l?L}~`~*2& zcfj0Cm_Jz}vT$b9f0aR?9(}&#;RJgpX2Dl zx#;fwKB?sj@0^b(xbkizr#Do+ShjkW!1(+jR`)!d!;#Q+`9;9j!zn&&5yrehte)B% zs}`4C)Oxg7C5W?@mb1j!4y##_p7nm2+b+MoToo6^cyO2`{ABm3=Qf^6hj)Bh-x+R0 zJg%smnzWbA&D~Y{EaQ~e2Slio_R+gVquUwWk_;;XA`J+mNml14TU%<%Z!_rF?@yY_ ze|42a;7!^4vdwN=iu{A%uG4GS&g!zbw;68;WMHXtXV~R+(FGe)_(o5gfuCpLWiuy>T=X`g;3N5;vyjb7sq33)g3)?9_(7O+j7H@u~O`ySwXhFKc z+qxKcK!dYEgEMHG1d+=-zyy~^HlM1zo)O*`8haM+JbT2qMvY$~kXt&a+%VNhT-gfm(RJK=lDT=)Ble0&&?pIOeI}84sHgEKZ^Llu4Y;m#6VNus+ zPkQr!e?bf=yLI9~SJpc~tsb@xQ(x6J-0iO&TU z7k!-VF%gVeml`F^P$5EEW*79bM&4#sMecm3m)0WKIWrnN!e?mis5QdkvhCRAvxgHa zb5e_2;_lTS(MObj)3w?Xrx>KBc#MwuQ8iPP?#Iw;`(z!vWUirOjual=ZlC5pd)fEd zz89HU*H2E>?|u2P`+dcrTH%emfns6pBLSQ*PCQZReAlqE*fuvsiRfnTH}sB_QC?HW z_~6@z6a2_3S-HCxq*T622-o|I>@jBL?cd*?IcTF(Jmh+i_k@>Pp~wvZc;45p?(0o? zkDs)=r7|Oxrc7R}-}UBh{fA-1C$ndgZmRtUjKnh zqw=0?u58T%R*R(Sox7OHk#HZJB?9Dk0bpf+_**1woZ4ERqIQ0$>pQ zL69T_Rm#diKqdH-hqkJ4FgXArLVqaC3REd83$R8SR0+N)P$d=teo>%GSsVZ(sZiw) zxRDB2LSQKsu!LYzDqtyx2IVM#B>)*|fF&9bQ2|Q;+^s;BSPD)FL6ty$8dM1ZloY5E z1(8#rN{l@C0>v>fISj}HDuWAvv!Fugf@}a(fi7SL1a!#)2oMMXE-cUs4JrUO0!pDl zHh@Ec=m41ommC6sYv2QcvCsv9vtZW=T(W>GnsPyMK;{K@19hM&7Y=*@R2I7A0noS_ z^!s&_Vg*M1$4C6%VbtYi)-{|y48eaJj(^*$_%|T+Klfw+=>7wiDvhLpsQ^00BB)zr z|8p=^j)K%u!BlxU8HzQaQ3PSBl)X=ASV5}LuvF@f;A*q}x?i$lRwNYy{y#9)kw61Q z{|`*1fvsR{{sWjw11V`>BaQ+QB4}VE6+T2zfkrG1BBcV2D-bCSOr?U52pZ5xg)14$!L?K!QB(^f zX*$vXTB?pHT8gG4O#-TpSSol6%uWG})hX?SqUlI2hoS-E)V4rTb)><(C|Vm*t%#zv zA+;?~v^Iof{V6Yss;ewjb09WUFBVAuzYG)qdVGWC>SX0H|2T*IcQEluYvuqmLWQxk z%?aLSK2*{UB_^J-$mor*LvdSbPg-Q}R-7<-cd@I=txMS;A??i@CwZHF6+$mkF?#)( zyRD=Md({{?H%Fnu9wKmK>z{vN`(E|@TV`icilV}!gi#^+2MfzDXTQJv79tugM_`eZ zl9C!@ef&Lhe}NcUm1JO=C-xHcIbJ?<5N~EvW*<7uTYKqC zvZ-t8wv*p|lb7hD8hL|1GH;di7<1f{Z>_q#11ah|ez;+vvp@Gt;O!6jh)tpO*$uas z^v2l}P>llKuX1d9PnKUxG@rTsV2{C8Y5Uo>crAmZjz}Y;i*oOu-7fXH+ZE5=aM$u} zl-XUMxvhoPYz1}Jds@@lvLks#FAx{30w#9c^z|yZ_3CW8Am1|`!|@9WD9;A=ne}Az z%^fU`1=hE}Ts@Oz7NpF@0(;YMcJ-l`kb8)0dCcwY_)BckA@901rca6*%h%Y~Sl+$= z_6?cnofB9YFsbyxtKR%^_oExa#@}9^d8r((T(}s=_Qm=87oB7!iSO;C(78s(X>I*; zDZ2B%9znOSd@W_-+Y|MK4Hc@)o3+bnWYekf8;HJ2WQJ|W`Q;SxHIiWrx5Ku65<9(( z^Mp!mC)R{s%JF`IPW%3{4fmwOB-gYvV&xa>72tE;H4-uOa?j6^hcH7MF6d5eJ7UFT z*<9Y8eD@l!zqzT%_Org(L2^0|`1r1)&++&eONPI?sCw;@_qh%(uk79nC&qmh*hN2y zm9+ZLuCGk+z1C7LnKzil*16XBhO#*0E-tT(DA|X_7i@ErI}Z)6@k?Ymyi=G-aHOkf zFgGjkVT{&Dj(Ui9?unEcOwaJGlJVUy+Y;LXACCEm4*`y zlcpMnPi_8wxzbS0pp0b{Ut$M~`7V&eBv}LMDEMv$GcyouA=jD;-uQS`)m0!JJITx*6 zwMF@w-@9e89d^wzNn29S%(&h&7v-9mjhE(l?12_HvN{>uvsV)T%;ux%hn|HXw>MTV zRTaaFj`@Yb*C>{^yjOd&P1(7N=V5_VV7O1Q=?x6$_U?Q57`rN)^`$-;Z~J)K--xW0 z;F(})^b6(4lx`Bj+HncIh`O*t%~-=&!$9M1rCL{f4FBN0+xL{x*fO~6FP#YN%SqPW zgylR+?;IMQIZtPJun#{_yvH@aUmP7-WaC=Z5uS4~HNyDg=gE=jj;Zx|ZVhoG*l~_c zn>JJ`jve(AeVBfy#fQ5d zHr7L(DiR*C!f^(KR4tyVh&!Em>-{>@+>x1A%-^3xPAx69agv;xpRy*`lG6oVTq!+p z&RDUK>2coKL&@89y8MjmL~aBgvWUC3v0aF+3KtN+VdP%-Vr${Mvi3k@o1^YAm38X2 zz9P}G@kJUvhg%-M9hVIEIG@xR-$Q1{bXCDf^XbplC+~bDKfPdmR0mi9+-1G}1IHa?xxY;CU8buww%XqJG`@7} z={sJ@=iOhQ@rGefnQ(oKkKx!oNHo2u${p@fe*UX|R|M<)f;oL>Ud3WDE;y#r@M`Z@ zc**z80@M9XgLm!fmI~i2-tEV1Z6G!l*%oWN4EsrP^p{>sEr~olSs%AHMmtyFl2V8S zr(Rf3bN-Q-t`^DkdrIrxJdfLwe@{IhAx*;Dslnr<64s^bVg8n`PXeaGDW zoejl@i*2-LhSc))*7OvQda}A&#lB$>z8W{Yh`PjmA^2d=jX3SWa>iMbKxk!s8GXi_ zdhW;h2;HEYYqRS^$BtQkjTeU}_H*sGusi-NHhjMRU|+FW#B$j7naomiX0chDS<6WM znrV30;^e&&TMp@a&qQ36-z|S0kVk65+{XuRkU2JL%QrFY84GT^QJuf*4sw80S8L|) zvflLj$FL>k@lZd>-K=}R7DD>ARy3?Iw9b{c`Dp`}{2XUvfKL3j<&v_@a=9!+#l8)QhD8igd z$h(Hphu|}8X&n~teI=rM4qQ5w*Poi67}nf8daqc2{jQ@VyR4gk%wE}sVnk6Y>>HE*mF~#d$v?^u`T0!{?vF)fbzZV z6C7=d8+s3Rq#O&n!!|DeT45>gLx=N+^})_|`ws`BlGqRAzh~v|*>spCkFAP+w610T zePxK~y0~Og+5JV9(R$s+H5^l{i;k@6n?`QevEu&Xjm9Fd+BQ5BPQ=^T%xV>*af~GO!&vJjVEE9 z+>u^NcgrmwJ2`LoZse>c_1v-IN^`Vr2IG@+!cGs4aq4p-{rAVZN;-|7&}EArKI#%Cm=Dun`s$c6uAI%}OIqWP($vDKJ+$r_?`xN_WkZ z-ls+3ul2~NR&u9D@u>2SPXDq``U$N5-bS{^8INyS+rg9Pmb+I&j5&rqA&2WCN_vmB zSdm1^<8}2XgKy@ZN1*9;k8JqnEVSbuuc_Wt>i+)Y8w!lQYU&0V@7&N&vW?)f-Fcl@ zR8c#bo%CEc`@_Yw9ggf@>*}nA1eEaGOHI)4XXW3GXmWFSa|}$}SaeL34NN?My45V( z|J9cH^GJWp^tS79FGFW%*PE`jr&DG2Rr87d=yBh&%ynDA%(0#7;kgU7&iJHlcUabT zXz`||&3EwLZrwZWcMX$jOl*wfV2QeWo7eLu$)h3eSzq0=qm`egjW`{3Puk`cTs#?k zVbkm&^M-XgIM+3S;gPLdZ(Pck^t#X(cRpeRS5eogx#25?93DYfyLBHh=Z=c!8Np*b zR7;oVEHn-_pTCHIQe>SLW=$EPzyt;Mbb#n2D`$EcGU*C%u=Ue zUH@Q9d+&R?7hbm>lBhoy74>?P)8tuhnI)xfsNJ=pRczLi!Z9hV)<<5Co1-FwIS*XO zc{1|KDI&c4O3JPCr|pXRZDNu!MK{;I<4$J{?YHVndtW=b)r!DysKE8S?AiT?lT#%) zH*uDit*=&9*muRx{JlhDQGUNmH*s<-vI-gD3U`Z_!NE%E)`hMy z#tc#vi;JP{`g!sFPA~*!`%PB*Q^6t+TZGK^mMzhbCQXm^ z)~kF8kt}&JCF7`uEfPZyxU2T^&1iW%2uE zrp!l-2?9bq0|!}4+a+F{s-NT++B0UY`c%#DGhG6HW{QXOVv%hL~nHBvo} z;f=5@d5sf{IgPGwulv=WUdxz$;tGo)gY^-ntl}IgpGc+~?fcdz*Yoi2>T?(NPd|5R zDL{IM2Q%xHxiYJq_R*N0rK6wXD_a_hc6R4Q9R4;P5ok#7_c1|vS-maTwX#y1m(jkO zcav0&)I0v&NJDye{YH21(n1v3kr^8&)t%&^Yf#>6h}(Oy zr2jUiGU1tG;=S;o*uL2BYqDO@KU(+wwSmEKLS$i=w?V~~>IaVQS}%9*pO<|z5i+&# zKE6U*d{gxP(g*ghYCgj{MDSaK>ZK08@}ywOaD2CJp&55!$aO))!CI5Z*O`xEj4C3siede#J2Jr zm|(x!B4u=Rt%$a`5A)ITZ}R83EIM8C*-}JG_Q23SPei1@u6J>6=c~Yt`F#$uxp-E} zJ-uMZ1pM8N?5{EC&)IFDFq%$jxW%*j=w4~X- z5PNl&u{7zn(4?(SxzT&6YJ_UG++&;F4+3oUT^jbA?|gg6UC?e=Tfi(&yq_tfCTi~q z8+X(OqoCstP8^*nzVU*@da-GEYl4lFz9~wWFZMMqE;&0f0{uiwp5aSlZ{a#=w#6E) z>cBNj{vpgvi<;IxDce43E7YP73I}ZSS<79{{Q8xtnuUQ@)Dtp8!~8(Vc0=!-*|*0s znwyvI8|WcZ&c~^w==1s9wd>E5F%D$es(mZ+e4o#A_H#TI0lH>Kf|ZsZIIT+&XwAwT z)>bn*St!n!kGDuP3i2uyFh~-4D4{p1_x5`0fN7nW4%z_KDwOxy*}pR=i*A&(eLF)e z=QG6q^u4_A?{AvbIcG4W$ipYm5_;j5`Xf;lC*Jb&Cp7X*lOH}rzA?4ozk1##Fpn;x z%h! zE3HhwbwJ*ChUM&zPqGhv#=X9mVjkF?#YV4#hfmjVyfT!UOBlMQoh}`5+(3I&Ys-$Z zL>pMh={%hKHu#}+<);nx^saZQ-?}PP6lCoveaN1*B)0Ebq+3+$Sp}&QeO@N!=<6kS zB)o#^P`+6Wua&#MN9cdR#_p<9U9%+sw)hc^*_y;8CqQX)$Y>Hw`9S>n!OWYFUl&&#Dz+BW>#Zu@XvSl{mFv^lVW;K5 zSBmn-=%d+eo7`@eUGLfc<>^~4K}nh8c7iZJdU<8vg{c_ko2{N2SHi}TX?_7~xFk;g&B~n+gM_ePk6a>_lpl5blCguso{8?~!y!{*eIHM%75W3?IQSbKup zoVYIN`=L<|vHN1T75V~UZv&G2w)fi&Zk4>K z@LL%oCE=ay3g$QD_7&p_*G@UMd0l=rXp8RHQe<2lf7-*4$1XF(EUD54ou{K-c>gLt zPG0dg+P$x>IZ#)F8y)L86n5gVL`kb|@)I%7LTe)&JJT%i8OQ2hUYQHGIk0A6yA7)V zUdWm;ooP-{C7TIe$8b=bGralS$oYfHpY>p$Y;N++E$w$sIPxGmI$E!A#%*7u?WdPv zx`{T|pR@DwbydtVwCMLF-7$I^^KIK$E8m_Cy|Q0C=1Pg^R>ED-=CRUw+*C{NU7uEAWPT7P-gK z(YN*30$Y+L6`4W zOaLoF;S?;C^v}Qw8sLYb?wJ@G18%nrWPxyLM+DwAU}-Dv!ZAoPac07m@*`_r z<8w{dxpR2c4fPZ;&Z$-Nd0MS~!nrq5&vkh?nl+7GIH}g-dzF@)rBz7wBCpfs;_udB z6XG8q6@6EDUJ<(X6UWE14smCWxzh>8h}=6+m$?DGfGAjC?cvj#_*+Y?oW`dGpTgkQ zHnw1EO9qF50yeN{c^NDQg#l-De+JTly+T^+QA5#wD}%ww$;hFh4Xjl~R`$&QUJV9~ zltCkYE3&d-{r4hR98v~>f&^Gq%dfkP)cr3g+!oqLTRDhK{t-p9a`=899Ex5AVUZy2 z6t3q9=rw5H>d>}2w6#t0Ao+j*O0b_q3##vXVh5MfneIL~wG$(*icYHZ|A}L}5YL3r-d(gTu*Fw;HK2cGNIRGBt98ngFNo zsVsr0AUqZdU;^8;P!?eRV?_PBQ%W0A@~HnRn5EcSnbeQi{9<16LWs7;J3;Yy_RUT$ z&7JGEMFzPm$THq*bltfe%IU3nF<4=*`Uxo!sW_d%O8@Dzwo8lOTtsEylKhqn*q1~G z3oEPFUq5AiwGvkgZ#c<**=aoO&=_-*9Kb@}-C>XTLk!B_k z1fTUk`I?MhGG9^%g4OiHbb(B~}=dNpb6^vB~i@RS!Iydhdw7AmHgp zz6qzznBdZ-X4biO)#Uw?`Z=#Ji)h?h3*Q*Hue|E*;LR$h|dQ#Yo zOv;h4l;HTy$rqct)v@c0iC+V@eZxc`_?11R`;Hwv74}L%*~q~KnH;k!HcL)-!qTH>JlD2O^ko&r_Xzb z+XAh`Kl4=jU>gS}&qsVucsrk2b<)hnU_*z9$9y@65PJ`SPy9(bM;scEnarHo!Gks< zRrmrup^=STUlU(2C#&Y4Xmg?aCcE$Lp3~2#B_oPWIEB3mTl)m+FC}O85wxC}NcRN@ z6dLxIyUZ!NTOAoZ$-Yxa*_Gcp*y(NJa^Ac97F`#8mJb-u#70b~s5jYfPTkl!bB5Ve z^%1e<2J(8%)I`y^*rgkr5H~ND3F~hgl+6!g{3i7vV?J&$dR#%JAVOJ?zPU=pWopP9 z7jvnrCL;!Zw-x5vL++^`PwEH=- zPL_PWP+H-7(MM0k`RiAt&PNePBAm2&vz8=CoH^KZ<4jS%BFU?cig4D);Ze)ayQ(UZ zH&w%rKC1j8cmyeSLC1#Z%&a}wwNdZU!wzfa7H8o;9$oLstdvjSpPt!@vE4&I}biiBWeht@s*FMXK z#?lpc^Iwdnc5@FkTMrBS+zV7IVEKF&TerMckZrT+-+cP7rvfxUI2sh;CkFf>8AJfe zAZ2Axlt^Kp{~GTgq)pK@h<7k2I@&%U`$WAq?83*eSN(Y;BbF`e;I$Y49Uj@js#JJa0CjDLP-M( zX>uUZ7at%^Bn#3Qti(_PL%|@1cn6Z7ud)EBlOl|h;}51@em)))mQIdv0yQ3!3`fc! zfqW~HdwTt#CX!_+TF5w(JmCR&FF=X-g%TVUUlI4Ow4R>7kei#h{5!ob+{@)>39nVw z_=P>V0YpIqN?>_UF9llnGAEHdR95#eCHU`mm7E+EJiv<056rMsR zIAlNYllp^l6aGsXh$HBf`D%EfIlz{9CU`0d=>{9`+Gf{%{S)E61p;cX<)@EQdfnPrc zq{`0$4eD1G@Bzd1FAfVFN?9FbOLZ3-z*TsVTp({jG5yaa&Wa;}PMy(KJ%*rL!Ndqe zir=EHlAsq-KySqW=lf_xMtC3hU&5%N4DjRuUZMf55Gihn>;+!>L54ZwJ;+cVb!lel zM(`*2fR}hUgFrXXd_R|vnr=RT9vb>^Llm?Sq!w6tLkl>A_}dyR05D+z=i`tp&;dK@ z2b`~y1E=hqS%2h#Q-PFQFyhZV99ZH(pH+EC%%6EEd9c*}jSemUCqED%{RbTeocQ~_ zEOff>_dI!U6!Et_Btq^Fen=!Z=lDAvO773HXs81~pP#yc@V`ImMM75koel$n#DCAj z{)G`IP3UZSug|sVFPGp{DThnXT4|~2!;QR9~SwiZdfQ_{dYPL0R4L&4)JFl zIP{-wBrk{gogY#bJW~C}59EQ+_h0Kp$|0ch!_eo4?Li*)PdX$xQ~Em{68n1_A>~le zv;A-UKyp^}&Ovau5Sdd7YA(TMz*nNnqeo20F!! tK!XrY8Rg(;DsABFX~Oo2{gcBLhyC*{${AMRvVD9h;}+;Eho%Oy{y$Hv=Vkx^ literal 0 HcmV?d00001 diff --git a/surfsense_backend/tests/fixtures/sample.txt b/surfsense_backend/tests/fixtures/sample.txt new file mode 100644 index 000000000..0ee6513b9 --- /dev/null +++ b/surfsense_backend/tests/fixtures/sample.txt @@ -0,0 +1,34 @@ +SurfSense Document Upload Test + +This is a sample text document used for end-to-end testing of the manual document +upload pipeline in SurfSense. The document contains multiple paragraphs to ensure +that the chunking system has enough content to work with. + +Artificial Intelligence and Machine Learning + +Artificial intelligence (AI) is a broad field of computer science concerned with +building smart machines capable of performing tasks that typically require human +intelligence. Machine learning is a subset of AI that enables systems to learn and +improve from experience without being explicitly programmed. + +Natural Language Processing + +Natural language processing (NLP) is a subfield of linguistics, computer science, +and artificial intelligence concerned with the interactions between computers and +human language. Key applications include machine translation, sentiment analysis, +text summarization, and question answering systems. + +Vector Databases and Semantic Search + +Vector databases store data as high-dimensional vectors, enabling efficient +similarity search operations. When combined with embedding models, they power +semantic search systems that understand the meaning behind queries rather than +relying on exact keyword matches. This technology is fundamental to modern +retrieval-augmented generation (RAG) systems. + +Document Processing Pipelines + +Modern document processing pipelines involve several stages: extraction, transformation, +chunking, embedding generation, and storage. Each stage plays a critical role in +converting raw documents into searchable, structured knowledge that can be retrieved +and used by AI systems for accurate information retrieval and generation. diff --git a/surfsense_backend/tests/utils/__init__.py b/surfsense_backend/tests/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/utils/helpers.py b/surfsense_backend/tests/utils/helpers.py new file mode 100644 index 000000000..7c68bfac5 --- /dev/null +++ b/surfsense_backend/tests/utils/helpers.py @@ -0,0 +1,157 @@ +"""Shared test helpers for authentication, polling, and cleanup.""" + +from __future__ import annotations + +import asyncio +import os +from pathlib import Path + +import httpx + +FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures" + +BACKEND_URL = os.environ.get("TEST_BACKEND_URL", "http://localhost:8000") +TEST_EMAIL = os.environ.get("TEST_USER_EMAIL", "testuser@surfsense.com") +TEST_PASSWORD = os.environ.get("TEST_USER_PASSWORD", "testpassword123") +TEST_SEARCH_SPACE_ID = int(os.environ.get("TEST_SEARCH_SPACE_ID", "1")) + + +async def get_auth_token(client: httpx.AsyncClient) -> str: + """Log in and return a Bearer JWT token.""" + response = await client.post( + "/auth/jwt/login", + data={"username": TEST_EMAIL, "password": TEST_PASSWORD}, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + ) + assert response.status_code == 200, ( + f"Login failed ({response.status_code}): {response.text}" + ) + return response.json()["access_token"] + + +def auth_headers(token: str) -> dict[str, str]: + """Return Authorization header dict for a Bearer token.""" + return {"Authorization": f"Bearer {token}"} + + +async def upload_file( + client: httpx.AsyncClient, + headers: dict[str, str], + fixture_name: str, + *, + search_space_id: int = TEST_SEARCH_SPACE_ID, + filename_override: str | None = None, +) -> httpx.Response: + """Upload a single fixture file and return the raw response.""" + file_path = FIXTURES_DIR / fixture_name + upload_name = filename_override or fixture_name + with open(file_path, "rb") as f: + return await client.post( + "/api/v1/documents/fileupload", + headers=headers, + files={"files": (upload_name, f)}, + data={"search_space_id": str(search_space_id)}, + ) + + +async def upload_multiple_files( + client: httpx.AsyncClient, + headers: dict[str, str], + fixture_names: list[str], + *, + search_space_id: int = TEST_SEARCH_SPACE_ID, +) -> httpx.Response: + """Upload multiple fixture files in a single request.""" + files = [] + open_handles = [] + try: + for name in fixture_names: + fh = open(FIXTURES_DIR / name, "rb") # noqa: SIM115 + open_handles.append(fh) + files.append(("files", (name, fh))) + + return await client.post( + "/api/v1/documents/fileupload", + headers=headers, + files=files, + data={"search_space_id": str(search_space_id)}, + ) + finally: + for fh in open_handles: + fh.close() + + +async def poll_document_status( + client: httpx.AsyncClient, + headers: dict[str, str], + document_ids: list[int], + *, + search_space_id: int = TEST_SEARCH_SPACE_ID, + timeout: float = 180.0, + interval: float = 3.0, +) -> dict[int, dict]: + """ + Poll ``GET /api/v1/documents/status`` until every document reaches a + terminal state (``ready`` or ``failed``) or *timeout* seconds elapse. + + Returns a mapping of ``{document_id: status_item_dict}``. + """ + ids_param = ",".join(str(d) for d in document_ids) + terminal_states = {"ready", "failed"} + elapsed = 0.0 + + while elapsed < timeout: + resp = await client.get( + "/api/v1/documents/status", + headers=headers, + params={ + "search_space_id": search_space_id, + "document_ids": ids_param, + }, + ) + assert resp.status_code == 200, ( + f"Status poll failed ({resp.status_code}): {resp.text}" + ) + + items = {item["id"]: item for item in resp.json()["items"]} + if all( + items.get(did, {}).get("status", {}).get("state") in terminal_states + for did in document_ids + ): + return items + + await asyncio.sleep(interval) + elapsed += interval + + raise TimeoutError( + f"Documents {document_ids} did not reach terminal state within {timeout}s. " + f"Last status: {items}" + ) + + +async def get_document( + client: httpx.AsyncClient, + headers: dict[str, str], + document_id: int, +) -> dict: + """Fetch a single document by ID.""" + resp = await client.get( + f"/api/v1/documents/{document_id}", + headers=headers, + ) + assert resp.status_code == 200, ( + f"GET document {document_id} failed ({resp.status_code}): {resp.text}" + ) + return resp.json() + + +async def delete_document( + client: httpx.AsyncClient, + headers: dict[str, str], + document_id: int, +) -> httpx.Response: + """Delete a document by ID, returning the raw response.""" + return await client.delete( + f"/api/v1/documents/{document_id}", + headers=headers, + )