add docstrings to all indexing pipeline tests

2026-05-02 12:22:40 +02:00 · 2026-02-25 20:30:31 +02:00 · 2026-02-25 20:30:31 +02:00 · 0de74f4bf7
commit 0de74f4bf7
parent 4293910e8e
7 changed files with 48 additions and 0 deletions
--- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
@ -9,6 +9,7 @@ pytestmark = pytest.mark.integration

@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
 async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
+    """Document status is READY after successful indexing."""
    await index_uploaded_file(
        markdown_content="## Hello\n\nSome content.",
        filename="test.pdf",
@ -29,6 +30,7 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):

@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
 async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
+    """Document content is set to the LLM-generated summary."""
    await index_uploaded_file(
        markdown_content="## Hello\n\nSome content.",
        filename="test.pdf",
@ -49,6 +51,7 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):

@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
 async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker):
+    """Chunks derived from the source markdown are persisted in the DB."""
    await index_uploaded_file(
        markdown_content="## Hello\n\nSome content.",
        filename="test.pdf",
@ -75,6 +78,7 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker

@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text")
 async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker):
+    """RuntimeError is raised when the indexing step fails so the caller can fire a failure notification."""
    with pytest.raises(RuntimeError):
        await index_uploaded_file(
            markdown_content="## Hello\n\nSome content.",
--- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py
@ -11,6 +11,7 @@ pytestmark = pytest.mark.integration
 async def test_sets_status_ready(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """Document status is READY after successful indexing."""
    connector_doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -30,6 +31,7 @@ async def test_sets_status_ready(
 async def test_content_is_summary_when_should_summarize_true(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """Document content is set to the LLM-generated summary when should_summarize=True."""
    connector_doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -49,6 +51,7 @@ async def test_content_is_summary_when_should_summarize_true(
 async def test_content_is_source_markdown_when_should_summarize_false(
    db_session, db_search_space, make_connector_document,
 ):
+    """Document content is set to source_markdown verbatim when should_summarize=False."""
    connector_doc = make_connector_document(
        search_space_id=db_search_space.id,
        should_summarize=False,
@ -72,6 +75,7 @@ async def test_content_is_source_markdown_when_should_summarize_false(
 async def test_chunks_written_to_db(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """Chunks derived from source_markdown are persisted in the DB."""
    connector_doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -94,6 +98,7 @@ async def test_chunks_written_to_db(
 async def test_embedding_written_to_db(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """Document embedding vector is persisted in the DB after indexing."""
    connector_doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -114,6 +119,7 @@ async def test_embedding_written_to_db(
 async def test_updated_at_advances_after_indexing(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """updated_at timestamp is later after indexing than it was at prepare time."""
    connector_doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -136,6 +142,7 @@ async def test_updated_at_advances_after_indexing(
 async def test_no_llm_falls_back_to_source_markdown(
    db_session, db_search_space, make_connector_document,
 ):
+    """When llm=None and no fallback_summary, content falls back to source_markdown."""
    connector_doc = make_connector_document(
        search_space_id=db_search_space.id,
        should_summarize=True,
@ -160,6 +167,7 @@ async def test_no_llm_falls_back_to_source_markdown(
 async def test_fallback_summary_used_when_llm_unavailable(
    db_session, db_search_space, make_connector_document,
 ):
+    """fallback_summary is used as content when llm=None and should_summarize=True."""
    connector_doc = make_connector_document(
        search_space_id=db_search_space.id,
        should_summarize=True,
@ -184,6 +192,7 @@ async def test_fallback_summary_used_when_llm_unavailable(
 async def test_reindex_replaces_old_chunks(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """Re-indexing a document replaces its old chunks rather than appending."""
    connector_doc = make_connector_document(
        search_space_id=db_search_space.id,
        source_markdown="## v1",
@ -215,6 +224,7 @@ async def test_reindex_replaces_old_chunks(
 async def test_llm_error_sets_status_failed(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """Document status is FAILED when the LLM raises during indexing."""
    connector_doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -234,6 +244,7 @@ async def test_llm_error_sets_status_failed(
 async def test_llm_error_leaves_no_partial_data(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """A failed indexing attempt leaves no partial embedding or chunks in the DB."""
    connector_doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

--- a/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py
@ -11,6 +11,7 @@ pytestmark = pytest.mark.integration
 async def test_new_document_is_persisted_with_pending_status(
    db_session, db_search_space, make_connector_document
 ):
+    """A new document is created in the DB with PENDING status and correct markdown."""
    doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -31,6 +32,7 @@ async def test_new_document_is_persisted_with_pending_status(
 async def test_unchanged_ready_document_is_skipped(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """A READY document with unchanged content is not returned for re-indexing."""
    doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -48,6 +50,7 @@ async def test_unchanged_ready_document_is_skipped(
 async def test_title_only_change_updates_title_in_db(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """A title-only change updates the DB title without re-queuing the document."""
    original = make_connector_document(search_space_id=db_search_space.id, title="Original Title")
    service = IndexingPipelineService(session=db_session)

@ -69,6 +72,7 @@ async def test_title_only_change_updates_title_in_db(
 async def test_changed_content_is_returned_for_reprocessing(
    db_session, db_search_space, make_connector_document
 ):
+    """A document with changed content is returned for re-indexing with updated markdown."""
    original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1")
    service = IndexingPipelineService(session=db_session)

@ -91,6 +95,7 @@ async def test_changed_content_is_returned_for_reprocessing(
 async def test_all_documents_in_batch_are_persisted(
    db_session, db_search_space, make_connector_document
 ):
+    """All documents in a batch are persisted and returned."""
    docs = [
        make_connector_document(search_space_id=db_search_space.id, unique_id="id-1", title="Doc 1", source_markdown="## Content 1"),
        make_connector_document(search_space_id=db_search_space.id, unique_id="id-2", title="Doc 2", source_markdown="## Content 2"),
@ -111,6 +116,7 @@ async def test_all_documents_in_batch_are_persisted(
 async def test_duplicate_in_batch_is_persisted_once(
    db_session, db_search_space, make_connector_document
 ):
+    """The same document passed twice in a batch is only persisted once."""
    doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -127,6 +133,7 @@ async def test_duplicate_in_batch_is_persisted_once(
 async def test_created_by_id_is_persisted(
    db_session, db_user, db_search_space, make_connector_document
 ):
+    """created_by_id from the connector document is persisted on the DB row."""
    doc = make_connector_document(
        search_space_id=db_search_space.id,
        created_by_id=str(db_user.id),
@ -145,6 +152,7 @@ async def test_created_by_id_is_persisted(
 async def test_metadata_is_updated_when_content_changes(
    db_session, db_search_space, make_connector_document
 ):
+    """document_metadata is overwritten with the latest metadata when content changes."""
    original = make_connector_document(
        search_space_id=db_search_space.id,
        source_markdown="## v1",
@ -171,6 +179,7 @@ async def test_metadata_is_updated_when_content_changes(
 async def test_updated_at_advances_when_title_only_changes(
    db_session, db_search_space, make_connector_document
 ):
+    """updated_at advances even when only the title changes."""
    original = make_connector_document(search_space_id=db_search_space.id, title="Old Title")
    service = IndexingPipelineService(session=db_session)

@ -192,6 +201,7 @@ async def test_updated_at_advances_when_title_only_changes(
 async def test_updated_at_advances_when_content_changes(
    db_session, db_search_space, make_connector_document
 ):
+    """updated_at advances when document content changes."""
    original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1")
    service = IndexingPipelineService(session=db_session)

@ -213,6 +223,7 @@ async def test_updated_at_advances_when_content_changes(
 async def test_same_content_from_different_source_skipped_in_single_batch(
    db_session, db_search_space, make_connector_document
 ):
+    """Two documents with identical content in the same batch result in only one being persisted."""
    first = make_connector_document(
        search_space_id=db_search_space.id,
        unique_id="source-a",
@ -238,6 +249,7 @@ async def test_same_content_from_different_source_skipped_in_single_batch(
 async def test_same_content_from_different_source_is_skipped(
    db_session, db_search_space, make_connector_document
 ):
+    """A document with content identical to an already-indexed document is skipped."""
    first = make_connector_document(
        search_space_id=db_search_space.id,
        unique_id="source-a",
@ -265,6 +277,7 @@ async def test_same_content_from_different_source_is_skipped(
 async def test_failed_document_with_unchanged_content_is_requeued(
    db_session, db_search_space, make_connector_document, mocker,
 ):
+    """A FAILED document with unchanged content is re-queued as PENDING on the next run."""
    doc = make_connector_document(search_space_id=db_search_space.id)
    service = IndexingPipelineService(session=db_session)

@ -289,6 +302,7 @@ async def test_failed_document_with_unchanged_content_is_requeued(
 async def test_title_and_content_change_updates_both_and_returns_document(
    db_session, db_search_space, make_connector_document
 ):
+    """When both title and content change, both are updated and the document is returned for re-indexing."""
    original = make_connector_document(
        search_space_id=db_search_space.id,
        title="Original Title",