add docstrings to all indexing pipeline tests

This commit is contained in:
CREDO23 2026-02-25 20:30:31 +02:00
parent 4293910e8e
commit 0de74f4bf7
7 changed files with 48 additions and 0 deletions

View file

@ -9,6 +9,7 @@ pytestmark = pytest.mark.integration
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
"""Document status is READY after successful indexing."""
await index_uploaded_file(
markdown_content="## Hello\n\nSome content.",
filename="test.pdf",
@ -29,6 +30,7 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
"""Document content is set to the LLM-generated summary."""
await index_uploaded_file(
markdown_content="## Hello\n\nSome content.",
filename="test.pdf",
@ -49,6 +51,7 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker):
"""Chunks derived from the source markdown are persisted in the DB."""
await index_uploaded_file(
markdown_content="## Hello\n\nSome content.",
filename="test.pdf",
@ -75,6 +78,7 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text")
async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker):
"""RuntimeError is raised when the indexing step fails so the caller can fire a failure notification."""
with pytest.raises(RuntimeError):
await index_uploaded_file(
markdown_content="## Hello\n\nSome content.",

View file

@ -11,6 +11,7 @@ pytestmark = pytest.mark.integration
async def test_sets_status_ready(
db_session, db_search_space, make_connector_document, mocker,
):
"""Document status is READY after successful indexing."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -30,6 +31,7 @@ async def test_sets_status_ready(
async def test_content_is_summary_when_should_summarize_true(
db_session, db_search_space, make_connector_document, mocker,
):
"""Document content is set to the LLM-generated summary when should_summarize=True."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -49,6 +51,7 @@ async def test_content_is_summary_when_should_summarize_true(
async def test_content_is_source_markdown_when_should_summarize_false(
db_session, db_search_space, make_connector_document,
):
"""Document content is set to source_markdown verbatim when should_summarize=False."""
connector_doc = make_connector_document(
search_space_id=db_search_space.id,
should_summarize=False,
@ -72,6 +75,7 @@ async def test_content_is_source_markdown_when_should_summarize_false(
async def test_chunks_written_to_db(
db_session, db_search_space, make_connector_document, mocker,
):
"""Chunks derived from source_markdown are persisted in the DB."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -94,6 +98,7 @@ async def test_chunks_written_to_db(
async def test_embedding_written_to_db(
db_session, db_search_space, make_connector_document, mocker,
):
"""Document embedding vector is persisted in the DB after indexing."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -114,6 +119,7 @@ async def test_embedding_written_to_db(
async def test_updated_at_advances_after_indexing(
db_session, db_search_space, make_connector_document, mocker,
):
"""updated_at timestamp is later after indexing than it was at prepare time."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -136,6 +142,7 @@ async def test_updated_at_advances_after_indexing(
async def test_no_llm_falls_back_to_source_markdown(
db_session, db_search_space, make_connector_document,
):
"""When llm=None and no fallback_summary, content falls back to source_markdown."""
connector_doc = make_connector_document(
search_space_id=db_search_space.id,
should_summarize=True,
@ -160,6 +167,7 @@ async def test_no_llm_falls_back_to_source_markdown(
async def test_fallback_summary_used_when_llm_unavailable(
db_session, db_search_space, make_connector_document,
):
"""fallback_summary is used as content when llm=None and should_summarize=True."""
connector_doc = make_connector_document(
search_space_id=db_search_space.id,
should_summarize=True,
@ -184,6 +192,7 @@ async def test_fallback_summary_used_when_llm_unavailable(
async def test_reindex_replaces_old_chunks(
db_session, db_search_space, make_connector_document, mocker,
):
"""Re-indexing a document replaces its old chunks rather than appending."""
connector_doc = make_connector_document(
search_space_id=db_search_space.id,
source_markdown="## v1",
@ -215,6 +224,7 @@ async def test_reindex_replaces_old_chunks(
async def test_llm_error_sets_status_failed(
db_session, db_search_space, make_connector_document, mocker,
):
"""Document status is FAILED when the LLM raises during indexing."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -234,6 +244,7 @@ async def test_llm_error_sets_status_failed(
async def test_llm_error_leaves_no_partial_data(
db_session, db_search_space, make_connector_document, mocker,
):
"""A failed indexing attempt leaves no partial embedding or chunks in the DB."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)

View file

@ -11,6 +11,7 @@ pytestmark = pytest.mark.integration
async def test_new_document_is_persisted_with_pending_status(
db_session, db_search_space, make_connector_document
):
"""A new document is created in the DB with PENDING status and correct markdown."""
doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -31,6 +32,7 @@ async def test_new_document_is_persisted_with_pending_status(
async def test_unchanged_ready_document_is_skipped(
db_session, db_search_space, make_connector_document, mocker,
):
"""A READY document with unchanged content is not returned for re-indexing."""
doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -48,6 +50,7 @@ async def test_unchanged_ready_document_is_skipped(
async def test_title_only_change_updates_title_in_db(
db_session, db_search_space, make_connector_document, mocker,
):
"""A title-only change updates the DB title without re-queuing the document."""
original = make_connector_document(search_space_id=db_search_space.id, title="Original Title")
service = IndexingPipelineService(session=db_session)
@ -69,6 +72,7 @@ async def test_title_only_change_updates_title_in_db(
async def test_changed_content_is_returned_for_reprocessing(
db_session, db_search_space, make_connector_document
):
"""A document with changed content is returned for re-indexing with updated markdown."""
original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1")
service = IndexingPipelineService(session=db_session)
@ -91,6 +95,7 @@ async def test_changed_content_is_returned_for_reprocessing(
async def test_all_documents_in_batch_are_persisted(
db_session, db_search_space, make_connector_document
):
"""All documents in a batch are persisted and returned."""
docs = [
make_connector_document(search_space_id=db_search_space.id, unique_id="id-1", title="Doc 1", source_markdown="## Content 1"),
make_connector_document(search_space_id=db_search_space.id, unique_id="id-2", title="Doc 2", source_markdown="## Content 2"),
@ -111,6 +116,7 @@ async def test_all_documents_in_batch_are_persisted(
async def test_duplicate_in_batch_is_persisted_once(
db_session, db_search_space, make_connector_document
):
"""The same document passed twice in a batch is only persisted once."""
doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -127,6 +133,7 @@ async def test_duplicate_in_batch_is_persisted_once(
async def test_created_by_id_is_persisted(
db_session, db_user, db_search_space, make_connector_document
):
"""created_by_id from the connector document is persisted on the DB row."""
doc = make_connector_document(
search_space_id=db_search_space.id,
created_by_id=str(db_user.id),
@ -145,6 +152,7 @@ async def test_created_by_id_is_persisted(
async def test_metadata_is_updated_when_content_changes(
db_session, db_search_space, make_connector_document
):
"""document_metadata is overwritten with the latest metadata when content changes."""
original = make_connector_document(
search_space_id=db_search_space.id,
source_markdown="## v1",
@ -171,6 +179,7 @@ async def test_metadata_is_updated_when_content_changes(
async def test_updated_at_advances_when_title_only_changes(
db_session, db_search_space, make_connector_document
):
"""updated_at advances even when only the title changes."""
original = make_connector_document(search_space_id=db_search_space.id, title="Old Title")
service = IndexingPipelineService(session=db_session)
@ -192,6 +201,7 @@ async def test_updated_at_advances_when_title_only_changes(
async def test_updated_at_advances_when_content_changes(
db_session, db_search_space, make_connector_document
):
"""updated_at advances when document content changes."""
original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1")
service = IndexingPipelineService(session=db_session)
@ -213,6 +223,7 @@ async def test_updated_at_advances_when_content_changes(
async def test_same_content_from_different_source_skipped_in_single_batch(
db_session, db_search_space, make_connector_document
):
"""Two documents with identical content in the same batch result in only one being persisted."""
first = make_connector_document(
search_space_id=db_search_space.id,
unique_id="source-a",
@ -238,6 +249,7 @@ async def test_same_content_from_different_source_skipped_in_single_batch(
async def test_same_content_from_different_source_is_skipped(
db_session, db_search_space, make_connector_document
):
"""A document with content identical to an already-indexed document is skipped."""
first = make_connector_document(
search_space_id=db_search_space.id,
unique_id="source-a",
@ -265,6 +277,7 @@ async def test_same_content_from_different_source_is_skipped(
async def test_failed_document_with_unchanged_content_is_requeued(
db_session, db_search_space, make_connector_document, mocker,
):
"""A FAILED document with unchanged content is re-queued as PENDING on the next run."""
doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -289,6 +302,7 @@ async def test_failed_document_with_unchanged_content_is_requeued(
async def test_title_and_content_change_updates_both_and_returns_document(
db_session, db_search_space, make_connector_document
):
"""When both title and content change, both are updated and the document is returned for re-indexing."""
original = make_connector_document(
search_space_id=db_search_space.id,
title="Original Title",