mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-02 12:22:40 +02:00
add docstrings to all indexing pipeline tests
This commit is contained in:
parent
4293910e8e
commit
0de74f4bf7
7 changed files with 48 additions and 0 deletions
|
|
@ -9,6 +9,7 @@ pytestmark = pytest.mark.integration
|
|||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
||||
"""Document status is READY after successful indexing."""
|
||||
await index_uploaded_file(
|
||||
markdown_content="## Hello\n\nSome content.",
|
||||
filename="test.pdf",
|
||||
|
|
@ -29,6 +30,7 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
|||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
|
||||
"""Document content is set to the LLM-generated summary."""
|
||||
await index_uploaded_file(
|
||||
markdown_content="## Hello\n\nSome content.",
|
||||
filename="test.pdf",
|
||||
|
|
@ -49,6 +51,7 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
|
|||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker):
|
||||
"""Chunks derived from the source markdown are persisted in the DB."""
|
||||
await index_uploaded_file(
|
||||
markdown_content="## Hello\n\nSome content.",
|
||||
filename="test.pdf",
|
||||
|
|
@ -75,6 +78,7 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker
|
|||
|
||||
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text")
|
||||
async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker):
|
||||
"""RuntimeError is raised when the indexing step fails so the caller can fire a failure notification."""
|
||||
with pytest.raises(RuntimeError):
|
||||
await index_uploaded_file(
|
||||
markdown_content="## Hello\n\nSome content.",
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ pytestmark = pytest.mark.integration
|
|||
async def test_sets_status_ready(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""Document status is READY after successful indexing."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -30,6 +31,7 @@ async def test_sets_status_ready(
|
|||
async def test_content_is_summary_when_should_summarize_true(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""Document content is set to the LLM-generated summary when should_summarize=True."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -49,6 +51,7 @@ async def test_content_is_summary_when_should_summarize_true(
|
|||
async def test_content_is_source_markdown_when_should_summarize_false(
|
||||
db_session, db_search_space, make_connector_document,
|
||||
):
|
||||
"""Document content is set to source_markdown verbatim when should_summarize=False."""
|
||||
connector_doc = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
should_summarize=False,
|
||||
|
|
@ -72,6 +75,7 @@ async def test_content_is_source_markdown_when_should_summarize_false(
|
|||
async def test_chunks_written_to_db(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""Chunks derived from source_markdown are persisted in the DB."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -94,6 +98,7 @@ async def test_chunks_written_to_db(
|
|||
async def test_embedding_written_to_db(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""Document embedding vector is persisted in the DB after indexing."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -114,6 +119,7 @@ async def test_embedding_written_to_db(
|
|||
async def test_updated_at_advances_after_indexing(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""updated_at timestamp is later after indexing than it was at prepare time."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -136,6 +142,7 @@ async def test_updated_at_advances_after_indexing(
|
|||
async def test_no_llm_falls_back_to_source_markdown(
|
||||
db_session, db_search_space, make_connector_document,
|
||||
):
|
||||
"""When llm=None and no fallback_summary, content falls back to source_markdown."""
|
||||
connector_doc = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
should_summarize=True,
|
||||
|
|
@ -160,6 +167,7 @@ async def test_no_llm_falls_back_to_source_markdown(
|
|||
async def test_fallback_summary_used_when_llm_unavailable(
|
||||
db_session, db_search_space, make_connector_document,
|
||||
):
|
||||
"""fallback_summary is used as content when llm=None and should_summarize=True."""
|
||||
connector_doc = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
should_summarize=True,
|
||||
|
|
@ -184,6 +192,7 @@ async def test_fallback_summary_used_when_llm_unavailable(
|
|||
async def test_reindex_replaces_old_chunks(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""Re-indexing a document replaces its old chunks rather than appending."""
|
||||
connector_doc = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
source_markdown="## v1",
|
||||
|
|
@ -215,6 +224,7 @@ async def test_reindex_replaces_old_chunks(
|
|||
async def test_llm_error_sets_status_failed(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""Document status is FAILED when the LLM raises during indexing."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -234,6 +244,7 @@ async def test_llm_error_sets_status_failed(
|
|||
async def test_llm_error_leaves_no_partial_data(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""A failed indexing attempt leaves no partial embedding or chunks in the DB."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ pytestmark = pytest.mark.integration
|
|||
async def test_new_document_is_persisted_with_pending_status(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""A new document is created in the DB with PENDING status and correct markdown."""
|
||||
doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -31,6 +32,7 @@ async def test_new_document_is_persisted_with_pending_status(
|
|||
async def test_unchanged_ready_document_is_skipped(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""A READY document with unchanged content is not returned for re-indexing."""
|
||||
doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -48,6 +50,7 @@ async def test_unchanged_ready_document_is_skipped(
|
|||
async def test_title_only_change_updates_title_in_db(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""A title-only change updates the DB title without re-queuing the document."""
|
||||
original = make_connector_document(search_space_id=db_search_space.id, title="Original Title")
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -69,6 +72,7 @@ async def test_title_only_change_updates_title_in_db(
|
|||
async def test_changed_content_is_returned_for_reprocessing(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""A document with changed content is returned for re-indexing with updated markdown."""
|
||||
original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1")
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -91,6 +95,7 @@ async def test_changed_content_is_returned_for_reprocessing(
|
|||
async def test_all_documents_in_batch_are_persisted(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""All documents in a batch are persisted and returned."""
|
||||
docs = [
|
||||
make_connector_document(search_space_id=db_search_space.id, unique_id="id-1", title="Doc 1", source_markdown="## Content 1"),
|
||||
make_connector_document(search_space_id=db_search_space.id, unique_id="id-2", title="Doc 2", source_markdown="## Content 2"),
|
||||
|
|
@ -111,6 +116,7 @@ async def test_all_documents_in_batch_are_persisted(
|
|||
async def test_duplicate_in_batch_is_persisted_once(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""The same document passed twice in a batch is only persisted once."""
|
||||
doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -127,6 +133,7 @@ async def test_duplicate_in_batch_is_persisted_once(
|
|||
async def test_created_by_id_is_persisted(
|
||||
db_session, db_user, db_search_space, make_connector_document
|
||||
):
|
||||
"""created_by_id from the connector document is persisted on the DB row."""
|
||||
doc = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
created_by_id=str(db_user.id),
|
||||
|
|
@ -145,6 +152,7 @@ async def test_created_by_id_is_persisted(
|
|||
async def test_metadata_is_updated_when_content_changes(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""document_metadata is overwritten with the latest metadata when content changes."""
|
||||
original = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
source_markdown="## v1",
|
||||
|
|
@ -171,6 +179,7 @@ async def test_metadata_is_updated_when_content_changes(
|
|||
async def test_updated_at_advances_when_title_only_changes(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""updated_at advances even when only the title changes."""
|
||||
original = make_connector_document(search_space_id=db_search_space.id, title="Old Title")
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -192,6 +201,7 @@ async def test_updated_at_advances_when_title_only_changes(
|
|||
async def test_updated_at_advances_when_content_changes(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""updated_at advances when document content changes."""
|
||||
original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1")
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -213,6 +223,7 @@ async def test_updated_at_advances_when_content_changes(
|
|||
async def test_same_content_from_different_source_skipped_in_single_batch(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""Two documents with identical content in the same batch result in only one being persisted."""
|
||||
first = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
unique_id="source-a",
|
||||
|
|
@ -238,6 +249,7 @@ async def test_same_content_from_different_source_skipped_in_single_batch(
|
|||
async def test_same_content_from_different_source_is_skipped(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""A document with content identical to an already-indexed document is skipped."""
|
||||
first = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
unique_id="source-a",
|
||||
|
|
@ -265,6 +277,7 @@ async def test_same_content_from_different_source_is_skipped(
|
|||
async def test_failed_document_with_unchanged_content_is_requeued(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
):
|
||||
"""A FAILED document with unchanged content is re-queued as PENDING on the next run."""
|
||||
doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -289,6 +302,7 @@ async def test_failed_document_with_unchanged_content_is_requeued(
|
|||
async def test_title_and_content_change_updates_both_and_returns_document(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""When both title and content change, both are updated and the document is returned for re-indexing."""
|
||||
original = make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
title="Original Title",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue