diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py index c471110fc..723c0e13b 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py @@ -9,6 +9,7 @@ pytestmark = pytest.mark.integration @pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") async def test_sets_status_ready(db_session, db_search_space, db_user, mocker): + """Document status is READY after successful indexing.""" await index_uploaded_file( markdown_content="## Hello\n\nSome content.", filename="test.pdf", @@ -29,6 +30,7 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker): @pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") async def test_content_is_summary(db_session, db_search_space, db_user, mocker): + """Document content is set to the LLM-generated summary.""" await index_uploaded_file( markdown_content="## Hello\n\nSome content.", filename="test.pdf", @@ -49,6 +51,7 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker): @pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker): + """Chunks derived from the source markdown are persisted in the DB.""" await index_uploaded_file( markdown_content="## Hello\n\nSome content.", filename="test.pdf", @@ -75,6 +78,7 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker @pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text") async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker): + """RuntimeError is raised when the indexing step fails so the caller can fire a failure notification.""" with pytest.raises(RuntimeError): await index_uploaded_file( markdown_content="## Hello\n\nSome content.", diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py index 89bd722ee..7c5e1e4f4 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_document.py @@ -11,6 +11,7 @@ pytestmark = pytest.mark.integration async def test_sets_status_ready( db_session, db_search_space, make_connector_document, mocker, ): + """Document status is READY after successful indexing.""" connector_doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -30,6 +31,7 @@ async def test_sets_status_ready( async def test_content_is_summary_when_should_summarize_true( db_session, db_search_space, make_connector_document, mocker, ): + """Document content is set to the LLM-generated summary when should_summarize=True.""" connector_doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -49,6 +51,7 @@ async def test_content_is_summary_when_should_summarize_true( async def test_content_is_source_markdown_when_should_summarize_false( db_session, db_search_space, make_connector_document, ): + """Document content is set to source_markdown verbatim when should_summarize=False.""" connector_doc = make_connector_document( search_space_id=db_search_space.id, should_summarize=False, @@ -72,6 +75,7 @@ async def test_content_is_source_markdown_when_should_summarize_false( async def test_chunks_written_to_db( db_session, db_search_space, make_connector_document, mocker, ): + """Chunks derived from source_markdown are persisted in the DB.""" connector_doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -94,6 +98,7 @@ async def test_chunks_written_to_db( async def test_embedding_written_to_db( db_session, db_search_space, make_connector_document, mocker, ): + """Document embedding vector is persisted in the DB after indexing.""" connector_doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -114,6 +119,7 @@ async def test_embedding_written_to_db( async def test_updated_at_advances_after_indexing( db_session, db_search_space, make_connector_document, mocker, ): + """updated_at timestamp is later after indexing than it was at prepare time.""" connector_doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -136,6 +142,7 @@ async def test_updated_at_advances_after_indexing( async def test_no_llm_falls_back_to_source_markdown( db_session, db_search_space, make_connector_document, ): + """When llm=None and no fallback_summary, content falls back to source_markdown.""" connector_doc = make_connector_document( search_space_id=db_search_space.id, should_summarize=True, @@ -160,6 +167,7 @@ async def test_no_llm_falls_back_to_source_markdown( async def test_fallback_summary_used_when_llm_unavailable( db_session, db_search_space, make_connector_document, ): + """fallback_summary is used as content when llm=None and should_summarize=True.""" connector_doc = make_connector_document( search_space_id=db_search_space.id, should_summarize=True, @@ -184,6 +192,7 @@ async def test_fallback_summary_used_when_llm_unavailable( async def test_reindex_replaces_old_chunks( db_session, db_search_space, make_connector_document, mocker, ): + """Re-indexing a document replaces its old chunks rather than appending.""" connector_doc = make_connector_document( search_space_id=db_search_space.id, source_markdown="## v1", @@ -215,6 +224,7 @@ async def test_reindex_replaces_old_chunks( async def test_llm_error_sets_status_failed( db_session, db_search_space, make_connector_document, mocker, ): + """Document status is FAILED when the LLM raises during indexing.""" connector_doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -234,6 +244,7 @@ async def test_llm_error_sets_status_failed( async def test_llm_error_leaves_no_partial_data( db_session, db_search_space, make_connector_document, mocker, ): + """A failed indexing attempt leaves no partial embedding or chunks in the DB.""" connector_doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py b/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py index 8b66b8323..b6d257f7a 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_prepare_for_indexing.py @@ -11,6 +11,7 @@ pytestmark = pytest.mark.integration async def test_new_document_is_persisted_with_pending_status( db_session, db_search_space, make_connector_document ): + """A new document is created in the DB with PENDING status and correct markdown.""" doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -31,6 +32,7 @@ async def test_new_document_is_persisted_with_pending_status( async def test_unchanged_ready_document_is_skipped( db_session, db_search_space, make_connector_document, mocker, ): + """A READY document with unchanged content is not returned for re-indexing.""" doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -48,6 +50,7 @@ async def test_unchanged_ready_document_is_skipped( async def test_title_only_change_updates_title_in_db( db_session, db_search_space, make_connector_document, mocker, ): + """A title-only change updates the DB title without re-queuing the document.""" original = make_connector_document(search_space_id=db_search_space.id, title="Original Title") service = IndexingPipelineService(session=db_session) @@ -69,6 +72,7 @@ async def test_title_only_change_updates_title_in_db( async def test_changed_content_is_returned_for_reprocessing( db_session, db_search_space, make_connector_document ): + """A document with changed content is returned for re-indexing with updated markdown.""" original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1") service = IndexingPipelineService(session=db_session) @@ -91,6 +95,7 @@ async def test_changed_content_is_returned_for_reprocessing( async def test_all_documents_in_batch_are_persisted( db_session, db_search_space, make_connector_document ): + """All documents in a batch are persisted and returned.""" docs = [ make_connector_document(search_space_id=db_search_space.id, unique_id="id-1", title="Doc 1", source_markdown="## Content 1"), make_connector_document(search_space_id=db_search_space.id, unique_id="id-2", title="Doc 2", source_markdown="## Content 2"), @@ -111,6 +116,7 @@ async def test_all_documents_in_batch_are_persisted( async def test_duplicate_in_batch_is_persisted_once( db_session, db_search_space, make_connector_document ): + """The same document passed twice in a batch is only persisted once.""" doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -127,6 +133,7 @@ async def test_duplicate_in_batch_is_persisted_once( async def test_created_by_id_is_persisted( db_session, db_user, db_search_space, make_connector_document ): + """created_by_id from the connector document is persisted on the DB row.""" doc = make_connector_document( search_space_id=db_search_space.id, created_by_id=str(db_user.id), @@ -145,6 +152,7 @@ async def test_created_by_id_is_persisted( async def test_metadata_is_updated_when_content_changes( db_session, db_search_space, make_connector_document ): + """document_metadata is overwritten with the latest metadata when content changes.""" original = make_connector_document( search_space_id=db_search_space.id, source_markdown="## v1", @@ -171,6 +179,7 @@ async def test_metadata_is_updated_when_content_changes( async def test_updated_at_advances_when_title_only_changes( db_session, db_search_space, make_connector_document ): + """updated_at advances even when only the title changes.""" original = make_connector_document(search_space_id=db_search_space.id, title="Old Title") service = IndexingPipelineService(session=db_session) @@ -192,6 +201,7 @@ async def test_updated_at_advances_when_title_only_changes( async def test_updated_at_advances_when_content_changes( db_session, db_search_space, make_connector_document ): + """updated_at advances when document content changes.""" original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1") service = IndexingPipelineService(session=db_session) @@ -213,6 +223,7 @@ async def test_updated_at_advances_when_content_changes( async def test_same_content_from_different_source_skipped_in_single_batch( db_session, db_search_space, make_connector_document ): + """Two documents with identical content in the same batch result in only one being persisted.""" first = make_connector_document( search_space_id=db_search_space.id, unique_id="source-a", @@ -238,6 +249,7 @@ async def test_same_content_from_different_source_skipped_in_single_batch( async def test_same_content_from_different_source_is_skipped( db_session, db_search_space, make_connector_document ): + """A document with content identical to an already-indexed document is skipped.""" first = make_connector_document( search_space_id=db_search_space.id, unique_id="source-a", @@ -265,6 +277,7 @@ async def test_same_content_from_different_source_is_skipped( async def test_failed_document_with_unchanged_content_is_requeued( db_session, db_search_space, make_connector_document, mocker, ): + """A FAILED document with unchanged content is re-queued as PENDING on the next run.""" doc = make_connector_document(search_space_id=db_search_space.id) service = IndexingPipelineService(session=db_session) @@ -289,6 +302,7 @@ async def test_failed_document_with_unchanged_content_is_requeued( async def test_title_and_content_change_updates_both_and_returns_document( db_session, db_search_space, make_connector_document ): + """When both title and content change, both are updated and the document is returned for re-indexing.""" original = make_connector_document( search_space_id=db_search_space.id, title="Original Title", diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_connector_document.py b/surfsense_backend/tests/unit/indexing_pipeline/test_connector_document.py index fef691964..228777626 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_connector_document.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_connector_document.py @@ -6,6 +6,7 @@ from app.indexing_pipeline.connector_document import ConnectorDocument def test_valid_document_created_with_required_fields(): + """All optional fields default correctly when only required fields are supplied.""" doc = ConnectorDocument( title="Task", source_markdown="## Task\n\nSome content.", @@ -23,6 +24,7 @@ def test_valid_document_created_with_required_fields(): def test_omitting_created_by_id_raises(): + """Omitting created_by_id raises a validation error.""" with pytest.raises(ValidationError): ConnectorDocument( title="Task", @@ -35,6 +37,7 @@ def test_omitting_created_by_id_raises(): def test_empty_source_markdown_raises(): + """Empty source_markdown raises a validation error.""" with pytest.raises(ValidationError): ConnectorDocument( title="Task", @@ -46,6 +49,7 @@ def test_empty_source_markdown_raises(): def test_whitespace_only_source_markdown_raises(): + """Whitespace-only source_markdown raises a validation error.""" with pytest.raises(ValidationError): ConnectorDocument( title="Task", @@ -57,6 +61,7 @@ def test_whitespace_only_source_markdown_raises(): def test_empty_title_raises(): + """Empty title raises a validation error.""" with pytest.raises(ValidationError): ConnectorDocument( title="", @@ -68,6 +73,7 @@ def test_empty_title_raises(): def test_empty_created_by_id_raises(): + """Empty created_by_id raises a validation error.""" with pytest.raises(ValidationError): ConnectorDocument( title="Task", @@ -81,6 +87,7 @@ def test_empty_created_by_id_raises(): def test_zero_search_space_id_raises(): + """search_space_id of zero raises a validation error.""" with pytest.raises(ValidationError): ConnectorDocument( title="Task", @@ -94,6 +101,7 @@ def test_zero_search_space_id_raises(): def test_empty_unique_id_raises(): + """Empty unique_id raises a validation error.""" with pytest.raises(ValidationError): ConnectorDocument( title="Task", diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_document_chunker.py b/surfsense_backend/tests/unit/indexing_pipeline/test_document_chunker.py index 78d0641c1..9c52d503d 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_document_chunker.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_document_chunker.py @@ -7,6 +7,7 @@ pytestmark = pytest.mark.unit @pytest.mark.usefixtures("patched_chunker_instance", "patched_code_chunker_instance") def test_uses_code_chunker_when_flag_is_true(): + """Code chunker is selected when use_code_chunker=True.""" result = chunk_text("def foo(): pass", use_code_chunker=True) assert result == ["code chunk"] @@ -14,6 +15,7 @@ def test_uses_code_chunker_when_flag_is_true(): @pytest.mark.usefixtures("patched_chunker_instance", "patched_code_chunker_instance") def test_uses_default_chunker_when_flag_is_false(): + """Default prose chunker is selected when use_code_chunker=False.""" result = chunk_text("Some prose text.", use_code_chunker=False) assert result == ["prose chunk"] diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py b/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py index c8e2e97e9..6b7a47f51 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py @@ -7,36 +7,42 @@ pytestmark = pytest.mark.unit def test_different_unique_id_produces_different_hash(make_connector_document): + """Two documents with different unique_ids produce different identifier hashes.""" doc_a = make_connector_document(unique_id="id-001") doc_b = make_connector_document(unique_id="id-002") assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) def test_different_search_space_produces_different_identifier_hash(make_connector_document): + """Same document in different search spaces produces different identifier hashes.""" doc_a = make_connector_document(search_space_id=1) doc_b = make_connector_document(search_space_id=2) assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) def test_different_document_type_produces_different_identifier_hash(make_connector_document): + """Same unique_id with different document types produces different identifier hashes.""" doc_a = make_connector_document(document_type=DocumentType.CLICKUP_CONNECTOR) doc_b = make_connector_document(document_type=DocumentType.NOTION_CONNECTOR) assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) def test_same_content_same_space_produces_same_content_hash(make_connector_document): + """Identical content in the same search space always produces the same content hash.""" doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1) doc_b = make_connector_document(source_markdown="Hello world", search_space_id=1) assert compute_content_hash(doc_a) == compute_content_hash(doc_b) def test_same_content_different_space_produces_different_content_hash(make_connector_document): + """Identical content in different search spaces produces different content hashes.""" doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1) doc_b = make_connector_document(source_markdown="Hello world", search_space_id=2) assert compute_content_hash(doc_a) != compute_content_hash(doc_b) def test_different_content_produces_different_content_hash(make_connector_document): + """Different source markdown produces different content hashes.""" doc_a = make_connector_document(source_markdown="Original content") doc_b = make_connector_document(source_markdown="Updated content") assert compute_content_hash(doc_a) != compute_content_hash(doc_b) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_document_summarizer.py b/surfsense_backend/tests/unit/indexing_pipeline/test_document_summarizer.py index 2f713d13d..a3a8ecfc2 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_document_summarizer.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_document_summarizer.py @@ -8,6 +8,7 @@ pytestmark = pytest.mark.unit @pytest.mark.usefixtures("patched_summarizer_chain") async def test_without_metadata_returns_raw_summary(): + """Summarizer returns the LLM output directly when no metadata is provided.""" result = await summarize_document("# Content", llm=MagicMock(model="gpt-4")) assert result == "The summary." @@ -15,6 +16,7 @@ async def test_without_metadata_returns_raw_summary(): @pytest.mark.usefixtures("patched_summarizer_chain") async def test_with_metadata_includes_metadata_values_in_output(): + """Non-empty metadata values are prepended to the summary output.""" result = await summarize_document( "# Content", llm=MagicMock(model="gpt-4"), @@ -27,6 +29,7 @@ async def test_with_metadata_includes_metadata_values_in_output(): @pytest.mark.usefixtures("patched_summarizer_chain") async def test_with_metadata_omits_empty_fields_from_output(): + """Empty metadata fields are omitted from the summary output.""" result = await summarize_document( "# Content", llm=MagicMock(model="gpt-4"),