add docstrings to all indexing pipeline tests

This commit is contained in:
CREDO23 2026-02-25 20:30:31 +02:00
parent 4293910e8e
commit 0de74f4bf7
7 changed files with 48 additions and 0 deletions

View file

@ -6,6 +6,7 @@ from app.indexing_pipeline.connector_document import ConnectorDocument
def test_valid_document_created_with_required_fields():
"""All optional fields default correctly when only required fields are supplied."""
doc = ConnectorDocument(
title="Task",
source_markdown="## Task\n\nSome content.",
@ -23,6 +24,7 @@ def test_valid_document_created_with_required_fields():
def test_omitting_created_by_id_raises():
"""Omitting created_by_id raises a validation error."""
with pytest.raises(ValidationError):
ConnectorDocument(
title="Task",
@ -35,6 +37,7 @@ def test_omitting_created_by_id_raises():
def test_empty_source_markdown_raises():
"""Empty source_markdown raises a validation error."""
with pytest.raises(ValidationError):
ConnectorDocument(
title="Task",
@ -46,6 +49,7 @@ def test_empty_source_markdown_raises():
def test_whitespace_only_source_markdown_raises():
"""Whitespace-only source_markdown raises a validation error."""
with pytest.raises(ValidationError):
ConnectorDocument(
title="Task",
@ -57,6 +61,7 @@ def test_whitespace_only_source_markdown_raises():
def test_empty_title_raises():
"""Empty title raises a validation error."""
with pytest.raises(ValidationError):
ConnectorDocument(
title="",
@ -68,6 +73,7 @@ def test_empty_title_raises():
def test_empty_created_by_id_raises():
"""Empty created_by_id raises a validation error."""
with pytest.raises(ValidationError):
ConnectorDocument(
title="Task",
@ -81,6 +87,7 @@ def test_empty_created_by_id_raises():
def test_zero_search_space_id_raises():
"""search_space_id of zero raises a validation error."""
with pytest.raises(ValidationError):
ConnectorDocument(
title="Task",
@ -94,6 +101,7 @@ def test_zero_search_space_id_raises():
def test_empty_unique_id_raises():
"""Empty unique_id raises a validation error."""
with pytest.raises(ValidationError):
ConnectorDocument(
title="Task",

View file

@ -7,6 +7,7 @@ pytestmark = pytest.mark.unit
@pytest.mark.usefixtures("patched_chunker_instance", "patched_code_chunker_instance")
def test_uses_code_chunker_when_flag_is_true():
"""Code chunker is selected when use_code_chunker=True."""
result = chunk_text("def foo(): pass", use_code_chunker=True)
assert result == ["code chunk"]
@ -14,6 +15,7 @@ def test_uses_code_chunker_when_flag_is_true():
@pytest.mark.usefixtures("patched_chunker_instance", "patched_code_chunker_instance")
def test_uses_default_chunker_when_flag_is_false():
"""Default prose chunker is selected when use_code_chunker=False."""
result = chunk_text("Some prose text.", use_code_chunker=False)
assert result == ["prose chunk"]

View file

@ -7,36 +7,42 @@ pytestmark = pytest.mark.unit
def test_different_unique_id_produces_different_hash(make_connector_document):
"""Two documents with different unique_ids produce different identifier hashes."""
doc_a = make_connector_document(unique_id="id-001")
doc_b = make_connector_document(unique_id="id-002")
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
def test_different_search_space_produces_different_identifier_hash(make_connector_document):
"""Same document in different search spaces produces different identifier hashes."""
doc_a = make_connector_document(search_space_id=1)
doc_b = make_connector_document(search_space_id=2)
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
def test_different_document_type_produces_different_identifier_hash(make_connector_document):
"""Same unique_id with different document types produces different identifier hashes."""
doc_a = make_connector_document(document_type=DocumentType.CLICKUP_CONNECTOR)
doc_b = make_connector_document(document_type=DocumentType.NOTION_CONNECTOR)
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
def test_same_content_same_space_produces_same_content_hash(make_connector_document):
"""Identical content in the same search space always produces the same content hash."""
doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1)
doc_b = make_connector_document(source_markdown="Hello world", search_space_id=1)
assert compute_content_hash(doc_a) == compute_content_hash(doc_b)
def test_same_content_different_space_produces_different_content_hash(make_connector_document):
"""Identical content in different search spaces produces different content hashes."""
doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1)
doc_b = make_connector_document(source_markdown="Hello world", search_space_id=2)
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
def test_different_content_produces_different_content_hash(make_connector_document):
"""Different source markdown produces different content hashes."""
doc_a = make_connector_document(source_markdown="Original content")
doc_b = make_connector_document(source_markdown="Updated content")
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)

View file

@ -8,6 +8,7 @@ pytestmark = pytest.mark.unit
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_without_metadata_returns_raw_summary():
"""Summarizer returns the LLM output directly when no metadata is provided."""
result = await summarize_document("# Content", llm=MagicMock(model="gpt-4"))
assert result == "The summary."
@ -15,6 +16,7 @@ async def test_without_metadata_returns_raw_summary():
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_with_metadata_includes_metadata_values_in_output():
"""Non-empty metadata values are prepended to the summary output."""
result = await summarize_document(
"# Content",
llm=MagicMock(model="gpt-4"),
@ -27,6 +29,7 @@ async def test_with_metadata_includes_metadata_values_in_output():
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_with_metadata_omits_empty_fields_from_output():
"""Empty metadata fields are omitted from the summary output."""
result = await summarize_document(
"# Content",
llm=MagicMock(model="gpt-4"),