feat(tests): Update tests for summary-free indexing

This commit is contained in:
Anish Sarkar 2026-06-04 00:53:51 +05:30
parent dc6a17930b
commit ddfe60c2f0
26 changed files with 123 additions and 294 deletions

View file

@ -101,7 +101,7 @@ async def test_generate_resume_defaults_to_one_page_target(monkeypatch) -> None:
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=_llm_invoke))
monkeypatch.setattr(
resume_tool,
"get_document_summary_llm",
"get_agent_llm",
AsyncMock(return_value=llm),
)
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
@ -130,7 +130,7 @@ async def test_generate_resume_compresses_when_over_limit(monkeypatch) -> None:
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
monkeypatch.setattr(
resume_tool,
"get_document_summary_llm",
"get_agent_llm",
AsyncMock(return_value=llm),
)
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
@ -165,7 +165,7 @@ async def test_generate_resume_returns_ready_when_target_not_met(monkeypatch) ->
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
monkeypatch.setattr(
resume_tool,
"get_document_summary_llm",
"get_agent_llm",
AsyncMock(return_value=llm),
)
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
@ -198,7 +198,7 @@ async def test_generate_resume_fails_when_hard_limit_exceeded(monkeypatch) -> No
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
monkeypatch.setattr(
resume_tool,
"get_document_summary_llm",
"get_agent_llm",
AsyncMock(return_value=llm),
)
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")

View file

@ -71,7 +71,6 @@ async def test_build_connector_doc_produces_correct_fields():
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert doc.title == "Engineering Handbook"
@ -81,7 +80,6 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.search_space_id == _SEARCH_SPACE_ID
assert doc.connector_id == _CONNECTOR_ID
assert doc.created_by_id == _USER_ID
assert doc.should_summarize is True
assert doc.metadata["page_id"] == "abc-123"
assert doc.metadata["page_title"] == "Engineering Handbook"
assert doc.metadata["space_id"] == "ENG"
@ -89,9 +87,8 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.metadata["connector_id"] == _CONNECTOR_ID
assert doc.metadata["document_type"] == "Confluence Page"
assert doc.metadata["connector_type"] == "Confluence"
assert doc.fallback_summary is not None
assert "Engineering Handbook" in doc.fallback_summary
assert markdown in doc.fallback_summary
assert "Engineering Handbook" in doc.deterministic_preview
assert markdown in doc.deterministic_preview
async def test_build_connector_doc_summary_disabled():
@ -101,9 +98,7 @@ async def test_build_connector_doc_summary_disabled():
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=False,
)
assert doc.should_summarize is False
# ---------------------------------------------------------------------------
@ -111,10 +106,9 @@ async def test_build_connector_doc_summary_disabled():
# ---------------------------------------------------------------------------
def _mock_connector(enable_summary: bool = True):
def _mock_connector():
c = MagicMock()
c.config = {"access_token": "tok"}
c.enable_summary = enable_summary
c.last_indexed_at = None
return c

View file

@ -71,7 +71,6 @@ async def test_single_file_returns_one_connector_document(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -97,7 +96,6 @@ async def test_multiple_files_all_produce_documents(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 3
@ -125,7 +123,6 @@ async def test_one_download_exception_does_not_block_others(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 2
@ -152,7 +149,6 @@ async def test_etl_error_counts_as_download_failure(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -191,7 +187,6 @@ async def test_concurrency_bounded_by_semaphore(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
max_concurrency=2,
)
@ -231,7 +226,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
on_heartbeat=_on_heartbeat,
)
@ -324,7 +318,6 @@ async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500):
mocks["task_logger"],
mocks["log_entry"],
max_files,
enable_summary=True,
)
@ -434,7 +427,6 @@ async def _run_selected(mocks, file_tuples):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
@ -569,7 +561,6 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert sorted(remove_calls) == ["id:del1", "id:del2"]
@ -608,7 +599,6 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert indexed == 2
@ -670,7 +660,6 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert sorted(remove_calls) == ["id:del1", "id:del2"]
@ -704,7 +693,6 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert cursor == "brand-new-cursor-xyz"
@ -725,7 +713,7 @@ def orchestrator_mocks(monkeypatch):
mock_connector = MagicMock()
mock_connector.config = {"_token_encrypted": False}
mock_connector.last_indexed_at = None
mock_connector.enable_summary = True
mock_connector.enable_vision_llm = True
monkeypatch.setattr(
_mod,

View file

@ -66,7 +66,6 @@ async def test_single_file_returns_one_connector_document(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -91,7 +90,6 @@ async def test_multiple_files_all_produce_documents(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 3
@ -119,7 +117,6 @@ async def test_one_download_exception_does_not_block_others(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 2
@ -146,7 +143,6 @@ async def test_etl_error_counts_as_download_failure(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -186,7 +182,6 @@ async def test_concurrency_bounded_by_semaphore(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
max_concurrency=2,
)
@ -226,7 +221,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
on_heartbeat=_on_heartbeat,
)
@ -302,7 +296,7 @@ def full_scan_mocks(mock_drive_client, monkeypatch):
monkeypatch.setattr(
_mod,
"get_user_long_context_llm",
"get_agent_llm",
AsyncMock(return_value=MagicMock()),
)
@ -333,7 +327,6 @@ async def _run_full_scan(mocks, *, max_files=500, include_subfolders=False):
mocks["log_entry"],
max_files,
include_subfolders=include_subfolders,
enable_summary=True,
)
@ -489,7 +482,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
)
monkeypatch.setattr(
_mod,
"get_user_long_context_llm",
"get_agent_llm",
AsyncMock(return_value=MagicMock()),
)
@ -509,7 +502,6 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert sorted(remove_calls) == ["del1", "del2", "trash1"]
@ -577,7 +569,6 @@ async def _run_selected(mocks, file_ids):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)

View file

@ -70,7 +70,6 @@ async def test_build_connector_doc_produces_correct_fields():
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert doc.title == "ENG-42: Fix login bug"
@ -80,7 +79,6 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.search_space_id == _SEARCH_SPACE_ID
assert doc.connector_id == _CONNECTOR_ID
assert doc.created_by_id == _USER_ID
assert doc.should_summarize is True
assert doc.metadata["issue_id"] == "abc-123"
assert doc.metadata["issue_identifier"] == "ENG-42"
assert doc.metadata["issue_title"] == "Fix login bug"
@ -90,13 +88,12 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.metadata["connector_id"] == _CONNECTOR_ID
assert doc.metadata["document_type"] == "Linear Issue"
assert doc.metadata["connector_type"] == "Linear"
assert doc.fallback_summary is not None
assert "ENG-42" in doc.fallback_summary
assert markdown in doc.fallback_summary
assert "ENG-42" in doc.deterministic_preview
assert markdown in doc.deterministic_preview
async def test_build_connector_doc_summary_disabled():
"""When enable_summary is False, should_summarize is False."""
"""When enable_vision_llm is False, deterministic_content is False."""
doc = _build_connector_doc(
_make_issue(),
_make_formatted_issue(),
@ -104,21 +101,17 @@ async def test_build_connector_doc_summary_disabled():
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=False,
)
assert doc.should_summarize is False
# ---------------------------------------------------------------------------
# Shared fixtures for Slices 2-6
# ---------------------------------------------------------------------------
def _mock_connector(enable_summary: bool = True):
def _mock_connector():
c = MagicMock()
c.config = {"access_token": "tok"}
c.enable_summary = enable_summary
c.last_indexed_at = None
return c

View file

@ -41,7 +41,6 @@ async def test_build_connector_doc_produces_correct_fields():
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert doc.title == "My Notion Page"
@ -51,40 +50,34 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.search_space_id == _SEARCH_SPACE_ID
assert doc.connector_id == _CONNECTOR_ID
assert doc.created_by_id == _USER_ID
assert doc.should_summarize is True
assert doc.metadata["page_title"] == "My Notion Page"
assert doc.metadata["page_id"] == "abc-123"
assert doc.metadata["connector_id"] == _CONNECTOR_ID
assert doc.metadata["document_type"] == "Notion Page"
assert doc.metadata["connector_type"] == "Notion"
assert doc.fallback_summary is not None
assert "My Notion Page" in doc.fallback_summary
assert markdown in doc.fallback_summary
assert "My Notion Page" in doc.deterministic_preview
assert markdown in doc.deterministic_preview
async def test_build_connector_doc_summary_disabled():
"""When enable_summary is False, should_summarize is False."""
"""When enable_vision_llm is False, deterministic_content is False."""
doc = _build_connector_doc(
_make_page(),
"# content",
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=False,
)
assert doc.should_summarize is False
# ---------------------------------------------------------------------------
# Shared fixtures for Slices 2-7 (full index_notion_pages tests)
# ---------------------------------------------------------------------------
def _mock_connector(enable_summary: bool = True):
def _mock_connector():
c = MagicMock()
c.config = {"access_token": "tok"}
c.enable_summary = enable_summary
c.last_indexed_at = None
return c

View file

@ -65,7 +65,6 @@ async def test_single_file_returns_one_connector_document(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -91,7 +90,6 @@ async def test_multiple_files_all_produce_documents(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 3
@ -119,7 +117,6 @@ async def test_one_download_exception_does_not_block_others(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 2
@ -146,7 +143,6 @@ async def test_etl_error_counts_as_download_failure(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -185,7 +181,6 @@ async def test_concurrency_bounded_by_semaphore(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
max_concurrency=2,
)
@ -225,7 +220,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
on_heartbeat=_on_heartbeat,
)

View file

@ -180,7 +180,6 @@ async def _run_gdrive_selected(mocks, file_ids):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
@ -337,7 +336,7 @@ def gdrive_full_scan_mocks(monkeypatch):
_mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock)
)
monkeypatch.setattr(
_mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock())
_mod, "get_agent_llm", AsyncMock(return_value=MagicMock())
)
return {
@ -366,7 +365,6 @@ async def _run_gdrive_full_scan(mocks, max_files=500):
MagicMock(),
max_files,
include_subfolders=False,
enable_summary=True,
)
@ -455,7 +453,7 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
_mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock)
)
monkeypatch.setattr(
_mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock())
_mod, "get_agent_llm", AsyncMock(return_value=MagicMock())
)
mock_task_logger = MagicMock()
@ -473,7 +471,6 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
call_files = download_mock.call_args[0][1]
@ -539,7 +536,6 @@ async def _run_onedrive_selected(mocks, file_ids):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
@ -641,7 +637,6 @@ async def _run_dropbox_selected(mocks, file_paths):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)

View file

@ -18,7 +18,6 @@ def test_valid_document_created_with_required_fields():
connector_id=42,
created_by_id="00000000-0000-0000-0000-000000000001",
)
assert doc.should_summarize is True
assert doc.should_use_code_chunker is False
assert doc.metadata == {}
assert doc.connector_id == 42

View file

@ -1,41 +0,0 @@
from unittest.mock import MagicMock
import pytest
from app.indexing_pipeline.document_summarizer import summarize_document
pytestmark = pytest.mark.unit
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_without_metadata_returns_raw_summary():
"""Summarizer returns the LLM output directly when no metadata is provided."""
result = await summarize_document("# Content", llm=MagicMock(model="gpt-4"))
assert result == "The summary."
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_with_metadata_includes_metadata_values_in_output():
"""Non-empty metadata values are prepended to the summary output."""
result = await summarize_document(
"# Content",
llm=MagicMock(model="gpt-4"),
metadata={"author": "Alice", "source": "Notion"},
)
assert "Alice" in result
assert "Notion" in result
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_with_metadata_omits_empty_fields_from_output():
"""Empty metadata fields are omitted from the summary output."""
result = await summarize_document(
"# Content",
llm=MagicMock(model="gpt-4"),
metadata={"author": "Alice", "description": ""},
)
assert "Alice" in result
assert "description" not in result.lower()

View file

@ -51,11 +51,6 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
return await original_to_thread(func, *args, **kwargs)
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
AsyncMock(return_value="Summary."),
)
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
@ -85,7 +80,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
document.id = 1
document.status = DocumentStatus.pending()
await pipeline.index(document, connector_doc, llm=MagicMock())
await pipeline.index(document, connector_doc)
# Either chunker entry point satisfies the "chunking runs off the event
# loop" contract this test guards. Routing between the two is verified
@ -104,10 +99,6 @@ async def test_non_code_documents_use_hybrid_chunker(
mid-row. Only documents flagged with ``should_use_code_chunker=True``
should take the ``chunk_text`` path.
"""
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
AsyncMock(return_value="Summary."),
)
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
@ -139,7 +130,7 @@ async def test_non_code_documents_use_hybrid_chunker(
document.id = 1
document.status = DocumentStatus.pending()
await pipeline.index(document, connector_doc, llm=MagicMock())
await pipeline.index(document, connector_doc)
mock_chunk_hybrid.assert_called_once()
mock_chunk_code.assert_not_called()