chore: ran linting

This commit is contained in:
Anish Sarkar 2026-02-26 03:05:20 +05:30
parent 7332be956e
commit 9ccee054a5
24 changed files with 368 additions and 151 deletions

View file

@ -147,7 +147,9 @@ async def delete_sandbox(thread_id: int | str) -> None:
try: try:
sandbox = client.find_one(labels=labels) sandbox = client.find_one(labels=labels)
except DaytonaError: except DaytonaError:
logger.debug("No sandbox to delete for thread %s (already removed)", thread_id) logger.debug(
"No sandbox to delete for thread %s (already removed)", thread_id
)
return return
try: try:
client.delete(sandbox) client.delete(sandbox)
@ -166,6 +168,7 @@ async def delete_sandbox(thread_id: int | str) -> None:
# Local file persistence # Local file persistence
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _get_sandbox_files_dir() -> Path: def _get_sandbox_files_dir() -> Path:
return Path(os.environ.get("SANDBOX_FILES_DIR", "sandbox_files")) return Path(os.environ.get("SANDBOX_FILES_DIR", "sandbox_files"))

View file

@ -5,6 +5,7 @@ from app.db import DocumentType
class ConnectorDocument(BaseModel): class ConnectorDocument(BaseModel):
"""Canonical data transfer object produced by connector adapters and consumed by the indexing pipeline.""" """Canonical data transfer object produced by connector adapters and consumed by the indexing pipeline."""
title: str title: str
source_markdown: str source_markdown: str
unique_id: str unique_id: str

View file

@ -3,5 +3,7 @@ from app.config import config
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]: def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
"""Chunk a text string using the configured chunker and return the chunk texts.""" """Chunk a text string using the configured chunker and return the chunk texts."""
chunker = config.code_chunker_instance if use_code_chunker else config.chunker_instance chunker = (
config.code_chunker_instance if use_code_chunker else config.chunker_instance
)
return [c.text for c in chunker.chunk(text)] return [c.text for c in chunker.chunk(text)]

View file

@ -2,7 +2,9 @@ from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.utils.document_converters import optimize_content_for_context_window from app.utils.document_converters import optimize_content_for_context_window
async def summarize_document(source_markdown: str, llm, metadata: dict | None = None) -> str: async def summarize_document(
source_markdown: str, llm, metadata: dict | None = None
) -> str:
"""Generate a text summary of a document using an LLM, prefixed with metadata when provided.""" """Generate a text summary of a document using an LLM, prefixed with metadata when provided."""
model_name = getattr(llm, "model", "gpt-3.5-turbo") model_name = getattr(llm, "model", "gpt-3.5-turbo")
optimized_content = optimize_content_for_context_window( optimized_content = optimize_content_for_context_window(

View file

@ -12,7 +12,6 @@ from litellm.exceptions import (
Timeout, Timeout,
UnprocessableEntityError, UnprocessableEntityError,
) )
from sqlalchemy.exc import IntegrityError
# Tuples for use directly in except clauses. # Tuples for use directly in except clauses.
RETRYABLE_LLM_ERRORS = ( RETRYABLE_LLM_ERRORS = (
@ -53,10 +52,14 @@ class PipelineMessages:
LLM_PERMISSION = "LLM request denied. Check your account permissions." LLM_PERMISSION = "LLM request denied. Check your account permissions."
LLM_NOT_FOUND = "LLM model not found. Check your model configuration." LLM_NOT_FOUND = "LLM model not found. Check your model configuration."
LLM_BAD_REQUEST = "LLM rejected the request. Document content may be invalid." LLM_BAD_REQUEST = "LLM rejected the request. Document content may be invalid."
LLM_UNPROCESSABLE = "Document exceeds the LLM context window even after optimization." LLM_UNPROCESSABLE = (
"Document exceeds the LLM context window even after optimization."
)
LLM_RESPONSE = "LLM returned an invalid response." LLM_RESPONSE = "LLM returned an invalid response."
EMBEDDING_FAILED = "Embedding failed. Check your embedding model configuration or service." EMBEDDING_FAILED = (
"Embedding failed. Check your embedding model configuration or service."
)
EMBEDDING_MODEL = "Embedding model files are missing or corrupted." EMBEDDING_MODEL = "Embedding model files are missing or corrupted."
EMBEDDING_MEMORY = "Not enough memory to embed this document." EMBEDDING_MEMORY = "Not enough memory to embed this document."

View file

@ -2,6 +2,7 @@ import contextlib
from datetime import UTC, datetime from datetime import UTC, datetime
from sqlalchemy import delete, select from sqlalchemy import delete, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Chunk, Document, DocumentStatus from app.db import Chunk, Document, DocumentStatus
@ -21,7 +22,6 @@ from app.indexing_pipeline.exceptions import (
EMBEDDING_ERRORS, EMBEDDING_ERRORS,
PERMANENT_LLM_ERRORS, PERMANENT_LLM_ERRORS,
RETRYABLE_LLM_ERRORS, RETRYABLE_LLM_ERRORS,
IntegrityError,
PipelineMessages, PipelineMessages,
embedding_message, embedding_message,
llm_permanent_message, llm_permanent_message,

View file

@ -24,7 +24,9 @@ class LogMessages:
# index # index
INDEX_STARTED = "Document indexing started." INDEX_STARTED = "Document indexing started."
INDEX_SUCCESS = "Document indexed successfully." INDEX_SUCCESS = "Document indexed successfully."
LLM_RETRYABLE = "Retryable LLM error — document marked failed, will retry on next sync." LLM_RETRYABLE = (
"Retryable LLM error — document marked failed, will retry on next sync."
)
LLM_PERMANENT = "Permanent LLM error — document marked failed." LLM_PERMANENT = "Permanent LLM error — document marked failed."
EMBEDDING_FAILED = "Embedding error — document marked failed." EMBEDDING_FAILED = "Embedding error — document marked failed."
CHUNKING_OVERFLOW = "Chunking overflow — document marked failed." CHUNKING_OVERFLOW = "Chunking overflow — document marked failed."
@ -52,7 +54,9 @@ def _build_message(msg: str, ctx: PipelineLogContext, **extra) -> str:
return msg return msg
def _safe_log(level_fn, msg: str, ctx: PipelineLogContext, exc_info=None, **extra) -> None: def _safe_log(
level_fn, msg: str, ctx: PipelineLogContext, exc_info=None, **extra
) -> None:
# Logging must never raise — a broken log call inside an except block would # Logging must never raise — a broken log call inside an except block would
# chain with the original exception and mask it entirely. # chain with the original exception and mask it entirely.
try: try:
@ -64,6 +68,7 @@ def _safe_log(level_fn, msg: str, ctx: PipelineLogContext, exc_info=None, **extr
# ── prepare_for_indexing ────────────────────────────────────────────────────── # ── prepare_for_indexing ──────────────────────────────────────────────────────
def log_document_queued(ctx: PipelineLogContext) -> None: def log_document_queued(ctx: PipelineLogContext) -> None:
_safe_log(logger.info, LogMessages.DOCUMENT_QUEUED, ctx) _safe_log(logger.info, LogMessages.DOCUMENT_QUEUED, ctx)
@ -77,7 +82,9 @@ def log_document_requeued(ctx: PipelineLogContext) -> None:
def log_doc_skipped_unknown(ctx: PipelineLogContext, exc: Exception) -> None: def log_doc_skipped_unknown(ctx: PipelineLogContext, exc: Exception) -> None:
_safe_log(logger.warning, LogMessages.DOC_SKIPPED_UNKNOWN, ctx, exc_info=exc, error=exc) _safe_log(
logger.warning, LogMessages.DOC_SKIPPED_UNKNOWN, ctx, exc_info=exc, error=exc
)
def log_race_condition(ctx: PipelineLogContext) -> None: def log_race_condition(ctx: PipelineLogContext) -> None:
@ -90,6 +97,7 @@ def log_batch_aborted(ctx: PipelineLogContext, exc: Exception) -> None:
# ── index ───────────────────────────────────────────────────────────────────── # ── index ─────────────────────────────────────────────────────────────────────
def log_index_started(ctx: PipelineLogContext) -> None: def log_index_started(ctx: PipelineLogContext) -> None:
_safe_log(logger.info, LogMessages.INDEX_STARTED, ctx) _safe_log(logger.info, LogMessages.INDEX_STARTED, ctx)

View file

@ -10,6 +10,8 @@ These endpoints support the ThreadHistoryAdapter pattern from assistant-ui:
- POST /threads/{thread_id}/messages - Append message - POST /threads/{thread_id}/messages - Append message
""" """
import asyncio
import logging
from datetime import UTC, datetime from datetime import UTC, datetime
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, HTTPException, Request
@ -52,9 +54,6 @@ from app.tasks.chat.stream_new_chat import stream_new_chat, stream_resume_chat
from app.users import current_active_user from app.users import current_active_user
from app.utils.rbac import check_permission from app.utils.rbac import check_permission
import asyncio
import logging
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
router = APIRouter() router = APIRouter()
@ -75,11 +74,19 @@ def _try_delete_sandbox(thread_id: int) -> None:
try: try:
await delete_sandbox(thread_id) await delete_sandbox(thread_id)
except Exception: except Exception:
_logger.warning("Background sandbox delete failed for thread %s", thread_id, exc_info=True) _logger.warning(
"Background sandbox delete failed for thread %s",
thread_id,
exc_info=True,
)
try: try:
delete_local_sandbox_files(thread_id) delete_local_sandbox_files(thread_id)
except Exception: except Exception:
_logger.warning("Local sandbox file cleanup failed for thread %s", thread_id, exc_info=True) _logger.warning(
"Local sandbox file cleanup failed for thread %s",
thread_id,
exc_info=True,
)
try: try:
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()

View file

@ -87,7 +87,7 @@ async def download_sandbox_file(
# Fall back to live sandbox download # Fall back to live sandbox download
try: try:
sandbox = await get_or_create_sandbox(thread_id) sandbox = await get_or_create_sandbox(thread_id)
raw_sandbox = sandbox._sandbox # noqa: SLF001 raw_sandbox = sandbox._sandbox
content: bytes = await asyncio.to_thread(raw_sandbox.fs.download_file, path) content: bytes = await asyncio.to_thread(raw_sandbox.fs.download_file, path)
except Exception as exc: except Exception as exc:
logger.warning("Sandbox file download failed for %s: %s", path, exc) logger.warning("Sandbox file download failed for %s: %s", path, exc)

View file

@ -877,7 +877,9 @@ async def _stream_agent_events(
output_text = om.group(1) if om else "" output_text = om.group(1) if om else ""
thread_id_str = config.get("configurable", {}).get("thread_id", "") thread_id_str = config.get("configurable", {}).get("thread_id", "")
for sf_match in re.finditer(r"^SANDBOX_FILE:\s*(.+)$", output_text, re.MULTILINE): for sf_match in re.finditer(
r"^SANDBOX_FILE:\s*(.+)$", output_text, re.MULTILINE
):
fpath = sf_match.group(1).strip() fpath = sf_match.group(1).strip()
if fpath and fpath not in result.sandbox_files: if fpath and fpath not in result.sandbox_files:
result.sandbox_files.append(fpath) result.sandbox_files.append(fpath)
@ -963,7 +965,10 @@ def _try_persist_and_delete_sandbox(
sandbox_files: list[str], sandbox_files: list[str],
) -> None: ) -> None:
"""Fire-and-forget: persist sandbox files locally then delete the sandbox.""" """Fire-and-forget: persist sandbox files locally then delete the sandbox."""
from app.agents.new_chat.sandbox import is_sandbox_enabled, persist_and_delete_sandbox from app.agents.new_chat.sandbox import (
is_sandbox_enabled,
persist_and_delete_sandbox,
)
if not is_sandbox_enabled(): if not is_sandbox_enabled():
return return

View file

@ -179,11 +179,7 @@ addopts = "-v --tb=short -x --strict-markers -ra --durations=5"
markers = [ markers = [
"unit: pure logic tests, no DB or external services", "unit: pure logic tests, no DB or external services",
"integration: tests that require a real PostgreSQL database", "integration: tests that require a real PostgreSQL database",
"document: document upload and processing tests", "e2e: tests requiring a running backend and real HTTP calls"
"connector: connector indexing tests",
"chat: chat and agent tests",
"page_limit: page limit enforcement tests",
"upload_limit: file upload limit validation tests",
] ]
filterwarnings = [ filterwarnings = [
"ignore::UserWarning:chonkie", "ignore::UserWarning:chonkie",

View file

@ -46,6 +46,7 @@ def make_connector_document():
Generic factory for unit tests. Overridden in tests/integration/conftest.py Generic factory for unit tests. Overridden in tests/integration/conftest.py
with real DB-backed IDs for integration tests. with real DB-backed IDs for integration tests.
""" """
def _make(**overrides): def _make(**overrides):
defaults = { defaults = {
"title": "Test Document", "title": "Test Document",
@ -58,4 +59,5 @@ def make_connector_document():
} }
defaults.update(overrides) defaults.update(overrides)
return ConnectorDocument(**defaults) return ConnectorDocument(**defaults)
return _make return _make

View file

@ -18,7 +18,6 @@ from tests.utils.helpers import (
get_search_space_id, get_search_space_id,
) )
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Backend connectivity fixtures # Backend connectivity fixtures
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View file

@ -28,7 +28,7 @@ from tests.utils.helpers import (
upload_multiple_files, upload_multiple_files,
) )
pytestmark = pytest.mark.document pytestmark = pytest.mark.e2e
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Helpers local to this module # Helpers local to this module

View file

@ -31,7 +31,7 @@ from tests.utils.helpers import (
upload_file, upload_file,
) )
pytestmark = pytest.mark.page_limit pytestmark = pytest.mark.e2e
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View file

@ -21,7 +21,7 @@ import io
import httpx import httpx
import pytest import pytest
pytestmark = pytest.mark.upload_limit pytestmark = pytest.mark.e2e
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View file

@ -1,4 +1,3 @@
import os import os
import uuid import uuid
from unittest.mock import AsyncMock, MagicMock from unittest.mock import AsyncMock, MagicMock
@ -9,14 +8,21 @@ from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.pool import NullPool from sqlalchemy.pool import NullPool
from app.db import Base, SearchSpace, SearchSourceConnector, SearchSourceConnectorType from app.db import (
from app.db import User Base,
from app.db import DocumentType DocumentType,
SearchSourceConnector,
SearchSourceConnectorType,
SearchSpace,
User,
)
from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.connector_document import ConnectorDocument
_EMBEDDING_DIM = 1024 # must match the Vector() dimension used in DB column creation _EMBEDDING_DIM = 1024 # must match the Vector() dimension used in DB column creation
_DEFAULT_TEST_DB = "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test" _DEFAULT_TEST_DB = (
"postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test"
)
TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB) TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB)
@ -80,7 +86,9 @@ async def db_user(db_session: AsyncSession) -> User:
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def db_connector(db_session: AsyncSession, db_user: User, db_search_space: "SearchSpace") -> SearchSourceConnector: async def db_connector(
db_session: AsyncSession, db_user: User, db_search_space: "SearchSpace"
) -> SearchSourceConnector:
connector = SearchSourceConnector( connector = SearchSourceConnector(
name="Test Connector", name="Test Connector",
connector_type=SearchSourceConnectorType.CLICKUP_CONNECTOR, connector_type=SearchSourceConnectorType.CLICKUP_CONNECTOR,
@ -147,6 +155,7 @@ def patched_chunk_text(monkeypatch) -> MagicMock:
@pytest.fixture @pytest.fixture
def make_connector_document(db_connector, db_user): def make_connector_document(db_connector, db_user):
"""Integration-scoped override: uses real DB connector and user IDs.""" """Integration-scoped override: uses real DB connector and user IDs."""
def _make(**overrides): def _make(**overrides):
defaults = { defaults = {
"title": "Test Document", "title": "Test Document",
@ -159,6 +168,5 @@ def make_connector_document(db_connector, db_user):
} }
defaults.update(overrides) defaults.update(overrides)
return ConnectorDocument(**defaults) return ConnectorDocument(**defaults)
return _make return _make

View file

@ -7,7 +7,9 @@ from app.indexing_pipeline.adapters.file_upload_adapter import index_uploaded_fi
pytestmark = pytest.mark.integration pytestmark = pytest.mark.integration
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_sets_status_ready(db_session, db_search_space, db_user, mocker): async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
"""Document status is READY after successful indexing.""" """Document status is READY after successful indexing."""
await index_uploaded_file( await index_uploaded_file(
@ -28,7 +30,9 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
assert DocumentStatus.is_state(document.status, DocumentStatus.READY) assert DocumentStatus.is_state(document.status, DocumentStatus.READY)
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_content_is_summary(db_session, db_search_space, db_user, mocker): async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
"""Document content is set to the LLM-generated summary.""" """Document content is set to the LLM-generated summary."""
await index_uploaded_file( await index_uploaded_file(
@ -49,7 +53,9 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
assert document.content == "Mocked summary." assert document.content == "Mocked summary."
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker): async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker):
"""Chunks derived from the source markdown are persisted in the DB.""" """Chunks derived from the source markdown are persisted in the DB."""
await index_uploaded_file( await index_uploaded_file(
@ -76,7 +82,9 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker
assert chunks[0].content == "Test chunk content." assert chunks[0].content == "Test chunk content."
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize_raises", "patched_embed_text", "patched_chunk_text"
)
async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker): async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker):
"""RuntimeError is raised when the indexing step fails so the caller can fire a failure notification.""" """RuntimeError is raised when the indexing step fails so the caller can fire a failure notification."""
with pytest.raises(RuntimeError): with pytest.raises(RuntimeError):

View file

@ -7,9 +7,14 @@ from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineServ
pytestmark = pytest.mark.integration pytestmark = pytest.mark.integration
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_sets_status_ready( async def test_sets_status_ready(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""Document status is READY after successful indexing.""" """Document status is READY after successful indexing."""
connector_doc = make_connector_document(search_space_id=db_search_space.id) connector_doc = make_connector_document(search_space_id=db_search_space.id)
@ -21,15 +26,22 @@ async def test_sets_status_ready(
await service.index(document, connector_doc, llm=mocker.Mock()) await service.index(document, connector_doc, llm=mocker.Mock())
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY) assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_content_is_summary_when_should_summarize_true( async def test_content_is_summary_when_should_summarize_true(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""Document content is set to the LLM-generated summary when should_summarize=True.""" """Document content is set to the LLM-generated summary when should_summarize=True."""
connector_doc = make_connector_document(search_space_id=db_search_space.id) connector_doc = make_connector_document(search_space_id=db_search_space.id)
@ -41,15 +53,21 @@ async def test_content_is_summary_when_should_summarize_true(
await service.index(document, connector_doc, llm=mocker.Mock()) await service.index(document, connector_doc, llm=mocker.Mock())
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded.content == "Mocked summary." assert reloaded.content == "Mocked summary."
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_content_is_source_markdown_when_should_summarize_false( async def test_content_is_source_markdown_when_should_summarize_false(
db_session, db_search_space, make_connector_document, db_session,
db_search_space,
make_connector_document,
): ):
"""Document content is set to source_markdown verbatim when should_summarize=False.""" """Document content is set to source_markdown verbatim when should_summarize=False."""
connector_doc = make_connector_document( connector_doc = make_connector_document(
@ -65,15 +83,22 @@ async def test_content_is_source_markdown_when_should_summarize_false(
await service.index(document, connector_doc, llm=None) await service.index(document, connector_doc, llm=None)
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded.content == "## Raw content" assert reloaded.content == "## Raw content"
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_chunks_written_to_db( async def test_chunks_written_to_db(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""Chunks derived from source_markdown are persisted in the DB.""" """Chunks derived from source_markdown are persisted in the DB."""
connector_doc = make_connector_document(search_space_id=db_search_space.id) connector_doc = make_connector_document(search_space_id=db_search_space.id)
@ -94,9 +119,14 @@ async def test_chunks_written_to_db(
assert chunks[0].content == "Test chunk content." assert chunks[0].content == "Test chunk content."
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_embedding_written_to_db( async def test_embedding_written_to_db(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""Document embedding vector is persisted in the DB after indexing.""" """Document embedding vector is persisted in the DB after indexing."""
connector_doc = make_connector_document(search_space_id=db_search_space.id) connector_doc = make_connector_document(search_space_id=db_search_space.id)
@ -108,16 +138,23 @@ async def test_embedding_written_to_db(
await service.index(document, connector_doc, llm=mocker.Mock()) await service.index(document, connector_doc, llm=mocker.Mock())
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded.embedding is not None assert reloaded.embedding is not None
assert len(reloaded.embedding) == 1024 assert len(reloaded.embedding) == 1024
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_updated_at_advances_after_indexing( async def test_updated_at_advances_after_indexing(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""updated_at timestamp is later after indexing than it was at prepare time.""" """updated_at timestamp is later after indexing than it was at prepare time."""
connector_doc = make_connector_document(search_space_id=db_search_space.id) connector_doc = make_connector_document(search_space_id=db_search_space.id)
@ -127,20 +164,28 @@ async def test_updated_at_advances_after_indexing(
document = prepared[0] document = prepared[0]
document_id = document.id document_id = document.id
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
updated_at_pending = result.scalars().first().updated_at updated_at_pending = result.scalars().first().updated_at
await service.index(document, connector_doc, llm=mocker.Mock()) await service.index(document, connector_doc, llm=mocker.Mock())
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
updated_at_ready = result.scalars().first().updated_at updated_at_ready = result.scalars().first().updated_at
assert updated_at_ready > updated_at_pending assert updated_at_ready > updated_at_pending
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_no_llm_falls_back_to_source_markdown( async def test_no_llm_falls_back_to_source_markdown(
db_session, db_search_space, make_connector_document, db_session,
db_search_space,
make_connector_document,
): ):
"""When llm=None and no fallback_summary, content falls back to source_markdown.""" """When llm=None and no fallback_summary, content falls back to source_markdown."""
connector_doc = make_connector_document( connector_doc = make_connector_document(
@ -156,16 +201,22 @@ async def test_no_llm_falls_back_to_source_markdown(
await service.index(document, connector_doc, llm=None) await service.index(document, connector_doc, llm=None)
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY) assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
assert reloaded.content == "## Fallback content" assert reloaded.content == "## Fallback content"
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_fallback_summary_used_when_llm_unavailable( async def test_fallback_summary_used_when_llm_unavailable(
db_session, db_search_space, make_connector_document, db_session,
db_search_space,
make_connector_document,
): ):
"""fallback_summary is used as content when llm=None and should_summarize=True.""" """fallback_summary is used as content when llm=None and should_summarize=True."""
connector_doc = make_connector_document( connector_doc = make_connector_document(
@ -181,16 +232,23 @@ async def test_fallback_summary_used_when_llm_unavailable(
await service.index(prepared[0], connector_doc, llm=None) await service.index(prepared[0], connector_doc, llm=None)
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY) assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
assert reloaded.content == "Short pre-built summary." assert reloaded.content == "Short pre-built summary."
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_reindex_replaces_old_chunks( async def test_reindex_replaces_old_chunks(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""Re-indexing a document replaces its old chunks rather than appending.""" """Re-indexing a document replaces its old chunks rather than appending."""
connector_doc = make_connector_document( connector_doc = make_connector_document(
@ -220,9 +278,14 @@ async def test_reindex_replaces_old_chunks(
assert len(chunks) == 1 assert len(chunks) == 1
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize_raises", "patched_embed_text", "patched_chunk_text"
)
async def test_llm_error_sets_status_failed( async def test_llm_error_sets_status_failed(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""Document status is FAILED when the LLM raises during indexing.""" """Document status is FAILED when the LLM raises during indexing."""
connector_doc = make_connector_document(search_space_id=db_search_space.id) connector_doc = make_connector_document(search_space_id=db_search_space.id)
@ -234,15 +297,22 @@ async def test_llm_error_sets_status_failed(
await service.index(document, connector_doc, llm=mocker.Mock()) await service.index(document, connector_doc, llm=mocker.Mock())
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.FAILED) assert DocumentStatus.is_state(reloaded.status, DocumentStatus.FAILED)
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize_raises", "patched_embed_text", "patched_chunk_text"
)
async def test_llm_error_leaves_no_partial_data( async def test_llm_error_leaves_no_partial_data(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""A failed indexing attempt leaves no partial embedding or chunks in the DB.""" """A failed indexing attempt leaves no partial embedding or chunks in the DB."""
connector_doc = make_connector_document(search_space_id=db_search_space.id) connector_doc = make_connector_document(search_space_id=db_search_space.id)
@ -254,7 +324,9 @@ async def test_llm_error_leaves_no_partial_data(
await service.index(document, connector_doc, llm=mocker.Mock()) await service.index(document, connector_doc, llm=mocker.Mock())
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded.embedding is None assert reloaded.embedding is None

View file

@ -2,7 +2,9 @@ import pytest
from sqlalchemy import select from sqlalchemy import select
from app.db import Document, DocumentStatus from app.db import Document, DocumentStatus
from app.indexing_pipeline.document_hashing import compute_content_hash as real_compute_content_hash from app.indexing_pipeline.document_hashing import (
compute_content_hash as real_compute_content_hash,
)
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
pytestmark = pytest.mark.integration pytestmark = pytest.mark.integration
@ -20,7 +22,9 @@ async def test_new_document_is_persisted_with_pending_status(
assert len(results) == 1 assert len(results) == 1
document_id = results[0].id document_id = results[0].id
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded is not None assert reloaded is not None
@ -28,9 +32,14 @@ async def test_new_document_is_persisted_with_pending_status(
assert reloaded.source_markdown == doc.source_markdown assert reloaded.source_markdown == doc.source_markdown
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_unchanged_ready_document_is_skipped( async def test_unchanged_ready_document_is_skipped(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""A READY document with unchanged content is not returned for re-indexing.""" """A READY document with unchanged content is not returned for re-indexing."""
doc = make_connector_document(search_space_id=db_search_space.id) doc = make_connector_document(search_space_id=db_search_space.id)
@ -46,24 +55,35 @@ async def test_unchanged_ready_document_is_skipped(
assert results == [] assert results == []
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize", "patched_embed_text", "patched_chunk_text"
)
async def test_title_only_change_updates_title_in_db( async def test_title_only_change_updates_title_in_db(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""A title-only change updates the DB title without re-queuing the document.""" """A title-only change updates the DB title without re-queuing the document."""
original = make_connector_document(search_space_id=db_search_space.id, title="Original Title") original = make_connector_document(
search_space_id=db_search_space.id, title="Original Title"
)
service = IndexingPipelineService(session=db_session) service = IndexingPipelineService(session=db_session)
prepared = await service.prepare_for_indexing([original]) prepared = await service.prepare_for_indexing([original])
document_id = prepared[0].id document_id = prepared[0].id
await service.index(prepared[0], original, llm=mocker.Mock()) await service.index(prepared[0], original, llm=mocker.Mock())
renamed = make_connector_document(search_space_id=db_search_space.id, title="Updated Title") renamed = make_connector_document(
search_space_id=db_search_space.id, title="Updated Title"
)
results = await service.prepare_for_indexing([renamed]) results = await service.prepare_for_indexing([renamed])
assert results == [] assert results == []
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded.title == "Updated Title" assert reloaded.title == "Updated Title"
@ -73,19 +93,25 @@ async def test_changed_content_is_returned_for_reprocessing(
db_session, db_search_space, make_connector_document db_session, db_search_space, make_connector_document
): ):
"""A document with changed content is returned for re-indexing with updated markdown.""" """A document with changed content is returned for re-indexing with updated markdown."""
original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1") original = make_connector_document(
search_space_id=db_search_space.id, source_markdown="## v1"
)
service = IndexingPipelineService(session=db_session) service = IndexingPipelineService(session=db_session)
first = await service.prepare_for_indexing([original]) first = await service.prepare_for_indexing([original])
original_id = first[0].id original_id = first[0].id
updated = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v2") updated = make_connector_document(
search_space_id=db_search_space.id, source_markdown="## v2"
)
results = await service.prepare_for_indexing([updated]) results = await service.prepare_for_indexing([updated])
assert len(results) == 1 assert len(results) == 1
assert results[0].id == original_id assert results[0].id == original_id
result = await db_session.execute(select(Document).filter(Document.id == original_id)) result = await db_session.execute(
select(Document).filter(Document.id == original_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded.source_markdown == "## v2" assert reloaded.source_markdown == "## v2"
@ -97,9 +123,24 @@ async def test_all_documents_in_batch_are_persisted(
): ):
"""All documents in a batch are persisted and returned.""" """All documents in a batch are persisted and returned."""
docs = [ docs = [
make_connector_document(search_space_id=db_search_space.id, unique_id="id-1", title="Doc 1", source_markdown="## Content 1"), make_connector_document(
make_connector_document(search_space_id=db_search_space.id, unique_id="id-2", title="Doc 2", source_markdown="## Content 2"), search_space_id=db_search_space.id,
make_connector_document(search_space_id=db_search_space.id, unique_id="id-3", title="Doc 3", source_markdown="## Content 3"), unique_id="id-1",
title="Doc 1",
source_markdown="## Content 1",
),
make_connector_document(
search_space_id=db_search_space.id,
unique_id="id-2",
title="Doc 2",
source_markdown="## Content 2",
),
make_connector_document(
search_space_id=db_search_space.id,
unique_id="id-3",
title="Doc 3",
source_markdown="## Content 3",
),
] ]
service = IndexingPipelineService(session=db_session) service = IndexingPipelineService(session=db_session)
@ -107,7 +148,9 @@ async def test_all_documents_in_batch_are_persisted(
assert len(results) == 3 assert len(results) == 3
result = await db_session.execute(select(Document).filter(Document.search_space_id == db_search_space.id)) result = await db_session.execute(
select(Document).filter(Document.search_space_id == db_search_space.id)
)
rows = result.scalars().all() rows = result.scalars().all()
assert len(rows) == 3 assert len(rows) == 3
@ -124,7 +167,9 @@ async def test_duplicate_in_batch_is_persisted_once(
assert len(results) == 1 assert len(results) == 1
result = await db_session.execute(select(Document).filter(Document.search_space_id == db_search_space.id)) result = await db_session.execute(
select(Document).filter(Document.search_space_id == db_search_space.id)
)
rows = result.scalars().all() rows = result.scalars().all()
assert len(rows) == 1 assert len(rows) == 1
@ -143,7 +188,9 @@ async def test_created_by_id_is_persisted(
results = await service.prepare_for_indexing([doc]) results = await service.prepare_for_indexing([doc])
document_id = results[0].id document_id = results[0].id
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert str(reloaded.created_by_id) == str(db_user.id) assert str(reloaded.created_by_id) == str(db_user.id)
@ -170,7 +217,9 @@ async def test_metadata_is_updated_when_content_changes(
) )
await service.prepare_for_indexing([updated]) await service.prepare_for_indexing([updated])
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded.document_metadata == {"status": "done"} assert reloaded.document_metadata == {"status": "done"}
@ -180,19 +229,27 @@ async def test_updated_at_advances_when_title_only_changes(
db_session, db_search_space, make_connector_document db_session, db_search_space, make_connector_document
): ):
"""updated_at advances even when only the title changes.""" """updated_at advances even when only the title changes."""
original = make_connector_document(search_space_id=db_search_space.id, title="Old Title") original = make_connector_document(
search_space_id=db_search_space.id, title="Old Title"
)
service = IndexingPipelineService(session=db_session) service = IndexingPipelineService(session=db_session)
first = await service.prepare_for_indexing([original]) first = await service.prepare_for_indexing([original])
document_id = first[0].id document_id = first[0].id
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
updated_at_v1 = result.scalars().first().updated_at updated_at_v1 = result.scalars().first().updated_at
renamed = make_connector_document(search_space_id=db_search_space.id, title="New Title") renamed = make_connector_document(
search_space_id=db_search_space.id, title="New Title"
)
await service.prepare_for_indexing([renamed]) await service.prepare_for_indexing([renamed])
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
updated_at_v2 = result.scalars().first().updated_at updated_at_v2 = result.scalars().first().updated_at
assert updated_at_v2 > updated_at_v1 assert updated_at_v2 > updated_at_v1
@ -202,19 +259,27 @@ async def test_updated_at_advances_when_content_changes(
db_session, db_search_space, make_connector_document db_session, db_search_space, make_connector_document
): ):
"""updated_at advances when document content changes.""" """updated_at advances when document content changes."""
original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1") original = make_connector_document(
search_space_id=db_search_space.id, source_markdown="## v1"
)
service = IndexingPipelineService(session=db_session) service = IndexingPipelineService(session=db_session)
first = await service.prepare_for_indexing([original]) first = await service.prepare_for_indexing([original])
document_id = first[0].id document_id = first[0].id
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
updated_at_v1 = result.scalars().first().updated_at updated_at_v1 = result.scalars().first().updated_at
updated = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v2") updated = make_connector_document(
search_space_id=db_search_space.id, source_markdown="## v2"
)
await service.prepare_for_indexing([updated]) await service.prepare_for_indexing([updated])
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
updated_at_v2 = result.scalars().first().updated_at updated_at_v2 = result.scalars().first().updated_at
assert updated_at_v2 > updated_at_v1 assert updated_at_v2 > updated_at_v1
@ -273,9 +338,14 @@ async def test_same_content_from_different_source_is_skipped(
assert len(result.scalars().all()) == 1 assert len(result.scalars().all()) == 1
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text") @pytest.mark.usefixtures(
"patched_summarize_raises", "patched_embed_text", "patched_chunk_text"
)
async def test_failed_document_with_unchanged_content_is_requeued( async def test_failed_document_with_unchanged_content_is_requeued(
db_session, db_search_space, make_connector_document, mocker, db_session,
db_search_space,
make_connector_document,
mocker,
): ):
"""A FAILED document with unchanged content is re-queued as PENDING on the next run.""" """A FAILED document with unchanged content is re-queued as PENDING on the next run."""
doc = make_connector_document(search_space_id=db_search_space.id) doc = make_connector_document(search_space_id=db_search_space.id)
@ -286,8 +356,12 @@ async def test_failed_document_with_unchanged_content_is_requeued(
document_id = prepared[0].id document_id = prepared[0].id
await service.index(prepared[0], doc, llm=mocker.Mock()) await service.index(prepared[0], doc, llm=mocker.Mock())
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
assert DocumentStatus.is_state(result.scalars().first().status, DocumentStatus.FAILED) select(Document).filter(Document.id == document_id)
)
assert DocumentStatus.is_state(
result.scalars().first().status, DocumentStatus.FAILED
)
# Next run: same content, pipeline must re-queue the failed document # Next run: same content, pipeline must re-queue the failed document
results = await service.prepare_for_indexing([doc]) results = await service.prepare_for_indexing([doc])
@ -295,8 +369,12 @@ async def test_failed_document_with_unchanged_content_is_requeued(
assert len(results) == 1 assert len(results) == 1
assert results[0].id == document_id assert results[0].id == document_id
result = await db_session.execute(select(Document).filter(Document.id == document_id)) result = await db_session.execute(
assert DocumentStatus.is_state(result.scalars().first().status, DocumentStatus.PENDING) select(Document).filter(Document.id == document_id)
)
assert DocumentStatus.is_state(
result.scalars().first().status, DocumentStatus.PENDING
)
async def test_title_and_content_change_updates_both_and_returns_document( async def test_title_and_content_change_updates_both_and_returns_document(
@ -323,16 +401,20 @@ async def test_title_and_content_change_updates_both_and_returns_document(
assert len(results) == 1 assert len(results) == 1
assert results[0].id == original_id assert results[0].id == original_id
result = await db_session.execute(select(Document).filter(Document.id == original_id)) result = await db_session.execute(
select(Document).filter(Document.id == original_id)
)
reloaded = result.scalars().first() reloaded = result.scalars().first()
assert reloaded.title == "Updated Title" assert reloaded.title == "Updated Title"
assert reloaded.source_markdown == "## v2" assert reloaded.source_markdown == "## v2"
async def test_one_bad_document_in_batch_does_not_prevent_others_from_being_persisted( async def test_one_bad_document_in_batch_does_not_prevent_others_from_being_persisted(
db_session, db_search_space, make_connector_document, monkeypatch, db_session,
db_search_space,
make_connector_document,
monkeypatch,
): ):
""" """
A per-document error during prepare_for_indexing must be isolated. A per-document error during prepare_for_indexing must be isolated.

View file

@ -1,6 +1,7 @@
import pytest
from unittest.mock import AsyncMock, MagicMock from unittest.mock import AsyncMock, MagicMock
import pytest
@pytest.fixture @pytest.fixture
def patched_summarizer_chain(monkeypatch): def patched_summarizer_chain(monkeypatch):
@ -21,7 +22,9 @@ def patched_summarizer_chain(monkeypatch):
def patched_chunker_instance(monkeypatch): def patched_chunker_instance(monkeypatch):
mock = MagicMock() mock = MagicMock()
mock.chunk.return_value = [MagicMock(text="prose chunk")] mock.chunk.return_value = [MagicMock(text="prose chunk")]
monkeypatch.setattr("app.indexing_pipeline.document_chunker.config.chunker_instance", mock) monkeypatch.setattr(
"app.indexing_pipeline.document_chunker.config.chunker_instance", mock
)
return mock return mock
@ -29,5 +32,7 @@ def patched_chunker_instance(monkeypatch):
def patched_code_chunker_instance(monkeypatch): def patched_code_chunker_instance(monkeypatch):
mock = MagicMock() mock = MagicMock()
mock.chunk.return_value = [MagicMock(text="code chunk")] mock.chunk.return_value = [MagicMock(text="code chunk")]
monkeypatch.setattr("app.indexing_pipeline.document_chunker.config.code_chunker_instance", mock) monkeypatch.setattr(
"app.indexing_pipeline.document_chunker.config.code_chunker_instance", mock
)
return mock return mock

View file

@ -1,7 +1,10 @@
import pytest import pytest
from app.db import DocumentType from app.db import DocumentType
from app.indexing_pipeline.document_hashing import compute_content_hash, compute_unique_identifier_hash from app.indexing_pipeline.document_hashing import (
compute_content_hash,
compute_unique_identifier_hash,
)
pytestmark = pytest.mark.unit pytestmark = pytest.mark.unit
@ -10,21 +13,31 @@ def test_different_unique_id_produces_different_hash(make_connector_document):
"""Two documents with different unique_ids produce different identifier hashes.""" """Two documents with different unique_ids produce different identifier hashes."""
doc_a = make_connector_document(unique_id="id-001") doc_a = make_connector_document(unique_id="id-001")
doc_b = make_connector_document(unique_id="id-002") doc_b = make_connector_document(unique_id="id-002")
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(
doc_b
)
def test_different_search_space_produces_different_identifier_hash(make_connector_document): def test_different_search_space_produces_different_identifier_hash(
make_connector_document,
):
"""Same document in different search spaces produces different identifier hashes.""" """Same document in different search spaces produces different identifier hashes."""
doc_a = make_connector_document(search_space_id=1) doc_a = make_connector_document(search_space_id=1)
doc_b = make_connector_document(search_space_id=2) doc_b = make_connector_document(search_space_id=2)
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(
doc_b
)
def test_different_document_type_produces_different_identifier_hash(make_connector_document): def test_different_document_type_produces_different_identifier_hash(
make_connector_document,
):
"""Same unique_id with different document types produces different identifier hashes.""" """Same unique_id with different document types produces different identifier hashes."""
doc_a = make_connector_document(document_type=DocumentType.CLICKUP_CONNECTOR) doc_a = make_connector_document(document_type=DocumentType.CLICKUP_CONNECTOR)
doc_b = make_connector_document(document_type=DocumentType.NOTION_CONNECTOR) doc_b = make_connector_document(document_type=DocumentType.NOTION_CONNECTOR)
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(
doc_b
)
def test_same_content_same_space_produces_same_content_hash(make_connector_document): def test_same_content_same_space_produces_same_content_hash(make_connector_document):
@ -34,7 +47,9 @@ def test_same_content_same_space_produces_same_content_hash(make_connector_docum
assert compute_content_hash(doc_a) == compute_content_hash(doc_b) assert compute_content_hash(doc_a) == compute_content_hash(doc_b)
def test_same_content_different_space_produces_different_content_hash(make_connector_document): def test_same_content_different_space_produces_different_content_hash(
make_connector_document,
):
"""Identical content in different search spaces produces different content hashes.""" """Identical content in different search spaces produces different content hashes."""
doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1) doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1)
doc_b = make_connector_document(source_markdown="Hello world", search_space_id=2) doc_b = make_connector_document(source_markdown="Hello world", search_space_id=2)

View file

@ -1,6 +1,7 @@
import pytest
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pytest
from app.indexing_pipeline.document_summarizer import summarize_document from app.indexing_pipeline.document_summarizer import summarize_document
pytestmark = pytest.mark.unit pytestmark = pytest.mark.unit
@ -38,5 +39,3 @@ async def test_with_metadata_omits_empty_fields_from_output():
assert "Alice" in result assert "Alice" in result
assert "description" not in result.lower() assert "description" not in result.lower()