mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
chore: ran linting
This commit is contained in:
parent
7332be956e
commit
9ccee054a5
24 changed files with 368 additions and 151 deletions
|
|
@ -147,7 +147,9 @@ async def delete_sandbox(thread_id: int | str) -> None:
|
|||
try:
|
||||
sandbox = client.find_one(labels=labels)
|
||||
except DaytonaError:
|
||||
logger.debug("No sandbox to delete for thread %s (already removed)", thread_id)
|
||||
logger.debug(
|
||||
"No sandbox to delete for thread %s (already removed)", thread_id
|
||||
)
|
||||
return
|
||||
try:
|
||||
client.delete(sandbox)
|
||||
|
|
@ -166,6 +168,7 @@ async def delete_sandbox(thread_id: int | str) -> None:
|
|||
# Local file persistence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _get_sandbox_files_dir() -> Path:
|
||||
return Path(os.environ.get("SANDBOX_FILES_DIR", "sandbox_files"))
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from app.db import DocumentType
|
|||
|
||||
class ConnectorDocument(BaseModel):
|
||||
"""Canonical data transfer object produced by connector adapters and consumed by the indexing pipeline."""
|
||||
|
||||
title: str
|
||||
source_markdown: str
|
||||
unique_id: str
|
||||
|
|
|
|||
|
|
@ -3,5 +3,7 @@ from app.config import config
|
|||
|
||||
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
||||
"""Chunk a text string using the configured chunker and return the chunk texts."""
|
||||
chunker = config.code_chunker_instance if use_code_chunker else config.chunker_instance
|
||||
chunker = (
|
||||
config.code_chunker_instance if use_code_chunker else config.chunker_instance
|
||||
)
|
||||
return [c.text for c in chunker.chunk(text)]
|
||||
|
|
|
|||
|
|
@ -2,7 +2,9 @@ from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
|||
from app.utils.document_converters import optimize_content_for_context_window
|
||||
|
||||
|
||||
async def summarize_document(source_markdown: str, llm, metadata: dict | None = None) -> str:
|
||||
async def summarize_document(
|
||||
source_markdown: str, llm, metadata: dict | None = None
|
||||
) -> str:
|
||||
"""Generate a text summary of a document using an LLM, prefixed with metadata when provided."""
|
||||
model_name = getattr(llm, "model", "gpt-3.5-turbo")
|
||||
optimized_content = optimize_content_for_context_window(
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ from litellm.exceptions import (
|
|||
Timeout,
|
||||
UnprocessableEntityError,
|
||||
)
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
|
||||
# Tuples for use directly in except clauses.
|
||||
RETRYABLE_LLM_ERRORS = (
|
||||
|
|
@ -36,29 +35,33 @@ PERMANENT_LLM_ERRORS = (
|
|||
# (LiteLLMEmbeddings, CohereEmbeddings, GeminiEmbeddings all normalize to RuntimeError).
|
||||
EMBEDDING_ERRORS = (
|
||||
RuntimeError, # local device failure or API backend normalization
|
||||
OSError, # model files missing or corrupted (local backends)
|
||||
MemoryError, # document too large for available RAM
|
||||
OSError, # model files missing or corrupted (local backends)
|
||||
MemoryError, # document too large for available RAM
|
||||
)
|
||||
|
||||
|
||||
class PipelineMessages:
|
||||
RATE_LIMIT = "LLM rate limit exceeded. Will retry on next sync."
|
||||
LLM_TIMEOUT = "LLM request timed out. Will retry on next sync."
|
||||
LLM_UNAVAILABLE = "LLM service temporarily unavailable. Will retry on next sync."
|
||||
LLM_BAD_GATEWAY = "LLM gateway error. Will retry on next sync."
|
||||
LLM_SERVER_ERROR = "LLM internal server error. Will retry on next sync."
|
||||
LLM_CONNECTION = "Could not reach the LLM service. Check network connectivity."
|
||||
RATE_LIMIT = "LLM rate limit exceeded. Will retry on next sync."
|
||||
LLM_TIMEOUT = "LLM request timed out. Will retry on next sync."
|
||||
LLM_UNAVAILABLE = "LLM service temporarily unavailable. Will retry on next sync."
|
||||
LLM_BAD_GATEWAY = "LLM gateway error. Will retry on next sync."
|
||||
LLM_SERVER_ERROR = "LLM internal server error. Will retry on next sync."
|
||||
LLM_CONNECTION = "Could not reach the LLM service. Check network connectivity."
|
||||
|
||||
LLM_AUTH = "LLM authentication failed. Check your API key."
|
||||
LLM_PERMISSION = "LLM request denied. Check your account permissions."
|
||||
LLM_NOT_FOUND = "LLM model not found. Check your model configuration."
|
||||
LLM_BAD_REQUEST = "LLM rejected the request. Document content may be invalid."
|
||||
LLM_UNPROCESSABLE = "Document exceeds the LLM context window even after optimization."
|
||||
LLM_RESPONSE = "LLM returned an invalid response."
|
||||
LLM_AUTH = "LLM authentication failed. Check your API key."
|
||||
LLM_PERMISSION = "LLM request denied. Check your account permissions."
|
||||
LLM_NOT_FOUND = "LLM model not found. Check your model configuration."
|
||||
LLM_BAD_REQUEST = "LLM rejected the request. Document content may be invalid."
|
||||
LLM_UNPROCESSABLE = (
|
||||
"Document exceeds the LLM context window even after optimization."
|
||||
)
|
||||
LLM_RESPONSE = "LLM returned an invalid response."
|
||||
|
||||
EMBEDDING_FAILED = "Embedding failed. Check your embedding model configuration or service."
|
||||
EMBEDDING_MODEL = "Embedding model files are missing or corrupted."
|
||||
EMBEDDING_MEMORY = "Not enough memory to embed this document."
|
||||
EMBEDDING_FAILED = (
|
||||
"Embedding failed. Check your embedding model configuration or service."
|
||||
)
|
||||
EMBEDDING_MODEL = "Embedding model files are missing or corrupted."
|
||||
EMBEDDING_MEMORY = "Not enough memory to embed this document."
|
||||
|
||||
CHUNKING_OVERFLOW = "Document structure is too deeply nested to chunk."
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import contextlib
|
|||
from datetime import UTC, datetime
|
||||
|
||||
from sqlalchemy import delete, select
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Chunk, Document, DocumentStatus
|
||||
|
|
@ -21,7 +22,6 @@ from app.indexing_pipeline.exceptions import (
|
|||
EMBEDDING_ERRORS,
|
||||
PERMANENT_LLM_ERRORS,
|
||||
RETRYABLE_LLM_ERRORS,
|
||||
IntegrityError,
|
||||
PipelineMessages,
|
||||
embedding_message,
|
||||
llm_permanent_message,
|
||||
|
|
|
|||
|
|
@ -8,27 +8,29 @@ logger = logging.getLogger(__name__)
|
|||
class PipelineLogContext:
|
||||
connector_id: int | None
|
||||
search_space_id: int
|
||||
unique_id: str # always available from ConnectorDocument
|
||||
doc_id: int | None = None # set once the DB row exists (index phase only)
|
||||
unique_id: str # always available from ConnectorDocument
|
||||
doc_id: int | None = None # set once the DB row exists (index phase only)
|
||||
|
||||
|
||||
class LogMessages:
|
||||
# prepare_for_indexing
|
||||
DOCUMENT_QUEUED = "New document queued for indexing."
|
||||
DOCUMENT_UPDATED = "Document content changed, re-queued for indexing."
|
||||
DOCUMENT_REQUEUED = "Stuck document re-queued for indexing."
|
||||
DOCUMENT_QUEUED = "New document queued for indexing."
|
||||
DOCUMENT_UPDATED = "Document content changed, re-queued for indexing."
|
||||
DOCUMENT_REQUEUED = "Stuck document re-queued for indexing."
|
||||
DOC_SKIPPED_UNKNOWN = "Unexpected error — document skipped."
|
||||
BATCH_ABORTED = "Fatal DB error — aborting prepare batch."
|
||||
RACE_CONDITION = "Concurrent worker beat us to the commit — rolling back batch."
|
||||
BATCH_ABORTED = "Fatal DB error — aborting prepare batch."
|
||||
RACE_CONDITION = "Concurrent worker beat us to the commit — rolling back batch."
|
||||
|
||||
# index
|
||||
INDEX_STARTED = "Document indexing started."
|
||||
INDEX_SUCCESS = "Document indexed successfully."
|
||||
LLM_RETRYABLE = "Retryable LLM error — document marked failed, will retry on next sync."
|
||||
LLM_PERMANENT = "Permanent LLM error — document marked failed."
|
||||
EMBEDDING_FAILED = "Embedding error — document marked failed."
|
||||
CHUNKING_OVERFLOW = "Chunking overflow — document marked failed."
|
||||
UNEXPECTED = "Unexpected error — document marked failed."
|
||||
INDEX_STARTED = "Document indexing started."
|
||||
INDEX_SUCCESS = "Document indexed successfully."
|
||||
LLM_RETRYABLE = (
|
||||
"Retryable LLM error — document marked failed, will retry on next sync."
|
||||
)
|
||||
LLM_PERMANENT = "Permanent LLM error — document marked failed."
|
||||
EMBEDDING_FAILED = "Embedding error — document marked failed."
|
||||
CHUNKING_OVERFLOW = "Chunking overflow — document marked failed."
|
||||
UNEXPECTED = "Unexpected error — document marked failed."
|
||||
|
||||
|
||||
def _format_context(ctx: PipelineLogContext) -> str:
|
||||
|
|
@ -52,7 +54,9 @@ def _build_message(msg: str, ctx: PipelineLogContext, **extra) -> str:
|
|||
return msg
|
||||
|
||||
|
||||
def _safe_log(level_fn, msg: str, ctx: PipelineLogContext, exc_info=None, **extra) -> None:
|
||||
def _safe_log(
|
||||
level_fn, msg: str, ctx: PipelineLogContext, exc_info=None, **extra
|
||||
) -> None:
|
||||
# Logging must never raise — a broken log call inside an except block would
|
||||
# chain with the original exception and mask it entirely.
|
||||
try:
|
||||
|
|
@ -64,6 +68,7 @@ def _safe_log(level_fn, msg: str, ctx: PipelineLogContext, exc_info=None, **extr
|
|||
|
||||
# ── prepare_for_indexing ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def log_document_queued(ctx: PipelineLogContext) -> None:
|
||||
_safe_log(logger.info, LogMessages.DOCUMENT_QUEUED, ctx)
|
||||
|
||||
|
|
@ -77,7 +82,9 @@ def log_document_requeued(ctx: PipelineLogContext) -> None:
|
|||
|
||||
|
||||
def log_doc_skipped_unknown(ctx: PipelineLogContext, exc: Exception) -> None:
|
||||
_safe_log(logger.warning, LogMessages.DOC_SKIPPED_UNKNOWN, ctx, exc_info=exc, error=exc)
|
||||
_safe_log(
|
||||
logger.warning, LogMessages.DOC_SKIPPED_UNKNOWN, ctx, exc_info=exc, error=exc
|
||||
)
|
||||
|
||||
|
||||
def log_race_condition(ctx: PipelineLogContext) -> None:
|
||||
|
|
@ -90,6 +97,7 @@ def log_batch_aborted(ctx: PipelineLogContext, exc: Exception) -> None:
|
|||
|
||||
# ── index ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def log_index_started(ctx: PipelineLogContext) -> None:
|
||||
_safe_log(logger.info, LogMessages.INDEX_STARTED, ctx)
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@ These endpoints support the ThreadHistoryAdapter pattern from assistant-ui:
|
|||
- POST /threads/{thread_id}/messages - Append message
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
|
|
@ -52,9 +54,6 @@ from app.tasks.chat.stream_new_chat import stream_new_chat, stream_resume_chat
|
|||
from app.users import current_active_user
|
||||
from app.utils.rbac import check_permission
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
|
@ -75,11 +74,19 @@ def _try_delete_sandbox(thread_id: int) -> None:
|
|||
try:
|
||||
await delete_sandbox(thread_id)
|
||||
except Exception:
|
||||
_logger.warning("Background sandbox delete failed for thread %s", thread_id, exc_info=True)
|
||||
_logger.warning(
|
||||
"Background sandbox delete failed for thread %s",
|
||||
thread_id,
|
||||
exc_info=True,
|
||||
)
|
||||
try:
|
||||
delete_local_sandbox_files(thread_id)
|
||||
except Exception:
|
||||
_logger.warning("Local sandbox file cleanup failed for thread %s", thread_id, exc_info=True)
|
||||
_logger.warning(
|
||||
"Local sandbox file cleanup failed for thread %s",
|
||||
thread_id,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ async def download_sandbox_file(
|
|||
# Fall back to live sandbox download
|
||||
try:
|
||||
sandbox = await get_or_create_sandbox(thread_id)
|
||||
raw_sandbox = sandbox._sandbox # noqa: SLF001
|
||||
raw_sandbox = sandbox._sandbox
|
||||
content: bytes = await asyncio.to_thread(raw_sandbox.fs.download_file, path)
|
||||
except Exception as exc:
|
||||
logger.warning("Sandbox file download failed for %s: %s", path, exc)
|
||||
|
|
|
|||
|
|
@ -877,7 +877,9 @@ async def _stream_agent_events(
|
|||
output_text = om.group(1) if om else ""
|
||||
thread_id_str = config.get("configurable", {}).get("thread_id", "")
|
||||
|
||||
for sf_match in re.finditer(r"^SANDBOX_FILE:\s*(.+)$", output_text, re.MULTILINE):
|
||||
for sf_match in re.finditer(
|
||||
r"^SANDBOX_FILE:\s*(.+)$", output_text, re.MULTILINE
|
||||
):
|
||||
fpath = sf_match.group(1).strip()
|
||||
if fpath and fpath not in result.sandbox_files:
|
||||
result.sandbox_files.append(fpath)
|
||||
|
|
@ -963,7 +965,10 @@ def _try_persist_and_delete_sandbox(
|
|||
sandbox_files: list[str],
|
||||
) -> None:
|
||||
"""Fire-and-forget: persist sandbox files locally then delete the sandbox."""
|
||||
from app.agents.new_chat.sandbox import is_sandbox_enabled, persist_and_delete_sandbox
|
||||
from app.agents.new_chat.sandbox import (
|
||||
is_sandbox_enabled,
|
||||
persist_and_delete_sandbox,
|
||||
)
|
||||
|
||||
if not is_sandbox_enabled():
|
||||
return
|
||||
|
|
|
|||
|
|
@ -1886,11 +1886,11 @@ async def process_file_in_background_with_document(
|
|||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully processed file: {filename}",
|
||||
{
|
||||
{
|
||||
"document_id": doc_id,
|
||||
"content_hash": content_hash,
|
||||
"file_type": etl_service,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
return document
|
||||
|
|
|
|||
|
|
@ -179,11 +179,7 @@ addopts = "-v --tb=short -x --strict-markers -ra --durations=5"
|
|||
markers = [
|
||||
"unit: pure logic tests, no DB or external services",
|
||||
"integration: tests that require a real PostgreSQL database",
|
||||
"document: document upload and processing tests",
|
||||
"connector: connector indexing tests",
|
||||
"chat: chat and agent tests",
|
||||
"page_limit: page limit enforcement tests",
|
||||
"upload_limit: file upload limit validation tests",
|
||||
"e2e: tests requiring a running backend and real HTTP calls"
|
||||
]
|
||||
filterwarnings = [
|
||||
"ignore::UserWarning:chonkie",
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ def make_connector_document():
|
|||
Generic factory for unit tests. Overridden in tests/integration/conftest.py
|
||||
with real DB-backed IDs for integration tests.
|
||||
"""
|
||||
|
||||
def _make(**overrides):
|
||||
defaults = {
|
||||
"title": "Test Document",
|
||||
|
|
@ -58,4 +59,5 @@ def make_connector_document():
|
|||
}
|
||||
defaults.update(overrides)
|
||||
return ConnectorDocument(**defaults)
|
||||
|
||||
return _make
|
||||
|
|
|
|||
|
|
@ -18,7 +18,6 @@ from tests.utils.helpers import (
|
|||
get_search_space_id,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backend connectivity fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ from tests.utils.helpers import (
|
|||
upload_multiple_files,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.document
|
||||
pytestmark = pytest.mark.e2e
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers local to this module
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ from tests.utils.helpers import (
|
|||
upload_file,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.page_limit
|
||||
pytestmark = pytest.mark.e2e
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ import io
|
|||
import httpx
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.upload_limit
|
||||
pytestmark = pytest.mark.e2e
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
import os
|
||||
import uuid
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
|
@ -9,14 +8,21 @@ from sqlalchemy import text
|
|||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
|
||||
from sqlalchemy.pool import NullPool
|
||||
|
||||
from app.db import Base, SearchSpace, SearchSourceConnector, SearchSourceConnectorType
|
||||
from app.db import User
|
||||
from app.db import DocumentType
|
||||
from app.db import (
|
||||
Base,
|
||||
DocumentType,
|
||||
SearchSourceConnector,
|
||||
SearchSourceConnectorType,
|
||||
SearchSpace,
|
||||
User,
|
||||
)
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
|
||||
_EMBEDDING_DIM = 1024 # must match the Vector() dimension used in DB column creation
|
||||
|
||||
_DEFAULT_TEST_DB = "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test"
|
||||
_DEFAULT_TEST_DB = (
|
||||
"postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test"
|
||||
)
|
||||
TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB)
|
||||
|
||||
|
||||
|
|
@ -80,7 +86,9 @@ async def db_user(db_session: AsyncSession) -> User:
|
|||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def db_connector(db_session: AsyncSession, db_user: User, db_search_space: "SearchSpace") -> SearchSourceConnector:
|
||||
async def db_connector(
|
||||
db_session: AsyncSession, db_user: User, db_search_space: "SearchSpace"
|
||||
) -> SearchSourceConnector:
|
||||
connector = SearchSourceConnector(
|
||||
name="Test Connector",
|
||||
connector_type=SearchSourceConnectorType.CLICKUP_CONNECTOR,
|
||||
|
|
@ -147,6 +155,7 @@ def patched_chunk_text(monkeypatch) -> MagicMock:
|
|||
@pytest.fixture
|
||||
def make_connector_document(db_connector, db_user):
|
||||
"""Integration-scoped override: uses real DB connector and user IDs."""
|
||||
|
||||
def _make(**overrides):
|
||||
defaults = {
|
||||
"title": "Test Document",
|
||||
|
|
@ -159,6 +168,5 @@ def make_connector_document(db_connector, db_user):
|
|||
}
|
||||
defaults.update(overrides)
|
||||
return ConnectorDocument(**defaults)
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,9 @@ from app.indexing_pipeline.adapters.file_upload_adapter import index_uploaded_fi
|
|||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
||||
"""Document status is READY after successful indexing."""
|
||||
await index_uploaded_file(
|
||||
|
|
@ -28,7 +30,9 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
|||
assert DocumentStatus.is_state(document.status, DocumentStatus.READY)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
|
||||
"""Document content is set to the LLM-generated summary."""
|
||||
await index_uploaded_file(
|
||||
|
|
@ -49,7 +53,9 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
|
|||
assert document.content == "Mocked summary."
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker):
|
||||
"""Chunks derived from the source markdown are persisted in the DB."""
|
||||
await index_uploaded_file(
|
||||
|
|
@ -76,7 +82,9 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker
|
|||
assert chunks[0].content == "Test chunk content."
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize_raises", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker):
|
||||
"""RuntimeError is raised when the indexing step fails so the caller can fire a failure notification."""
|
||||
with pytest.raises(RuntimeError):
|
||||
|
|
|
|||
|
|
@ -7,9 +7,14 @@ from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineServ
|
|||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_sets_status_ready(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""Document status is READY after successful indexing."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -21,15 +26,22 @@ async def test_sets_status_ready(
|
|||
|
||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_content_is_summary_when_should_summarize_true(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""Document content is set to the LLM-generated summary when should_summarize=True."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -41,15 +53,21 @@ async def test_content_is_summary_when_should_summarize_true(
|
|||
|
||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded.content == "Mocked summary."
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_content_is_source_markdown_when_should_summarize_false(
|
||||
db_session, db_search_space, make_connector_document,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
):
|
||||
"""Document content is set to source_markdown verbatim when should_summarize=False."""
|
||||
connector_doc = make_connector_document(
|
||||
|
|
@ -65,15 +83,22 @@ async def test_content_is_source_markdown_when_should_summarize_false(
|
|||
|
||||
await service.index(document, connector_doc, llm=None)
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded.content == "## Raw content"
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_chunks_written_to_db(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""Chunks derived from source_markdown are persisted in the DB."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -94,9 +119,14 @@ async def test_chunks_written_to_db(
|
|||
assert chunks[0].content == "Test chunk content."
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_embedding_written_to_db(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""Document embedding vector is persisted in the DB after indexing."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -108,16 +138,23 @@ async def test_embedding_written_to_db(
|
|||
|
||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded.embedding is not None
|
||||
assert len(reloaded.embedding) == 1024
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_updated_at_advances_after_indexing(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""updated_at timestamp is later after indexing than it was at prepare time."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -127,20 +164,28 @@ async def test_updated_at_advances_after_indexing(
|
|||
document = prepared[0]
|
||||
document_id = document.id
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
updated_at_pending = result.scalars().first().updated_at
|
||||
|
||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
updated_at_ready = result.scalars().first().updated_at
|
||||
|
||||
assert updated_at_ready > updated_at_pending
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_no_llm_falls_back_to_source_markdown(
|
||||
db_session, db_search_space, make_connector_document,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
):
|
||||
"""When llm=None and no fallback_summary, content falls back to source_markdown."""
|
||||
connector_doc = make_connector_document(
|
||||
|
|
@ -156,16 +201,22 @@ async def test_no_llm_falls_back_to_source_markdown(
|
|||
|
||||
await service.index(document, connector_doc, llm=None)
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
|
||||
assert reloaded.content == "## Fallback content"
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_fallback_summary_used_when_llm_unavailable(
|
||||
db_session, db_search_space, make_connector_document,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
):
|
||||
"""fallback_summary is used as content when llm=None and should_summarize=True."""
|
||||
connector_doc = make_connector_document(
|
||||
|
|
@ -181,16 +232,23 @@ async def test_fallback_summary_used_when_llm_unavailable(
|
|||
|
||||
await service.index(prepared[0], connector_doc, llm=None)
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
|
||||
assert reloaded.content == "Short pre-built summary."
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_reindex_replaces_old_chunks(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""Re-indexing a document replaces its old chunks rather than appending."""
|
||||
connector_doc = make_connector_document(
|
||||
|
|
@ -220,9 +278,14 @@ async def test_reindex_replaces_old_chunks(
|
|||
assert len(chunks) == 1
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize_raises", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_llm_error_sets_status_failed(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""Document status is FAILED when the LLM raises during indexing."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -234,15 +297,22 @@ async def test_llm_error_sets_status_failed(
|
|||
|
||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.FAILED)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize_raises", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_llm_error_leaves_no_partial_data(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""A failed indexing attempt leaves no partial embedding or chunks in the DB."""
|
||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -254,7 +324,9 @@ async def test_llm_error_leaves_no_partial_data(
|
|||
|
||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded.embedding is None
|
||||
|
|
|
|||
|
|
@ -2,7 +2,9 @@ import pytest
|
|||
from sqlalchemy import select
|
||||
|
||||
from app.db import Document, DocumentStatus
|
||||
from app.indexing_pipeline.document_hashing import compute_content_hash as real_compute_content_hash
|
||||
from app.indexing_pipeline.document_hashing import (
|
||||
compute_content_hash as real_compute_content_hash,
|
||||
)
|
||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||
|
||||
pytestmark = pytest.mark.integration
|
||||
|
|
@ -20,7 +22,9 @@ async def test_new_document_is_persisted_with_pending_status(
|
|||
assert len(results) == 1
|
||||
document_id = results[0].id
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded is not None
|
||||
|
|
@ -28,9 +32,14 @@ async def test_new_document_is_persisted_with_pending_status(
|
|||
assert reloaded.source_markdown == doc.source_markdown
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_unchanged_ready_document_is_skipped(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""A READY document with unchanged content is not returned for re-indexing."""
|
||||
doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -46,24 +55,35 @@ async def test_unchanged_ready_document_is_skipped(
|
|||
assert results == []
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_title_only_change_updates_title_in_db(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""A title-only change updates the DB title without re-queuing the document."""
|
||||
original = make_connector_document(search_space_id=db_search_space.id, title="Original Title")
|
||||
original = make_connector_document(
|
||||
search_space_id=db_search_space.id, title="Original Title"
|
||||
)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
prepared = await service.prepare_for_indexing([original])
|
||||
document_id = prepared[0].id
|
||||
await service.index(prepared[0], original, llm=mocker.Mock())
|
||||
|
||||
renamed = make_connector_document(search_space_id=db_search_space.id, title="Updated Title")
|
||||
renamed = make_connector_document(
|
||||
search_space_id=db_search_space.id, title="Updated Title"
|
||||
)
|
||||
results = await service.prepare_for_indexing([renamed])
|
||||
|
||||
assert results == []
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded.title == "Updated Title"
|
||||
|
|
@ -73,19 +93,25 @@ async def test_changed_content_is_returned_for_reprocessing(
|
|||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""A document with changed content is returned for re-indexing with updated markdown."""
|
||||
original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1")
|
||||
original = make_connector_document(
|
||||
search_space_id=db_search_space.id, source_markdown="## v1"
|
||||
)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
first = await service.prepare_for_indexing([original])
|
||||
original_id = first[0].id
|
||||
|
||||
updated = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v2")
|
||||
updated = make_connector_document(
|
||||
search_space_id=db_search_space.id, source_markdown="## v2"
|
||||
)
|
||||
results = await service.prepare_for_indexing([updated])
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].id == original_id
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == original_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == original_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded.source_markdown == "## v2"
|
||||
|
|
@ -97,9 +123,24 @@ async def test_all_documents_in_batch_are_persisted(
|
|||
):
|
||||
"""All documents in a batch are persisted and returned."""
|
||||
docs = [
|
||||
make_connector_document(search_space_id=db_search_space.id, unique_id="id-1", title="Doc 1", source_markdown="## Content 1"),
|
||||
make_connector_document(search_space_id=db_search_space.id, unique_id="id-2", title="Doc 2", source_markdown="## Content 2"),
|
||||
make_connector_document(search_space_id=db_search_space.id, unique_id="id-3", title="Doc 3", source_markdown="## Content 3"),
|
||||
make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
unique_id="id-1",
|
||||
title="Doc 1",
|
||||
source_markdown="## Content 1",
|
||||
),
|
||||
make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
unique_id="id-2",
|
||||
title="Doc 2",
|
||||
source_markdown="## Content 2",
|
||||
),
|
||||
make_connector_document(
|
||||
search_space_id=db_search_space.id,
|
||||
unique_id="id-3",
|
||||
title="Doc 3",
|
||||
source_markdown="## Content 3",
|
||||
),
|
||||
]
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
|
|
@ -107,7 +148,9 @@ async def test_all_documents_in_batch_are_persisted(
|
|||
|
||||
assert len(results) == 3
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.search_space_id == db_search_space.id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.search_space_id == db_search_space.id)
|
||||
)
|
||||
rows = result.scalars().all()
|
||||
|
||||
assert len(rows) == 3
|
||||
|
|
@ -124,7 +167,9 @@ async def test_duplicate_in_batch_is_persisted_once(
|
|||
|
||||
assert len(results) == 1
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.search_space_id == db_search_space.id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.search_space_id == db_search_space.id)
|
||||
)
|
||||
rows = result.scalars().all()
|
||||
|
||||
assert len(rows) == 1
|
||||
|
|
@ -143,7 +188,9 @@ async def test_created_by_id_is_persisted(
|
|||
results = await service.prepare_for_indexing([doc])
|
||||
document_id = results[0].id
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert str(reloaded.created_by_id) == str(db_user.id)
|
||||
|
|
@ -170,7 +217,9 @@ async def test_metadata_is_updated_when_content_changes(
|
|||
)
|
||||
await service.prepare_for_indexing([updated])
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded.document_metadata == {"status": "done"}
|
||||
|
|
@ -180,19 +229,27 @@ async def test_updated_at_advances_when_title_only_changes(
|
|||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""updated_at advances even when only the title changes."""
|
||||
original = make_connector_document(search_space_id=db_search_space.id, title="Old Title")
|
||||
original = make_connector_document(
|
||||
search_space_id=db_search_space.id, title="Old Title"
|
||||
)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
first = await service.prepare_for_indexing([original])
|
||||
document_id = first[0].id
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
updated_at_v1 = result.scalars().first().updated_at
|
||||
|
||||
renamed = make_connector_document(search_space_id=db_search_space.id, title="New Title")
|
||||
renamed = make_connector_document(
|
||||
search_space_id=db_search_space.id, title="New Title"
|
||||
)
|
||||
await service.prepare_for_indexing([renamed])
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
updated_at_v2 = result.scalars().first().updated_at
|
||||
|
||||
assert updated_at_v2 > updated_at_v1
|
||||
|
|
@ -202,19 +259,27 @@ async def test_updated_at_advances_when_content_changes(
|
|||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""updated_at advances when document content changes."""
|
||||
original = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v1")
|
||||
original = make_connector_document(
|
||||
search_space_id=db_search_space.id, source_markdown="## v1"
|
||||
)
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
first = await service.prepare_for_indexing([original])
|
||||
document_id = first[0].id
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
updated_at_v1 = result.scalars().first().updated_at
|
||||
|
||||
updated = make_connector_document(search_space_id=db_search_space.id, source_markdown="## v2")
|
||||
updated = make_connector_document(
|
||||
search_space_id=db_search_space.id, source_markdown="## v2"
|
||||
)
|
||||
await service.prepare_for_indexing([updated])
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
updated_at_v2 = result.scalars().first().updated_at
|
||||
|
||||
assert updated_at_v2 > updated_at_v1
|
||||
|
|
@ -273,9 +338,14 @@ async def test_same_content_from_different_source_is_skipped(
|
|||
assert len(result.scalars().all()) == 1
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize_raises", "patched_embed_text", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize_raises", "patched_embed_text", "patched_chunk_text"
|
||||
)
|
||||
async def test_failed_document_with_unchanged_content_is_requeued(
|
||||
db_session, db_search_space, make_connector_document, mocker,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
mocker,
|
||||
):
|
||||
"""A FAILED document with unchanged content is re-queued as PENDING on the next run."""
|
||||
doc = make_connector_document(search_space_id=db_search_space.id)
|
||||
|
|
@ -286,8 +356,12 @@ async def test_failed_document_with_unchanged_content_is_requeued(
|
|||
document_id = prepared[0].id
|
||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
assert DocumentStatus.is_state(result.scalars().first().status, DocumentStatus.FAILED)
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
assert DocumentStatus.is_state(
|
||||
result.scalars().first().status, DocumentStatus.FAILED
|
||||
)
|
||||
|
||||
# Next run: same content, pipeline must re-queue the failed document
|
||||
results = await service.prepare_for_indexing([doc])
|
||||
|
|
@ -295,8 +369,12 @@ async def test_failed_document_with_unchanged_content_is_requeued(
|
|||
assert len(results) == 1
|
||||
assert results[0].id == document_id
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == document_id))
|
||||
assert DocumentStatus.is_state(result.scalars().first().status, DocumentStatus.PENDING)
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
assert DocumentStatus.is_state(
|
||||
result.scalars().first().status, DocumentStatus.PENDING
|
||||
)
|
||||
|
||||
|
||||
async def test_title_and_content_change_updates_both_and_returns_document(
|
||||
|
|
@ -323,16 +401,20 @@ async def test_title_and_content_change_updates_both_and_returns_document(
|
|||
assert len(results) == 1
|
||||
assert results[0].id == original_id
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == original_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == original_id)
|
||||
)
|
||||
reloaded = result.scalars().first()
|
||||
|
||||
assert reloaded.title == "Updated Title"
|
||||
assert reloaded.source_markdown == "## v2"
|
||||
|
||||
|
||||
|
||||
async def test_one_bad_document_in_batch_does_not_prevent_others_from_being_persisted(
|
||||
db_session, db_search_space, make_connector_document, monkeypatch,
|
||||
db_session,
|
||||
db_search_space,
|
||||
make_connector_document,
|
||||
monkeypatch,
|
||||
):
|
||||
"""
|
||||
A per-document error during prepare_for_indexing must be isolated.
|
||||
|
|
@ -374,4 +456,4 @@ async def test_one_bad_document_in_batch_does_not_prevent_others_from_being_pers
|
|||
result = await db_session.execute(
|
||||
select(Document).filter(Document.search_space_id == db_search_space.id)
|
||||
)
|
||||
assert len(result.scalars().all()) == 2
|
||||
assert len(result.scalars().all()) == 2
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patched_summarizer_chain(monkeypatch):
|
||||
|
|
@ -21,7 +22,9 @@ def patched_summarizer_chain(monkeypatch):
|
|||
def patched_chunker_instance(monkeypatch):
|
||||
mock = MagicMock()
|
||||
mock.chunk.return_value = [MagicMock(text="prose chunk")]
|
||||
monkeypatch.setattr("app.indexing_pipeline.document_chunker.config.chunker_instance", mock)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.document_chunker.config.chunker_instance", mock
|
||||
)
|
||||
return mock
|
||||
|
||||
|
||||
|
|
@ -29,5 +32,7 @@ def patched_chunker_instance(monkeypatch):
|
|||
def patched_code_chunker_instance(monkeypatch):
|
||||
mock = MagicMock()
|
||||
mock.chunk.return_value = [MagicMock(text="code chunk")]
|
||||
monkeypatch.setattr("app.indexing_pipeline.document_chunker.config.code_chunker_instance", mock)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.document_chunker.config.code_chunker_instance", mock
|
||||
)
|
||||
return mock
|
||||
|
|
|
|||
|
|
@ -1,7 +1,10 @@
|
|||
import pytest
|
||||
|
||||
from app.db import DocumentType
|
||||
from app.indexing_pipeline.document_hashing import compute_content_hash, compute_unique_identifier_hash
|
||||
from app.indexing_pipeline.document_hashing import (
|
||||
compute_content_hash,
|
||||
compute_unique_identifier_hash,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
|
@ -10,21 +13,31 @@ def test_different_unique_id_produces_different_hash(make_connector_document):
|
|||
"""Two documents with different unique_ids produce different identifier hashes."""
|
||||
doc_a = make_connector_document(unique_id="id-001")
|
||||
doc_b = make_connector_document(unique_id="id-002")
|
||||
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
|
||||
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(
|
||||
doc_b
|
||||
)
|
||||
|
||||
|
||||
def test_different_search_space_produces_different_identifier_hash(make_connector_document):
|
||||
def test_different_search_space_produces_different_identifier_hash(
|
||||
make_connector_document,
|
||||
):
|
||||
"""Same document in different search spaces produces different identifier hashes."""
|
||||
doc_a = make_connector_document(search_space_id=1)
|
||||
doc_b = make_connector_document(search_space_id=2)
|
||||
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
|
||||
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(
|
||||
doc_b
|
||||
)
|
||||
|
||||
|
||||
def test_different_document_type_produces_different_identifier_hash(make_connector_document):
|
||||
def test_different_document_type_produces_different_identifier_hash(
|
||||
make_connector_document,
|
||||
):
|
||||
"""Same unique_id with different document types produces different identifier hashes."""
|
||||
doc_a = make_connector_document(document_type=DocumentType.CLICKUP_CONNECTOR)
|
||||
doc_b = make_connector_document(document_type=DocumentType.NOTION_CONNECTOR)
|
||||
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
|
||||
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(
|
||||
doc_b
|
||||
)
|
||||
|
||||
|
||||
def test_same_content_same_space_produces_same_content_hash(make_connector_document):
|
||||
|
|
@ -34,7 +47,9 @@ def test_same_content_same_space_produces_same_content_hash(make_connector_docum
|
|||
assert compute_content_hash(doc_a) == compute_content_hash(doc_b)
|
||||
|
||||
|
||||
def test_same_content_different_space_produces_different_content_hash(make_connector_document):
|
||||
def test_same_content_different_space_produces_different_content_hash(
|
||||
make_connector_document,
|
||||
):
|
||||
"""Identical content in different search spaces produces different content hashes."""
|
||||
doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1)
|
||||
doc_b = make_connector_document(source_markdown="Hello world", search_space_id=2)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from app.indexing_pipeline.document_summarizer import summarize_document
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
|
@ -38,5 +39,3 @@ async def test_with_metadata_omits_empty_fields_from_output():
|
|||
|
||||
assert "Alice" in result
|
||||
assert "description" not in result.lower()
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue