Merge remote-tracking branch 'upstream/dev' into feat/api-key

This commit is contained in:
Anish Sarkar 2026-06-23 13:09:53 +05:30
commit 3695e1d5c5
64 changed files with 1043 additions and 1852 deletions

View file

@ -1,80 +0,0 @@
"""NOTE writes must carry the same char spans as the indexing pipeline.
``_create_document`` / ``_update_document`` are the cloud agent's KB write
paths. They must chunk through the shared span chunker so every persisted
chunk resolves back to an exact slice of ``source_markdown`` for citations.
"""
from __future__ import annotations
import pytest
from sqlalchemy import select
from app.agents.chat.multi_agent_chat.main_agent.middleware.kb_persistence import (
middleware as kb,
)
from app.db import Chunk
pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
_BODY = "Intro paragraph.\n\nBody paragraph with detail.\n\nOutro paragraph."
_NEW_BODY = "Rewritten intro.\n\nFresh body content.\n\nNew closing line."
async def _ordered_chunks(session, doc_id: int) -> list[Chunk]:
rows = await session.execute(
select(Chunk).where(Chunk.document_id == doc_id).order_by(Chunk.position)
)
return list(rows.scalars().all())
def _assert_spans_resolve(source_markdown: str, chunks: list[Chunk]) -> None:
assert chunks
for chunk in chunks:
assert chunk.start_char is not None
assert chunk.end_char is not None
assert source_markdown[chunk.start_char : chunk.end_char] == chunk.content
@pytest.mark.usefixtures("patched_embed_texts")
async def test_note_create_populates_chunk_spans(
db_session, db_search_space, db_user
) -> None:
doc = await kb._create_document(
db_session,
virtual_path="/documents/note.md",
content=_BODY,
search_space_id=db_search_space.id,
created_by_id=str(db_user.id),
)
await db_session.flush()
chunks = await _ordered_chunks(db_session, doc.id)
_assert_spans_resolve(doc.source_markdown, chunks)
@pytest.mark.usefixtures("patched_embed_texts")
async def test_note_update_refreshes_chunk_spans(
db_session, db_search_space, db_user
) -> None:
doc = await kb._create_document(
db_session,
virtual_path="/documents/note.md",
content=_BODY,
search_space_id=db_search_space.id,
created_by_id=str(db_user.id),
)
await db_session.flush()
updated = await kb._update_document(
db_session,
doc_id=doc.id,
content=_NEW_BODY,
virtual_path="/documents/note.md",
search_space_id=db_search_space.id,
)
await db_session.flush()
assert updated is not None
chunks = await _ordered_chunks(db_session, updated.id)
_assert_spans_resolve(updated.source_markdown, chunks)

View file

@ -158,12 +158,13 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
@pytest.fixture
def patched_chunk_text(monkeypatch) -> MagicMock:
from app.indexing_pipeline.document_chunker import ChunkSlice
text = "Test chunk content."
mock = MagicMock(return_value=[ChunkSlice(text, 0, len(text))])
mock = MagicMock(return_value=["Test chunk content."])
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
mock,
)
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock,
)
return mock

View file

@ -286,12 +286,9 @@ def _mock_external_apis(monkeypatch):
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
)
from app.indexing_pipeline.document_chunker import ChunkSlice
chunk = "Test chunk content."
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
MagicMock(return_value=[ChunkSlice(chunk, 0, len(chunk))]),
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
MagicMock(return_value=["Test chunk content."]),
)

View file

@ -176,14 +176,9 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
@pytest.mark.usefixtures("patched_embed_texts")
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
"""Reindexing replaces old chunks with new content rather than appending."""
from app.indexing_pipeline.document_chunker import ChunkSlice
mocker.patch(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
side_effect=[
[ChunkSlice("Original chunk.", 0, len("Original chunk."))],
[ChunkSlice("Updated chunk.", 0, len("Updated chunk."))],
],
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
side_effect=[["Original chunk."], ["Updated chunk."]],
)
adapter = UploadDocumentAdapter(db_session)

View file

@ -18,22 +18,16 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph."
@pytest.fixture
def paragraph_chunker(monkeypatch):
"""One slice per markdown paragraph, so edits map to chunk-level diffs."""
from app.indexing_pipeline.document_chunker import ChunkSlice
"""One chunk per markdown paragraph, so edits map to chunk-level diffs."""
def _split(markdown, *_args, **_kwargs):
slices = []
cursor = 0
for para in markdown.split("\n\n"):
start = markdown.index(para, cursor)
cursor = start + len(para)
if para.strip():
slices.append(ChunkSlice(para, start, cursor))
return slices
def _split(markdown, **_kwargs):
return [p for p in markdown.split("\n\n") if p.strip()]
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
_split,
"app.indexing_pipeline.cache.cached_indexing.chunk_text", _split
)
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split
)

View file

@ -1,96 +0,0 @@
"""Indexing records char spans so a chunk addresses its exact slice of the body.
Uses the real chunker (only embeddings are faked) so the span/partition
invariants are exercised end to end.
"""
import pytest
from sqlalchemy import select
from app.db import Chunk, Document
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
pytestmark = pytest.mark.integration
_BODY = (
"# Report\n\n"
+ "Intro paragraph that is reasonably long and descriptive. " * 8
+ "\n\n| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n\n"
+ "Closing paragraph with a different shape and more words to chunk. " * 8
)
async def _ordered_chunks(session, document_id) -> list[Chunk]:
result = await session.execute(
select(Chunk)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.position, Chunk.id)
)
return list(result.scalars().all())
def _assert_spans_address_body(chunks: list[Chunk], body: str) -> None:
for chunk in chunks:
assert chunk.start_char is not None and chunk.end_char is not None
assert body[chunk.start_char : chunk.end_char] == chunk.content
assert "".join(c.content for c in chunks) == body
async def _index(session, connector_doc) -> int:
service = IndexingPipelineService(session=session)
prepared = await service.prepare_for_indexing([connector_doc])
document = prepared[0]
await service.index(document, connector_doc)
return document.id
async def _reload_body(session, document_id) -> str:
result = await session.execute(select(Document).filter(Document.id == document_id))
return result.scalars().first().source_markdown
@pytest.mark.usefixtures("patched_embed_texts")
async def test_scratch_index_records_spans_addressing_body(
db_session, db_search_space, make_connector_document
):
connector_doc = make_connector_document(
search_space_id=db_search_space.id, source_markdown=_BODY
)
document_id = await _index(db_session, connector_doc)
body = await _reload_body(db_session, document_id)
chunks = await _ordered_chunks(db_session, document_id)
assert len(chunks) > 1
_assert_spans_address_body(chunks, body)
@pytest.mark.usefixtures("patched_embed_texts")
async def test_incremental_reindex_refreshes_shifted_spans(
db_session, db_search_space, make_connector_document
):
"""Inserting text at the top shifts every later chunk's span; kept rows must
have their spans refreshed, not left pointing at the old offsets."""
service = IndexingPipelineService(session=db_session)
original = make_connector_document(
search_space_id=db_search_space.id, source_markdown=_BODY
)
prepared = await service.prepare_for_indexing([original])
document_id = prepared[0].id
await service.index(prepared[0], original)
edited_body = "# Prepended heading\n\nA brand new opening paragraph.\n\n" + _BODY
edited = make_connector_document(
search_space_id=db_search_space.id, source_markdown=edited_body
)
prepared_again = await service.prepare_for_indexing([edited])
assert prepared_again, "edited content should requeue the document"
await service.index(prepared_again[0], edited)
body = await _reload_body(db_session, document_id)
chunks = await _ordered_chunks(db_session, document_id)
assert body == edited_body
_assert_spans_address_body(chunks, body)

View file

@ -40,19 +40,11 @@ def _make_document(
)
def _make_chunk(
*,
content: str,
document_id: int,
start_char: int | None = None,
end_char: int | None = None,
) -> Chunk:
def _make_chunk(*, content: str, document_id: int) -> Chunk:
return Chunk(
content=content,
document_id=document_id,
embedding=DUMMY_EMBEDDING,
start_char=start_char,
end_char=end_char,
)
@ -99,8 +91,6 @@ async def seed_large_doc(
_make_chunk(
content="quarterly performance review summary note content",
document_id=small_doc.id,
start_char=0,
end_char=10,
),
]

View file

@ -98,32 +98,6 @@ async def test_chunks_ordered_by_id(db_session, seed_large_doc):
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
async def test_chunk_spans_returned(db_session, seed_large_doc):
"""Each chunk dict carries start_char/end_char (the citation span)."""
space_id = seed_large_doc["search_space"].id
small_doc_id = seed_large_doc["small_doc"].id
retriever = ChucksHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review summary",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
for result in results:
for chunk in result["chunks"]:
assert "start_char" in chunk
assert "end_char" in chunk
if result["document"].get("id") == small_doc_id:
seeded = result["chunks"][0]
assert seeded["start_char"] == 0
assert seeded["end_char"] == 10
break
else:
pytest.fail("Small doc not found in search results")
async def test_score_is_positive_float(db_session, seed_large_doc):
"""Each result should have a positive float score from RRF."""
space_id = seed_large_doc["search_space"].id

View file

@ -1,127 +0,0 @@
"""Phase E.1 contract: the by-chunk resolve API exposes chunk char spans and
derives the cited chunk's line range from source_markdown."""
import pytest
import pytest_asyncio
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Chunk, Document, DocumentStatus, DocumentType, SearchSpace, User
pytestmark = pytest.mark.integration
_BODY = "alpha\nbravo\ncharlie\ndelta"
async def _make_document(
session: AsyncSession,
search_space: SearchSpace,
user: User,
*,
source_markdown: str = _BODY,
) -> Document:
doc = Document(
title="Doc",
document_type=DocumentType.FILE,
document_metadata={},
content=source_markdown,
content_hash="hash-by-chunk",
source_markdown=source_markdown,
search_space_id=search_space.id,
created_by_id=user.id,
status=DocumentStatus.ready(),
)
session.add(doc)
await session.flush()
return doc
async def _add_chunk(
session: AsyncSession,
document: Document,
*,
content: str,
position: int,
start_char: int | None,
end_char: int | None,
) -> Chunk:
chunk = Chunk(
content=content,
position=position,
document_id=document.id,
start_char=start_char,
end_char=end_char,
)
session.add(chunk)
await session.flush()
return chunk
@pytest_asyncio.fixture
async def make_document(db_session, db_search_space, db_user):
async def _make(**overrides):
return await _make_document(db_session, db_search_space, db_user, **overrides)
return _make
async def test_cited_line_range_derived_from_spans(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
await _add_chunk(
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
)
cited = await _add_chunk(
db_session,
doc,
content="charlie\ndelta",
position=1,
start_char=12,
end_char=len(_BODY),
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
assert result.cited_start_line == 3
assert result.cited_end_line == 4
async def test_chunk_spans_exposed_in_response(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
cited = await _add_chunk(
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
chunk = next(c for c in result.chunks if c.id == cited.id)
assert chunk.start_char == 0
assert chunk.end_char == 12
async def test_cited_line_range_null_without_spans(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
cited = await _add_chunk(
db_session, doc, content="alpha", position=0, start_char=None, end_char=None
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
assert result.cited_start_line is None
assert result.cited_end_line is None

View file

@ -1,175 +0,0 @@
"""Phase A contract: editor read paths serve source_markdown and never
reconstruct or mutate the body from chunks."""
import pytest
import pytest_asyncio
from fastapi import HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import (
Chunk,
Document,
DocumentStatus,
DocumentType,
SearchSpace,
User,
)
pytestmark = pytest.mark.integration
async def _make_document(
session: AsyncSession,
search_space: SearchSpace,
user: User,
*,
document_type: DocumentType = DocumentType.FILE,
source_markdown: str | None = "# Title\n\nBody line.",
content: str = "Body line.",
status: dict | None = None,
) -> Document:
doc = Document(
title="Doc",
document_type=document_type,
document_metadata={},
content=content,
content_hash="hash-001",
source_markdown=source_markdown,
search_space_id=search_space.id,
created_by_id=user.id,
status=status or DocumentStatus.ready(),
)
session.add(doc)
await session.flush()
return doc
async def _add_chunks(session: AsyncSession, document: Document, texts: list[str]):
for position, text in enumerate(texts):
session.add(Chunk(content=text, position=position, document_id=document.id))
await session.flush()
@pytest_asyncio.fixture
async def make_document(db_session, db_search_space, db_user):
async def _make(**overrides):
return await _make_document(db_session, db_search_space, db_user, **overrides)
return _make
class TestGetEditorContent:
async def test_returns_source_markdown_verbatim(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(source_markdown="# Real\n\nCanonical body.")
result = await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert result["source_markdown"] == "# Real\n\nCanonical body."
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
"""A ready document without source_markdown must not be rebuilt from chunks."""
from app.routes.editor_routes import get_editor_content
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 400
await db_session.refresh(doc)
assert doc.source_markdown is None
async def test_processing_document_without_body_returns_409(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(
source_markdown=None, status=DocumentStatus.processing()
)
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 409
async def test_failed_document_without_body_returns_422(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(
source_markdown=None, status=DocumentStatus.failed("boom")
)
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 422
async def test_empty_note_initializes_to_empty_markdown(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(document_type=DocumentType.NOTE, source_markdown=None)
result = await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert result["source_markdown"] == ""
class TestDownloadMarkdown:
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import download_document_markdown
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await download_document_markdown(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 400
class TestExportDocument:
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import export_document
from app.routes.reports_routes import ExportFormat
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await export_document(
db_search_space.id,
doc.id,
format=ExportFormat.PLAIN,
session=db_session,
user=db_user,
)
assert exc.value.status_code == 400