mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-04 22:02:16 +02:00
Merge remote-tracking branch 'upstream/dev' into feat/api-key
This commit is contained in:
commit
fd31ac34fd
61 changed files with 1984 additions and 435 deletions
|
|
@ -0,0 +1,80 @@
|
|||
"""NOTE writes must carry the same char spans as the indexing pipeline.
|
||||
|
||||
``_create_document`` / ``_update_document`` are the cloud agent's KB write
|
||||
paths. They must chunk through the shared span chunker so every persisted
|
||||
chunk resolves back to an exact slice of ``source_markdown`` for citations.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.agents.chat.multi_agent_chat.main_agent.middleware.kb_persistence import (
|
||||
middleware as kb,
|
||||
)
|
||||
from app.db import Chunk
|
||||
|
||||
pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
|
||||
|
||||
_BODY = "Intro paragraph.\n\nBody paragraph with detail.\n\nOutro paragraph."
|
||||
_NEW_BODY = "Rewritten intro.\n\nFresh body content.\n\nNew closing line."
|
||||
|
||||
|
||||
async def _ordered_chunks(session, doc_id: int) -> list[Chunk]:
|
||||
rows = await session.execute(
|
||||
select(Chunk).where(Chunk.document_id == doc_id).order_by(Chunk.position)
|
||||
)
|
||||
return list(rows.scalars().all())
|
||||
|
||||
|
||||
def _assert_spans_resolve(source_markdown: str, chunks: list[Chunk]) -> None:
|
||||
assert chunks
|
||||
for chunk in chunks:
|
||||
assert chunk.start_char is not None
|
||||
assert chunk.end_char is not None
|
||||
assert source_markdown[chunk.start_char : chunk.end_char] == chunk.content
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_embed_texts")
|
||||
async def test_note_create_populates_chunk_spans(
|
||||
db_session, db_search_space, db_user
|
||||
) -> None:
|
||||
doc = await kb._create_document(
|
||||
db_session,
|
||||
virtual_path="/documents/note.md",
|
||||
content=_BODY,
|
||||
search_space_id=db_search_space.id,
|
||||
created_by_id=str(db_user.id),
|
||||
)
|
||||
await db_session.flush()
|
||||
|
||||
chunks = await _ordered_chunks(db_session, doc.id)
|
||||
_assert_spans_resolve(doc.source_markdown, chunks)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_embed_texts")
|
||||
async def test_note_update_refreshes_chunk_spans(
|
||||
db_session, db_search_space, db_user
|
||||
) -> None:
|
||||
doc = await kb._create_document(
|
||||
db_session,
|
||||
virtual_path="/documents/note.md",
|
||||
content=_BODY,
|
||||
search_space_id=db_search_space.id,
|
||||
created_by_id=str(db_user.id),
|
||||
)
|
||||
await db_session.flush()
|
||||
|
||||
updated = await kb._update_document(
|
||||
db_session,
|
||||
doc_id=doc.id,
|
||||
content=_NEW_BODY,
|
||||
virtual_path="/documents/note.md",
|
||||
search_space_id=db_search_space.id,
|
||||
)
|
||||
await db_session.flush()
|
||||
|
||||
assert updated is not None
|
||||
chunks = await _ordered_chunks(db_session, updated.id)
|
||||
_assert_spans_resolve(updated.source_markdown, chunks)
|
||||
|
|
@ -158,13 +158,12 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
|
|||
|
||||
@pytest.fixture
|
||||
def patched_chunk_text(monkeypatch) -> MagicMock:
|
||||
mock = MagicMock(return_value=["Test chunk content."])
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
text = "Test chunk content."
|
||||
mock = MagicMock(return_value=[ChunkSlice(text, 0, len(text))])
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||
mock,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
||||
mock,
|
||||
)
|
||||
return mock
|
||||
|
|
|
|||
|
|
@ -286,9 +286,12 @@ def _mock_external_apis(monkeypatch):
|
|||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
||||
)
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
chunk = "Test chunk content."
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||
MagicMock(return_value=["Test chunk content."]),
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
||||
MagicMock(return_value=[ChunkSlice(chunk, 0, len(chunk))]),
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -176,9 +176,14 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
|
|||
@pytest.mark.usefixtures("patched_embed_texts")
|
||||
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
|
||||
"""Reindexing replaces old chunks with new content rather than appending."""
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
mocker.patch(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||
side_effect=[["Original chunk."], ["Updated chunk."]],
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
||||
side_effect=[
|
||||
[ChunkSlice("Original chunk.", 0, len("Original chunk."))],
|
||||
[ChunkSlice("Updated chunk.", 0, len("Updated chunk."))],
|
||||
],
|
||||
)
|
||||
|
||||
adapter = UploadDocumentAdapter(db_session)
|
||||
|
|
|
|||
|
|
@ -18,16 +18,22 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph."
|
|||
|
||||
@pytest.fixture
|
||||
def paragraph_chunker(monkeypatch):
|
||||
"""One chunk per markdown paragraph, so edits map to chunk-level diffs."""
|
||||
"""One slice per markdown paragraph, so edits map to chunk-level diffs."""
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
def _split(markdown, **_kwargs):
|
||||
return [p for p in markdown.split("\n\n") if p.strip()]
|
||||
def _split(markdown, *_args, **_kwargs):
|
||||
slices = []
|
||||
cursor = 0
|
||||
for para in markdown.split("\n\n"):
|
||||
start = markdown.index(para, cursor)
|
||||
cursor = start + len(para)
|
||||
if para.strip():
|
||||
slices.append(ChunkSlice(para, start, cursor))
|
||||
return slices
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text", _split
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
||||
_split,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,96 @@
|
|||
"""Indexing records char spans so a chunk addresses its exact slice of the body.
|
||||
|
||||
Uses the real chunker (only embeddings are faked) so the span/partition
|
||||
invariants are exercised end to end.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db import Chunk, Document
|
||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||
|
||||
pytestmark = pytest.mark.integration
|
||||
|
||||
_BODY = (
|
||||
"# Report\n\n"
|
||||
+ "Intro paragraph that is reasonably long and descriptive. " * 8
|
||||
+ "\n\n| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n\n"
|
||||
+ "Closing paragraph with a different shape and more words to chunk. " * 8
|
||||
)
|
||||
|
||||
|
||||
async def _ordered_chunks(session, document_id) -> list[Chunk]:
|
||||
result = await session.execute(
|
||||
select(Chunk)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
return list(result.scalars().all())
|
||||
|
||||
|
||||
def _assert_spans_address_body(chunks: list[Chunk], body: str) -> None:
|
||||
for chunk in chunks:
|
||||
assert chunk.start_char is not None and chunk.end_char is not None
|
||||
assert body[chunk.start_char : chunk.end_char] == chunk.content
|
||||
assert "".join(c.content for c in chunks) == body
|
||||
|
||||
|
||||
async def _index(session, connector_doc) -> int:
|
||||
service = IndexingPipelineService(session=session)
|
||||
prepared = await service.prepare_for_indexing([connector_doc])
|
||||
document = prepared[0]
|
||||
await service.index(document, connector_doc)
|
||||
return document.id
|
||||
|
||||
|
||||
async def _reload_body(session, document_id) -> str:
|
||||
result = await session.execute(select(Document).filter(Document.id == document_id))
|
||||
return result.scalars().first().source_markdown
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_embed_texts")
|
||||
async def test_scratch_index_records_spans_addressing_body(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
connector_doc = make_connector_document(
|
||||
search_space_id=db_search_space.id, source_markdown=_BODY
|
||||
)
|
||||
|
||||
document_id = await _index(db_session, connector_doc)
|
||||
|
||||
body = await _reload_body(db_session, document_id)
|
||||
chunks = await _ordered_chunks(db_session, document_id)
|
||||
|
||||
assert len(chunks) > 1
|
||||
_assert_spans_address_body(chunks, body)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_embed_texts")
|
||||
async def test_incremental_reindex_refreshes_shifted_spans(
|
||||
db_session, db_search_space, make_connector_document
|
||||
):
|
||||
"""Inserting text at the top shifts every later chunk's span; kept rows must
|
||||
have their spans refreshed, not left pointing at the old offsets."""
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
||||
original = make_connector_document(
|
||||
search_space_id=db_search_space.id, source_markdown=_BODY
|
||||
)
|
||||
prepared = await service.prepare_for_indexing([original])
|
||||
document_id = prepared[0].id
|
||||
await service.index(prepared[0], original)
|
||||
|
||||
edited_body = "# Prepended heading\n\nA brand new opening paragraph.\n\n" + _BODY
|
||||
edited = make_connector_document(
|
||||
search_space_id=db_search_space.id, source_markdown=edited_body
|
||||
)
|
||||
prepared_again = await service.prepare_for_indexing([edited])
|
||||
assert prepared_again, "edited content should requeue the document"
|
||||
await service.index(prepared_again[0], edited)
|
||||
|
||||
body = await _reload_body(db_session, document_id)
|
||||
chunks = await _ordered_chunks(db_session, document_id)
|
||||
|
||||
assert body == edited_body
|
||||
_assert_spans_address_body(chunks, body)
|
||||
|
|
@ -40,11 +40,19 @@ def _make_document(
|
|||
)
|
||||
|
||||
|
||||
def _make_chunk(*, content: str, document_id: int) -> Chunk:
|
||||
def _make_chunk(
|
||||
*,
|
||||
content: str,
|
||||
document_id: int,
|
||||
start_char: int | None = None,
|
||||
end_char: int | None = None,
|
||||
) -> Chunk:
|
||||
return Chunk(
|
||||
content=content,
|
||||
document_id=document_id,
|
||||
embedding=DUMMY_EMBEDDING,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -91,6 +99,8 @@ async def seed_large_doc(
|
|||
_make_chunk(
|
||||
content="quarterly performance review summary note content",
|
||||
document_id=small_doc.id,
|
||||
start_char=0,
|
||||
end_char=10,
|
||||
),
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -98,6 +98,32 @@ async def test_chunks_ordered_by_id(db_session, seed_large_doc):
|
|||
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
|
||||
|
||||
|
||||
async def test_chunk_spans_returned(db_session, seed_large_doc):
|
||||
"""Each chunk dict carries start_char/end_char (the citation span)."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
small_doc_id = seed_large_doc["small_doc"].id
|
||||
|
||||
retriever = ChucksHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review summary",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
for result in results:
|
||||
for chunk in result["chunks"]:
|
||||
assert "start_char" in chunk
|
||||
assert "end_char" in chunk
|
||||
if result["document"].get("id") == small_doc_id:
|
||||
seeded = result["chunks"][0]
|
||||
assert seeded["start_char"] == 0
|
||||
assert seeded["end_char"] == 10
|
||||
break
|
||||
else:
|
||||
pytest.fail("Small doc not found in search results")
|
||||
|
||||
|
||||
async def test_score_is_positive_float(db_session, seed_large_doc):
|
||||
"""Each result should have a positive float score from RRF."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
|
|
|||
|
|
@ -0,0 +1,127 @@
|
|||
"""Phase E.1 contract: the by-chunk resolve API exposes chunk char spans and
|
||||
derives the cited chunk's line range from source_markdown."""
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Chunk, Document, DocumentStatus, DocumentType, SearchSpace, User
|
||||
|
||||
pytestmark = pytest.mark.integration
|
||||
|
||||
_BODY = "alpha\nbravo\ncharlie\ndelta"
|
||||
|
||||
|
||||
async def _make_document(
|
||||
session: AsyncSession,
|
||||
search_space: SearchSpace,
|
||||
user: User,
|
||||
*,
|
||||
source_markdown: str = _BODY,
|
||||
) -> Document:
|
||||
doc = Document(
|
||||
title="Doc",
|
||||
document_type=DocumentType.FILE,
|
||||
document_metadata={},
|
||||
content=source_markdown,
|
||||
content_hash="hash-by-chunk",
|
||||
source_markdown=source_markdown,
|
||||
search_space_id=search_space.id,
|
||||
created_by_id=user.id,
|
||||
status=DocumentStatus.ready(),
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
return doc
|
||||
|
||||
|
||||
async def _add_chunk(
|
||||
session: AsyncSession,
|
||||
document: Document,
|
||||
*,
|
||||
content: str,
|
||||
position: int,
|
||||
start_char: int | None,
|
||||
end_char: int | None,
|
||||
) -> Chunk:
|
||||
chunk = Chunk(
|
||||
content=content,
|
||||
position=position,
|
||||
document_id=document.id,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
)
|
||||
session.add(chunk)
|
||||
await session.flush()
|
||||
return chunk
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def make_document(db_session, db_search_space, db_user):
|
||||
async def _make(**overrides):
|
||||
return await _make_document(db_session, db_search_space, db_user, **overrides)
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
async def test_cited_line_range_derived_from_spans(
|
||||
db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.documents_routes import get_document_by_chunk_id
|
||||
|
||||
doc = await make_document()
|
||||
await _add_chunk(
|
||||
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
|
||||
)
|
||||
cited = await _add_chunk(
|
||||
db_session,
|
||||
doc,
|
||||
content="charlie\ndelta",
|
||||
position=1,
|
||||
start_char=12,
|
||||
end_char=len(_BODY),
|
||||
)
|
||||
|
||||
result = await get_document_by_chunk_id(
|
||||
cited.id, chunk_window=5, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
assert result.cited_start_line == 3
|
||||
assert result.cited_end_line == 4
|
||||
|
||||
|
||||
async def test_chunk_spans_exposed_in_response(
|
||||
db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.documents_routes import get_document_by_chunk_id
|
||||
|
||||
doc = await make_document()
|
||||
cited = await _add_chunk(
|
||||
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
|
||||
)
|
||||
|
||||
result = await get_document_by_chunk_id(
|
||||
cited.id, chunk_window=5, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
chunk = next(c for c in result.chunks if c.id == cited.id)
|
||||
assert chunk.start_char == 0
|
||||
assert chunk.end_char == 12
|
||||
|
||||
|
||||
async def test_cited_line_range_null_without_spans(
|
||||
db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.documents_routes import get_document_by_chunk_id
|
||||
|
||||
doc = await make_document()
|
||||
cited = await _add_chunk(
|
||||
db_session, doc, content="alpha", position=0, start_char=None, end_char=None
|
||||
)
|
||||
|
||||
result = await get_document_by_chunk_id(
|
||||
cited.id, chunk_window=5, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
assert result.cited_start_line is None
|
||||
assert result.cited_end_line is None
|
||||
175
surfsense_backend/tests/integration/test_editor_routes.py
Normal file
175
surfsense_backend/tests/integration/test_editor_routes.py
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
"""Phase A contract: editor read paths serve source_markdown and never
|
||||
reconstruct or mutate the body from chunks."""
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import (
|
||||
Chunk,
|
||||
Document,
|
||||
DocumentStatus,
|
||||
DocumentType,
|
||||
SearchSpace,
|
||||
User,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
async def _make_document(
|
||||
session: AsyncSession,
|
||||
search_space: SearchSpace,
|
||||
user: User,
|
||||
*,
|
||||
document_type: DocumentType = DocumentType.FILE,
|
||||
source_markdown: str | None = "# Title\n\nBody line.",
|
||||
content: str = "Body line.",
|
||||
status: dict | None = None,
|
||||
) -> Document:
|
||||
doc = Document(
|
||||
title="Doc",
|
||||
document_type=document_type,
|
||||
document_metadata={},
|
||||
content=content,
|
||||
content_hash="hash-001",
|
||||
source_markdown=source_markdown,
|
||||
search_space_id=search_space.id,
|
||||
created_by_id=user.id,
|
||||
status=status or DocumentStatus.ready(),
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
return doc
|
||||
|
||||
|
||||
async def _add_chunks(session: AsyncSession, document: Document, texts: list[str]):
|
||||
for position, text in enumerate(texts):
|
||||
session.add(Chunk(content=text, position=position, document_id=document.id))
|
||||
await session.flush()
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def make_document(db_session, db_search_space, db_user):
|
||||
async def _make(**overrides):
|
||||
return await _make_document(db_session, db_search_space, db_user, **overrides)
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
class TestGetEditorContent:
|
||||
async def test_returns_source_markdown_verbatim(
|
||||
self, db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.editor_routes import get_editor_content
|
||||
|
||||
doc = await make_document(source_markdown="# Real\n\nCanonical body.")
|
||||
|
||||
result = await get_editor_content(
|
||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
assert result["source_markdown"] == "# Real\n\nCanonical body."
|
||||
|
||||
async def test_does_not_reconstruct_body_from_chunks(
|
||||
self, db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
"""A ready document without source_markdown must not be rebuilt from chunks."""
|
||||
from app.routes.editor_routes import get_editor_content
|
||||
|
||||
doc = await make_document(source_markdown=None)
|
||||
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
|
||||
|
||||
with pytest.raises(HTTPException) as exc:
|
||||
await get_editor_content(
|
||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
assert exc.value.status_code == 400
|
||||
await db_session.refresh(doc)
|
||||
assert doc.source_markdown is None
|
||||
|
||||
async def test_processing_document_without_body_returns_409(
|
||||
self, db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.editor_routes import get_editor_content
|
||||
|
||||
doc = await make_document(
|
||||
source_markdown=None, status=DocumentStatus.processing()
|
||||
)
|
||||
|
||||
with pytest.raises(HTTPException) as exc:
|
||||
await get_editor_content(
|
||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
assert exc.value.status_code == 409
|
||||
|
||||
async def test_failed_document_without_body_returns_422(
|
||||
self, db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.editor_routes import get_editor_content
|
||||
|
||||
doc = await make_document(
|
||||
source_markdown=None, status=DocumentStatus.failed("boom")
|
||||
)
|
||||
|
||||
with pytest.raises(HTTPException) as exc:
|
||||
await get_editor_content(
|
||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
assert exc.value.status_code == 422
|
||||
|
||||
async def test_empty_note_initializes_to_empty_markdown(
|
||||
self, db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.editor_routes import get_editor_content
|
||||
|
||||
doc = await make_document(document_type=DocumentType.NOTE, source_markdown=None)
|
||||
|
||||
result = await get_editor_content(
|
||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
assert result["source_markdown"] == ""
|
||||
|
||||
|
||||
class TestDownloadMarkdown:
|
||||
async def test_does_not_reconstruct_body_from_chunks(
|
||||
self, db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.editor_routes import download_document_markdown
|
||||
|
||||
doc = await make_document(source_markdown=None)
|
||||
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
|
||||
|
||||
with pytest.raises(HTTPException) as exc:
|
||||
await download_document_markdown(
|
||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
||||
)
|
||||
|
||||
assert exc.value.status_code == 400
|
||||
|
||||
|
||||
class TestExportDocument:
|
||||
async def test_does_not_reconstruct_body_from_chunks(
|
||||
self, db_session, db_search_space, db_user, make_document
|
||||
):
|
||||
from app.routes.editor_routes import export_document
|
||||
from app.routes.reports_routes import ExportFormat
|
||||
|
||||
doc = await make_document(source_markdown=None)
|
||||
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
|
||||
|
||||
with pytest.raises(HTTPException) as exc:
|
||||
await export_document(
|
||||
db_search_space.id,
|
||||
doc.id,
|
||||
format=ExportFormat.PLAIN,
|
||||
session=db_session,
|
||||
user=db_user,
|
||||
)
|
||||
|
||||
assert exc.value.status_code == 400
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
"""Unit tests for search_knowledge_base hit rendering.
|
||||
|
||||
The tool must surface the passage that actually matched (the RRF-ranked
|
||||
chunk), not the top of the document, and annotate it with its line range
|
||||
when the chunk carries a char span.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
|
||||
_format_hits,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
_BODY = "Intro paragraph.\n\nMatched passage here.\n\nClosing paragraph."
|
||||
|
||||
|
||||
def _hit() -> dict:
|
||||
intro = "Intro paragraph."
|
||||
matched = "Matched passage here."
|
||||
matched_start = _BODY.index(matched)
|
||||
return {
|
||||
"document": {"id": 7, "title": "note.md", "document_type": "NOTE"},
|
||||
"score": 0.42,
|
||||
"content": _BODY.replace("\n\n", "\n\n"),
|
||||
"matched_chunk_ids": [102],
|
||||
"chunks": [
|
||||
{
|
||||
"chunk_id": 101,
|
||||
"content": intro,
|
||||
"start_char": 0,
|
||||
"end_char": len(intro),
|
||||
},
|
||||
{
|
||||
"chunk_id": 102,
|
||||
"content": matched,
|
||||
"start_char": matched_start,
|
||||
"end_char": matched_start + len(matched),
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_renders_matched_passage_not_top_of_doc() -> None:
|
||||
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
||||
assert "Matched passage here." in out
|
||||
# The intro chunk was not matched, so it must not be shown as the snippet.
|
||||
assert "Intro paragraph." not in out
|
||||
|
||||
|
||||
def test_emits_copyable_line_citation_token_when_spans_present() -> None:
|
||||
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
||||
# "Matched passage here." sits on line 3 of the body; the hit must surface
|
||||
# a ready-to-copy token so the agent can cite without a separate read.
|
||||
assert "[citation:d7#L3-3]" in out
|
||||
|
||||
|
||||
def test_header_includes_document_id() -> None:
|
||||
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
||||
assert "id=7" in out
|
||||
|
||||
|
||||
def test_omits_citation_token_when_spans_absent() -> None:
|
||||
hit = _hit()
|
||||
for chunk in hit["chunks"]:
|
||||
chunk["start_char"] = None
|
||||
chunk["end_char"] = None
|
||||
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
||||
assert "Matched passage here." in out
|
||||
# No concrete, copyable token for this document without spans (the closing
|
||||
# instruction's placeholder template doesn't count).
|
||||
assert "[citation:d7#L" not in out
|
||||
|
||||
|
||||
def test_falls_back_to_content_when_no_matched_ids() -> None:
|
||||
hit = _hit()
|
||||
hit["matched_chunk_ids"] = []
|
||||
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
||||
assert "Intro paragraph." in out
|
||||
|
||||
|
||||
def test_no_results_message() -> None:
|
||||
out = _format_hits([], paths={}, bodies={}, query="missing")
|
||||
assert "No knowledge-base matches" in out
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
"""Span-aware chunking contract: slices form a lossless, contiguous partition
|
||||
of the markdown, and every slice's char span addresses its own text."""
|
||||
|
||||
import pytest
|
||||
|
||||
from app.indexing_pipeline.document_chunker import chunk_markdown_with_spans
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _assert_lossless_partition(md: str, slices) -> None:
|
||||
assert "".join(s.text for s in slices) == md
|
||||
|
||||
cursor = 0
|
||||
for s in slices:
|
||||
assert s.start_char == cursor, "slices must be contiguous"
|
||||
assert s.end_char >= s.start_char
|
||||
assert md[s.start_char : s.end_char] == s.text, "span must address slice text"
|
||||
cursor = s.end_char
|
||||
assert cursor == len(md)
|
||||
|
||||
|
||||
def test_prose_partition_and_spans():
|
||||
md = (
|
||||
"# Title\n\n"
|
||||
+ "First paragraph with several words here. " * 20
|
||||
+ "\n\nSecond section with more prose to force multiple chunks. " * 20
|
||||
)
|
||||
|
||||
slices = chunk_markdown_with_spans(md)
|
||||
|
||||
assert len(slices) > 1
|
||||
_assert_lossless_partition(md, slices)
|
||||
|
||||
|
||||
def test_table_kept_whole_with_exact_span():
|
||||
table = "| a | b |\n| - | - |\n| 1 | 2 |\n"
|
||||
md = f"Intro prose before the table.\n{table}\nClosing prose after."
|
||||
|
||||
slices = chunk_markdown_with_spans(md)
|
||||
|
||||
_assert_lossless_partition(md, slices)
|
||||
table_slices = [s for s in slices if s.text.lstrip().startswith("|")]
|
||||
assert any("| 1 | 2 |" in s.text for s in table_slices)
|
||||
for s in table_slices:
|
||||
assert "| a | b |" in s.text and "| 1 | 2 |" in s.text
|
||||
|
||||
|
||||
def test_table_at_eof_without_trailing_newline_stays_whole():
|
||||
md = "Intro.\n| a | b |\n| - | - |\n| 1 | 2 |"
|
||||
|
||||
slices = chunk_markdown_with_spans(md)
|
||||
|
||||
_assert_lossless_partition(md, slices)
|
||||
table_slices = [s for s in slices if "| 1 | 2 |" in s.text]
|
||||
assert len(table_slices) == 1
|
||||
assert "| a | b |" in table_slices[0].text
|
||||
|
||||
|
||||
def test_code_chunker_partition_and_spans():
|
||||
code = "\n\n".join(
|
||||
f"def func_{i}(x):\n total = x + {i}\n return total" for i in range(40)
|
||||
)
|
||||
|
||||
slices = chunk_markdown_with_spans(code, use_code_chunker=True)
|
||||
|
||||
assert len(slices) >= 1
|
||||
_assert_lossless_partition(code, slices)
|
||||
|
||||
|
||||
def test_empty_markdown_yields_no_slices():
|
||||
assert chunk_markdown_with_spans("") == []
|
||||
|
|
@ -37,12 +37,9 @@ def _make_orm_doc(connector_doc, doc_id):
|
|||
async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||
pipeline, make_connector_document, monkeypatch
|
||||
):
|
||||
"""index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
|
||||
"""index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop."""
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
|
||||
path, see issue #1334) is verified separately in
|
||||
``test_non_code_documents_use_hybrid_chunker``.
|
||||
"""
|
||||
to_thread_calls = []
|
||||
original_to_thread = asyncio.to_thread
|
||||
|
||||
|
|
@ -51,11 +48,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
|||
return await original_to_thread(func, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
|
||||
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
|
||||
mock_chunker.__name__ = "chunk_markdown_with_spans"
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||
mock_chunk_hybrid,
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
||||
mock_chunker,
|
||||
)
|
||||
mock_embed = MagicMock(
|
||||
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
|
||||
|
|
@ -90,34 +87,25 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
|||
|
||||
await pipeline.index(document, connector_doc)
|
||||
|
||||
# Either chunker entry point satisfies the "chunking runs off the event
|
||||
# loop" contract this test guards. Routing between the two is verified
|
||||
# in test_non_code_documents_use_hybrid_chunker.
|
||||
assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
|
||||
assert "chunk_markdown_with_spans" in to_thread_calls
|
||||
assert "embed_texts" in to_thread_calls
|
||||
assert document.status == DocumentStatus.ready()
|
||||
|
||||
|
||||
async def test_non_code_documents_use_hybrid_chunker(
|
||||
async def test_non_code_documents_use_prose_chunker(
|
||||
pipeline, make_connector_document, monkeypatch
|
||||
):
|
||||
"""Non-code documents route through ``chunk_text_hybrid`` (issue #1334).
|
||||
"""Non-code documents chunk with use_code_chunker=False (issue #1334).
|
||||
|
||||
The hybrid chunker preserves Markdown table integrity by avoiding splits
|
||||
mid-row. Only documents flagged with ``should_use_code_chunker=True``
|
||||
should take the ``chunk_text`` path.
|
||||
The table-aware prose path keeps Markdown tables intact; only documents
|
||||
flagged with ``should_use_code_chunker=True`` request the code chunker.
|
||||
"""
|
||||
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||
mock_chunk_hybrid,
|
||||
)
|
||||
mock_chunk_code = MagicMock(return_value=["chunk1"])
|
||||
mock_chunk_code.__name__ = "chunk_text"
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||
mock_chunk_code,
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
||||
mock_chunker,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
|
|
@ -149,8 +137,49 @@ async def test_non_code_documents_use_hybrid_chunker(
|
|||
|
||||
await pipeline.index(document, connector_doc)
|
||||
|
||||
mock_chunk_hybrid.assert_called_once()
|
||||
mock_chunk_code.assert_not_called()
|
||||
mock_chunker.assert_called_once()
|
||||
assert mock_chunker.call_args.args[1] is False
|
||||
|
||||
|
||||
async def test_code_documents_request_code_chunker(
|
||||
pipeline, make_connector_document, monkeypatch
|
||||
):
|
||||
"""Code-flagged documents forward use_code_chunker=True to the chunker."""
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
||||
mock_chunker,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
||||
)
|
||||
monkeypatch.setattr(pipeline, "_load_existing_chunks", AsyncMock(return_value=[]))
|
||||
|
||||
async def _noop_persist(_session, doc, *_args, **_kwargs):
|
||||
doc.status = DocumentStatus.ready()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.persist_scratch_index",
|
||||
_noop_persist,
|
||||
)
|
||||
|
||||
connector_doc = make_connector_document(
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
unique_id="repo-1",
|
||||
search_space_id=1,
|
||||
should_use_code_chunker=True,
|
||||
)
|
||||
document = MagicMock(spec=Document)
|
||||
document.id = 1
|
||||
document.status = DocumentStatus.pending()
|
||||
|
||||
await pipeline.index(document, connector_doc)
|
||||
|
||||
mock_chunker.assert_called_once()
|
||||
assert mock_chunker.call_args.args[1] is True
|
||||
|
||||
|
||||
def _mock_session_factory(orm_docs_by_id):
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
|
|||
def __init__(self, *, children=None, file_data=None) -> None:
|
||||
self.als_info = AsyncMock(return_value=children or [])
|
||||
self._load_file_data = AsyncMock(
|
||||
return_value=(file_data, 17) if file_data is not None else None
|
||||
return_value=(file_data, 17, None) if file_data is not None else None
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -69,13 +69,25 @@ class _FakeSession:
|
|||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Avoid loading the embedding model in unit tests."""
|
||||
"""Avoid loading the embedding model in unit tests.
|
||||
|
||||
Mirrors the legacy stub: one chunk spanning the whole content, with a
|
||||
zero summary/chunk vector, routed through the shared span builder.
|
||||
"""
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
async def _fake_build_chunk_embeddings(content: str, *, use_code_chunker: bool):
|
||||
summary = np.zeros(8, dtype=np.float32)
|
||||
pairs = (
|
||||
[(ChunkSlice(content, 0, len(content)), np.zeros(8, dtype=np.float32))]
|
||||
if content
|
||||
else []
|
||||
)
|
||||
return summary, pairs
|
||||
|
||||
monkeypatch.setattr(
|
||||
kb_persistence,
|
||||
"embed_texts",
|
||||
lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
|
||||
kb_persistence, "build_chunk_embeddings", _fake_build_chunk_embeddings
|
||||
)
|
||||
monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
|
|||
|
|
@ -0,0 +1,92 @@
|
|||
"""Unit tests for the numbered-document read preamble."""
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
|
||||
build_read_preamble,
|
||||
compute_matched_line_ranges,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
_BODY = "alpha\nbravo\ncharlie\ndelta"
|
||||
|
||||
|
||||
class TestComputeMatchedLineRanges:
|
||||
def test_maps_matched_chunk_spans_to_line_ranges(self):
|
||||
chunks = [(1, 0, 12), (2, 12, len(_BODY))]
|
||||
ranges = compute_matched_line_ranges(_BODY, chunks, {2})
|
||||
assert ranges == [(3, 4)]
|
||||
|
||||
def test_includes_only_matched_chunks(self):
|
||||
chunks = [(1, 0, 5), (2, 6, 11)]
|
||||
ranges = compute_matched_line_ranges(_BODY, chunks, {1})
|
||||
assert ranges == [(1, 1)]
|
||||
|
||||
def test_skips_chunks_without_spans(self):
|
||||
chunks = [(1, None, None)]
|
||||
ranges = compute_matched_line_ranges(_BODY, chunks, {1})
|
||||
assert ranges == []
|
||||
|
||||
def test_sorted_and_deduplicated(self):
|
||||
chunks = [(1, 12, len(_BODY)), (2, 0, 5), (3, 0, 5)]
|
||||
ranges = compute_matched_line_ranges(_BODY, chunks, {1, 2, 3})
|
||||
assert ranges == [(1, 1), (3, 4)]
|
||||
|
||||
|
||||
class TestBuildReadPreamble:
|
||||
def test_contains_document_metadata(self):
|
||||
preamble = build_read_preamble(
|
||||
document_id=42,
|
||||
document_type="FILE",
|
||||
title="Test Doc",
|
||||
url="https://example.com",
|
||||
matched_line_ranges=[],
|
||||
)
|
||||
assert "<document_id>42</document_id>" in preamble
|
||||
assert "<document_type>FILE</document_type>" in preamble
|
||||
assert "Test Doc" in preamble
|
||||
assert "https://example.com" in preamble
|
||||
|
||||
def test_citation_hint_uses_document_id(self):
|
||||
preamble = build_read_preamble(
|
||||
document_id=42,
|
||||
document_type="FILE",
|
||||
title="Test Doc",
|
||||
url="",
|
||||
matched_line_ranges=[],
|
||||
)
|
||||
assert "[citation:d42#L" in preamble
|
||||
|
||||
def test_lists_matched_line_ranges(self):
|
||||
preamble = build_read_preamble(
|
||||
document_id=7,
|
||||
document_type="NOTE",
|
||||
title="Notes",
|
||||
url="",
|
||||
matched_line_ranges=[(12, 18), (40, 40)],
|
||||
)
|
||||
assert "<matched_lines>" in preamble
|
||||
assert "12-18" in preamble
|
||||
assert "40" in preamble
|
||||
|
||||
def test_omits_matched_lines_block_when_empty(self):
|
||||
preamble = build_read_preamble(
|
||||
document_id=7,
|
||||
document_type="NOTE",
|
||||
title="Notes",
|
||||
url="",
|
||||
matched_line_ranges=[],
|
||||
)
|
||||
assert "<matched_lines>" not in preamble
|
||||
|
||||
def test_ends_with_trailing_newline_so_body_follows_cleanly(self):
|
||||
preamble = build_read_preamble(
|
||||
document_id=1,
|
||||
document_type="FILE",
|
||||
title="t",
|
||||
url="",
|
||||
matched_line_ranges=[],
|
||||
)
|
||||
assert preamble.endswith("\n")
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
"""Contracts for chat LLM construction in streaming flows.
|
||||
|
||||
``stream_new_chat`` / ``stream_resume_chat`` depend on LangChain receiving
|
||||
token chunks from ``ChatLiteLLM``. ``langchain-litellm`` defaults
|
||||
``streaming`` to ``False``, so the shared bundle loader must opt in
|
||||
explicitly for both DB-backed and global model paths.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from types import SimpleNamespace
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
import app.tasks.chat.streaming.flows.shared.llm_bundle as llm_bundle
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
class _CapturedChatLiteLLM:
|
||||
calls: list[dict[str, Any]] = []
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
self.kwargs = kwargs
|
||||
self.__class__.calls.append(kwargs)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _patch_common_bundle_dependencies(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Keep these tests focused on the LLM constructor contract."""
|
||||
|
||||
_CapturedChatLiteLLM.calls = []
|
||||
|
||||
async def _fake_search_space(_session: Any, _search_space_id: int) -> SimpleNamespace:
|
||||
return SimpleNamespace(id=42, user_id="user-1")
|
||||
|
||||
monkeypatch.setattr(llm_bundle, "_load_search_space", _fake_search_space)
|
||||
monkeypatch.setattr(llm_bundle, "SanitizedChatLiteLLM", _CapturedChatLiteLLM)
|
||||
monkeypatch.setattr(llm_bundle, "register_model_usage_metadata", lambda **_kw: None)
|
||||
monkeypatch.setattr(
|
||||
llm_bundle,
|
||||
"has_capability",
|
||||
lambda _model, capability: capability in {"chat", "vision"},
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def test_load_llm_bundle_enables_streaming_for_db_models(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
connection = SimpleNamespace(
|
||||
provider="openai",
|
||||
api_key="sk-test",
|
||||
base_url=None,
|
||||
extra={"litellm_params": {"temperature": 0.1}},
|
||||
)
|
||||
model = SimpleNamespace(
|
||||
id=7,
|
||||
model_id="gpt-4o-mini",
|
||||
display_name="GPT 4o Mini",
|
||||
connection=connection,
|
||||
)
|
||||
|
||||
async def _fake_db_model(_session: Any, *, model_id: int, search_space: Any) -> Any:
|
||||
assert model_id == 7
|
||||
assert search_space.id == 42
|
||||
return model
|
||||
|
||||
monkeypatch.setattr(llm_bundle, "_load_db_model", _fake_db_model)
|
||||
monkeypatch.setattr(
|
||||
llm_bundle,
|
||||
"to_litellm",
|
||||
lambda _conn, _model_id: (
|
||||
"openai/gpt-4o-mini",
|
||||
{"api_key": "sk-test", "temperature": 0.1},
|
||||
),
|
||||
)
|
||||
|
||||
llm, agent_config, error = await llm_bundle.load_llm_bundle(
|
||||
object(),
|
||||
config_id=7,
|
||||
search_space_id=42,
|
||||
)
|
||||
|
||||
assert error is None
|
||||
assert llm is not None
|
||||
assert agent_config is not None
|
||||
assert _CapturedChatLiteLLM.calls == [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"api_key": "sk-test",
|
||||
"temperature": 0.1,
|
||||
"streaming": True,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
async def test_load_llm_bundle_enables_streaming_for_global_models(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
global_model = {
|
||||
"id": -11,
|
||||
"connection_id": -101,
|
||||
"model_id": "claude-sonnet-4-5",
|
||||
"display_name": "Claude Sonnet",
|
||||
"billing_tier": "premium",
|
||||
}
|
||||
global_connection = {
|
||||
"id": -101,
|
||||
"provider": "anthropic",
|
||||
"api_key": "sk-ant-test",
|
||||
"base_url": None,
|
||||
"extra": {"litellm_params": {"temperature": 0.2}},
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
llm_bundle.config,
|
||||
"GLOBAL_MODELS",
|
||||
[global_model],
|
||||
raising=False,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
llm_bundle.config,
|
||||
"GLOBAL_CONNECTIONS",
|
||||
[global_connection],
|
||||
raising=False,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
llm_bundle,
|
||||
"to_litellm",
|
||||
lambda _conn, _model_id: (
|
||||
"anthropic/claude-sonnet-4-5",
|
||||
{"api_key": "sk-ant-test", "temperature": 0.2},
|
||||
),
|
||||
)
|
||||
|
||||
llm, agent_config, error = await llm_bundle.load_llm_bundle(
|
||||
object(),
|
||||
config_id=-11,
|
||||
search_space_id=42,
|
||||
)
|
||||
|
||||
assert error is None
|
||||
assert llm is not None
|
||||
assert agent_config is not None
|
||||
assert _CapturedChatLiteLLM.calls == [
|
||||
{
|
||||
"model": "anthropic/claude-sonnet-4-5",
|
||||
"api_key": "sk-ant-test",
|
||||
"temperature": 0.2,
|
||||
"streaming": True,
|
||||
}
|
||||
]
|
||||
39
surfsense_backend/tests/unit/utils/test_text_spans.py
Normal file
39
surfsense_backend/tests/unit/utils/test_text_spans.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
"""Unit tests for char-span -> line-range conversion."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.utils.text_spans import char_span_to_line_range
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
_TEXT = "line1\nline2\nline3"
|
||||
|
||||
|
||||
def test_single_line_span() -> None:
|
||||
start = _TEXT.index("line2")
|
||||
assert char_span_to_line_range(_TEXT, start, start + len("line2")) == (2, 2)
|
||||
|
||||
|
||||
def test_first_line_span() -> None:
|
||||
assert char_span_to_line_range(_TEXT, 0, len("line1")) == (1, 1)
|
||||
|
||||
|
||||
def test_last_line_span() -> None:
|
||||
start = _TEXT.index("line3")
|
||||
assert char_span_to_line_range(_TEXT, start, len(_TEXT)) == (3, 3)
|
||||
|
||||
|
||||
def test_multi_line_span() -> None:
|
||||
# "line1\nline2" spans lines 1-2.
|
||||
assert char_span_to_line_range(_TEXT, 0, _TEXT.index("line2") + 5) == (1, 2)
|
||||
|
||||
|
||||
def test_empty_span_resolves_to_its_line() -> None:
|
||||
start = _TEXT.index("line2")
|
||||
assert char_span_to_line_range(_TEXT, start, start) == (2, 2)
|
||||
|
||||
|
||||
def test_offsets_clamped_to_text_bounds() -> None:
|
||||
assert char_span_to_line_range(_TEXT, -5, 10_000) == (1, 3)
|
||||
Loading…
Add table
Add a link
Reference in a new issue