mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-24 21:38:09 +02:00
Merge remote-tracking branch 'upstream/dev' into feat/api-key
This commit is contained in:
commit
3695e1d5c5
64 changed files with 1043 additions and 1852 deletions
|
|
@ -1,31 +0,0 @@
|
||||||
"""add chunks.start_char/end_char for citation offsets
|
|
||||||
|
|
||||||
Char offsets into the document's source_markdown (half-open span) let citations
|
|
||||||
resolve the exact passage a chunk came from. Nullable because historical rows
|
|
||||||
have no span; they populate on the next connector sync or user edit/reindex.
|
|
||||||
|
|
||||||
No backfill: a bulk UPDATE of every chunk on a large HNSW-indexed table rewrites
|
|
||||||
every secondary index per row (see migration 165 for the same reasoning).
|
|
||||||
|
|
||||||
Revision ID: 166
|
|
||||||
Revises: 165
|
|
||||||
"""
|
|
||||||
|
|
||||||
from collections.abc import Sequence
|
|
||||||
|
|
||||||
from alembic import op
|
|
||||||
|
|
||||||
revision: str = "166"
|
|
||||||
down_revision: str | None = "165"
|
|
||||||
branch_labels: str | Sequence[str] | None = None
|
|
||||||
depends_on: str | Sequence[str] | None = None
|
|
||||||
|
|
||||||
|
|
||||||
def upgrade() -> None:
|
|
||||||
op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS start_char INTEGER;")
|
|
||||||
op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS end_char INTEGER;")
|
|
||||||
|
|
||||||
|
|
||||||
def downgrade() -> None:
|
|
||||||
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS end_char;")
|
|
||||||
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS start_char;")
|
|
||||||
|
|
@ -18,6 +18,7 @@ skipped (e.g. client disconnect).
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
from datetime import UTC, datetime
|
from datetime import UTC, datetime
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
@ -57,8 +58,9 @@ from app.db import (
|
||||||
FolderRevision,
|
FolderRevision,
|
||||||
shielded_async_session,
|
shielded_async_session,
|
||||||
)
|
)
|
||||||
from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings
|
from app.indexing_pipeline.document_chunker import chunk_text
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
|
embed_texts,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
@ -232,23 +234,24 @@ async def _create_document(
|
||||||
session.add(doc)
|
session.add(doc)
|
||||||
await session.flush()
|
await session.flush()
|
||||||
|
|
||||||
summary_embedding, chunk_embeddings = await build_chunk_embeddings(
|
summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
|
||||||
content, use_code_chunker=False
|
|
||||||
)
|
|
||||||
doc.embedding = summary_embedding
|
doc.embedding = summary_embedding
|
||||||
session.add_all(
|
chunks = chunk_text(content)
|
||||||
[
|
if chunks:
|
||||||
Chunk(
|
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
|
||||||
document_id=doc.id,
|
session.add_all(
|
||||||
content=sl.text,
|
[
|
||||||
embedding=embedding,
|
Chunk(
|
||||||
position=i,
|
document_id=doc.id,
|
||||||
start_char=sl.start_char,
|
content=text,
|
||||||
end_char=sl.end_char,
|
embedding=embedding,
|
||||||
)
|
position=i,
|
||||||
for i, (sl, embedding) in enumerate(chunk_embeddings)
|
)
|
||||||
]
|
for i, (text, embedding) in enumerate(
|
||||||
)
|
zip(chunks, chunk_embeddings, strict=True)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -284,25 +287,26 @@ async def _update_document(
|
||||||
search_space_id,
|
search_space_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
summary_embedding, chunk_embeddings = await build_chunk_embeddings(
|
summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
|
||||||
content, use_code_chunker=False
|
|
||||||
)
|
|
||||||
document.embedding = summary_embedding
|
document.embedding = summary_embedding
|
||||||
|
|
||||||
await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
|
await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
|
||||||
session.add_all(
|
chunks = chunk_text(content)
|
||||||
[
|
if chunks:
|
||||||
Chunk(
|
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
|
||||||
document_id=document.id,
|
session.add_all(
|
||||||
content=sl.text,
|
[
|
||||||
embedding=embedding,
|
Chunk(
|
||||||
position=i,
|
document_id=document.id,
|
||||||
start_char=sl.start_char,
|
content=text,
|
||||||
end_char=sl.end_char,
|
embedding=embedding,
|
||||||
)
|
position=i,
|
||||||
for i, (sl, embedding) in enumerate(chunk_embeddings)
|
)
|
||||||
]
|
for i, (text, embedding) in enumerate(
|
||||||
)
|
zip(chunks, chunk_embeddings, strict=True)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
return document
|
return document
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,58 +1,42 @@
|
||||||
<citations>
|
<citations>
|
||||||
Citations reach the answer through three channels. Use whichever applies, and
|
Citations reach the answer through two channels. Use whichever applies — and
|
||||||
never invent ids you didn't see: ids are matched exactly, so a wrong one
|
never invent ids you didn't see. Citation ids are resolved by exact-match
|
||||||
silently breaks the link — when in doubt, omit. Always write a citation as
|
lookup; a wrong id silently breaks the link, so when in doubt, omit.
|
||||||
plain `[citation:…]` brackets — no markdown links, no footnote numbers, no
|
|
||||||
parentheses.
|
|
||||||
|
|
||||||
### Channel A — web_search chunk blocks injected this turn
|
### Channel A — chunk blocks injected this turn
|
||||||
When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
|
When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
|
||||||
turn, the chunk `id` is the result's URL:
|
turn:
|
||||||
|
|
||||||
1. For each factual statement taken from a chunk, add `[citation:<url>]`
|
1. For each factual statement taken from those chunks, add
|
||||||
using the **exact** id from a visible `<chunk id='…'>` tag. Copy the
|
`[citation:chunk_id]` using the **exact** id from a visible
|
||||||
URL verbatim; do not retype it from memory.
|
`<chunk id='…'>` tag. Copy digit-for-digit (or the URL verbatim);
|
||||||
2. Multiple chunks → `[citation:url1], [citation:url2]` (comma-separated,
|
do not retype from memory.
|
||||||
|
2. `<document_id>` is the parent doc id, **not** a citation source —
|
||||||
|
only ids inside `<chunk id='…'>` count.
|
||||||
|
3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
|
||||||
each id copied individually).
|
each id copied individually).
|
||||||
3. Never invent, normalise, or guess at a URL; if unsure, omit.
|
4. Never invent, normalise, or guess at adjacent ids; if unsure, omit.
|
||||||
|
5. Plain brackets only — no markdown links, no footnote numbering.
|
||||||
|
|
||||||
### Channel B — citations relayed by a `task` specialist
|
### Channel B — citations relayed by a `task` specialist
|
||||||
A `task(...)` tool message may contain `[citation:…]` markers the
|
A `task(...)` tool message may contain `[citation:<chunk_id>]` markers
|
||||||
specialist already attached to its prose — line citations
|
the specialist already attached to its prose. The specialist saw the
|
||||||
(`[citation:d<id>#L<a>-<b>]`) or chunk ids (`[citation:N]`). The
|
underlying `<chunk id='…'>` blocks; you didn't. So:
|
||||||
specialist read the underlying document and tied each marker to a
|
|
||||||
passage; you didn't. So:
|
|
||||||
|
|
||||||
1. **Preserve those markers verbatim** in your final answer — do not
|
1. **Preserve those markers verbatim** in your final answer — do not
|
||||||
reformat, renumber, drop, or wrap them in markdown links. When you
|
reformat, renumber, drop, or wrap them in markdown links. When you
|
||||||
paraphrase a specialist sentence, copy the marker character-for-
|
paraphrase a specialist sentence, copy the marker character-for-
|
||||||
character; do not regenerate it from memory (LLMs reliably corrupt
|
character; do not regenerate the id from memory (LLMs reliably
|
||||||
nearby digits).
|
corrupt nearby digits).
|
||||||
2. Keep each marker attached to the sentence the specialist attached
|
2. Keep each marker attached to the sentence the specialist attached
|
||||||
it to.
|
it to.
|
||||||
3. Do **not** add new `[citation:…]` markers of your own to a
|
3. Do **not** add new `[citation:…]` markers of your own to a
|
||||||
specialist's prose; if a fact has no marker, the specialist
|
specialist's prose; if a fact has no marker, the specialist
|
||||||
couldn't tie it to a source and neither can you.
|
couldn't tie it to a chunk and neither can you.
|
||||||
4. When a specialist returns JSON, the citation markers live inside
|
4. When a specialist returns JSON, the citation markers live inside
|
||||||
the prose-bearing fields (e.g. a summary or excerpt). Pull them
|
the prose-bearing fields (e.g. a summary or excerpt). Pull them
|
||||||
along with the surrounding sentence when you quote.
|
along with the surrounding sentence when you quote.
|
||||||
|
|
||||||
### Channel C — your knowledge base (search hits and `read_file`)
|
If neither channel surfaces citation markers this turn, do not fabricate
|
||||||
Knowledge-base facts are cited by line range using the document id:
|
them.
|
||||||
`[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
|
|
||||||
|
|
||||||
1. `search_knowledge_base` prints a ready `[citation:d…#L…-…]` token above each
|
|
||||||
matched passage. When that passage supports your point, copy the token
|
|
||||||
verbatim — that is the entire citation.
|
|
||||||
2. When you `read_file` a `/documents/...` path, its header gives the
|
|
||||||
`<document_id>` and an optional `<matched_lines>` pointer, and the body is
|
|
||||||
shown with line numbers; cite the lines you actually used. Use `read_file`
|
|
||||||
when you need more context than a search passage shows.
|
|
||||||
3. Copy document ids and line numbers exactly as shown — never estimate,
|
|
||||||
shift, or invent them.
|
|
||||||
4. Older documents without a numbered body instead show `<chunk id='N'>`
|
|
||||||
blocks; cite those with `[citation:N]`, copying the id exactly.
|
|
||||||
|
|
||||||
If none of these channels surfaces a citable source this turn, do not
|
|
||||||
fabricate citations.
|
|
||||||
</citations>
|
</citations>
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,6 @@ from app.agents.chat.runtime.path_resolver import (
|
||||||
)
|
)
|
||||||
from app.db import Document, shielded_async_session
|
from app.db import Document, shielded_async_session
|
||||||
from app.utils.perf import get_perf_logger
|
from app.utils.perf import get_perf_logger
|
||||||
from app.utils.text_spans import char_span_to_line_range
|
|
||||||
|
|
||||||
_perf_log = get_perf_logger()
|
_perf_log = get_perf_logger()
|
||||||
|
|
||||||
|
|
@ -57,16 +56,12 @@ _TOOL_DESCRIPTION = (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
async def _resolve_doc_context(
|
async def _resolve_virtual_paths(
|
||||||
results: list[dict[str, Any]],
|
results: list[dict[str, Any]],
|
||||||
*,
|
*,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
) -> tuple[dict[int, str], dict[int, str]]:
|
) -> dict[int, str]:
|
||||||
"""Resolve ``Document.id`` -> (canonical virtual path, source_markdown).
|
"""Resolve ``Document.id`` -> canonical virtual path for the search hits."""
|
||||||
|
|
||||||
``source_markdown`` is the canonical body the chunk spans index into; the
|
|
||||||
renderer uses it to turn a chunk's char span into a line range.
|
|
||||||
"""
|
|
||||||
doc_ids = [
|
doc_ids = [
|
||||||
doc_id
|
doc_id
|
||||||
for doc_id in (
|
for doc_id in (
|
||||||
|
|
@ -77,24 +72,17 @@ async def _resolve_doc_context(
|
||||||
if isinstance(doc_id, int)
|
if isinstance(doc_id, int)
|
||||||
]
|
]
|
||||||
if not doc_ids:
|
if not doc_ids:
|
||||||
return {}, {}
|
return {}
|
||||||
|
|
||||||
async with shielded_async_session() as session:
|
async with shielded_async_session() as session:
|
||||||
index: PathIndex = await build_path_index(session, search_space_id)
|
index: PathIndex = await build_path_index(session, search_space_id)
|
||||||
rows = await session.execute(
|
folder_rows = await session.execute(
|
||||||
select(
|
select(Document.id, Document.folder_id).where(
|
||||||
Document.id, Document.folder_id, Document.source_markdown
|
|
||||||
).where(
|
|
||||||
Document.search_space_id == search_space_id,
|
Document.search_space_id == search_space_id,
|
||||||
Document.id.in_(doc_ids),
|
Document.id.in_(doc_ids),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
folder_by_doc_id: dict[int, int | None] = {}
|
folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
|
||||||
bodies: dict[int, str] = {}
|
|
||||||
for row in rows.all():
|
|
||||||
folder_by_doc_id[row.id] = row.folder_id
|
|
||||||
if row.source_markdown:
|
|
||||||
bodies[row.id] = row.source_markdown
|
|
||||||
|
|
||||||
paths: dict[int, str] = {}
|
paths: dict[int, str] = {}
|
||||||
for doc in results:
|
for doc in results:
|
||||||
|
|
@ -109,76 +97,13 @@ async def _resolve_doc_context(
|
||||||
folder_id=folder_id if isinstance(folder_id, int) else None,
|
folder_id=folder_id if isinstance(folder_id, int) else None,
|
||||||
index=index,
|
index=index,
|
||||||
)
|
)
|
||||||
return paths, bodies
|
return paths
|
||||||
|
|
||||||
|
|
||||||
def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
|
|
||||||
"""Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
|
|
||||||
start = chunk.get("start_char")
|
|
||||||
end = chunk.get("end_char")
|
|
||||||
if (
|
|
||||||
not body
|
|
||||||
or not isinstance(doc_id, int)
|
|
||||||
or not isinstance(start, int)
|
|
||||||
or not isinstance(end, int)
|
|
||||||
):
|
|
||||||
return ""
|
|
||||||
start_line, end_line = char_span_to_line_range(body, start, end)
|
|
||||||
return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
|
|
||||||
|
|
||||||
|
|
||||||
def _render_passage(
|
|
||||||
chunk: dict[str, Any], body: str | None, doc_id: int | None
|
|
||||||
) -> str | None:
|
|
||||||
"""Render one matched chunk as an indented passage tagged with its token."""
|
|
||||||
content = (chunk.get("content") or "").strip()
|
|
||||||
if not content:
|
|
||||||
return None
|
|
||||||
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
|
|
||||||
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
|
||||||
snippet += " ..."
|
|
||||||
indented = snippet.replace("\n", "\n ")
|
|
||||||
token = _citation_token(chunk, body, doc_id)
|
|
||||||
head = f"\n {token}" if token else ""
|
|
||||||
return f"{head}\n {indented}"
|
|
||||||
|
|
||||||
|
|
||||||
def _matched_passages(
|
|
||||||
doc: dict[str, Any], body: str | None, doc_id: int | None
|
|
||||||
) -> str:
|
|
||||||
"""Render the RRF-matched chunks; '' when none can be rendered."""
|
|
||||||
by_id = {
|
|
||||||
c.get("chunk_id"): c
|
|
||||||
for c in (doc.get("chunks") or [])
|
|
||||||
if isinstance(c, dict)
|
|
||||||
}
|
|
||||||
rendered: list[str] = []
|
|
||||||
for chunk_id in doc.get("matched_chunk_ids") or []:
|
|
||||||
chunk = by_id.get(chunk_id)
|
|
||||||
if chunk is None:
|
|
||||||
continue
|
|
||||||
passage = _render_passage(chunk, body, doc_id)
|
|
||||||
if passage:
|
|
||||||
rendered.append(passage)
|
|
||||||
return "".join(rendered)
|
|
||||||
|
|
||||||
|
|
||||||
def _fallback_snippet(doc: dict[str, Any]) -> str:
|
|
||||||
"""Top-of-document preview, used only when no matched chunk is available."""
|
|
||||||
content = (doc.get("content") or "").strip()
|
|
||||||
if not content:
|
|
||||||
return "\n (no preview available; read the document for details)"
|
|
||||||
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
|
|
||||||
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
|
||||||
snippet += " ..."
|
|
||||||
return "\n " + snippet.replace("\n", "\n ")
|
|
||||||
|
|
||||||
|
|
||||||
def _format_hits(
|
def _format_hits(
|
||||||
results: list[dict[str, Any]],
|
results: list[dict[str, Any]],
|
||||||
*,
|
*,
|
||||||
paths: dict[int, str],
|
paths: dict[int, str],
|
||||||
bodies: dict[int, str],
|
|
||||||
query: str,
|
query: str,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Render search hits as a compact, model-readable block."""
|
"""Render search hits as a compact, model-readable block."""
|
||||||
|
|
@ -199,15 +124,21 @@ def _format_hits(
|
||||||
score = doc.get("score")
|
score = doc.get("score")
|
||||||
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
|
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
|
||||||
path = paths.get(doc_id) if isinstance(doc_id, int) else None
|
path = paths.get(doc_id) if isinstance(doc_id, int) else None
|
||||||
body = bodies.get(doc_id) if isinstance(doc_id, int) else None
|
|
||||||
|
|
||||||
id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
|
header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
|
||||||
header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
|
|
||||||
f"\n path: {path}" if path else ""
|
f"\n path: {path}" if path else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
|
content = (doc.get("content") or "").strip()
|
||||||
entry = header + (passages or _fallback_snippet(doc))
|
if content:
|
||||||
|
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
|
||||||
|
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
||||||
|
snippet += " ..."
|
||||||
|
body = "\n " + snippet.replace("\n", "\n ")
|
||||||
|
else:
|
||||||
|
body = "\n (no preview available; read the document for details)"
|
||||||
|
|
||||||
|
entry = header + body
|
||||||
if total + len(entry) > _MAX_TOTAL_CHARS:
|
if total + len(entry) > _MAX_TOTAL_CHARS:
|
||||||
lines.append("\n<!-- additional matches truncated to fit context -->")
|
lines.append("\n<!-- additional matches truncated to fit context -->")
|
||||||
break
|
break
|
||||||
|
|
@ -215,9 +146,8 @@ def _format_hits(
|
||||||
total += len(entry)
|
total += len(entry)
|
||||||
|
|
||||||
lines.append(
|
lines.append(
|
||||||
"\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
|
"\n\nTo read a full document, delegate to the knowledge_base specialist "
|
||||||
"verbatim. To quote more context or read the full document, delegate to "
|
"with `task`, referencing the path above."
|
||||||
"the knowledge_base specialist with `task` using the path above."
|
|
||||||
)
|
)
|
||||||
lines.append("\n</knowledge_base_results>")
|
lines.append("\n</knowledge_base_results>")
|
||||||
return "".join(lines)
|
return "".join(lines)
|
||||||
|
|
@ -274,10 +204,8 @@ def create_search_knowledge_base_tool(
|
||||||
top_k=clamped_top_k,
|
top_k=clamped_top_k,
|
||||||
)
|
)
|
||||||
|
|
||||||
paths, bodies = await _resolve_doc_context(results, search_space_id=_space_id)
|
paths = await _resolve_virtual_paths(results, search_space_id=_space_id)
|
||||||
rendered = _format_hits(
|
rendered = _format_hits(results, paths=paths, query=cleaned_query)
|
||||||
results, paths=paths, bodies=bodies, query=cleaned_query
|
|
||||||
)
|
|
||||||
matched = _matched_chunk_ids(results)
|
matched = _matched_chunk_ids(results)
|
||||||
|
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
|
|
|
||||||
|
|
@ -45,10 +45,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
|
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
|
||||||
build_document_xml,
|
build_document_xml,
|
||||||
)
|
)
|
||||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
|
|
||||||
build_read_preamble,
|
|
||||||
compute_matched_line_ranges,
|
|
||||||
)
|
|
||||||
from app.agents.chat.runtime.path_resolver import (
|
from app.agents.chat.runtime.path_resolver import (
|
||||||
DOCUMENTS_ROOT,
|
DOCUMENTS_ROOT,
|
||||||
build_path_index,
|
build_path_index,
|
||||||
|
|
@ -68,12 +64,6 @@ def _basename(path: str) -> str:
|
||||||
return path.rsplit("/", 1)[-1]
|
return path.rsplit("/", 1)[-1]
|
||||||
|
|
||||||
|
|
||||||
def _metadata_url(metadata: dict[str, Any]) -> str:
|
|
||||||
return (
|
|
||||||
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_under(child: str, parent: str) -> bool:
|
def _is_under(child: str, parent: str) -> bool:
|
||||||
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
|
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
|
||||||
if parent == "/":
|
if parent == "/":
|
||||||
|
|
@ -470,11 +460,8 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
loaded = await self._load_file_data(file_path)
|
loaded = await self._load_file_data(file_path)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: File '{file_path}' not found"
|
return f"Error: File '{file_path}' not found"
|
||||||
file_data, _, preamble = loaded
|
file_data, _ = loaded
|
||||||
body = format_read_response(file_data, offset, limit)
|
return format_read_response(file_data, offset, limit)
|
||||||
if preamble and offset == 0:
|
|
||||||
return preamble + body
|
|
||||||
return body
|
|
||||||
|
|
||||||
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
|
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
|
||||||
return asyncio.run(self.aread(file_path, offset, limit))
|
return asyncio.run(self.aread(file_path, offset, limit))
|
||||||
|
|
@ -482,14 +469,12 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
async def _load_file_data(
|
async def _load_file_data(
|
||||||
self,
|
self,
|
||||||
path: str,
|
path: str,
|
||||||
) -> tuple[dict[str, Any], int | None, str | None] | None:
|
) -> tuple[dict[str, Any], int | None] | None:
|
||||||
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
|
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
|
||||||
|
|
||||||
Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path
|
Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
|
||||||
doesn't map to any known document. ``doc_id`` is ``None`` for the
|
to any known document. ``doc_id`` is ``None`` for the synthetic
|
||||||
synthetic anonymous document. ``preamble`` is the metadata header to
|
anonymous document so the caller doesn't track it as a DB-backed file.
|
||||||
show above a numbered ``source_markdown`` body (``None`` for the legacy
|
|
||||||
chunk-reconstructed XML reads used when a document has no body).
|
|
||||||
"""
|
"""
|
||||||
anon = self._kb_anon_doc()
|
anon = self._kb_anon_doc()
|
||||||
if anon and str(anon.get("path") or "") == path:
|
if anon and str(anon.get("path") or "") == path:
|
||||||
|
|
@ -507,7 +492,7 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
}
|
}
|
||||||
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
|
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
|
||||||
file_data = create_file_data(xml)
|
file_data = create_file_data(xml)
|
||||||
return file_data, None, None
|
return file_data, None
|
||||||
|
|
||||||
if not path.startswith(DOCUMENTS_ROOT):
|
if not path.startswith(DOCUMENTS_ROOT):
|
||||||
return None
|
return None
|
||||||
|
|
@ -520,58 +505,41 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
)
|
)
|
||||||
if document is None:
|
if document is None:
|
||||||
return None
|
return None
|
||||||
source_markdown = document.source_markdown or ""
|
|
||||||
document_type = (
|
|
||||||
document.document_type.value
|
|
||||||
if getattr(document, "document_type", None) is not None
|
|
||||||
else "UNKNOWN"
|
|
||||||
)
|
|
||||||
metadata = dict(document.document_metadata or {})
|
|
||||||
chunk_rows = await session.execute(
|
chunk_rows = await session.execute(
|
||||||
select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char)
|
select(Chunk.id, Chunk.content)
|
||||||
.where(Chunk.document_id == document.id)
|
.where(Chunk.document_id == document.id)
|
||||||
.order_by(Chunk.position, Chunk.id)
|
.order_by(Chunk.position, Chunk.id)
|
||||||
)
|
)
|
||||||
chunk_records = chunk_rows.all()
|
chunks = [
|
||||||
document_id = document.id
|
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
|
||||||
document_title = document.title
|
]
|
||||||
|
|
||||||
matched = self._matched_chunk_ids(document_id)
|
|
||||||
|
|
||||||
# Canonical read: serve the verbatim body with cat -n line numbers that
|
|
||||||
# line up with chunk char spans, so the agent cites real source lines.
|
|
||||||
if source_markdown:
|
|
||||||
ranges = compute_matched_line_ranges(
|
|
||||||
source_markdown,
|
|
||||||
[(r.id, r.start_char, r.end_char) for r in chunk_records],
|
|
||||||
matched,
|
|
||||||
)
|
|
||||||
preamble = build_read_preamble(
|
|
||||||
document_id=document_id,
|
|
||||||
document_type=document_type,
|
|
||||||
title=document_title,
|
|
||||||
url=_metadata_url(metadata),
|
|
||||||
matched_line_ranges=ranges,
|
|
||||||
)
|
|
||||||
return create_file_data(source_markdown), document_id, preamble
|
|
||||||
|
|
||||||
# Legacy fallback: no canonical body, reconstruct from chunks as XML.
|
|
||||||
doc_payload = {
|
doc_payload = {
|
||||||
"document_id": document_id,
|
"document_id": document.id,
|
||||||
"chunks": [
|
"chunks": chunks,
|
||||||
{"chunk_id": r.id, "content": r.content} for r in chunk_records
|
"matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
|
||||||
],
|
|
||||||
"matched_chunk_ids": list(matched),
|
|
||||||
"document": {
|
"document": {
|
||||||
"id": document_id,
|
"id": document.id,
|
||||||
"title": document_title,
|
"title": document.title,
|
||||||
"document_type": document_type,
|
"document_type": (
|
||||||
"metadata": metadata,
|
document.document_type.value
|
||||||
|
if getattr(document, "document_type", None) is not None
|
||||||
|
else "UNKNOWN"
|
||||||
|
),
|
||||||
|
"metadata": dict(document.document_metadata or {}),
|
||||||
},
|
},
|
||||||
"source": document_type,
|
"source": (
|
||||||
|
document.document_type.value
|
||||||
|
if getattr(document, "document_type", None) is not None
|
||||||
|
else "UNKNOWN"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
xml = build_document_xml(doc_payload, matched_chunk_ids=matched)
|
xml = build_document_xml(
|
||||||
return create_file_data(xml), document_id, None
|
doc_payload,
|
||||||
|
matched_chunk_ids=self._matched_chunk_ids(document.id),
|
||||||
|
)
|
||||||
|
file_data = create_file_data(xml)
|
||||||
|
return file_data, document.id
|
||||||
|
|
||||||
# ------------------------------------------------------------------ writes
|
# ------------------------------------------------------------------ writes
|
||||||
|
|
||||||
|
|
@ -603,7 +571,7 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
loaded = await self._load_file_data(file_path)
|
loaded = await self._load_file_data(file_path)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return EditResult(error=f"Error: File '{file_path}' not found")
|
return EditResult(error=f"Error: File '{file_path}' not found")
|
||||||
file_data, _, _ = loaded
|
file_data, _ = loaded
|
||||||
|
|
||||||
content = file_data_to_string(file_data)
|
content = file_data_to_string(file_data)
|
||||||
result = perform_string_replacement(
|
result = perform_string_replacement(
|
||||||
|
|
|
||||||
|
|
@ -1,73 +0,0 @@
|
||||||
"""Read preamble for canonical (numbered ``source_markdown``) KB reads.
|
|
||||||
|
|
||||||
The KB read tool numbers the body lines ``cat -n`` style, so serving the raw
|
|
||||||
``source_markdown`` makes those line numbers line up exactly with the chunk
|
|
||||||
char spans and the editor highlight. This module renders the small header the
|
|
||||||
agent sees above that body: document identity plus the matched line ranges to
|
|
||||||
seek to, and a concrete reminder of the line-citation token shape.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from collections.abc import Iterable
|
|
||||||
|
|
||||||
from app.utils.text_spans import char_span_to_line_range
|
|
||||||
|
|
||||||
|
|
||||||
def _format_range(start: int, end: int) -> str:
|
|
||||||
return f"{start}" if start == end else f"{start}-{end}"
|
|
||||||
|
|
||||||
|
|
||||||
def compute_matched_line_ranges(
|
|
||||||
source_markdown: str,
|
|
||||||
chunks: Iterable[tuple[int, int | None, int | None]],
|
|
||||||
matched_chunk_ids: set[int],
|
|
||||||
) -> list[tuple[int, int]]:
|
|
||||||
"""Map matched chunks to sorted, de-duplicated 1-based line ranges.
|
|
||||||
|
|
||||||
``chunks`` are ``(chunk_id, start_char, end_char)`` triples. Chunks without
|
|
||||||
spans (legacy rows) are skipped — they have no resolvable location.
|
|
||||||
"""
|
|
||||||
ranges: set[tuple[int, int]] = set()
|
|
||||||
for chunk_id, start_char, end_char in chunks:
|
|
||||||
if chunk_id not in matched_chunk_ids:
|
|
||||||
continue
|
|
||||||
if start_char is None or end_char is None:
|
|
||||||
continue
|
|
||||||
ranges.add(char_span_to_line_range(source_markdown, start_char, end_char))
|
|
||||||
return sorted(ranges)
|
|
||||||
|
|
||||||
|
|
||||||
def build_read_preamble(
|
|
||||||
*,
|
|
||||||
document_id: int,
|
|
||||||
document_type: str,
|
|
||||||
title: str,
|
|
||||||
url: str,
|
|
||||||
matched_line_ranges: list[tuple[int, int]],
|
|
||||||
) -> str:
|
|
||||||
"""Render the metadata header shown above a numbered ``source_markdown`` body.
|
|
||||||
|
|
||||||
``matched_line_ranges`` are 1-based inclusive line ranges (already derived
|
|
||||||
from chunk char spans) to point the agent at the relevant lines.
|
|
||||||
"""
|
|
||||||
lines = [
|
|
||||||
"<document_metadata>",
|
|
||||||
f" <document_id>{document_id}</document_id>",
|
|
||||||
f" <document_type>{document_type}</document_type>",
|
|
||||||
f" <title><![CDATA[{title}]]></title>",
|
|
||||||
f" <url><![CDATA[{url}]]></url>",
|
|
||||||
]
|
|
||||||
if matched_line_ranges:
|
|
||||||
ranges = ", ".join(_format_range(s, e) for s, e in matched_line_ranges)
|
|
||||||
lines.append(f" <matched_lines>{ranges}</matched_lines>")
|
|
||||||
lines.append("</document_metadata>")
|
|
||||||
lines.append(
|
|
||||||
f"Cite lines from this document as [citation:d{document_id}#L<start>-<end>] "
|
|
||||||
"using the line numbers shown below."
|
|
||||||
)
|
|
||||||
lines.append("")
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["build_read_preamble", "compute_matched_line_ranges"]
|
|
||||||
|
|
@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
||||||
loaded = await backend._load_file_data(validated)
|
loaded = await backend._load_file_data(validated)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: File '{validated}' not found"
|
return f"Error: File '{validated}' not found"
|
||||||
_, doc_id_to_attach, _ = loaded
|
_, doc_id_to_attach = loaded
|
||||||
|
|
||||||
res: EditResult = await backend.aedit(
|
res: EditResult = await backend.aedit(
|
||||||
validated, old_string, new_string, replace_all=replace_all
|
validated, old_string, new_string, replace_all=replace_all
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ async def cloud_move_file(
|
||||||
loaded = await backend._load_file_data(source)
|
loaded = await backend._load_file_data(source)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: source '{source}' not found."
|
return f"Error: source '{source}' not found."
|
||||||
source_file_data, loaded_doc_id, _ = loaded
|
source_file_data, loaded_doc_id = loaded
|
||||||
if source_doc_id is None:
|
if source_doc_id is None:
|
||||||
source_doc_id = loaded_doc_id
|
source_doc_id = loaded_doc_id
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -58,10 +58,8 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
||||||
loaded = await backend._load_file_data(validated)
|
loaded = await backend._load_file_data(validated)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: File '{validated}' not found"
|
return f"Error: File '{validated}' not found"
|
||||||
file_data, doc_id, preamble = loaded
|
file_data, doc_id = loaded
|
||||||
rendered = format_read_response(file_data, offset, limit)
|
rendered = format_read_response(file_data, offset, limit)
|
||||||
if preamble and offset == 0:
|
|
||||||
rendered = preamble + rendered
|
|
||||||
update: dict[str, Any] = {
|
update: dict[str, Any] = {
|
||||||
"files": {validated: file_data},
|
"files": {validated: file_data},
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ async def cloud_rm(
|
||||||
loaded = await backend._load_file_data(validated)
|
loaded = await backend._load_file_data(validated)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: file '{validated}' not found."
|
return f"Error: file '{validated}' not found."
|
||||||
_, resolved_doc_id, _ = loaded
|
_, resolved_doc_id = loaded
|
||||||
|
|
||||||
files_update: dict[str, Any] = {validated: None}
|
files_update: dict[str, Any] = {validated: None}
|
||||||
update: dict[str, Any] = {
|
update: dict[str, Any] = {
|
||||||
|
|
|
||||||
|
|
@ -240,24 +240,23 @@ def create_generate_image_tool(
|
||||||
error="No images were generated",
|
error="No images were generated",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Update all image URLs in response_dict to be absolute (for the serving endpoint)
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
for image in images:
|
||||||
|
if image.get("url"):
|
||||||
|
raw_url: str = image["url"]
|
||||||
|
if raw_url.startswith("/") and provider_base_url:
|
||||||
|
parsed = urlparse(provider_base_url)
|
||||||
|
origin = f"{parsed.scheme}://{parsed.netloc}"
|
||||||
|
image["url"] = f"{origin}{raw_url}" # Update the stored dict!
|
||||||
|
|
||||||
first_image = images[0]
|
first_image = images[0]
|
||||||
revised_prompt = first_image.get("revised_prompt", prompt)
|
revised_prompt = first_image.get("revised_prompt", prompt)
|
||||||
|
|
||||||
# b64_json (e.g. gpt-image-1) is served via our backend endpoint so
|
# b64_json (e.g. gpt-image-1) is served via our backend endpoint so
|
||||||
# megabytes of base64 don't bloat the LLM context.
|
# megabytes of base64 don't bloat the LLM context.
|
||||||
# Some OpenAI-compatible backends (e.g. Xinference) return a relative
|
|
||||||
# URL like /files/image.png. Browsers can't resolve these, so we
|
|
||||||
# prepend the provider's base origin when the URL starts with "/".
|
|
||||||
if first_image.get("url"):
|
if first_image.get("url"):
|
||||||
raw_url: str = first_image["url"]
|
image_url = first_image["url"]
|
||||||
if raw_url.startswith("/") and provider_base_url:
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
parsed = urlparse(provider_base_url)
|
|
||||||
origin = f"{parsed.scheme}://{parsed.netloc}"
|
|
||||||
image_url = f"{origin}{raw_url}"
|
|
||||||
else:
|
|
||||||
image_url = raw_url
|
|
||||||
elif first_image.get("b64_json"):
|
elif first_image.get("b64_json"):
|
||||||
backend_url = config.BACKEND_URL or "http://localhost:8000"
|
backend_url = config.BACKEND_URL or "http://localhost:8000"
|
||||||
image_url = (
|
image_url = (
|
||||||
|
|
|
||||||
|
|
@ -35,24 +35,42 @@ Map outcomes to your `status`:
|
||||||
|
|
||||||
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
|
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
|
||||||
|
|
||||||
## Citations in your prose
|
## Chunk citations in your prose
|
||||||
|
|
||||||
`read_file` on a KB document under `/documents/` serves it in one of two forms. Cite from whichever you actually see, attach the marker to the sentence in `action_summary` or `evidence.content_excerpt` stating that fact, and list every marker you emit in `evidence.citations`. The caller relays these markers to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
|
When `read_file` returns a KB-indexed document under `/documents/`, the response includes `<chunk id='…'>` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:<chunk_id>]` to the sentence stating that fact, using the **exact** id from the `<chunk id='…'>` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
|
||||||
|
|
||||||
**Numbered body (default).** A `<document_metadata>` header gives the `<document_id>` and an optional `<matched_lines>` pointer, then the body is shown with line numbers. Cite the lines a fact came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
|
### Where chunk ids live in `read_file` output
|
||||||
|
|
||||||
**Legacy chunk blocks (older docs without a stored body).** The response is XML with `<chunk id='N'>` blocks. Cite the chunk a fact came from as `[citation:N]`, using the **exact** id from a `<chunk id='…'>` tag.
|
A KB document's XML has three numeric attributes — only **one** is a citation source:
|
||||||
|
|
||||||
|
```
|
||||||
|
<document>
|
||||||
|
<document_metadata>
|
||||||
|
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
|
||||||
|
...
|
||||||
|
</document_metadata>
|
||||||
|
<chunk_index>
|
||||||
|
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
|
||||||
|
<entry chunk_id="129" lines="23-30" matched="true"/>
|
||||||
|
</chunk_index>
|
||||||
|
<document_content>
|
||||||
|
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
|
||||||
|
<chunk id='129'><![CDATA[…]]></chunk>
|
||||||
|
</document_content>
|
||||||
|
</document>
|
||||||
|
```
|
||||||
|
|
||||||
### Rules
|
### Rules
|
||||||
|
|
||||||
- Cite only from a passage you actually quoted or paraphrased this turn. Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory.
|
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
|
||||||
- Never cite `<document_id>` on its own — it identifies the document, not a passage. In the numbered form it is only the `d<document_id>` prefix of a line citation.
|
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
|
||||||
- Never invent, normalise, shorten, shift, or guess at ids or line numbers. If unsure, omit rather than pick.
|
- Never cite `<document_id>` — that's the parent doc, not a chunk.
|
||||||
|
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
|
||||||
- Prefer **fewer accurate citations** over many speculative ones.
|
- Prefer **fewer accurate citations** over many speculative ones.
|
||||||
- Multiple passages supporting the same point → comma-separated and copied individually: `[citation:d42#L14-22], [citation:d42#L31-39]`.
|
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
|
||||||
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
|
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
|
||||||
- Tool results with no body passage (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry nothing to cite.
|
- Tool results without `<chunk id='…'>` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none.
|
||||||
- Populate `evidence.citations` with **only** the markers you actually emitted — same set, same characters.
|
- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits.
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
|
|
@ -71,7 +89,7 @@ You construct the structured `evidence` fields from your own knowledge of what y
|
||||||
"path": "/documents/meetings/2026-05-11-meeting.md",
|
"path": "/documents/meetings/2026-05-11-meeting.md",
|
||||||
"matched_candidates": null,
|
"matched_candidates": null,
|
||||||
"content_excerpt": null,
|
"content_excerpt": null,
|
||||||
"citations": null
|
"chunk_ids": null
|
||||||
},
|
},
|
||||||
"next_step": null,
|
"next_step": null,
|
||||||
"missing_fields": null,
|
"missing_fields": null,
|
||||||
|
|
@ -103,7 +121,7 @@ You construct the structured `evidence` fields from your own knowledge of what y
|
||||||
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
|
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
|
||||||
],
|
],
|
||||||
"content_excerpt": null,
|
"content_excerpt": null,
|
||||||
"citations": null
|
"chunk_ids": null
|
||||||
},
|
},
|
||||||
"next_step": "Ask the user which design doc to update.",
|
"next_step": "Ask the user which design doc to update.",
|
||||||
"missing_fields": ["path"],
|
"missing_fields": ["path"],
|
||||||
|
|
@ -124,7 +142,7 @@ Return **only** one JSON object (no markdown or prose outside it):
|
||||||
"path": string | null,
|
"path": string | null,
|
||||||
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
||||||
"content_excerpt": string | null,
|
"content_excerpt": string | null,
|
||||||
"citations": string[] | null
|
"chunk_ids": string[] | null
|
||||||
},
|
},
|
||||||
"next_step": string | null,
|
"next_step": string | null,
|
||||||
"missing_fields": string[] | null,
|
"missing_fields": string[] | null,
|
||||||
|
|
|
||||||
|
|
@ -33,11 +33,11 @@ Map outcomes to your `status`:
|
||||||
- Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
|
- Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
|
||||||
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
|
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
|
||||||
|
|
||||||
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.)
|
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.)
|
||||||
|
|
||||||
## Citations in your prose
|
## Chunk citations in your prose
|
||||||
|
|
||||||
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry chunk ids or numbered KB bodies. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work.
|
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `<chunk id='…'>` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work.
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
|
|
@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
|
||||||
"path": "/notes/meetings/2026-05-11-meeting.md",
|
"path": "/notes/meetings/2026-05-11-meeting.md",
|
||||||
"matched_candidates": null,
|
"matched_candidates": null,
|
||||||
"content_excerpt": null,
|
"content_excerpt": null,
|
||||||
"citations": null
|
"chunk_ids": null
|
||||||
},
|
},
|
||||||
"next_step": null,
|
"next_step": null,
|
||||||
"missing_fields": null,
|
"missing_fields": null,
|
||||||
|
|
@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
|
||||||
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
|
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
|
||||||
],
|
],
|
||||||
"content_excerpt": null,
|
"content_excerpt": null,
|
||||||
"citations": null
|
"chunk_ids": null
|
||||||
},
|
},
|
||||||
"next_step": "Ask the user which design doc to update.",
|
"next_step": "Ask the user which design doc to update.",
|
||||||
"missing_fields": ["path"],
|
"missing_fields": ["path"],
|
||||||
|
|
@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
|
||||||
"path": string | null,
|
"path": string | null,
|
||||||
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
||||||
"content_excerpt": string | null,
|
"content_excerpt": string | null,
|
||||||
"citations": string[] | null
|
"chunk_ids": string[] | null
|
||||||
},
|
},
|
||||||
"next_step": string | null,
|
"next_step": string | null,
|
||||||
"missing_fields": string[] | null,
|
"missing_fields": string[] | null,
|
||||||
|
|
|
||||||
|
|
@ -28,21 +28,41 @@ Reply in plain prose:
|
||||||
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
|
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
|
||||||
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
|
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
|
||||||
|
|
||||||
## Citations
|
## Chunk citations
|
||||||
|
|
||||||
`read_file` on a KB document under `/documents/` serves it in one of two forms; cite a claim from whichever you actually see, alongside the path. The caller passes these markers through to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
|
When the evidence for a claim came from a `read_file` response that included `<chunk id='…'>` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:<chunk_id>]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
|
||||||
|
|
||||||
- **Numbered body (default).** A `<document_metadata>` header gives the `<document_id>`, and the body is shown with line numbers. Cite the lines a claim came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
|
### Where chunk ids live in `read_file` output
|
||||||
- **Legacy chunk blocks (older docs).** XML with `<chunk id='N'>` blocks. Cite the chunk a claim came from as `[citation:N]`.
|
|
||||||
|
A KB document's XML has three numeric attributes — only **one** is a citation source:
|
||||||
|
|
||||||
|
```
|
||||||
|
<document>
|
||||||
|
<document_metadata>
|
||||||
|
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
|
||||||
|
...
|
||||||
|
</document_metadata>
|
||||||
|
<chunk_index>
|
||||||
|
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
|
||||||
|
<entry chunk_id="129" lines="23-30" matched="true"/>
|
||||||
|
</chunk_index>
|
||||||
|
<document_content>
|
||||||
|
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
|
||||||
|
<chunk id='129'><![CDATA[…]]></chunk>
|
||||||
|
</document_content>
|
||||||
|
</document>
|
||||||
|
```
|
||||||
|
|
||||||
### Rules
|
### Rules
|
||||||
|
|
||||||
- Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. If you cannot see the id/lines for a claim, omit the citation.
|
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
|
||||||
- Never cite `<document_id>` on its own — in the numbered form it is only the `d<document_id>` prefix of a line citation.
|
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
|
||||||
- Never invent, normalise, shorten, shift, or guess. Prefer **fewer accurate citations** over many speculative ones.
|
- Never cite `<document_id>` — that's the parent doc, not a chunk.
|
||||||
- Multiple passages supporting the same point → comma-separated and copied individually.
|
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
|
||||||
|
- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids.
|
||||||
|
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
|
||||||
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
|
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
|
||||||
- Listings (`ls` / `glob` / `grep`), error strings, and files without either form carry nothing to cite.
|
- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without `<chunk id='…'>`), skip the citation.
|
||||||
- The absolute path under `/documents/` is always required; citations are additive, they do not replace the path reference.
|
- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference.
|
||||||
|
|
||||||
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:d42#L3-9].`
|
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].`
|
||||||
|
|
|
||||||
|
|
@ -957,9 +957,8 @@ class Config:
|
||||||
os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
|
os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
|
||||||
)
|
)
|
||||||
# Bump to invalidate every cached embedding set after a chunker change.
|
# Bump to invalidate every cached embedding set after a chunker change.
|
||||||
# v2: chunks became exact (raw) slices of source_markdown for citation spans.
|
|
||||||
EMBEDDING_CACHE_CHUNKER_VERSION = int(
|
EMBEDDING_CACHE_CHUNKER_VERSION = int(
|
||||||
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "2")
|
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
|
||||||
)
|
)
|
||||||
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
|
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
|
||||||
EMBEDDING_CACHE_MAX_TOTAL_MB = int(
|
EMBEDDING_CACHE_MAX_TOTAL_MB = int(
|
||||||
|
|
|
||||||
|
|
@ -1470,11 +1470,6 @@ class Chunk(BaseModel, TimestampMixin):
|
||||||
# ordering reads are document-scoped (covered by ix_chunks_document_id) and
|
# ordering reads are document-scoped (covered by ix_chunks_document_id) and
|
||||||
# building a position index on the large chunks table is not worth it.
|
# building a position index on the large chunks table is not worth it.
|
||||||
position = Column(Integer, nullable=False, server_default="0")
|
position = Column(Integer, nullable=False, server_default="0")
|
||||||
# Half-open char span into the document's source_markdown the chunk was cut
|
|
||||||
# from. Nullable: historical rows predate spans and populate on reindex.
|
|
||||||
# Invariant for span-aware rows: source_markdown[start_char:end_char] == content.
|
|
||||||
start_char = Column(Integer, nullable=True)
|
|
||||||
end_char = Column(Integer, nullable=True)
|
|
||||||
|
|
||||||
document_id = Column(
|
document_id = Column(
|
||||||
Integer,
|
Integer,
|
||||||
|
|
|
||||||
|
|
@ -18,26 +18,23 @@ from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable
|
||||||
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
|
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
|
||||||
from app.indexing_pipeline.cache.service import EmbeddingCacheService
|
from app.indexing_pipeline.cache.service import EmbeddingCacheService
|
||||||
from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
|
from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice, chunk_markdown_with_spans
|
from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
|
||||||
from app.indexing_pipeline.document_embedder import embed_texts
|
from app.indexing_pipeline.document_embedder import embed_texts
|
||||||
from app.observability import metrics
|
from app.observability import metrics
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
SliceEmbedding = tuple[ChunkSlice, np.ndarray]
|
ChunkPair = tuple[str, np.ndarray]
|
||||||
|
|
||||||
|
|
||||||
async def build_chunk_embeddings(
|
async def build_chunk_embeddings(
|
||||||
markdown: str, *, use_code_chunker: bool
|
markdown: str, *, use_code_chunker: bool
|
||||||
) -> tuple[np.ndarray, list[SliceEmbedding]]:
|
) -> tuple[np.ndarray, list[ChunkPair]]:
|
||||||
"""Return the document-level vector and ordered ``(ChunkSlice, vector)`` pairs.
|
"""Return the document-level vector and ordered ``(chunk_text, vector)`` pairs.
|
||||||
|
|
||||||
Slices are always recomputed (cheap) so their char spans are exact; only the
|
Drop-in for the inline chunk+embed step; reuses prior output when the same
|
||||||
embeddings are cached, reused when the same markdown was embedded with the
|
markdown has already been embedded with the current model and chunker.
|
||||||
current model and chunker.
|
|
||||||
"""
|
"""
|
||||||
slices = await chunk_slices(markdown, use_code_chunker=use_code_chunker)
|
|
||||||
|
|
||||||
settings = load_embedding_cache_settings()
|
settings = load_embedding_cache_settings()
|
||||||
chunker_kind = "code" if use_code_chunker else "hybrid"
|
chunker_kind = "code" if use_code_chunker else "hybrid"
|
||||||
embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
|
embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
|
||||||
|
|
@ -48,7 +45,7 @@ async def build_chunk_embeddings(
|
||||||
embedding_dim=embedding_dim,
|
embedding_dim=embedding_dim,
|
||||||
)
|
)
|
||||||
if not cacheable:
|
if not cacheable:
|
||||||
return await _compute(markdown, slices)
|
return await _compute(markdown, use_code_chunker=use_code_chunker)
|
||||||
|
|
||||||
key = EmbeddingKey(
|
key = EmbeddingKey(
|
||||||
markdown_sha256=_hash_text(markdown),
|
markdown_sha256=_hash_text(markdown),
|
||||||
|
|
@ -59,30 +56,31 @@ async def build_chunk_embeddings(
|
||||||
)
|
)
|
||||||
|
|
||||||
cached = await _recall(key)
|
cached = await _recall(key)
|
||||||
if cached is not None and _aligns(cached, slices):
|
if cached is not None:
|
||||||
metrics.record_embedding_cache_lookup(
|
metrics.record_embedding_cache_lookup(
|
||||||
embedding_model=key.embedding_model,
|
embedding_model=key.embedding_model,
|
||||||
chunker_kind=chunker_kind,
|
chunker_kind=chunker_kind,
|
||||||
outcome="hit",
|
outcome="hit",
|
||||||
)
|
)
|
||||||
logger.debug("Embedding cache hit for %s", key.markdown_sha256)
|
logger.debug("Embedding cache hit for %s", key.markdown_sha256)
|
||||||
return cached.summary_embedding, list(
|
return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
|
||||||
zip(slices, (c.embedding for c in cached.chunks), strict=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
metrics.record_embedding_cache_lookup(
|
metrics.record_embedding_cache_lookup(
|
||||||
embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
|
embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
|
||||||
)
|
)
|
||||||
summary_embedding, pairs = await _compute(markdown, slices)
|
summary_embedding, chunk_pairs = await _compute(
|
||||||
await _remember(key, summary_embedding, pairs)
|
markdown, use_code_chunker=use_code_chunker
|
||||||
return summary_embedding, pairs
|
|
||||||
|
|
||||||
|
|
||||||
async def chunk_slices(markdown: str, *, use_code_chunker: bool) -> list[ChunkSlice]:
|
|
||||||
"""Chunk markdown into ordered, char-addressed slices off the event loop."""
|
|
||||||
return await asyncio.to_thread(
|
|
||||||
chunk_markdown_with_spans, markdown, use_code_chunker
|
|
||||||
)
|
)
|
||||||
|
await _remember(key, summary_embedding, chunk_pairs)
|
||||||
|
return summary_embedding, chunk_pairs
|
||||||
|
|
||||||
|
|
||||||
|
async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
|
||||||
|
"""Chunk markdown into ordered texts with the pipeline's chunker selection."""
|
||||||
|
if use_code_chunker:
|
||||||
|
return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
|
||||||
|
# Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
|
||||||
|
return await asyncio.to_thread(chunk_text_hybrid, markdown)
|
||||||
|
|
||||||
|
|
||||||
async def embed_batch(texts: list[str]) -> list[np.ndarray]:
|
async def embed_batch(texts: list[str]) -> list[np.ndarray]:
|
||||||
|
|
@ -90,19 +88,13 @@ async def embed_batch(texts: list[str]) -> list[np.ndarray]:
|
||||||
return await asyncio.to_thread(embed_texts, texts)
|
return await asyncio.to_thread(embed_texts, texts)
|
||||||
|
|
||||||
|
|
||||||
def _aligns(cached: EmbeddingSet, slices: list[ChunkSlice]) -> bool:
|
|
||||||
"""A hit is only usable if its texts still match the current chunking."""
|
|
||||||
return len(cached.chunks) == len(slices) and all(
|
|
||||||
c.text == s.text for c, s in zip(cached.chunks, slices, strict=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def _compute(
|
async def _compute(
|
||||||
markdown: str, slices: list[ChunkSlice]
|
markdown: str, *, use_code_chunker: bool
|
||||||
) -> tuple[np.ndarray, list[SliceEmbedding]]:
|
) -> tuple[np.ndarray, list[ChunkPair]]:
|
||||||
embeddings = await embed_batch([markdown, *(s.text for s in slices)])
|
chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker)
|
||||||
|
embeddings = await embed_batch([markdown, *chunk_texts])
|
||||||
summary_embedding, *chunk_embeddings = embeddings
|
summary_embedding, *chunk_embeddings = embeddings
|
||||||
return summary_embedding, list(zip(slices, chunk_embeddings, strict=True))
|
return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
|
||||||
|
|
||||||
|
|
||||||
async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
|
async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
|
||||||
|
|
@ -118,14 +110,14 @@ async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
|
||||||
|
|
||||||
|
|
||||||
async def _remember(
|
async def _remember(
|
||||||
key: EmbeddingKey, summary_embedding: np.ndarray, pairs: list[SliceEmbedding]
|
key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair]
|
||||||
) -> None:
|
) -> None:
|
||||||
try:
|
try:
|
||||||
from app.tasks.celery_tasks import get_celery_session_maker
|
from app.tasks.celery_tasks import get_celery_session_maker
|
||||||
|
|
||||||
embedding_set = EmbeddingSet(
|
embedding_set = EmbeddingSet(
|
||||||
summary_embedding=summary_embedding,
|
summary_embedding=summary_embedding,
|
||||||
chunks=[CachedChunk(text=s.text, embedding=vec) for s, vec in pairs],
|
chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs],
|
||||||
)
|
)
|
||||||
async with get_celery_session_maker()() as session:
|
async with get_celery_session_maker()() as session:
|
||||||
await EmbeddingCacheService(session).remember(key, embedding_set)
|
await EmbeddingCacheService(session).remember(key, embedding_set)
|
||||||
|
|
|
||||||
|
|
@ -19,9 +19,6 @@ class ExistingChunk:
|
||||||
id: int
|
id: int
|
||||||
content: str
|
content: str
|
||||||
position: int
|
position: int
|
||||||
# Stored char span; None for legacy rows indexed before spans existed.
|
|
||||||
start_char: int | None = None
|
|
||||||
end_char: int | None = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True, slots=True)
|
@dataclass(frozen=True, slots=True)
|
||||||
|
|
|
||||||
|
|
@ -1,30 +1,16 @@
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
from app.config import config
|
from app.config import config
|
||||||
|
|
||||||
# Regex that matches a Markdown table block (header + separator + one or more rows)
|
# Regex that matches a Markdown table block (header + separator + one or more rows)
|
||||||
# A table block starts with a | at the beginning of a line and ends when a
|
# A table block starts with a | at the beginning of a line and ends when a
|
||||||
# non-table line (or end of string) is encountered. The final row may end at EOF
|
# non-table line (or end of string) is encountered.
|
||||||
# without a trailing newline, so the whole table stays one slice.
|
|
||||||
_TABLE_BLOCK_RE = re.compile(
|
_TABLE_BLOCK_RE = re.compile(
|
||||||
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)",
|
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
|
||||||
re.MULTILINE,
|
re.MULTILINE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True, slots=True)
|
|
||||||
class ChunkSlice:
|
|
||||||
"""A chunk paired with its half-open char span into the source markdown.
|
|
||||||
|
|
||||||
Invariant: ``markdown[start_char:end_char] == text``.
|
|
||||||
"""
|
|
||||||
|
|
||||||
text: str
|
|
||||||
start_char: int
|
|
||||||
end_char: int
|
|
||||||
|
|
||||||
|
|
||||||
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
||||||
"""Chunk a text string using the configured chunker and return the chunk texts."""
|
"""Chunk a text string using the configured chunker and return the chunk texts."""
|
||||||
chunker = (
|
chunker = (
|
||||||
|
|
@ -33,63 +19,41 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
||||||
return [c.text for c in chunker.chunk(text)]
|
return [c.text for c in chunker.chunk(text)]
|
||||||
|
|
||||||
|
|
||||||
def chunk_markdown_with_spans(
|
def chunk_text_hybrid(text: str) -> list[str]:
|
||||||
text: str, use_code_chunker: bool = False
|
"""Table-aware chunker that prevents Markdown tables from being split mid-row.
|
||||||
) -> list[ChunkSlice]:
|
|
||||||
"""Chunk markdown into a lossless, contiguous partition of char-addressed slices.
|
|
||||||
|
|
||||||
Tables stay whole (issue #1334) and every slice is an exact substring of
|
Algorithm:
|
||||||
``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is
|
1. Scan the document for Markdown table blocks.
|
||||||
the offset record citations resolve against.
|
2. Each table block is emitted as a single, unmodified chunk so that its
|
||||||
|
header, separator row, and data rows always stay together.
|
||||||
|
3. The non-table prose segments between (and around) tables are passed through
|
||||||
|
the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
|
||||||
|
document order.
|
||||||
|
|
||||||
|
This ensures that table data is never sliced in the middle by the token-based
|
||||||
|
chunker, which would otherwise produce garbled rows that are useless for RAG.
|
||||||
|
|
||||||
|
Fixes #1334.
|
||||||
"""
|
"""
|
||||||
if not text:
|
chunks: list[str] = []
|
||||||
return []
|
|
||||||
|
|
||||||
slices: list[ChunkSlice] = []
|
|
||||||
cursor = 0
|
cursor = 0
|
||||||
|
|
||||||
for match in _TABLE_BLOCK_RE.finditer(text):
|
for match in _TABLE_BLOCK_RE.finditer(text):
|
||||||
if match.start() > cursor:
|
# Prose before this table
|
||||||
slices.extend(
|
prose = text[cursor : match.start()].strip()
|
||||||
_segment_slices(text, cursor, match.start(), use_code_chunker)
|
if prose:
|
||||||
)
|
chunks.extend(chunk_text(prose))
|
||||||
slices.append(ChunkSlice(match.group(0), match.start(), match.end()))
|
|
||||||
|
# The table itself is kept as one indivisible chunk
|
||||||
|
table_block = match.group(0).strip()
|
||||||
|
if table_block:
|
||||||
|
chunks.append(table_block)
|
||||||
|
|
||||||
cursor = match.end()
|
cursor = match.end()
|
||||||
|
|
||||||
if len(text) > cursor:
|
# Remaining prose after the last table (or entire text if no tables)
|
||||||
slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker))
|
trailing = text[cursor:].strip()
|
||||||
|
if trailing:
|
||||||
|
chunks.extend(chunk_text(trailing))
|
||||||
|
|
||||||
return slices
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
def _segment_slices(
|
|
||||||
text: str, start: int, end: int, use_code_chunker: bool
|
|
||||||
) -> list[ChunkSlice]:
|
|
||||||
"""Sub-chunk one non-table segment into contiguous, char-addressed slices."""
|
|
||||||
chunker = (
|
|
||||||
config.code_chunker_instance if use_code_chunker else config.chunker_instance
|
|
||||||
)
|
|
||||||
segment = text[start:end]
|
|
||||||
chunks = chunker.chunk(segment)
|
|
||||||
|
|
||||||
slices: list[ChunkSlice] = []
|
|
||||||
local = 0
|
|
||||||
for chunk in chunks:
|
|
||||||
# Use the chunker's end offset only as a cut point, then re-slice the
|
|
||||||
# segment ourselves so the result is an exact, gap-free substring.
|
|
||||||
local_end = min(max(chunk.end_index, local), len(segment))
|
|
||||||
if local_end <= local:
|
|
||||||
continue
|
|
||||||
slices.append(
|
|
||||||
ChunkSlice(segment[local:local_end], start + local, start + local_end)
|
|
||||||
)
|
|
||||||
local = local_end
|
|
||||||
|
|
||||||
if local < len(segment):
|
|
||||||
if slices:
|
|
||||||
last = slices[-1]
|
|
||||||
slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end)
|
|
||||||
else:
|
|
||||||
slices.append(ChunkSlice(segment[local:], start + local, end))
|
|
||||||
|
|
||||||
return slices
|
|
||||||
|
|
|
||||||
|
|
@ -20,10 +20,9 @@ from app.db import (
|
||||||
DocumentType,
|
DocumentType,
|
||||||
)
|
)
|
||||||
from app.indexing_pipeline.cache import build_chunk_embeddings
|
from app.indexing_pipeline.cache import build_chunk_embeddings
|
||||||
from app.indexing_pipeline.cache.cached_indexing import chunk_slices, embed_batch
|
from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch
|
||||||
from app.indexing_pipeline.chunk_reconciler import ChunkPlan, ExistingChunk, reconcile
|
from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile
|
||||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
|
||||||
from app.indexing_pipeline.document_hashing import (
|
from app.indexing_pipeline.document_hashing import (
|
||||||
compute_content_hash,
|
compute_content_hash,
|
||||||
compute_identifier_hash,
|
compute_identifier_hash,
|
||||||
|
|
@ -490,22 +489,12 @@ class IndexingPipelineService:
|
||||||
|
|
||||||
async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
|
async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
|
||||||
result = await self.session.execute(
|
result = await self.session.execute(
|
||||||
select(
|
select(Chunk.id, Chunk.content, Chunk.position).where(
|
||||||
Chunk.id,
|
Chunk.document_id == document_id
|
||||||
Chunk.content,
|
)
|
||||||
Chunk.position,
|
|
||||||
Chunk.start_char,
|
|
||||||
Chunk.end_char,
|
|
||||||
).where(Chunk.document_id == document_id)
|
|
||||||
)
|
)
|
||||||
return [
|
return [
|
||||||
ExistingChunk(
|
ExistingChunk(id=row.id, content=row.content, position=row.position)
|
||||||
id=row.id,
|
|
||||||
content=row.content,
|
|
||||||
position=row.position,
|
|
||||||
start_char=row.start_char,
|
|
||||||
end_char=row.end_char,
|
|
||||||
)
|
|
||||||
for row in result
|
for row in result
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -516,21 +505,15 @@ class IndexingPipelineService:
|
||||||
delete(Chunk).where(Chunk.document_id == document.id)
|
delete(Chunk).where(Chunk.document_id == document.id)
|
||||||
)
|
)
|
||||||
|
|
||||||
summary_embedding, slice_pairs = await build_chunk_embeddings(
|
summary_embedding, chunk_pairs = await build_chunk_embeddings(
|
||||||
content,
|
content,
|
||||||
use_code_chunker=connector_doc.should_use_code_chunker,
|
use_code_chunker=connector_doc.should_use_code_chunker,
|
||||||
)
|
)
|
||||||
|
|
||||||
document.embedding = summary_embedding
|
document.embedding = summary_embedding
|
||||||
return [
|
return [
|
||||||
Chunk(
|
Chunk(content=text, embedding=emb, position=i)
|
||||||
content=chunk_slice.text,
|
for i, (text, emb) in enumerate(chunk_pairs)
|
||||||
embedding=emb,
|
|
||||||
position=i,
|
|
||||||
start_char=chunk_slice.start_char,
|
|
||||||
end_char=chunk_slice.end_char,
|
|
||||||
)
|
|
||||||
for i, (chunk_slice, emb) in enumerate(slice_pairs)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
async def _reindex_incrementally(
|
async def _reindex_incrementally(
|
||||||
|
|
@ -542,39 +525,35 @@ class IndexingPipelineService:
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Edit path: keep rows whose text survived, embed only new texts.
|
"""Edit path: keep rows whose text survived, embed only new texts.
|
||||||
|
|
||||||
Unchanged rows keep their embedding and their HNSW/GIN index entries. An
|
Unchanged rows keep their embedding and their HNSW/GIN index entries;
|
||||||
edit can shift a kept chunk's char span without changing its text, so
|
moved rows get a position-only UPDATE, which touches neither index.
|
||||||
every kept row's position and span are refreshed whenever they drift.
|
|
||||||
"""
|
"""
|
||||||
slices = await chunk_slices(
|
new_texts = await chunk_markdown(
|
||||||
content, use_code_chunker=connector_doc.should_use_code_chunker
|
content, use_code_chunker=connector_doc.should_use_code_chunker
|
||||||
)
|
)
|
||||||
new_texts = [s.text for s in slices]
|
|
||||||
plan = reconcile(existing, new_texts)
|
plan = reconcile(existing, new_texts)
|
||||||
|
|
||||||
# One batch: the document-level summary vector plus the missing chunks.
|
# One batch: the document-level summary vector plus the missing chunks.
|
||||||
embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
|
embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
|
||||||
summary_embedding, *new_embeddings = embeddings
|
summary_embedding, *new_embeddings = embeddings
|
||||||
|
|
||||||
|
if plan.reused:
|
||||||
|
await self.session.execute(
|
||||||
|
update(Chunk),
|
||||||
|
[{"id": cid, "position": pos} for cid, pos in plan.reused],
|
||||||
|
)
|
||||||
if plan.to_delete:
|
if plan.to_delete:
|
||||||
await self.session.execute(
|
await self.session.execute(
|
||||||
delete(Chunk).where(Chunk.id.in_(plan.to_delete))
|
delete(Chunk).where(Chunk.id.in_(plan.to_delete))
|
||||||
)
|
)
|
||||||
|
|
||||||
span_updates = self._kept_row_span_updates(existing, slices, plan)
|
|
||||||
if span_updates:
|
|
||||||
await self.session.execute(update(Chunk), span_updates)
|
|
||||||
|
|
||||||
self.session.add_all(
|
self.session.add_all(
|
||||||
Chunk(
|
Chunk(
|
||||||
content=slices[pos].text,
|
content=text,
|
||||||
embedding=emb,
|
embedding=emb,
|
||||||
position=pos,
|
position=pos,
|
||||||
start_char=slices[pos].start_char,
|
|
||||||
end_char=slices[pos].end_char,
|
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
)
|
)
|
||||||
for (pos, _text), emb in zip(plan.to_embed, new_embeddings, strict=True)
|
for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True)
|
||||||
)
|
)
|
||||||
document.embedding = summary_embedding
|
document.embedding = summary_embedding
|
||||||
|
|
||||||
|
|
@ -585,36 +564,6 @@ class IndexingPipelineService:
|
||||||
)
|
)
|
||||||
return len(new_texts)
|
return len(new_texts)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _kept_row_span_updates(
|
|
||||||
existing: list[ExistingChunk],
|
|
||||||
slices: list[ChunkSlice],
|
|
||||||
plan: ChunkPlan,
|
|
||||||
) -> list[dict]:
|
|
||||||
"""Position/span writes for kept rows, emitted only where a value drifts."""
|
|
||||||
deleted = set(plan.to_delete)
|
|
||||||
moved = dict(plan.reused)
|
|
||||||
updates: list[dict] = []
|
|
||||||
for chunk in existing:
|
|
||||||
if chunk.id in deleted:
|
|
||||||
continue
|
|
||||||
new_position = moved.get(chunk.id, chunk.position)
|
|
||||||
target = slices[new_position]
|
|
||||||
if (
|
|
||||||
chunk.position != new_position
|
|
||||||
or chunk.start_char != target.start_char
|
|
||||||
or chunk.end_char != target.end_char
|
|
||||||
):
|
|
||||||
updates.append(
|
|
||||||
{
|
|
||||||
"id": chunk.id,
|
|
||||||
"position": new_position,
|
|
||||||
"start_char": target.start_char,
|
|
||||||
"end_char": target.end_char,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return updates
|
|
||||||
|
|
||||||
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
|
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
|
||||||
"""Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
|
"""Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -440,15 +440,8 @@ class ChucksHybridSearchRetriever:
|
||||||
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
|
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
|
||||||
|
|
||||||
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
|
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
|
||||||
# start_char/end_char carry the citation span; None for legacy rows.
|
|
||||||
chunk_query = (
|
chunk_query = (
|
||||||
select(
|
select(Chunk.id, Chunk.content, Chunk.document_id)
|
||||||
Chunk.id,
|
|
||||||
Chunk.content,
|
|
||||||
Chunk.document_id,
|
|
||||||
Chunk.start_char,
|
|
||||||
Chunk.end_char,
|
|
||||||
)
|
|
||||||
.join(numbered, Chunk.id == numbered.c.chunk_id)
|
.join(numbered, Chunk.id == numbered.c.chunk_id)
|
||||||
.where(chunk_filter)
|
.where(chunk_filter)
|
||||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||||
|
|
@ -483,14 +476,7 @@ class ChucksHybridSearchRetriever:
|
||||||
if doc_id not in doc_map:
|
if doc_id not in doc_map:
|
||||||
continue
|
continue
|
||||||
doc_entry = doc_map[doc_id]
|
doc_entry = doc_map[doc_id]
|
||||||
doc_entry["chunks"].append(
|
doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
|
||||||
{
|
|
||||||
"chunk_id": row.id,
|
|
||||||
"content": row.content,
|
|
||||||
"start_char": row.start_char,
|
|
||||||
"end_char": row.end_char,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
if row.id in matched_chunk_ids:
|
if row.id in matched_chunk_ids:
|
||||||
doc_entry["matched_chunk_ids"].append(row.id)
|
doc_entry["matched_chunk_ids"].append(row.id)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,6 @@ from app.schemas import (
|
||||||
from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
|
from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
|
||||||
from app.users import get_auth_context
|
from app.users import get_auth_context
|
||||||
from app.utils.rbac import check_permission
|
from app.utils.rbac import check_permission
|
||||||
from app.utils.text_spans import char_span_to_line_range
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
|
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
|
||||||
|
|
@ -977,12 +976,9 @@ async def get_document_by_chunk_id(
|
||||||
session: AsyncSession = Depends(get_async_session),
|
session: AsyncSession = Depends(get_async_session),
|
||||||
auth: AuthContext = Depends(get_auth_context),
|
auth: AuthContext = Depends(get_auth_context),
|
||||||
):
|
):
|
||||||
"""Resolve a chunk id to its document plus a window of surrounding chunks.
|
"""
|
||||||
|
Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
|
||||||
Returns the cited chunk's 1-based line range (cited_start_line/
|
Uses SQL-level pagination to avoid loading all chunks into memory.
|
||||||
cited_end_line) when char spans exist, so callers can anchor the citation
|
|
||||||
to exact source lines. Uses SQL-level pagination to avoid loading all
|
|
||||||
chunks into memory.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from sqlalchemy import and_, func, or_
|
from sqlalchemy import and_, func, or_
|
||||||
|
|
@ -1046,17 +1042,6 @@ async def get_document_by_chunk_id(
|
||||||
)
|
)
|
||||||
windowed_chunks = windowed_result.scalars().all()
|
windowed_chunks = windowed_result.scalars().all()
|
||||||
|
|
||||||
cited_start_line: int | None = None
|
|
||||||
cited_end_line: int | None = None
|
|
||||||
if (
|
|
||||||
chunk.start_char is not None
|
|
||||||
and chunk.end_char is not None
|
|
||||||
and document.source_markdown
|
|
||||||
):
|
|
||||||
cited_start_line, cited_end_line = char_span_to_line_range(
|
|
||||||
document.source_markdown, chunk.start_char, chunk.end_char
|
|
||||||
)
|
|
||||||
|
|
||||||
return DocumentWithChunksRead(
|
return DocumentWithChunksRead(
|
||||||
id=document.id,
|
id=document.id,
|
||||||
title=document.title,
|
title=document.title,
|
||||||
|
|
@ -1071,8 +1056,6 @@ async def get_document_by_chunk_id(
|
||||||
chunks=windowed_chunks,
|
chunks=windowed_chunks,
|
||||||
total_chunks=total_chunks,
|
total_chunks=total_chunks,
|
||||||
chunk_start_index=start,
|
chunk_start_index=start,
|
||||||
cited_start_line=cited_start_line,
|
|
||||||
cited_end_line=cited_end_line,
|
|
||||||
)
|
)
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
raise
|
raise
|
||||||
|
|
|
||||||
|
|
@ -43,34 +43,6 @@ EDITOR_PLATE_MAX_BYTES = 1 * 1024 * 1024
|
||||||
EDITOR_PLATE_MAX_LINES = 5000
|
EDITOR_PLATE_MAX_LINES = 5000
|
||||||
|
|
||||||
|
|
||||||
def _raise_no_canonical_body(document: Document) -> None:
|
|
||||||
"""Translate a missing source_markdown into a status-aware HTTP error."""
|
|
||||||
doc_status = document.status or {}
|
|
||||||
state = (
|
|
||||||
doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready"
|
|
||||||
)
|
|
||||||
|
|
||||||
if state in ("pending", "processing"):
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=409,
|
|
||||||
detail="This document is still being processed. Please wait a moment and try again.",
|
|
||||||
)
|
|
||||||
if state == "failed":
|
|
||||||
reason = (
|
|
||||||
doc_status.get("reason", "Unknown error")
|
|
||||||
if isinstance(doc_status, dict)
|
|
||||||
else "Unknown error"
|
|
||||||
)
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=422,
|
|
||||||
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
|
|
||||||
)
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail="This document has no editable content. It may not have been processed correctly. Try re-indexing or re-uploading it.",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
|
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
|
||||||
async def get_editor_content(
|
async def get_editor_content(
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
|
|
@ -82,9 +54,8 @@ async def get_editor_content(
|
||||||
"""
|
"""
|
||||||
Get document content for editing.
|
Get document content for editing.
|
||||||
|
|
||||||
Returns source_markdown (the canonical body) for the Plate.js editor, with a
|
Returns source_markdown for the Plate.js editor.
|
||||||
one-time migration from legacy blocknote_document. Never reconstructs the
|
Falls back to blocknote_document → markdown conversion, then chunk reconstruction.
|
||||||
body from chunks.
|
|
||||||
|
|
||||||
Requires DOCUMENTS_READ permission.
|
Requires DOCUMENTS_READ permission.
|
||||||
"""
|
"""
|
||||||
|
|
@ -154,9 +125,52 @@ async def get_editor_content(
|
||||||
await session.commit()
|
await session.commit()
|
||||||
return _build_response(empty_markdown)
|
return _build_response(empty_markdown)
|
||||||
|
|
||||||
# No canonical body. Chunks are an index artifact, never the source of
|
chunk_contents_result = await session.execute(
|
||||||
# truth, so surface the processing state instead of rebuilding from them.
|
select(Chunk.content)
|
||||||
_raise_no_canonical_body(document)
|
.filter(Chunk.document_id == document_id)
|
||||||
|
.order_by(Chunk.position, Chunk.id)
|
||||||
|
)
|
||||||
|
chunk_contents = chunk_contents_result.scalars().all()
|
||||||
|
|
||||||
|
if not chunk_contents:
|
||||||
|
doc_status = document.status or {}
|
||||||
|
state = (
|
||||||
|
doc_status.get("state", "ready")
|
||||||
|
if isinstance(doc_status, dict)
|
||||||
|
else "ready"
|
||||||
|
)
|
||||||
|
if state in ("pending", "processing"):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=409,
|
||||||
|
detail="This document is still being processed. Please wait a moment and try again.",
|
||||||
|
)
|
||||||
|
if state == "failed":
|
||||||
|
reason = (
|
||||||
|
doc_status.get("reason", "Unknown error")
|
||||||
|
if isinstance(doc_status, dict)
|
||||||
|
else "Unknown error"
|
||||||
|
)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=422,
|
||||||
|
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
|
||||||
|
)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
|
||||||
|
)
|
||||||
|
|
||||||
|
markdown_content = "\n\n".join(chunk_contents)
|
||||||
|
|
||||||
|
if not markdown_content.strip():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="This document appears to be empty. Try re-uploading or editing it to add content.",
|
||||||
|
)
|
||||||
|
|
||||||
|
document.source_markdown = markdown_content
|
||||||
|
await session.commit()
|
||||||
|
|
||||||
|
return _build_response(markdown_content)
|
||||||
|
|
||||||
|
|
||||||
@router.get(
|
@router.get(
|
||||||
|
|
@ -170,9 +184,8 @@ async def download_document_markdown(
|
||||||
):
|
):
|
||||||
user = auth.user
|
user = auth.user
|
||||||
"""
|
"""
|
||||||
Download the canonical document body as a .md file.
|
Download the full document content as a .md file.
|
||||||
|
Reconstructs markdown from source_markdown or chunks.
|
||||||
Serves source_markdown, migrating legacy blocknote_document when present.
|
|
||||||
"""
|
"""
|
||||||
await check_permission(
|
await check_permission(
|
||||||
session,
|
session,
|
||||||
|
|
@ -198,6 +211,15 @@ async def download_document_markdown(
|
||||||
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||||
|
|
||||||
markdown = blocknote_to_markdown(document.blocknote_document)
|
markdown = blocknote_to_markdown(document.blocknote_document)
|
||||||
|
if markdown is None:
|
||||||
|
chunk_contents_result = await session.execute(
|
||||||
|
select(Chunk.content)
|
||||||
|
.filter(Chunk.document_id == document_id)
|
||||||
|
.order_by(Chunk.position, Chunk.id)
|
||||||
|
)
|
||||||
|
chunk_contents = chunk_contents_result.scalars().all()
|
||||||
|
if chunk_contents:
|
||||||
|
markdown = "\n\n".join(chunk_contents)
|
||||||
|
|
||||||
if not markdown or not markdown.strip():
|
if not markdown or not markdown.strip():
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
|
|
@ -340,6 +362,15 @@ async def export_document(
|
||||||
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||||
|
|
||||||
markdown_content = blocknote_to_markdown(document.blocknote_document)
|
markdown_content = blocknote_to_markdown(document.blocknote_document)
|
||||||
|
if markdown_content is None:
|
||||||
|
chunk_contents_result = await session.execute(
|
||||||
|
select(Chunk.content)
|
||||||
|
.filter(Chunk.document_id == document_id)
|
||||||
|
.order_by(Chunk.position, Chunk.id)
|
||||||
|
)
|
||||||
|
chunk_contents = chunk_contents_result.scalars().all()
|
||||||
|
if chunk_contents:
|
||||||
|
markdown_content = "\n\n".join(chunk_contents)
|
||||||
|
|
||||||
if not markdown_content or not markdown_content.strip():
|
if not markdown_content or not markdown_content.strip():
|
||||||
raise HTTPException(status_code=400, detail="Document has no content to export")
|
raise HTTPException(status_code=400, detail="Document has no content to export")
|
||||||
|
|
|
||||||
|
|
@ -214,7 +214,7 @@ async def _execute_image_generation(
|
||||||
)
|
)
|
||||||
|
|
||||||
# Store response
|
# Store response
|
||||||
image_gen.response_data = (
|
response_dict = (
|
||||||
response.model_dump() if hasattr(response, "model_dump") else dict(response)
|
response.model_dump() if hasattr(response, "model_dump") else dict(response)
|
||||||
)
|
)
|
||||||
if not image_gen.model and hasattr(response, "_hidden_params"):
|
if not image_gen.model and hasattr(response, "_hidden_params"):
|
||||||
|
|
@ -222,6 +222,20 @@ async def _execute_image_generation(
|
||||||
if isinstance(hidden, dict) and hidden.get("model"):
|
if isinstance(hidden, dict) and hidden.get("model"):
|
||||||
image_gen.model = hidden["model"]
|
image_gen.model = hidden["model"]
|
||||||
|
|
||||||
|
# Fix relative URLs in response data (for the serving endpoint)
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
images = response_dict.get("data", [])
|
||||||
|
provider_base_url = resolved_kwargs.get("api_base")
|
||||||
|
for image in images:
|
||||||
|
if image.get("url"):
|
||||||
|
raw_url: str = image["url"]
|
||||||
|
if raw_url.startswith("/") and provider_base_url:
|
||||||
|
parsed = urlparse(provider_base_url)
|
||||||
|
origin = f"{parsed.scheme}://{parsed.netloc}"
|
||||||
|
image["url"] = f"{origin}{raw_url}"
|
||||||
|
|
||||||
|
image_gen.response_data = response_dict
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Image Generation Execution + Results CRUD
|
# Image Generation Execution + Results CRUD
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,4 @@ class ChunkUpdate(ChunkBase):
|
||||||
|
|
||||||
|
|
||||||
class ChunkRead(ChunkBase, IDModel, TimestampModel):
|
class ChunkRead(ChunkBase, IDModel, TimestampModel):
|
||||||
start_char: int | None = None
|
|
||||||
end_char: int | None = None
|
|
||||||
|
|
||||||
model_config = ConfigDict(from_attributes=True)
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
|
|
||||||
|
|
@ -73,10 +73,6 @@ class DocumentWithChunksRead(DocumentRead):
|
||||||
chunks: list[ChunkRead] = []
|
chunks: list[ChunkRead] = []
|
||||||
total_chunks: int = 0
|
total_chunks: int = 0
|
||||||
chunk_start_index: int = 0
|
chunk_start_index: int = 0
|
||||||
# 1-based inclusive line range of the cited chunk within source_markdown;
|
|
||||||
# None when the chunk predates char spans or the body is unavailable.
|
|
||||||
cited_start_line: int | None = None
|
|
||||||
cited_end_line: int | None = None
|
|
||||||
|
|
||||||
model_config = ConfigDict(from_attributes=True)
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
||||||
"""Convert char spans into document-relative line ranges.
|
|
||||||
|
|
||||||
Chunks store half-open char spans into ``source_markdown``; citations and the
|
|
||||||
editor speak in line numbers. This is the single shared conversion so search,
|
|
||||||
the resolve API, and highlighting all agree on what "lines X-Y" means.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
|
|
||||||
def char_span_to_line_range(text: str, start_char: int, end_char: int) -> tuple[int, int]:
|
|
||||||
"""Return the 1-based inclusive line range covering ``[start_char, end_char)``.
|
|
||||||
|
|
||||||
Offsets are clamped to ``text`` bounds. An empty span resolves to the single
|
|
||||||
line containing it.
|
|
||||||
"""
|
|
||||||
n = len(text)
|
|
||||||
start = max(0, min(start_char, n))
|
|
||||||
end = max(start, min(end_char, n))
|
|
||||||
start_line = text.count("\n", 0, start) + 1
|
|
||||||
last_char_index = max(start, end - 1)
|
|
||||||
end_line = text.count("\n", 0, last_char_index) + 1
|
|
||||||
return start_line, end_line
|
|
||||||
|
|
@ -1,80 +0,0 @@
|
||||||
"""NOTE writes must carry the same char spans as the indexing pipeline.
|
|
||||||
|
|
||||||
``_create_document`` / ``_update_document`` are the cloud agent's KB write
|
|
||||||
paths. They must chunk through the shared span chunker so every persisted
|
|
||||||
chunk resolves back to an exact slice of ``source_markdown`` for citations.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from sqlalchemy import select
|
|
||||||
|
|
||||||
from app.agents.chat.multi_agent_chat.main_agent.middleware.kb_persistence import (
|
|
||||||
middleware as kb,
|
|
||||||
)
|
|
||||||
from app.db import Chunk
|
|
||||||
|
|
||||||
pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
|
|
||||||
|
|
||||||
_BODY = "Intro paragraph.\n\nBody paragraph with detail.\n\nOutro paragraph."
|
|
||||||
_NEW_BODY = "Rewritten intro.\n\nFresh body content.\n\nNew closing line."
|
|
||||||
|
|
||||||
|
|
||||||
async def _ordered_chunks(session, doc_id: int) -> list[Chunk]:
|
|
||||||
rows = await session.execute(
|
|
||||||
select(Chunk).where(Chunk.document_id == doc_id).order_by(Chunk.position)
|
|
||||||
)
|
|
||||||
return list(rows.scalars().all())
|
|
||||||
|
|
||||||
|
|
||||||
def _assert_spans_resolve(source_markdown: str, chunks: list[Chunk]) -> None:
|
|
||||||
assert chunks
|
|
||||||
for chunk in chunks:
|
|
||||||
assert chunk.start_char is not None
|
|
||||||
assert chunk.end_char is not None
|
|
||||||
assert source_markdown[chunk.start_char : chunk.end_char] == chunk.content
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_embed_texts")
|
|
||||||
async def test_note_create_populates_chunk_spans(
|
|
||||||
db_session, db_search_space, db_user
|
|
||||||
) -> None:
|
|
||||||
doc = await kb._create_document(
|
|
||||||
db_session,
|
|
||||||
virtual_path="/documents/note.md",
|
|
||||||
content=_BODY,
|
|
||||||
search_space_id=db_search_space.id,
|
|
||||||
created_by_id=str(db_user.id),
|
|
||||||
)
|
|
||||||
await db_session.flush()
|
|
||||||
|
|
||||||
chunks = await _ordered_chunks(db_session, doc.id)
|
|
||||||
_assert_spans_resolve(doc.source_markdown, chunks)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_embed_texts")
|
|
||||||
async def test_note_update_refreshes_chunk_spans(
|
|
||||||
db_session, db_search_space, db_user
|
|
||||||
) -> None:
|
|
||||||
doc = await kb._create_document(
|
|
||||||
db_session,
|
|
||||||
virtual_path="/documents/note.md",
|
|
||||||
content=_BODY,
|
|
||||||
search_space_id=db_search_space.id,
|
|
||||||
created_by_id=str(db_user.id),
|
|
||||||
)
|
|
||||||
await db_session.flush()
|
|
||||||
|
|
||||||
updated = await kb._update_document(
|
|
||||||
db_session,
|
|
||||||
doc_id=doc.id,
|
|
||||||
content=_NEW_BODY,
|
|
||||||
virtual_path="/documents/note.md",
|
|
||||||
search_space_id=db_search_space.id,
|
|
||||||
)
|
|
||||||
await db_session.flush()
|
|
||||||
|
|
||||||
assert updated is not None
|
|
||||||
chunks = await _ordered_chunks(db_session, updated.id)
|
|
||||||
_assert_spans_resolve(updated.source_markdown, chunks)
|
|
||||||
|
|
@ -158,12 +158,13 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def patched_chunk_text(monkeypatch) -> MagicMock:
|
def patched_chunk_text(monkeypatch) -> MagicMock:
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
mock = MagicMock(return_value=["Test chunk content."])
|
||||||
|
|
||||||
text = "Test chunk content."
|
|
||||||
mock = MagicMock(return_value=[ChunkSlice(text, 0, len(text))])
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||||
|
mock,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||||
mock,
|
mock,
|
||||||
)
|
)
|
||||||
return mock
|
return mock
|
||||||
|
|
|
||||||
|
|
@ -286,12 +286,9 @@ def _mock_external_apis(monkeypatch):
|
||||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||||
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
||||||
)
|
)
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
|
||||||
|
|
||||||
chunk = "Test chunk content."
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||||
MagicMock(return_value=[ChunkSlice(chunk, 0, len(chunk))]),
|
MagicMock(return_value=["Test chunk content."]),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -176,14 +176,9 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
|
||||||
@pytest.mark.usefixtures("patched_embed_texts")
|
@pytest.mark.usefixtures("patched_embed_texts")
|
||||||
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
|
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
|
||||||
"""Reindexing replaces old chunks with new content rather than appending."""
|
"""Reindexing replaces old chunks with new content rather than appending."""
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
|
||||||
|
|
||||||
mocker.patch(
|
mocker.patch(
|
||||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||||
side_effect=[
|
side_effect=[["Original chunk."], ["Updated chunk."]],
|
||||||
[ChunkSlice("Original chunk.", 0, len("Original chunk."))],
|
|
||||||
[ChunkSlice("Updated chunk.", 0, len("Updated chunk."))],
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
adapter = UploadDocumentAdapter(db_session)
|
adapter = UploadDocumentAdapter(db_session)
|
||||||
|
|
|
||||||
|
|
@ -18,22 +18,16 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph."
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def paragraph_chunker(monkeypatch):
|
def paragraph_chunker(monkeypatch):
|
||||||
"""One slice per markdown paragraph, so edits map to chunk-level diffs."""
|
"""One chunk per markdown paragraph, so edits map to chunk-level diffs."""
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
|
||||||
|
|
||||||
def _split(markdown, *_args, **_kwargs):
|
def _split(markdown, **_kwargs):
|
||||||
slices = []
|
return [p for p in markdown.split("\n\n") if p.strip()]
|
||||||
cursor = 0
|
|
||||||
for para in markdown.split("\n\n"):
|
|
||||||
start = markdown.index(para, cursor)
|
|
||||||
cursor = start + len(para)
|
|
||||||
if para.strip():
|
|
||||||
slices.append(ChunkSlice(para, start, cursor))
|
|
||||||
return slices
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text", _split
|
||||||
_split,
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,96 +0,0 @@
|
||||||
"""Indexing records char spans so a chunk addresses its exact slice of the body.
|
|
||||||
|
|
||||||
Uses the real chunker (only embeddings are faked) so the span/partition
|
|
||||||
invariants are exercised end to end.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from sqlalchemy import select
|
|
||||||
|
|
||||||
from app.db import Chunk, Document
|
|
||||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.integration
|
|
||||||
|
|
||||||
_BODY = (
|
|
||||||
"# Report\n\n"
|
|
||||||
+ "Intro paragraph that is reasonably long and descriptive. " * 8
|
|
||||||
+ "\n\n| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n\n"
|
|
||||||
+ "Closing paragraph with a different shape and more words to chunk. " * 8
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def _ordered_chunks(session, document_id) -> list[Chunk]:
|
|
||||||
result = await session.execute(
|
|
||||||
select(Chunk)
|
|
||||||
.filter(Chunk.document_id == document_id)
|
|
||||||
.order_by(Chunk.position, Chunk.id)
|
|
||||||
)
|
|
||||||
return list(result.scalars().all())
|
|
||||||
|
|
||||||
|
|
||||||
def _assert_spans_address_body(chunks: list[Chunk], body: str) -> None:
|
|
||||||
for chunk in chunks:
|
|
||||||
assert chunk.start_char is not None and chunk.end_char is not None
|
|
||||||
assert body[chunk.start_char : chunk.end_char] == chunk.content
|
|
||||||
assert "".join(c.content for c in chunks) == body
|
|
||||||
|
|
||||||
|
|
||||||
async def _index(session, connector_doc) -> int:
|
|
||||||
service = IndexingPipelineService(session=session)
|
|
||||||
prepared = await service.prepare_for_indexing([connector_doc])
|
|
||||||
document = prepared[0]
|
|
||||||
await service.index(document, connector_doc)
|
|
||||||
return document.id
|
|
||||||
|
|
||||||
|
|
||||||
async def _reload_body(session, document_id) -> str:
|
|
||||||
result = await session.execute(select(Document).filter(Document.id == document_id))
|
|
||||||
return result.scalars().first().source_markdown
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_embed_texts")
|
|
||||||
async def test_scratch_index_records_spans_addressing_body(
|
|
||||||
db_session, db_search_space, make_connector_document
|
|
||||||
):
|
|
||||||
connector_doc = make_connector_document(
|
|
||||||
search_space_id=db_search_space.id, source_markdown=_BODY
|
|
||||||
)
|
|
||||||
|
|
||||||
document_id = await _index(db_session, connector_doc)
|
|
||||||
|
|
||||||
body = await _reload_body(db_session, document_id)
|
|
||||||
chunks = await _ordered_chunks(db_session, document_id)
|
|
||||||
|
|
||||||
assert len(chunks) > 1
|
|
||||||
_assert_spans_address_body(chunks, body)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_embed_texts")
|
|
||||||
async def test_incremental_reindex_refreshes_shifted_spans(
|
|
||||||
db_session, db_search_space, make_connector_document
|
|
||||||
):
|
|
||||||
"""Inserting text at the top shifts every later chunk's span; kept rows must
|
|
||||||
have their spans refreshed, not left pointing at the old offsets."""
|
|
||||||
service = IndexingPipelineService(session=db_session)
|
|
||||||
|
|
||||||
original = make_connector_document(
|
|
||||||
search_space_id=db_search_space.id, source_markdown=_BODY
|
|
||||||
)
|
|
||||||
prepared = await service.prepare_for_indexing([original])
|
|
||||||
document_id = prepared[0].id
|
|
||||||
await service.index(prepared[0], original)
|
|
||||||
|
|
||||||
edited_body = "# Prepended heading\n\nA brand new opening paragraph.\n\n" + _BODY
|
|
||||||
edited = make_connector_document(
|
|
||||||
search_space_id=db_search_space.id, source_markdown=edited_body
|
|
||||||
)
|
|
||||||
prepared_again = await service.prepare_for_indexing([edited])
|
|
||||||
assert prepared_again, "edited content should requeue the document"
|
|
||||||
await service.index(prepared_again[0], edited)
|
|
||||||
|
|
||||||
body = await _reload_body(db_session, document_id)
|
|
||||||
chunks = await _ordered_chunks(db_session, document_id)
|
|
||||||
|
|
||||||
assert body == edited_body
|
|
||||||
_assert_spans_address_body(chunks, body)
|
|
||||||
|
|
@ -40,19 +40,11 @@ def _make_document(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _make_chunk(
|
def _make_chunk(*, content: str, document_id: int) -> Chunk:
|
||||||
*,
|
|
||||||
content: str,
|
|
||||||
document_id: int,
|
|
||||||
start_char: int | None = None,
|
|
||||||
end_char: int | None = None,
|
|
||||||
) -> Chunk:
|
|
||||||
return Chunk(
|
return Chunk(
|
||||||
content=content,
|
content=content,
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
embedding=DUMMY_EMBEDDING,
|
embedding=DUMMY_EMBEDDING,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -99,8 +91,6 @@ async def seed_large_doc(
|
||||||
_make_chunk(
|
_make_chunk(
|
||||||
content="quarterly performance review summary note content",
|
content="quarterly performance review summary note content",
|
||||||
document_id=small_doc.id,
|
document_id=small_doc.id,
|
||||||
start_char=0,
|
|
||||||
end_char=10,
|
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -98,32 +98,6 @@ async def test_chunks_ordered_by_id(db_session, seed_large_doc):
|
||||||
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
|
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
|
||||||
|
|
||||||
|
|
||||||
async def test_chunk_spans_returned(db_session, seed_large_doc):
|
|
||||||
"""Each chunk dict carries start_char/end_char (the citation span)."""
|
|
||||||
space_id = seed_large_doc["search_space"].id
|
|
||||||
small_doc_id = seed_large_doc["small_doc"].id
|
|
||||||
|
|
||||||
retriever = ChucksHybridSearchRetriever(db_session)
|
|
||||||
results = await retriever.hybrid_search(
|
|
||||||
query_text="quarterly performance review summary",
|
|
||||||
top_k=10,
|
|
||||||
search_space_id=space_id,
|
|
||||||
query_embedding=DUMMY_EMBEDDING,
|
|
||||||
)
|
|
||||||
|
|
||||||
for result in results:
|
|
||||||
for chunk in result["chunks"]:
|
|
||||||
assert "start_char" in chunk
|
|
||||||
assert "end_char" in chunk
|
|
||||||
if result["document"].get("id") == small_doc_id:
|
|
||||||
seeded = result["chunks"][0]
|
|
||||||
assert seeded["start_char"] == 0
|
|
||||||
assert seeded["end_char"] == 10
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
pytest.fail("Small doc not found in search results")
|
|
||||||
|
|
||||||
|
|
||||||
async def test_score_is_positive_float(db_session, seed_large_doc):
|
async def test_score_is_positive_float(db_session, seed_large_doc):
|
||||||
"""Each result should have a positive float score from RRF."""
|
"""Each result should have a positive float score from RRF."""
|
||||||
space_id = seed_large_doc["search_space"].id
|
space_id = seed_large_doc["search_space"].id
|
||||||
|
|
|
||||||
|
|
@ -1,127 +0,0 @@
|
||||||
"""Phase E.1 contract: the by-chunk resolve API exposes chunk char spans and
|
|
||||||
derives the cited chunk's line range from source_markdown."""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import pytest_asyncio
|
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
|
||||||
|
|
||||||
from app.db import Chunk, Document, DocumentStatus, DocumentType, SearchSpace, User
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.integration
|
|
||||||
|
|
||||||
_BODY = "alpha\nbravo\ncharlie\ndelta"
|
|
||||||
|
|
||||||
|
|
||||||
async def _make_document(
|
|
||||||
session: AsyncSession,
|
|
||||||
search_space: SearchSpace,
|
|
||||||
user: User,
|
|
||||||
*,
|
|
||||||
source_markdown: str = _BODY,
|
|
||||||
) -> Document:
|
|
||||||
doc = Document(
|
|
||||||
title="Doc",
|
|
||||||
document_type=DocumentType.FILE,
|
|
||||||
document_metadata={},
|
|
||||||
content=source_markdown,
|
|
||||||
content_hash="hash-by-chunk",
|
|
||||||
source_markdown=source_markdown,
|
|
||||||
search_space_id=search_space.id,
|
|
||||||
created_by_id=user.id,
|
|
||||||
status=DocumentStatus.ready(),
|
|
||||||
)
|
|
||||||
session.add(doc)
|
|
||||||
await session.flush()
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
async def _add_chunk(
|
|
||||||
session: AsyncSession,
|
|
||||||
document: Document,
|
|
||||||
*,
|
|
||||||
content: str,
|
|
||||||
position: int,
|
|
||||||
start_char: int | None,
|
|
||||||
end_char: int | None,
|
|
||||||
) -> Chunk:
|
|
||||||
chunk = Chunk(
|
|
||||||
content=content,
|
|
||||||
position=position,
|
|
||||||
document_id=document.id,
|
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
)
|
|
||||||
session.add(chunk)
|
|
||||||
await session.flush()
|
|
||||||
return chunk
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def make_document(db_session, db_search_space, db_user):
|
|
||||||
async def _make(**overrides):
|
|
||||||
return await _make_document(db_session, db_search_space, db_user, **overrides)
|
|
||||||
|
|
||||||
return _make
|
|
||||||
|
|
||||||
|
|
||||||
async def test_cited_line_range_derived_from_spans(
|
|
||||||
db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.documents_routes import get_document_by_chunk_id
|
|
||||||
|
|
||||||
doc = await make_document()
|
|
||||||
await _add_chunk(
|
|
||||||
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
|
|
||||||
)
|
|
||||||
cited = await _add_chunk(
|
|
||||||
db_session,
|
|
||||||
doc,
|
|
||||||
content="charlie\ndelta",
|
|
||||||
position=1,
|
|
||||||
start_char=12,
|
|
||||||
end_char=len(_BODY),
|
|
||||||
)
|
|
||||||
|
|
||||||
result = await get_document_by_chunk_id(
|
|
||||||
cited.id, chunk_window=5, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result.cited_start_line == 3
|
|
||||||
assert result.cited_end_line == 4
|
|
||||||
|
|
||||||
|
|
||||||
async def test_chunk_spans_exposed_in_response(
|
|
||||||
db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.documents_routes import get_document_by_chunk_id
|
|
||||||
|
|
||||||
doc = await make_document()
|
|
||||||
cited = await _add_chunk(
|
|
||||||
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
|
|
||||||
)
|
|
||||||
|
|
||||||
result = await get_document_by_chunk_id(
|
|
||||||
cited.id, chunk_window=5, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
chunk = next(c for c in result.chunks if c.id == cited.id)
|
|
||||||
assert chunk.start_char == 0
|
|
||||||
assert chunk.end_char == 12
|
|
||||||
|
|
||||||
|
|
||||||
async def test_cited_line_range_null_without_spans(
|
|
||||||
db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.documents_routes import get_document_by_chunk_id
|
|
||||||
|
|
||||||
doc = await make_document()
|
|
||||||
cited = await _add_chunk(
|
|
||||||
db_session, doc, content="alpha", position=0, start_char=None, end_char=None
|
|
||||||
)
|
|
||||||
|
|
||||||
result = await get_document_by_chunk_id(
|
|
||||||
cited.id, chunk_window=5, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result.cited_start_line is None
|
|
||||||
assert result.cited_end_line is None
|
|
||||||
|
|
@ -1,175 +0,0 @@
|
||||||
"""Phase A contract: editor read paths serve source_markdown and never
|
|
||||||
reconstruct or mutate the body from chunks."""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import pytest_asyncio
|
|
||||||
from fastapi import HTTPException
|
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
|
||||||
|
|
||||||
from app.db import (
|
|
||||||
Chunk,
|
|
||||||
Document,
|
|
||||||
DocumentStatus,
|
|
||||||
DocumentType,
|
|
||||||
SearchSpace,
|
|
||||||
User,
|
|
||||||
)
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.integration
|
|
||||||
|
|
||||||
|
|
||||||
async def _make_document(
|
|
||||||
session: AsyncSession,
|
|
||||||
search_space: SearchSpace,
|
|
||||||
user: User,
|
|
||||||
*,
|
|
||||||
document_type: DocumentType = DocumentType.FILE,
|
|
||||||
source_markdown: str | None = "# Title\n\nBody line.",
|
|
||||||
content: str = "Body line.",
|
|
||||||
status: dict | None = None,
|
|
||||||
) -> Document:
|
|
||||||
doc = Document(
|
|
||||||
title="Doc",
|
|
||||||
document_type=document_type,
|
|
||||||
document_metadata={},
|
|
||||||
content=content,
|
|
||||||
content_hash="hash-001",
|
|
||||||
source_markdown=source_markdown,
|
|
||||||
search_space_id=search_space.id,
|
|
||||||
created_by_id=user.id,
|
|
||||||
status=status or DocumentStatus.ready(),
|
|
||||||
)
|
|
||||||
session.add(doc)
|
|
||||||
await session.flush()
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
async def _add_chunks(session: AsyncSession, document: Document, texts: list[str]):
|
|
||||||
for position, text in enumerate(texts):
|
|
||||||
session.add(Chunk(content=text, position=position, document_id=document.id))
|
|
||||||
await session.flush()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def make_document(db_session, db_search_space, db_user):
|
|
||||||
async def _make(**overrides):
|
|
||||||
return await _make_document(db_session, db_search_space, db_user, **overrides)
|
|
||||||
|
|
||||||
return _make
|
|
||||||
|
|
||||||
|
|
||||||
class TestGetEditorContent:
|
|
||||||
async def test_returns_source_markdown_verbatim(
|
|
||||||
self, db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.editor_routes import get_editor_content
|
|
||||||
|
|
||||||
doc = await make_document(source_markdown="# Real\n\nCanonical body.")
|
|
||||||
|
|
||||||
result = await get_editor_content(
|
|
||||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result["source_markdown"] == "# Real\n\nCanonical body."
|
|
||||||
|
|
||||||
async def test_does_not_reconstruct_body_from_chunks(
|
|
||||||
self, db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
"""A ready document without source_markdown must not be rebuilt from chunks."""
|
|
||||||
from app.routes.editor_routes import get_editor_content
|
|
||||||
|
|
||||||
doc = await make_document(source_markdown=None)
|
|
||||||
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
|
|
||||||
|
|
||||||
with pytest.raises(HTTPException) as exc:
|
|
||||||
await get_editor_content(
|
|
||||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
assert exc.value.status_code == 400
|
|
||||||
await db_session.refresh(doc)
|
|
||||||
assert doc.source_markdown is None
|
|
||||||
|
|
||||||
async def test_processing_document_without_body_returns_409(
|
|
||||||
self, db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.editor_routes import get_editor_content
|
|
||||||
|
|
||||||
doc = await make_document(
|
|
||||||
source_markdown=None, status=DocumentStatus.processing()
|
|
||||||
)
|
|
||||||
|
|
||||||
with pytest.raises(HTTPException) as exc:
|
|
||||||
await get_editor_content(
|
|
||||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
assert exc.value.status_code == 409
|
|
||||||
|
|
||||||
async def test_failed_document_without_body_returns_422(
|
|
||||||
self, db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.editor_routes import get_editor_content
|
|
||||||
|
|
||||||
doc = await make_document(
|
|
||||||
source_markdown=None, status=DocumentStatus.failed("boom")
|
|
||||||
)
|
|
||||||
|
|
||||||
with pytest.raises(HTTPException) as exc:
|
|
||||||
await get_editor_content(
|
|
||||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
assert exc.value.status_code == 422
|
|
||||||
|
|
||||||
async def test_empty_note_initializes_to_empty_markdown(
|
|
||||||
self, db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.editor_routes import get_editor_content
|
|
||||||
|
|
||||||
doc = await make_document(document_type=DocumentType.NOTE, source_markdown=None)
|
|
||||||
|
|
||||||
result = await get_editor_content(
|
|
||||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result["source_markdown"] == ""
|
|
||||||
|
|
||||||
|
|
||||||
class TestDownloadMarkdown:
|
|
||||||
async def test_does_not_reconstruct_body_from_chunks(
|
|
||||||
self, db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.editor_routes import download_document_markdown
|
|
||||||
|
|
||||||
doc = await make_document(source_markdown=None)
|
|
||||||
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
|
|
||||||
|
|
||||||
with pytest.raises(HTTPException) as exc:
|
|
||||||
await download_document_markdown(
|
|
||||||
db_search_space.id, doc.id, session=db_session, user=db_user
|
|
||||||
)
|
|
||||||
|
|
||||||
assert exc.value.status_code == 400
|
|
||||||
|
|
||||||
|
|
||||||
class TestExportDocument:
|
|
||||||
async def test_does_not_reconstruct_body_from_chunks(
|
|
||||||
self, db_session, db_search_space, db_user, make_document
|
|
||||||
):
|
|
||||||
from app.routes.editor_routes import export_document
|
|
||||||
from app.routes.reports_routes import ExportFormat
|
|
||||||
|
|
||||||
doc = await make_document(source_markdown=None)
|
|
||||||
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
|
|
||||||
|
|
||||||
with pytest.raises(HTTPException) as exc:
|
|
||||||
await export_document(
|
|
||||||
db_search_space.id,
|
|
||||||
doc.id,
|
|
||||||
format=ExportFormat.PLAIN,
|
|
||||||
session=db_session,
|
|
||||||
user=db_user,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert exc.value.status_code == 400
|
|
||||||
|
|
@ -1,87 +0,0 @@
|
||||||
"""Unit tests for search_knowledge_base hit rendering.
|
|
||||||
|
|
||||||
The tool must surface the passage that actually matched (the RRF-ranked
|
|
||||||
chunk), not the top of the document, and annotate it with its line range
|
|
||||||
when the chunk carries a char span.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
|
|
||||||
_format_hits,
|
|
||||||
)
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.unit
|
|
||||||
|
|
||||||
_BODY = "Intro paragraph.\n\nMatched passage here.\n\nClosing paragraph."
|
|
||||||
|
|
||||||
|
|
||||||
def _hit() -> dict:
|
|
||||||
intro = "Intro paragraph."
|
|
||||||
matched = "Matched passage here."
|
|
||||||
matched_start = _BODY.index(matched)
|
|
||||||
return {
|
|
||||||
"document": {"id": 7, "title": "note.md", "document_type": "NOTE"},
|
|
||||||
"score": 0.42,
|
|
||||||
"content": _BODY.replace("\n\n", "\n\n"),
|
|
||||||
"matched_chunk_ids": [102],
|
|
||||||
"chunks": [
|
|
||||||
{
|
|
||||||
"chunk_id": 101,
|
|
||||||
"content": intro,
|
|
||||||
"start_char": 0,
|
|
||||||
"end_char": len(intro),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"chunk_id": 102,
|
|
||||||
"content": matched,
|
|
||||||
"start_char": matched_start,
|
|
||||||
"end_char": matched_start + len(matched),
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def test_renders_matched_passage_not_top_of_doc() -> None:
|
|
||||||
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
|
||||||
assert "Matched passage here." in out
|
|
||||||
# The intro chunk was not matched, so it must not be shown as the snippet.
|
|
||||||
assert "Intro paragraph." not in out
|
|
||||||
|
|
||||||
|
|
||||||
def test_emits_copyable_line_citation_token_when_spans_present() -> None:
|
|
||||||
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
|
||||||
# "Matched passage here." sits on line 3 of the body; the hit must surface
|
|
||||||
# a ready-to-copy token so the agent can cite without a separate read.
|
|
||||||
assert "[citation:d7#L3-3]" in out
|
|
||||||
|
|
||||||
|
|
||||||
def test_header_includes_document_id() -> None:
|
|
||||||
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
|
||||||
assert "id=7" in out
|
|
||||||
|
|
||||||
|
|
||||||
def test_omits_citation_token_when_spans_absent() -> None:
|
|
||||||
hit = _hit()
|
|
||||||
for chunk in hit["chunks"]:
|
|
||||||
chunk["start_char"] = None
|
|
||||||
chunk["end_char"] = None
|
|
||||||
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
|
||||||
assert "Matched passage here." in out
|
|
||||||
# No concrete, copyable token for this document without spans (the closing
|
|
||||||
# instruction's placeholder template doesn't count).
|
|
||||||
assert "[citation:d7#L" not in out
|
|
||||||
|
|
||||||
|
|
||||||
def test_falls_back_to_content_when_no_matched_ids() -> None:
|
|
||||||
hit = _hit()
|
|
||||||
hit["matched_chunk_ids"] = []
|
|
||||||
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
|
||||||
assert "Intro paragraph." in out
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_results_message() -> None:
|
|
||||||
out = _format_hits([], paths={}, bodies={}, query="missing")
|
|
||||||
assert "No knowledge-base matches" in out
|
|
||||||
|
|
@ -1,72 +0,0 @@
|
||||||
"""Span-aware chunking contract: slices form a lossless, contiguous partition
|
|
||||||
of the markdown, and every slice's char span addresses its own text."""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from app.indexing_pipeline.document_chunker import chunk_markdown_with_spans
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.unit
|
|
||||||
|
|
||||||
|
|
||||||
def _assert_lossless_partition(md: str, slices) -> None:
|
|
||||||
assert "".join(s.text for s in slices) == md
|
|
||||||
|
|
||||||
cursor = 0
|
|
||||||
for s in slices:
|
|
||||||
assert s.start_char == cursor, "slices must be contiguous"
|
|
||||||
assert s.end_char >= s.start_char
|
|
||||||
assert md[s.start_char : s.end_char] == s.text, "span must address slice text"
|
|
||||||
cursor = s.end_char
|
|
||||||
assert cursor == len(md)
|
|
||||||
|
|
||||||
|
|
||||||
def test_prose_partition_and_spans():
|
|
||||||
md = (
|
|
||||||
"# Title\n\n"
|
|
||||||
+ "First paragraph with several words here. " * 20
|
|
||||||
+ "\n\nSecond section with more prose to force multiple chunks. " * 20
|
|
||||||
)
|
|
||||||
|
|
||||||
slices = chunk_markdown_with_spans(md)
|
|
||||||
|
|
||||||
assert len(slices) > 1
|
|
||||||
_assert_lossless_partition(md, slices)
|
|
||||||
|
|
||||||
|
|
||||||
def test_table_kept_whole_with_exact_span():
|
|
||||||
table = "| a | b |\n| - | - |\n| 1 | 2 |\n"
|
|
||||||
md = f"Intro prose before the table.\n{table}\nClosing prose after."
|
|
||||||
|
|
||||||
slices = chunk_markdown_with_spans(md)
|
|
||||||
|
|
||||||
_assert_lossless_partition(md, slices)
|
|
||||||
table_slices = [s for s in slices if s.text.lstrip().startswith("|")]
|
|
||||||
assert any("| 1 | 2 |" in s.text for s in table_slices)
|
|
||||||
for s in table_slices:
|
|
||||||
assert "| a | b |" in s.text and "| 1 | 2 |" in s.text
|
|
||||||
|
|
||||||
|
|
||||||
def test_table_at_eof_without_trailing_newline_stays_whole():
|
|
||||||
md = "Intro.\n| a | b |\n| - | - |\n| 1 | 2 |"
|
|
||||||
|
|
||||||
slices = chunk_markdown_with_spans(md)
|
|
||||||
|
|
||||||
_assert_lossless_partition(md, slices)
|
|
||||||
table_slices = [s for s in slices if "| 1 | 2 |" in s.text]
|
|
||||||
assert len(table_slices) == 1
|
|
||||||
assert "| a | b |" in table_slices[0].text
|
|
||||||
|
|
||||||
|
|
||||||
def test_code_chunker_partition_and_spans():
|
|
||||||
code = "\n\n".join(
|
|
||||||
f"def func_{i}(x):\n total = x + {i}\n return total" for i in range(40)
|
|
||||||
)
|
|
||||||
|
|
||||||
slices = chunk_markdown_with_spans(code, use_code_chunker=True)
|
|
||||||
|
|
||||||
assert len(slices) >= 1
|
|
||||||
_assert_lossless_partition(code, slices)
|
|
||||||
|
|
||||||
|
|
||||||
def test_empty_markdown_yields_no_slices():
|
|
||||||
assert chunk_markdown_with_spans("") == []
|
|
||||||
|
|
@ -37,9 +37,12 @@ def _make_orm_doc(connector_doc, doc_id):
|
||||||
async def test_index_calls_embed_and_chunk_via_to_thread(
|
async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
pipeline, make_connector_document, monkeypatch
|
pipeline, make_connector_document, monkeypatch
|
||||||
):
|
):
|
||||||
"""index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop."""
|
"""index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
|
||||||
|
|
||||||
|
Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
|
||||||
|
path, see issue #1334) is verified separately in
|
||||||
|
``test_non_code_documents_use_hybrid_chunker``.
|
||||||
|
"""
|
||||||
to_thread_calls = []
|
to_thread_calls = []
|
||||||
original_to_thread = asyncio.to_thread
|
original_to_thread = asyncio.to_thread
|
||||||
|
|
||||||
|
|
@ -48,11 +51,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
return await original_to_thread(func, *args, **kwargs)
|
return await original_to_thread(func, *args, **kwargs)
|
||||||
|
|
||||||
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
|
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
|
||||||
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
|
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||||
mock_chunker.__name__ = "chunk_markdown_with_spans"
|
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||||
mock_chunker,
|
mock_chunk_hybrid,
|
||||||
)
|
)
|
||||||
mock_embed = MagicMock(
|
mock_embed = MagicMock(
|
||||||
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
|
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
|
||||||
|
|
@ -87,25 +90,34 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
|
|
||||||
await pipeline.index(document, connector_doc)
|
await pipeline.index(document, connector_doc)
|
||||||
|
|
||||||
assert "chunk_markdown_with_spans" in to_thread_calls
|
# Either chunker entry point satisfies the "chunking runs off the event
|
||||||
|
# loop" contract this test guards. Routing between the two is verified
|
||||||
|
# in test_non_code_documents_use_hybrid_chunker.
|
||||||
|
assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
|
||||||
assert "embed_texts" in to_thread_calls
|
assert "embed_texts" in to_thread_calls
|
||||||
assert document.status == DocumentStatus.ready()
|
assert document.status == DocumentStatus.ready()
|
||||||
|
|
||||||
|
|
||||||
async def test_non_code_documents_use_prose_chunker(
|
async def test_non_code_documents_use_hybrid_chunker(
|
||||||
pipeline, make_connector_document, monkeypatch
|
pipeline, make_connector_document, monkeypatch
|
||||||
):
|
):
|
||||||
"""Non-code documents chunk with use_code_chunker=False (issue #1334).
|
"""Non-code documents route through ``chunk_text_hybrid`` (issue #1334).
|
||||||
|
|
||||||
The table-aware prose path keeps Markdown tables intact; only documents
|
The hybrid chunker preserves Markdown table integrity by avoiding splits
|
||||||
flagged with ``should_use_code_chunker=True`` request the code chunker.
|
mid-row. Only documents flagged with ``should_use_code_chunker=True``
|
||||||
|
should take the ``chunk_text`` path.
|
||||||
"""
|
"""
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||||
|
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||||
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||||
mock_chunker,
|
mock_chunk_hybrid,
|
||||||
|
)
|
||||||
|
mock_chunk_code = MagicMock(return_value=["chunk1"])
|
||||||
|
mock_chunk_code.__name__ = "chunk_text"
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||||
|
mock_chunk_code,
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||||
|
|
@ -137,49 +149,8 @@ async def test_non_code_documents_use_prose_chunker(
|
||||||
|
|
||||||
await pipeline.index(document, connector_doc)
|
await pipeline.index(document, connector_doc)
|
||||||
|
|
||||||
mock_chunker.assert_called_once()
|
mock_chunk_hybrid.assert_called_once()
|
||||||
assert mock_chunker.call_args.args[1] is False
|
mock_chunk_code.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
async def test_code_documents_request_code_chunker(
|
|
||||||
pipeline, make_connector_document, monkeypatch
|
|
||||||
):
|
|
||||||
"""Code-flagged documents forward use_code_chunker=True to the chunker."""
|
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
|
||||||
|
|
||||||
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
|
||||||
mock_chunker,
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
|
||||||
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(pipeline, "_load_existing_chunks", AsyncMock(return_value=[]))
|
|
||||||
|
|
||||||
async def _noop_persist(_session, doc, *_args, **_kwargs):
|
|
||||||
doc.status = DocumentStatus.ready()
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.persist_scratch_index",
|
|
||||||
_noop_persist,
|
|
||||||
)
|
|
||||||
|
|
||||||
connector_doc = make_connector_document(
|
|
||||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
|
||||||
unique_id="repo-1",
|
|
||||||
search_space_id=1,
|
|
||||||
should_use_code_chunker=True,
|
|
||||||
)
|
|
||||||
document = MagicMock(spec=Document)
|
|
||||||
document.id = 1
|
|
||||||
document.status = DocumentStatus.pending()
|
|
||||||
|
|
||||||
await pipeline.index(document, connector_doc)
|
|
||||||
|
|
||||||
mock_chunker.assert_called_once()
|
|
||||||
assert mock_chunker.call_args.args[1] is True
|
|
||||||
|
|
||||||
|
|
||||||
def _mock_session_factory(orm_docs_by_id):
|
def _mock_session_factory(orm_docs_by_id):
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
|
||||||
def __init__(self, *, children=None, file_data=None) -> None:
|
def __init__(self, *, children=None, file_data=None) -> None:
|
||||||
self.als_info = AsyncMock(return_value=children or [])
|
self.als_info = AsyncMock(return_value=children or [])
|
||||||
self._load_file_data = AsyncMock(
|
self._load_file_data = AsyncMock(
|
||||||
return_value=(file_data, 17, None) if file_data is not None else None
|
return_value=(file_data, 17) if file_data is not None else None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -69,25 +69,13 @@ class _FakeSession:
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
|
def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
"""Avoid loading the embedding model in unit tests.
|
"""Avoid loading the embedding model in unit tests."""
|
||||||
|
|
||||||
Mirrors the legacy stub: one chunk spanning the whole content, with a
|
|
||||||
zero summary/chunk vector, routed through the shared span builder.
|
|
||||||
"""
|
|
||||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
|
||||||
|
|
||||||
async def _fake_build_chunk_embeddings(content: str, *, use_code_chunker: bool):
|
|
||||||
summary = np.zeros(8, dtype=np.float32)
|
|
||||||
pairs = (
|
|
||||||
[(ChunkSlice(content, 0, len(content)), np.zeros(8, dtype=np.float32))]
|
|
||||||
if content
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
return summary, pairs
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
kb_persistence, "build_chunk_embeddings", _fake_build_chunk_embeddings
|
kb_persistence,
|
||||||
|
"embed_texts",
|
||||||
|
lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
|
||||||
)
|
)
|
||||||
|
monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|
|
||||||
|
|
@ -1,92 +0,0 @@
|
||||||
"""Unit tests for the numbered-document read preamble."""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
|
|
||||||
build_read_preamble,
|
|
||||||
compute_matched_line_ranges,
|
|
||||||
)
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.unit
|
|
||||||
|
|
||||||
|
|
||||||
_BODY = "alpha\nbravo\ncharlie\ndelta"
|
|
||||||
|
|
||||||
|
|
||||||
class TestComputeMatchedLineRanges:
|
|
||||||
def test_maps_matched_chunk_spans_to_line_ranges(self):
|
|
||||||
chunks = [(1, 0, 12), (2, 12, len(_BODY))]
|
|
||||||
ranges = compute_matched_line_ranges(_BODY, chunks, {2})
|
|
||||||
assert ranges == [(3, 4)]
|
|
||||||
|
|
||||||
def test_includes_only_matched_chunks(self):
|
|
||||||
chunks = [(1, 0, 5), (2, 6, 11)]
|
|
||||||
ranges = compute_matched_line_ranges(_BODY, chunks, {1})
|
|
||||||
assert ranges == [(1, 1)]
|
|
||||||
|
|
||||||
def test_skips_chunks_without_spans(self):
|
|
||||||
chunks = [(1, None, None)]
|
|
||||||
ranges = compute_matched_line_ranges(_BODY, chunks, {1})
|
|
||||||
assert ranges == []
|
|
||||||
|
|
||||||
def test_sorted_and_deduplicated(self):
|
|
||||||
chunks = [(1, 12, len(_BODY)), (2, 0, 5), (3, 0, 5)]
|
|
||||||
ranges = compute_matched_line_ranges(_BODY, chunks, {1, 2, 3})
|
|
||||||
assert ranges == [(1, 1), (3, 4)]
|
|
||||||
|
|
||||||
|
|
||||||
class TestBuildReadPreamble:
|
|
||||||
def test_contains_document_metadata(self):
|
|
||||||
preamble = build_read_preamble(
|
|
||||||
document_id=42,
|
|
||||||
document_type="FILE",
|
|
||||||
title="Test Doc",
|
|
||||||
url="https://example.com",
|
|
||||||
matched_line_ranges=[],
|
|
||||||
)
|
|
||||||
assert "<document_id>42</document_id>" in preamble
|
|
||||||
assert "<document_type>FILE</document_type>" in preamble
|
|
||||||
assert "Test Doc" in preamble
|
|
||||||
assert "https://example.com" in preamble
|
|
||||||
|
|
||||||
def test_citation_hint_uses_document_id(self):
|
|
||||||
preamble = build_read_preamble(
|
|
||||||
document_id=42,
|
|
||||||
document_type="FILE",
|
|
||||||
title="Test Doc",
|
|
||||||
url="",
|
|
||||||
matched_line_ranges=[],
|
|
||||||
)
|
|
||||||
assert "[citation:d42#L" in preamble
|
|
||||||
|
|
||||||
def test_lists_matched_line_ranges(self):
|
|
||||||
preamble = build_read_preamble(
|
|
||||||
document_id=7,
|
|
||||||
document_type="NOTE",
|
|
||||||
title="Notes",
|
|
||||||
url="",
|
|
||||||
matched_line_ranges=[(12, 18), (40, 40)],
|
|
||||||
)
|
|
||||||
assert "<matched_lines>" in preamble
|
|
||||||
assert "12-18" in preamble
|
|
||||||
assert "40" in preamble
|
|
||||||
|
|
||||||
def test_omits_matched_lines_block_when_empty(self):
|
|
||||||
preamble = build_read_preamble(
|
|
||||||
document_id=7,
|
|
||||||
document_type="NOTE",
|
|
||||||
title="Notes",
|
|
||||||
url="",
|
|
||||||
matched_line_ranges=[],
|
|
||||||
)
|
|
||||||
assert "<matched_lines>" not in preamble
|
|
||||||
|
|
||||||
def test_ends_with_trailing_newline_so_body_follows_cleanly(self):
|
|
||||||
preamble = build_read_preamble(
|
|
||||||
document_id=1,
|
|
||||||
document_type="FILE",
|
|
||||||
title="t",
|
|
||||||
url="",
|
|
||||||
matched_line_ranges=[],
|
|
||||||
)
|
|
||||||
assert preamble.endswith("\n")
|
|
||||||
162
surfsense_backend/tests/unit/utils/test_async_retry.py
Normal file
162
surfsense_backend/tests/unit/utils/test_async_retry.py
Normal file
|
|
@ -0,0 +1,162 @@
|
||||||
|
"""Tests for async_retry utilities."""
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.connectors.exceptions import (
|
||||||
|
ConnectorAPIError,
|
||||||
|
ConnectorAuthError,
|
||||||
|
ConnectorError,
|
||||||
|
ConnectorRateLimitError,
|
||||||
|
ConnectorTimeoutError,
|
||||||
|
)
|
||||||
|
from app.utils.async_retry import _is_retryable, raise_for_status
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
def make_response(
|
||||||
|
status_code: int,
|
||||||
|
*,
|
||||||
|
headers: dict[str, str] | None = None,
|
||||||
|
json_body=None,
|
||||||
|
text_body: str = "",
|
||||||
|
):
|
||||||
|
kwargs = {
|
||||||
|
"status_code": status_code,
|
||||||
|
"headers": headers,
|
||||||
|
"request": httpx.Request("GET", "https://x"),
|
||||||
|
}
|
||||||
|
|
||||||
|
if json_body is not None:
|
||||||
|
kwargs["json"] = json_body
|
||||||
|
else:
|
||||||
|
kwargs["text"] = text_body
|
||||||
|
|
||||||
|
return httpx.Response(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_raise_for_status_does_not_raise_for_success():
|
||||||
|
response = make_response(200)
|
||||||
|
|
||||||
|
raise_for_status(response)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("retry_after_header", "expected"),
|
||||||
|
[
|
||||||
|
("5", 5.0),
|
||||||
|
(None, None),
|
||||||
|
("abc", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_raise_for_status_429(retry_after_header, expected):
|
||||||
|
headers = {}
|
||||||
|
if retry_after_header is not None:
|
||||||
|
headers["Retry-After"] = retry_after_header
|
||||||
|
|
||||||
|
response = make_response(
|
||||||
|
429,
|
||||||
|
headers=headers,
|
||||||
|
json_body={"detail": "rate limited"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ConnectorRateLimitError) as exc_info:
|
||||||
|
raise_for_status(response)
|
||||||
|
|
||||||
|
exc = exc_info.value
|
||||||
|
assert exc.retry_after == expected
|
||||||
|
assert exc.response_body == {"detail": "rate limited"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("status_code", [401, 403])
|
||||||
|
def test_raise_for_status_auth_errors(status_code):
|
||||||
|
response = make_response(
|
||||||
|
status_code,
|
||||||
|
json_body={"error": "unauthorized"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ConnectorAuthError) as exc_info:
|
||||||
|
raise_for_status(response)
|
||||||
|
|
||||||
|
exc = exc_info.value
|
||||||
|
assert exc.status_code == status_code
|
||||||
|
assert exc.response_body == {"error": "unauthorized"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_raise_for_status_gateway_timeout():
|
||||||
|
response = make_response(
|
||||||
|
504,
|
||||||
|
json_body={"error": "timeout"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ConnectorTimeoutError):
|
||||||
|
raise_for_status(response)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("status_code", [500, 502])
|
||||||
|
def test_raise_for_status_server_errors(status_code):
|
||||||
|
response = make_response(
|
||||||
|
status_code,
|
||||||
|
json_body={"error": "server"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ConnectorAPIError) as exc_info:
|
||||||
|
raise_for_status(response)
|
||||||
|
|
||||||
|
assert exc_info.value.status_code == status_code
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("status_code", [400, 404])
|
||||||
|
def test_raise_for_status_client_errors(status_code):
|
||||||
|
response = make_response(
|
||||||
|
status_code,
|
||||||
|
json_body={"error": "client"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ConnectorAPIError) as exc_info:
|
||||||
|
raise_for_status(response)
|
||||||
|
|
||||||
|
assert exc_info.value.status_code == status_code
|
||||||
|
|
||||||
|
|
||||||
|
def test_raise_for_status_uses_text_when_json_parsing_fails():
|
||||||
|
response = make_response(
|
||||||
|
500,
|
||||||
|
text_body="Internal server error",
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ConnectorAPIError) as exc_info:
|
||||||
|
raise_for_status(response)
|
||||||
|
|
||||||
|
assert exc_info.value.response_body == "Internal server error"
|
||||||
|
|
||||||
|
|
||||||
|
def test_connector_error_retryable_false():
|
||||||
|
exc = ConnectorError("boom")
|
||||||
|
|
||||||
|
assert _is_retryable(exc) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_rate_limit_error_is_retryable():
|
||||||
|
exc = ConnectorRateLimitError()
|
||||||
|
|
||||||
|
assert _is_retryable(exc) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_timeout_exception_is_retryable():
|
||||||
|
exc = httpx.TimeoutException("timeout")
|
||||||
|
|
||||||
|
assert _is_retryable(exc) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_connect_error_is_retryable():
|
||||||
|
exc = httpx.ConnectError("connection failed")
|
||||||
|
|
||||||
|
assert _is_retryable(exc) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_unrelated_exception_is_not_retryable():
|
||||||
|
exc = ValueError("boom")
|
||||||
|
|
||||||
|
assert _is_retryable(exc) is False
|
||||||
293
surfsense_backend/tests/unit/utils/test_content_utils.py
Normal file
293
surfsense_backend/tests/unit/utils/test_content_utils.py
Normal file
|
|
@ -0,0 +1,293 @@
|
||||||
|
"""Tests for strip_markdown_fences() and extract_text_content() in
|
||||||
|
app/utils/content_utils.py.
|
||||||
|
|
||||||
|
Out of scope: bootstrap_history_from_db() — async + DB, belongs in
|
||||||
|
integration tests.
|
||||||
|
|
||||||
|
Run:
|
||||||
|
uv run pytest -m unit tests/unit/utils/test_content_utils.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# strip_markdown_fences()
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestStripMarkdownFences:
|
||||||
|
"""Tests for strip_markdown_fences(text: str) -> str.
|
||||||
|
|
||||||
|
Regex: r"^```(?:\\w+)?\\s*\\n(.*?)```\\s*$" (re.DOTALL)
|
||||||
|
Called on text.strip() — so surrounding whitespace is handled before
|
||||||
|
the regex runs. The captured group is also .strip()-ped before return.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Fenced with a language tag
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_json_fence_returns_inner_content(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = '```json\n{"key": "value"}\n```'
|
||||||
|
assert strip_markdown_fences(text) == '{"key": "value"}'
|
||||||
|
|
||||||
|
def test_python_fence_returns_inner_content(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "```python\ndef hello():\n return 'hi'\n```"
|
||||||
|
assert strip_markdown_fences(text) == "def hello():\n return 'hi'"
|
||||||
|
|
||||||
|
def test_yaml_fence_returns_inner_content(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "```yaml\nkey: value\n```"
|
||||||
|
assert strip_markdown_fences(text) == "key: value"
|
||||||
|
|
||||||
|
def test_sql_multiline_fence_returns_inner_content(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "```sql\nSELECT *\nFROM users\nWHERE id = 1;\n```"
|
||||||
|
assert strip_markdown_fences(text) == "SELECT *\nFROM users\nWHERE id = 1;"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Fenced without a language tag
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_no_lang_tag_single_line_returns_inner_content(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "```\nhello world\n```"
|
||||||
|
assert strip_markdown_fences(text) == "hello world"
|
||||||
|
|
||||||
|
def test_no_lang_tag_multiline_returns_inner_content(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "```\nline one\nline two\n```"
|
||||||
|
assert strip_markdown_fences(text) == "line one\nline two"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Plain text — no fences → returned unchanged
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_plain_text_returned_unchanged(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "just plain text with no fences"
|
||||||
|
assert strip_markdown_fences(text) == text
|
||||||
|
|
||||||
|
def test_plain_text_with_newlines_returned_unchanged(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "line one\nline two\nline three"
|
||||||
|
assert strip_markdown_fences(text) == text
|
||||||
|
|
||||||
|
def test_empty_string_returned_unchanged(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
assert strip_markdown_fences("") == ""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Surrounding whitespace handling
|
||||||
|
# The function calls text.strip() before matching, so leading/trailing
|
||||||
|
# whitespace outside the fence is consumed. The captured group is also
|
||||||
|
# .strip()-ped, so whitespace between the fence markers and content is
|
||||||
|
# removed too.
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_leading_whitespace_around_fence_stripped(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = " ```json\n{}\n```"
|
||||||
|
assert strip_markdown_fences(text) == "{}"
|
||||||
|
|
||||||
|
def test_trailing_whitespace_around_fence_stripped(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "```json\n{}\n``` "
|
||||||
|
assert strip_markdown_fences(text) == "{}"
|
||||||
|
|
||||||
|
def test_surrounding_newlines_stripped(self):
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = '\n\n```json\n{"a": 1}\n```\n\n'
|
||||||
|
assert strip_markdown_fences(text) == '{"a": 1}'
|
||||||
|
|
||||||
|
def test_inner_indentation_preserved(self):
|
||||||
|
"""The captured group is .strip()-ped, so leading whitespace on the
|
||||||
|
*first* line is removed, but indentation on subsequent lines is kept."""
|
||||||
|
from app.utils.content_utils import strip_markdown_fences
|
||||||
|
|
||||||
|
text = "```\n indented line\n deeper indent\n```"
|
||||||
|
result = strip_markdown_fences(text)
|
||||||
|
# .strip() removes the leading spaces from the first captured line
|
||||||
|
assert "indented line" in result
|
||||||
|
# indentation on the second line is preserved
|
||||||
|
assert " deeper indent" in result
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_text_content()
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractTextContent:
|
||||||
|
"""Tests for extract_text_content(content: str | dict | list) -> str."""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# str input → returned as-is
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_str_input_returned_as_is(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content("hello world") == "hello world"
|
||||||
|
|
||||||
|
def test_str_empty_returned_as_is(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content("") == ""
|
||||||
|
|
||||||
|
def test_str_with_internal_whitespace_returned_as_is(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content(" spaced ") == " spaced "
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# dict with "text" key → return content["text"]
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_dict_with_text_key_returns_its_value(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content({"text": "from dict"}) == "from dict"
|
||||||
|
|
||||||
|
def test_dict_with_text_key_empty_value(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content({"text": ""}) == ""
|
||||||
|
|
||||||
|
def test_dict_with_text_key_ignores_other_keys(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
d = {"text": "important", "role": "assistant", "extra": 99}
|
||||||
|
assert extract_text_content(d) == "important"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# dict without "text" key → str(dict)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_dict_without_text_key_returns_str_repr(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
d = {"role": "assistant", "value": 42}
|
||||||
|
assert extract_text_content(d) == str(d)
|
||||||
|
|
||||||
|
def test_empty_dict_returns_str_repr(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content({}) == str({})
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# list of parts — text dicts and plain strings
|
||||||
|
# Parts are joined with "\n" (per implementation: "\n".join(texts))
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_list_text_type_parts_joined_with_newline(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
parts = [
|
||||||
|
{"type": "text", "text": "Hello"},
|
||||||
|
{"type": "text", "text": "world"},
|
||||||
|
]
|
||||||
|
assert extract_text_content(parts) == "Hello\nworld"
|
||||||
|
|
||||||
|
def test_list_plain_strings_joined_with_newline(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
parts = ["foo", "bar"]
|
||||||
|
assert extract_text_content(parts) == "foo\nbar"
|
||||||
|
|
||||||
|
def test_list_mixed_text_dicts_and_plain_strings(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
parts = [
|
||||||
|
{"type": "text", "text": "Hello"},
|
||||||
|
"plain",
|
||||||
|
{"type": "text", "text": "world"},
|
||||||
|
]
|
||||||
|
result = extract_text_content(parts)
|
||||||
|
assert "Hello" in result
|
||||||
|
assert "plain" in result
|
||||||
|
assert "world" in result
|
||||||
|
|
||||||
|
def test_list_non_text_type_parts_ignored(self):
|
||||||
|
"""tool_use, image, and other non-text blocks must not leak into output."""
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
parts = [
|
||||||
|
{"type": "tool_use", "id": "abc", "name": "search_kb"},
|
||||||
|
{"type": "text", "text": "visible text"},
|
||||||
|
{"type": "image", "source": {"url": "https://example.com/img.png"}},
|
||||||
|
]
|
||||||
|
result = extract_text_content(parts)
|
||||||
|
assert result == "visible text"
|
||||||
|
assert "tool_use" not in result
|
||||||
|
assert "search_kb" not in result
|
||||||
|
assert "image" not in result
|
||||||
|
|
||||||
|
def test_list_only_non_text_parts_returns_empty_string(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
parts = [
|
||||||
|
{"type": "tool_use", "id": "x"},
|
||||||
|
{"type": "image", "source": {}},
|
||||||
|
]
|
||||||
|
assert extract_text_content(parts) == ""
|
||||||
|
|
||||||
|
def test_list_single_text_part(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
parts = [{"type": "text", "text": "only me"}]
|
||||||
|
assert extract_text_content(parts) == "only me"
|
||||||
|
|
||||||
|
def test_list_text_part_missing_text_key_contributes_empty_string(self):
|
||||||
|
"""part.get("text", "") — a text-typed dict with no "text" key gives ""."""
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
parts = [{"type": "text"}, {"type": "text", "text": "after"}]
|
||||||
|
result = extract_text_content(parts)
|
||||||
|
# both parts collected; joined → "\nafter" or "after" depending on strip
|
||||||
|
assert "after" in result
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Empty list → empty string
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_empty_list_returns_empty_string(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content([]) == ""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Unsupported types → empty string (the final bare `return ""`)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_none_returns_empty_string(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content(None) == ""
|
||||||
|
|
||||||
|
def test_integer_returns_empty_string(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content(42) == ""
|
||||||
|
|
||||||
|
def test_boolean_returns_empty_string(self):
|
||||||
|
from app.utils.content_utils import extract_text_content
|
||||||
|
|
||||||
|
assert extract_text_content(True) == ""
|
||||||
|
|
@ -1,39 +0,0 @@
|
||||||
"""Unit tests for char-span -> line-range conversion."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from app.utils.text_spans import char_span_to_line_range
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.unit
|
|
||||||
|
|
||||||
_TEXT = "line1\nline2\nline3"
|
|
||||||
|
|
||||||
|
|
||||||
def test_single_line_span() -> None:
|
|
||||||
start = _TEXT.index("line2")
|
|
||||||
assert char_span_to_line_range(_TEXT, start, start + len("line2")) == (2, 2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_first_line_span() -> None:
|
|
||||||
assert char_span_to_line_range(_TEXT, 0, len("line1")) == (1, 1)
|
|
||||||
|
|
||||||
|
|
||||||
def test_last_line_span() -> None:
|
|
||||||
start = _TEXT.index("line3")
|
|
||||||
assert char_span_to_line_range(_TEXT, start, len(_TEXT)) == (3, 3)
|
|
||||||
|
|
||||||
|
|
||||||
def test_multi_line_span() -> None:
|
|
||||||
# "line1\nline2" spans lines 1-2.
|
|
||||||
assert char_span_to_line_range(_TEXT, 0, _TEXT.index("line2") + 5) == (1, 2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_empty_span_resolves_to_its_line() -> None:
|
|
||||||
start = _TEXT.index("line2")
|
|
||||||
assert char_span_to_line_range(_TEXT, start, start) == (2, 2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_offsets_clamped_to_text_bounds() -> None:
|
|
||||||
assert char_span_to_line_range(_TEXT, -5, 10_000) == (1, 3)
|
|
||||||
|
|
@ -14,7 +14,10 @@ SURFSENSE_BACKEND_INTERNAL_URL=http://backend:8000
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
# Runtime configuration (read at runtime by the server, no rebuild needed)
|
# Runtime configuration (read at runtime by the server, no rebuild needed)
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Configure these plain variables for runtime behavior. They are read by server
|
||||||
|
# code when the app starts/serves requests, so changing them requires restarting
|
||||||
|
# the web process but not rebuilding the frontend bundle.
|
||||||
|
#
|
||||||
# Authentication method: LOCAL (email/password) or GOOGLE (OAuth).
|
# Authentication method: LOCAL (email/password) or GOOGLE (OAuth).
|
||||||
AUTH_TYPE=LOCAL
|
AUTH_TYPE=LOCAL
|
||||||
# Document parsing backend: DOCLING, LLAMACLOUD, etc.
|
# Document parsing backend: DOCLING, LLAMACLOUD, etc.
|
||||||
|
|
@ -22,16 +25,6 @@ ETL_SERVICE=DOCLING
|
||||||
# Deployment mode: self-hosted or cloud.
|
# Deployment mode: self-hosted or cloud.
|
||||||
DEPLOYMENT_MODE=self-hosted
|
DEPLOYMENT_MODE=self-hosted
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
|
||||||
# Build-time fallbacks for packaged clients (e.g. Electron) without a runtime
|
|
||||||
# config provider. Optional; Docker reads the plain runtime vars above first.
|
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
|
||||||
# NEXT_PUBLIC_AUTH_TYPE=GOOGLE
|
|
||||||
# NEXT_PUBLIC_ETL_SERVICE=DOCLING
|
|
||||||
# NEXT_PUBLIC_DEPLOYMENT_MODE=self-hosted
|
|
||||||
# Overrides the app version shown in the UI (defaults to package.json version).
|
|
||||||
# NEXT_PUBLIC_APP_VERSION=
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
# Database (Contact Form, optional)
|
# Database (Contact Form, optional)
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
@ -72,3 +65,20 @@ NEXT_PUBLIC_GOOGLE_ADSENSE_SLOT_FREE_HUB_BEFORE_FAQ=
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_ENABLED=false
|
NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_ENABLED=false
|
||||||
NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_MESSAGE=
|
NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_MESSAGE=
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Internal build-time fallbacks
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
#
|
||||||
|
# Most deployments should leave these unset.
|
||||||
|
#
|
||||||
|
# These are only for SurfSense-managed production/cloud builds or packaged
|
||||||
|
# clients that do not have the normal server runtime config available.
|
||||||
|
#
|
||||||
|
# NEXT_PUBLIC_* values are embedded into the browser bundle during `next build`.
|
||||||
|
# Changing them after the bundle is built has no effect.
|
||||||
|
|
||||||
|
# NEXT_PUBLIC_AUTH_TYPE=GOOGLE
|
||||||
|
# NEXT_PUBLIC_ETL_SERVICE=DOCLING
|
||||||
|
# NEXT_PUBLIC_DEPLOYMENT_MODE=self-hosted
|
||||||
|
# NEXT_PUBLIC_APP_VERSION=
|
||||||
|
|
@ -58,6 +58,11 @@
|
||||||
--highlight: oklch(0.852 0.199 91.936);
|
--highlight: oklch(0.852 0.199 91.936);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
html[data-surfsense-auth-type="GOOGLE"] .runtime-auth-local,
|
||||||
|
html[data-surfsense-auth-type="LOCAL"] .runtime-auth-google {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
.dark {
|
.dark {
|
||||||
--background: oklch(0.145 0 0);
|
--background: oklch(0.145 0 0);
|
||||||
--foreground: oklch(0.985 0 0);
|
--foreground: oklch(0.985 0 0);
|
||||||
|
|
@ -270,12 +275,6 @@ button {
|
||||||
contain-intrinsic-size: 0 40px;
|
contain-intrinsic-size: 0 40px;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Monaco whole-line highlight for a cited source span (Phase E). */
|
|
||||||
.citation-line-highlight {
|
|
||||||
background-color: color-mix(in srgb, var(--primary) 16%, transparent);
|
|
||||||
box-shadow: inset 2px 0 0 0 var(--primary);
|
|
||||||
}
|
|
||||||
|
|
||||||
@source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}";
|
@source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}";
|
||||||
@source "../node_modules/streamdown/dist/*.js";
|
@source "../node_modules/streamdown/dist/*.js";
|
||||||
@source "../node_modules/@streamdown/code/dist/*.js";
|
@source "../node_modules/@streamdown/code/dist/*.js";
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ import type { Metadata, Viewport } from "next";
|
||||||
import "./globals.css";
|
import "./globals.css";
|
||||||
import { RootProvider } from "fumadocs-ui/provider/next";
|
import { RootProvider } from "fumadocs-ui/provider/next";
|
||||||
import { Roboto } from "next/font/google";
|
import { Roboto } from "next/font/google";
|
||||||
|
import Script from "next/script";
|
||||||
import { AnnouncementToastProvider } from "@/components/announcements/AnnouncementToastProvider";
|
import { AnnouncementToastProvider } from "@/components/announcements/AnnouncementToastProvider";
|
||||||
import { DesktopUpdateToast } from "@/components/desktop/desktop-update-toast";
|
import { DesktopUpdateToast } from "@/components/desktop/desktop-update-toast";
|
||||||
import { GlobalLoadingProvider } from "@/components/providers/GlobalLoadingProvider";
|
import { GlobalLoadingProvider } from "@/components/providers/GlobalLoadingProvider";
|
||||||
|
|
@ -16,8 +17,13 @@ import {
|
||||||
import { ThemeProvider } from "@/components/theme/theme-provider";
|
import { ThemeProvider } from "@/components/theme/theme-provider";
|
||||||
import { Toaster } from "@/components/ui/sonner";
|
import { Toaster } from "@/components/ui/sonner";
|
||||||
import { LocaleProvider } from "@/contexts/LocaleContext";
|
import { LocaleProvider } from "@/contexts/LocaleContext";
|
||||||
|
import { BUILD_TIME_AUTH_TYPE } from "@/lib/env-config";
|
||||||
import { PlatformProvider } from "@/contexts/platform-context";
|
import { PlatformProvider } from "@/contexts/platform-context";
|
||||||
import { ReactQueryClientProvider } from "@/lib/query-client/query-client.provider";
|
import { ReactQueryClientProvider } from "@/lib/query-client/query-client.provider";
|
||||||
|
import {
|
||||||
|
getRuntimeAuthInitScript,
|
||||||
|
resolveRuntimeAuthUiMode,
|
||||||
|
} from "@/lib/runtime-auth-config";
|
||||||
import { cn } from "@/lib/utils";
|
import { cn } from "@/lib/utils";
|
||||||
|
|
||||||
const roboto = Roboto({
|
const roboto = Roboto({
|
||||||
|
|
@ -131,8 +137,15 @@ export default function RootLayout({
|
||||||
// Language can be switched dynamically through LanguageSwitcher component
|
// Language can be switched dynamically through LanguageSwitcher component
|
||||||
// Locale state is managed by LocaleContext and persisted in localStorage
|
// Locale state is managed by LocaleContext and persisted in localStorage
|
||||||
return (
|
return (
|
||||||
<html lang="en" suppressHydrationWarning>
|
<html
|
||||||
|
lang="en"
|
||||||
|
data-surfsense-auth-type={resolveRuntimeAuthUiMode(BUILD_TIME_AUTH_TYPE)}
|
||||||
|
suppressHydrationWarning
|
||||||
|
>
|
||||||
<head>
|
<head>
|
||||||
|
<Script id="surfsense-runtime-auth-init" strategy="beforeInteractive">
|
||||||
|
{getRuntimeAuthInitScript(BUILD_TIME_AUTH_TYPE)}
|
||||||
|
</Script>
|
||||||
<link rel="preconnect" href="https://api.github.com" />
|
<link rel="preconnect" href="https://api.github.com" />
|
||||||
<OrganizationJsonLd />
|
<OrganizationJsonLd />
|
||||||
<WebSiteJsonLd />
|
<WebSiteJsonLd />
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,6 @@
|
||||||
import { atom } from "jotai";
|
import { atom } from "jotai";
|
||||||
import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
|
import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
|
||||||
|
|
||||||
export interface EditorLineRange {
|
|
||||||
start: number;
|
|
||||||
end: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface EditorPanelState {
|
interface EditorPanelState {
|
||||||
isOpen: boolean;
|
isOpen: boolean;
|
||||||
kind: "document" | "local_file" | "memory";
|
kind: "document" | "local_file" | "memory";
|
||||||
|
|
@ -14,10 +9,6 @@ interface EditorPanelState {
|
||||||
searchSpaceId: number | null;
|
searchSpaceId: number | null;
|
||||||
memoryScope: "user" | "team" | null;
|
memoryScope: "user" | "team" | null;
|
||||||
title: string | null;
|
title: string | null;
|
||||||
// Citation line anchor: when set, the editor opens the raw source view
|
|
||||||
// scrolled to and highlighting this 1-based inclusive line range.
|
|
||||||
highlightLines: EditorLineRange | null;
|
|
||||||
forceSourceView: boolean;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const initialState: EditorPanelState = {
|
const initialState: EditorPanelState = {
|
||||||
|
|
@ -28,8 +19,6 @@ const initialState: EditorPanelState = {
|
||||||
searchSpaceId: null,
|
searchSpaceId: null,
|
||||||
memoryScope: null,
|
memoryScope: null,
|
||||||
title: null,
|
title: null,
|
||||||
highlightLines: null,
|
|
||||||
forceSourceView: false,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export const editorPanelAtom = atom<EditorPanelState>(initialState);
|
export const editorPanelAtom = atom<EditorPanelState>(initialState);
|
||||||
|
|
@ -44,14 +33,7 @@ export const openEditorPanelAtom = atom(
|
||||||
get,
|
get,
|
||||||
set,
|
set,
|
||||||
payload:
|
payload:
|
||||||
| {
|
| { documentId: number; searchSpaceId: number; title?: string; kind?: "document" }
|
||||||
documentId: number;
|
|
||||||
searchSpaceId: number;
|
|
||||||
title?: string;
|
|
||||||
kind?: "document";
|
|
||||||
highlightLines?: EditorLineRange | null;
|
|
||||||
forceSourceView?: boolean;
|
|
||||||
}
|
|
||||||
| {
|
| {
|
||||||
kind: "local_file";
|
kind: "local_file";
|
||||||
localFilePath: string;
|
localFilePath: string;
|
||||||
|
|
@ -77,8 +59,6 @@ export const openEditorPanelAtom = atom(
|
||||||
searchSpaceId: payload.searchSpaceId ?? null,
|
searchSpaceId: payload.searchSpaceId ?? null,
|
||||||
memoryScope: null,
|
memoryScope: null,
|
||||||
title: payload.title ?? null,
|
title: payload.title ?? null,
|
||||||
highlightLines: null,
|
|
||||||
forceSourceView: false,
|
|
||||||
});
|
});
|
||||||
set(rightPanelTabAtom, "editor");
|
set(rightPanelTabAtom, "editor");
|
||||||
set(rightPanelCollapsedAtom, false);
|
set(rightPanelCollapsedAtom, false);
|
||||||
|
|
@ -93,8 +73,6 @@ export const openEditorPanelAtom = atom(
|
||||||
searchSpaceId: payload.searchSpaceId ?? null,
|
searchSpaceId: payload.searchSpaceId ?? null,
|
||||||
memoryScope: payload.memoryScope,
|
memoryScope: payload.memoryScope,
|
||||||
title: payload.title ?? null,
|
title: payload.title ?? null,
|
||||||
highlightLines: null,
|
|
||||||
forceSourceView: false,
|
|
||||||
});
|
});
|
||||||
set(rightPanelTabAtom, "editor");
|
set(rightPanelTabAtom, "editor");
|
||||||
set(rightPanelCollapsedAtom, false);
|
set(rightPanelCollapsedAtom, false);
|
||||||
|
|
@ -108,8 +86,6 @@ export const openEditorPanelAtom = atom(
|
||||||
searchSpaceId: payload.searchSpaceId,
|
searchSpaceId: payload.searchSpaceId,
|
||||||
memoryScope: null,
|
memoryScope: null,
|
||||||
title: payload.title ?? null,
|
title: payload.title ?? null,
|
||||||
highlightLines: payload.highlightLines ?? null,
|
|
||||||
forceSourceView: payload.forceSourceView ?? false,
|
|
||||||
});
|
});
|
||||||
set(rightPanelTabAtom, "editor");
|
set(rightPanelTabAtom, "editor");
|
||||||
set(rightPanelCollapsedAtom, false);
|
set(rightPanelCollapsedAtom, false);
|
||||||
|
|
|
||||||
|
|
@ -2,11 +2,9 @@
|
||||||
|
|
||||||
import { useSetAtom } from "jotai";
|
import { useSetAtom } from "jotai";
|
||||||
import { FileText } from "lucide-react";
|
import { FileText } from "lucide-react";
|
||||||
import { useParams } from "next/navigation";
|
|
||||||
import type { FC } from "react";
|
import type { FC } from "react";
|
||||||
import { useId, useState } from "react";
|
import { useId, useState } from "react";
|
||||||
import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom";
|
import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom";
|
||||||
import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
|
|
||||||
import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context";
|
import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context";
|
||||||
import { CitationPanelContent } from "@/components/citation-panel/citation-panel";
|
import { CitationPanelContent } from "@/components/citation-panel/citation-panel";
|
||||||
import { Citation } from "@/components/tool-ui/citation";
|
import { Citation } from "@/components/tool-ui/citation";
|
||||||
|
|
@ -110,50 +108,6 @@ const NumericChunkCitation: FC<{ chunkId: number }> = ({ chunkId }) => {
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
interface LineCitationProps {
|
|
||||||
documentId: number;
|
|
||||||
startLine: number;
|
|
||||||
endLine: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Inline citation for a knowledge-base document line range
|
|
||||||
* (`[citation:d<documentId>#L<start>-<end>]`). Clicking opens the document in
|
|
||||||
* the editor's read-only source view, scrolled to and highlighting the cited
|
|
||||||
* lines — the same anchor the citation panel uses for chunk citations.
|
|
||||||
*/
|
|
||||||
export const LineCitation: FC<LineCitationProps> = ({ documentId, startLine, endLine }) => {
|
|
||||||
const openEditorPanel = useSetAtom(openEditorPanelAtom);
|
|
||||||
const params = useParams();
|
|
||||||
const searchSpaceId = Number(params?.search_space_id);
|
|
||||||
|
|
||||||
const label = startLine === endLine ? `L${startLine}` : `L${startLine}-${endLine}`;
|
|
||||||
|
|
||||||
const handleClick = () => {
|
|
||||||
if (!Number.isFinite(searchSpaceId)) return;
|
|
||||||
openEditorPanel({
|
|
||||||
documentId,
|
|
||||||
searchSpaceId,
|
|
||||||
highlightLines: { start: startLine, end: endLine },
|
|
||||||
forceSourceView: true,
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
return (
|
|
||||||
<Button
|
|
||||||
type="button"
|
|
||||||
variant="ghost"
|
|
||||||
onClick={handleClick}
|
|
||||||
className="ml-0.5 inline-flex h-5 min-w-5 items-center justify-center gap-0.5 rounded-md bg-popover px-1.5 text-[11px] font-medium text-popover-foreground/80 align-baseline"
|
|
||||||
title={`View cited lines ${startLine}–${endLine}`}
|
|
||||||
aria-label={`View cited document lines ${startLine} to ${endLine}`}
|
|
||||||
>
|
|
||||||
<FileText className="size-3" />
|
|
||||||
{label}
|
|
||||||
</Button>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
import { tryGetHostname } from "@/lib/url";
|
import { tryGetHostname } from "@/lib/url";
|
||||||
|
|
||||||
interface UrlCitationProps {
|
interface UrlCitationProps {
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
import Link from "next/link";
|
import Link from "next/link";
|
||||||
import { useState } from "react";
|
import { useState } from "react";
|
||||||
import { Button } from "@/components/ui/button";
|
import { Button } from "@/components/ui/button";
|
||||||
import { BUILD_TIME_AUTH_TYPE, buildBackendUrl } from "@/lib/env-config";
|
import { buildBackendUrl } from "@/lib/env-config";
|
||||||
import { trackLoginAttempt } from "@/lib/posthog/events";
|
import { trackLoginAttempt } from "@/lib/posthog/events";
|
||||||
import { cn } from "@/lib/utils";
|
import { cn } from "@/lib/utils";
|
||||||
|
|
||||||
|
|
@ -46,7 +46,6 @@ interface SignInButtonProps {
|
||||||
}
|
}
|
||||||
|
|
||||||
export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => {
|
export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => {
|
||||||
const isGoogleAuth = BUILD_TIME_AUTH_TYPE === "GOOGLE";
|
|
||||||
const [isRedirecting, setIsRedirecting] = useState(false);
|
const [isRedirecting, setIsRedirecting] = useState(false);
|
||||||
|
|
||||||
const handleGoogleLogin = () => {
|
const handleGoogleLogin = () => {
|
||||||
|
|
@ -56,44 +55,45 @@ export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => {
|
||||||
window.location.href = buildBackendUrl("/auth/google/authorize-redirect");
|
window.location.href = buildBackendUrl("/auth/google/authorize-redirect");
|
||||||
};
|
};
|
||||||
|
|
||||||
const getClassName = () => {
|
const getGoogleClassName = () => {
|
||||||
if (variant === "desktop") {
|
if (variant === "desktop") {
|
||||||
return isGoogleAuth
|
return "hidden rounded-full border border-white bg-white px-5 py-2 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] md:flex dark:border-white";
|
||||||
? "hidden rounded-full border border-white bg-white px-5 py-2 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] md:flex dark:border-white"
|
|
||||||
: "hidden rounded-full bg-black px-8 py-2 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] md:block dark:bg-white dark:text-black";
|
|
||||||
}
|
}
|
||||||
if (variant === "compact") {
|
if (variant === "compact") {
|
||||||
return isGoogleAuth
|
return "rounded-full border border-white bg-white px-4 py-1.5 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white";
|
||||||
? "rounded-full border border-white bg-white px-4 py-1.5 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white"
|
|
||||||
: "rounded-full bg-black px-6 py-1.5 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black";
|
|
||||||
}
|
}
|
||||||
// mobile
|
// mobile
|
||||||
return isGoogleAuth
|
return "w-full rounded-lg border border-white bg-white px-8 py-2.5 font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white touch-manipulation";
|
||||||
? "w-full rounded-lg border border-white bg-white px-8 py-2.5 font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white touch-manipulation"
|
|
||||||
: "w-full rounded-lg bg-black px-8 py-2 font-medium text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black text-center touch-manipulation";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if (isGoogleAuth) {
|
const getLocalClassName = () => {
|
||||||
return (
|
if (variant === "desktop") {
|
||||||
|
return "hidden rounded-full bg-black px-8 py-2 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] md:block dark:bg-white dark:text-black";
|
||||||
|
}
|
||||||
|
if (variant === "compact") {
|
||||||
|
return "rounded-full bg-black px-6 py-1.5 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black";
|
||||||
|
}
|
||||||
|
return "w-full rounded-lg bg-black px-8 py-2 font-medium text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black text-center touch-manipulation";
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
<Button
|
<Button
|
||||||
type="button"
|
type="button"
|
||||||
variant="ghost"
|
variant="ghost"
|
||||||
onClick={handleGoogleLogin}
|
onClick={handleGoogleLogin}
|
||||||
disabled={isRedirecting}
|
disabled={isRedirecting}
|
||||||
className={cn(
|
className={cn(
|
||||||
"flex items-center justify-center gap-2 transition-colors duration-200 disabled:cursor-not-allowed disabled:opacity-50",
|
"runtime-auth-google flex items-center justify-center gap-2 transition-colors duration-200 disabled:cursor-not-allowed disabled:opacity-50",
|
||||||
getClassName()
|
getGoogleClassName()
|
||||||
)}
|
)}
|
||||||
>
|
>
|
||||||
<GoogleLogo className="h-4 w-4" />
|
<GoogleLogo className="h-4 w-4" />
|
||||||
<span>Sign In</span>
|
<span>Sign In</span>
|
||||||
</Button>
|
</Button>
|
||||||
);
|
<Link href="/login" className={cn("runtime-auth-local", getLocalClassName())}>
|
||||||
}
|
Sign In
|
||||||
|
</Link>
|
||||||
return (
|
</>
|
||||||
<Link href="/login" className={getClassName()}>
|
|
||||||
Sign In
|
|
||||||
</Link>
|
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -46,13 +46,6 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
|
||||||
|
|
||||||
const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]);
|
const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]);
|
||||||
|
|
||||||
const citedLineLabel = useMemo(() => {
|
|
||||||
const start = data?.cited_start_line;
|
|
||||||
const end = data?.cited_end_line;
|
|
||||||
if (start == null || end == null) return null;
|
|
||||||
return start === end ? `Line ${start}` : `Lines ${start}–${end}`;
|
|
||||||
}, [data?.cited_start_line, data?.cited_end_line]);
|
|
||||||
|
|
||||||
const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0;
|
const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0;
|
||||||
const startIndex = data?.chunk_start_index ?? 0;
|
const startIndex = data?.chunk_start_index ?? 0;
|
||||||
const hasMoreAbove = startIndex > 0;
|
const hasMoreAbove = startIndex > 0;
|
||||||
|
|
@ -82,15 +75,10 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
|
||||||
|
|
||||||
const handleOpenFullDocument = () => {
|
const handleOpenFullDocument = () => {
|
||||||
if (!data) return;
|
if (!data) return;
|
||||||
const hasLineAnchor = data.cited_start_line != null && data.cited_end_line != null;
|
|
||||||
openEditorPanel({
|
openEditorPanel({
|
||||||
documentId: data.id,
|
documentId: data.id,
|
||||||
searchSpaceId: data.search_space_id,
|
searchSpaceId: data.search_space_id,
|
||||||
title: data.title,
|
title: data.title,
|
||||||
highlightLines: hasLineAnchor
|
|
||||||
? { start: data.cited_start_line as number, end: data.cited_end_line as number }
|
|
||||||
: null,
|
|
||||||
forceSourceView: hasLineAnchor,
|
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -122,7 +110,6 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
<div className="flex items-center gap-3 shrink-0 text-[11px] text-muted-foreground">
|
<div className="flex items-center gap-3 shrink-0 text-[11px] text-muted-foreground">
|
||||||
{citedLineLabel && <span>{citedLineLabel}</span>}
|
|
||||||
{totalChunks > 0 && <span>{totalChunks} chunks</span>}
|
{totalChunks > 0 && <span>{totalChunks} chunks</span>}
|
||||||
{!isLoading && !error && data && (
|
{!isLoading && !error && data && (
|
||||||
<Button
|
<Button
|
||||||
|
|
@ -185,9 +172,7 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
|
||||||
Chunk #{chunk.id}
|
Chunk #{chunk.id}
|
||||||
</span>
|
</span>
|
||||||
{isCited && (
|
{isCited && (
|
||||||
<span className="text-[11px] font-semibold text-primary">
|
<span className="text-[11px] font-semibold text-primary">Cited chunk</span>
|
||||||
{citedLineLabel ? `Cited chunk · ${citedLineLabel}` : "Cited chunk"}
|
|
||||||
</span>
|
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
<div className="text-sm">
|
<div className="text-sm">
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import type { ReactNode } from "react";
|
import type { ReactNode } from "react";
|
||||||
import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
|
import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
|
||||||
import {
|
import {
|
||||||
type CitationToken,
|
type CitationToken,
|
||||||
type CitationUrlMap,
|
type CitationUrlMap,
|
||||||
|
|
@ -21,16 +21,6 @@ export function renderCitationToken(token: CitationToken, ordinalKey: number): R
|
||||||
if (token.kind === "url") {
|
if (token.kind === "url") {
|
||||||
return <UrlCitation key={`citation-url-${ordinalKey}`} url={token.url} />;
|
return <UrlCitation key={`citation-url-${ordinalKey}`} url={token.url} />;
|
||||||
}
|
}
|
||||||
if (token.kind === "line") {
|
|
||||||
return (
|
|
||||||
<LineCitation
|
|
||||||
key={`citation-line-${token.documentId}-${token.startLine}-${ordinalKey}`}
|
|
||||||
documentId={token.documentId}
|
|
||||||
startLine={token.startLine}
|
|
||||||
endLine={token.endLine}
|
|
||||||
/>
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return (
|
return (
|
||||||
<InlineCitation
|
<InlineCitation
|
||||||
key={`citation-${token.isDocsChunk ? "doc-" : ""}${token.chunkId}-${ordinalKey}`}
|
key={`citation-${token.isDocsChunk ? "doc-" : ""}${token.chunkId}-${ordinalKey}`}
|
||||||
|
|
|
||||||
|
|
@ -149,8 +149,6 @@ export function EditorPanelContent({
|
||||||
searchSpaceId,
|
searchSpaceId,
|
||||||
title,
|
title,
|
||||||
onClose,
|
onClose,
|
||||||
highlightLines = null,
|
|
||||||
forceSourceView = false,
|
|
||||||
}: {
|
}: {
|
||||||
kind?: "document" | "local_file" | "memory";
|
kind?: "document" | "local_file" | "memory";
|
||||||
documentId?: number;
|
documentId?: number;
|
||||||
|
|
@ -159,8 +157,6 @@ export function EditorPanelContent({
|
||||||
searchSpaceId?: number;
|
searchSpaceId?: number;
|
||||||
title: string | null;
|
title: string | null;
|
||||||
onClose?: () => void;
|
onClose?: () => void;
|
||||||
highlightLines?: { start: number; end: number } | null;
|
|
||||||
forceSourceView?: boolean;
|
|
||||||
}) {
|
}) {
|
||||||
const electronAPI = useElectronAPI();
|
const electronAPI = useElectronAPI();
|
||||||
const [editorDoc, setEditorDoc] = useState<EditorContent | null>(null);
|
const [editorDoc, setEditorDoc] = useState<EditorContent | null>(null);
|
||||||
|
|
@ -209,7 +205,7 @@ export function EditorPanelContent({
|
||||||
const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines;
|
const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines;
|
||||||
const viewerMode: ViewerMode = isMemoryMode
|
const viewerMode: ViewerMode = isMemoryMode
|
||||||
? "plate"
|
? "plate"
|
||||||
: editorDoc?.viewer_mode === "monaco" || isLargeDocument || forceSourceView
|
: editorDoc?.viewer_mode === "monaco" || isLargeDocument
|
||||||
? "monaco"
|
? "monaco"
|
||||||
: "plate";
|
: "plate";
|
||||||
|
|
||||||
|
|
@ -832,7 +828,6 @@ export function EditorPanelContent({
|
||||||
value={editorDoc.source_markdown}
|
value={editorDoc.source_markdown}
|
||||||
readOnly
|
readOnly
|
||||||
onChange={() => {}}
|
onChange={() => {}}
|
||||||
highlightLines={highlightLines}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -923,8 +918,6 @@ function DesktopEditorPanel() {
|
||||||
searchSpaceId={panelState.searchSpaceId ?? undefined}
|
searchSpaceId={panelState.searchSpaceId ?? undefined}
|
||||||
title={panelState.title}
|
title={panelState.title}
|
||||||
onClose={closePanel}
|
onClose={closePanel}
|
||||||
highlightLines={panelState.highlightLines}
|
|
||||||
forceSourceView={panelState.forceSourceView}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
|
@ -964,8 +957,6 @@ function MobileEditorDrawer() {
|
||||||
memoryScope={panelState.memoryScope ?? undefined}
|
memoryScope={panelState.memoryScope ?? undefined}
|
||||||
searchSpaceId={panelState.searchSpaceId ?? undefined}
|
searchSpaceId={panelState.searchSpaceId ?? undefined}
|
||||||
title={panelState.title}
|
title={panelState.title}
|
||||||
highlightLines={panelState.highlightLines}
|
|
||||||
forceSourceView={panelState.forceSourceView}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</DrawerContent>
|
</DrawerContent>
|
||||||
|
|
|
||||||
|
|
@ -3,10 +3,9 @@
|
||||||
import { type Descendant, KEYS } from "platejs";
|
import { type Descendant, KEYS } from "platejs";
|
||||||
import { createPlatePlugin, type PlateElementProps } from "platejs/react";
|
import { createPlatePlugin, type PlateElementProps } from "platejs/react";
|
||||||
import type { FC } from "react";
|
import type { FC } from "react";
|
||||||
import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
|
import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
|
||||||
import {
|
import {
|
||||||
CITATION_REGEX,
|
CITATION_REGEX,
|
||||||
type CitationToken,
|
|
||||||
type CitationUrlMap,
|
type CitationUrlMap,
|
||||||
parseTextWithCitations,
|
parseTextWithCitations,
|
||||||
} from "@/lib/citations/citation-parser";
|
} from "@/lib/citations/citation-parser";
|
||||||
|
|
@ -18,12 +17,9 @@ import {
|
||||||
*/
|
*/
|
||||||
export type CitationElementNode = {
|
export type CitationElementNode = {
|
||||||
type: "citation";
|
type: "citation";
|
||||||
kind: "chunk" | "doc" | "url" | "line";
|
kind: "chunk" | "doc" | "url";
|
||||||
chunkId?: number;
|
chunkId?: number;
|
||||||
url?: string;
|
url?: string;
|
||||||
documentId?: number;
|
|
||||||
startLine?: number;
|
|
||||||
endLine?: number;
|
|
||||||
/** Original literal token that produced this citation node. */
|
/** Original literal token that produced this citation node. */
|
||||||
rawText: string;
|
rawText: string;
|
||||||
children: [{ text: "" }];
|
children: [{ text: "" }];
|
||||||
|
|
@ -37,22 +33,11 @@ const CitationElement: FC<PlateElementProps<CitationElementNode>> = ({
|
||||||
element,
|
element,
|
||||||
}) => {
|
}) => {
|
||||||
const isUrl = element.kind === "url";
|
const isUrl = element.kind === "url";
|
||||||
const isLine =
|
|
||||||
element.kind === "line" &&
|
|
||||||
element.documentId !== undefined &&
|
|
||||||
element.startLine !== undefined &&
|
|
||||||
element.endLine !== undefined;
|
|
||||||
return (
|
return (
|
||||||
<span {...attributes} className="inline-flex align-baseline">
|
<span {...attributes} className="inline-flex align-baseline">
|
||||||
<span contentEditable={false}>
|
<span contentEditable={false}>
|
||||||
{isUrl && element.url ? (
|
{isUrl && element.url ? (
|
||||||
<UrlCitation url={element.url} />
|
<UrlCitation url={element.url} />
|
||||||
) : isLine ? (
|
|
||||||
<LineCitation
|
|
||||||
documentId={element.documentId as number}
|
|
||||||
startLine={element.startLine as number}
|
|
||||||
endLine={element.endLine as number}
|
|
||||||
/>
|
|
||||||
) : element.chunkId !== undefined ? (
|
) : element.chunkId !== undefined ? (
|
||||||
<InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} />
|
<InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} />
|
||||||
) : null}
|
) : null}
|
||||||
|
|
@ -112,7 +97,10 @@ function copyMarks(textNode: SlateText): Record<string, unknown> {
|
||||||
return marks;
|
return marks;
|
||||||
}
|
}
|
||||||
|
|
||||||
function makeCitationElement(rawText: string, segment: CitationToken): CitationElementNode {
|
function makeCitationElement(
|
||||||
|
rawText: string,
|
||||||
|
segment: { kind: "url"; url: string } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
|
||||||
|
): CitationElementNode {
|
||||||
if (segment.kind === "url") {
|
if (segment.kind === "url") {
|
||||||
return {
|
return {
|
||||||
type: CITATION_TYPE,
|
type: CITATION_TYPE,
|
||||||
|
|
@ -122,17 +110,6 @@ function makeCitationElement(rawText: string, segment: CitationToken): CitationE
|
||||||
children: [{ text: "" }],
|
children: [{ text: "" }],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (segment.kind === "line") {
|
|
||||||
return {
|
|
||||||
type: CITATION_TYPE,
|
|
||||||
kind: "line",
|
|
||||||
documentId: segment.documentId,
|
|
||||||
startLine: segment.startLine,
|
|
||||||
endLine: segment.endLine,
|
|
||||||
rawText,
|
|
||||||
children: [{ text: "" }],
|
|
||||||
};
|
|
||||||
}
|
|
||||||
return {
|
return {
|
||||||
type: CITATION_TYPE,
|
type: CITATION_TYPE,
|
||||||
kind: segment.isDocsChunk ? "doc" : "chunk",
|
kind: segment.isDocsChunk ? "doc" : "chunk",
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import dynamic from "next/dynamic";
|
import dynamic from "next/dynamic";
|
||||||
import { useTheme } from "next-themes";
|
import { useTheme } from "next-themes";
|
||||||
import { useCallback, useEffect, useRef } from "react";
|
import { useEffect, useRef } from "react";
|
||||||
import { Spinner } from "@/components/ui/spinner";
|
import { Spinner } from "@/components/ui/spinner";
|
||||||
|
|
||||||
const MonacoEditor = dynamic(() => import("@monaco-editor/react"), {
|
const MonacoEditor = dynamic(() => import("@monaco-editor/react"), {
|
||||||
|
|
@ -17,8 +17,6 @@ interface SourceCodeEditorProps {
|
||||||
readOnly?: boolean;
|
readOnly?: boolean;
|
||||||
fontSize?: number;
|
fontSize?: number;
|
||||||
onSave?: () => Promise<void> | void;
|
onSave?: () => Promise<void> | void;
|
||||||
/** 1-based inclusive line range to reveal and highlight (e.g. a citation). */
|
|
||||||
highlightLines?: { start: number; end: number } | null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function SourceCodeEditor({
|
export function SourceCodeEditor({
|
||||||
|
|
@ -29,45 +27,10 @@ export function SourceCodeEditor({
|
||||||
readOnly = false,
|
readOnly = false,
|
||||||
fontSize = 12,
|
fontSize = 12,
|
||||||
onSave,
|
onSave,
|
||||||
highlightLines = null,
|
|
||||||
}: SourceCodeEditorProps) {
|
}: SourceCodeEditorProps) {
|
||||||
const { resolvedTheme } = useTheme();
|
const { resolvedTheme } = useTheme();
|
||||||
const onSaveRef = useRef(onSave);
|
const onSaveRef = useRef(onSave);
|
||||||
const monacoRef = useRef<any>(null);
|
const monacoRef = useRef<any>(null);
|
||||||
const editorRef = useRef<any>(null);
|
|
||||||
const decorationsRef = useRef<any>(null);
|
|
||||||
const highlightLinesRef = useRef(highlightLines);
|
|
||||||
highlightLinesRef.current = highlightLines;
|
|
||||||
|
|
||||||
const applyHighlight = useCallback(() => {
|
|
||||||
const editor = editorRef.current;
|
|
||||||
const monaco = monacoRef.current;
|
|
||||||
if (!editor || !monaco) return;
|
|
||||||
if (decorationsRef.current) {
|
|
||||||
decorationsRef.current.clear();
|
|
||||||
decorationsRef.current = null;
|
|
||||||
}
|
|
||||||
const range = highlightLinesRef.current;
|
|
||||||
if (!range) return;
|
|
||||||
const lineCount = editor.getModel()?.getLineCount() ?? range.end;
|
|
||||||
const start = Math.min(Math.max(1, Math.floor(range.start)), lineCount);
|
|
||||||
const end = Math.min(Math.max(start, Math.floor(range.end)), lineCount);
|
|
||||||
try {
|
|
||||||
decorationsRef.current = editor.createDecorationsCollection([
|
|
||||||
{
|
|
||||||
range: new monaco.Range(start, 1, end, 1),
|
|
||||||
options: { isWholeLine: true, className: "citation-line-highlight" },
|
|
||||||
},
|
|
||||||
]);
|
|
||||||
} catch {
|
|
||||||
// Decoration failure must not block the reveal below.
|
|
||||||
}
|
|
||||||
editor.revealLinesInCenter(start, end, monaco.editor.ScrollType.Immediate);
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
applyHighlight();
|
|
||||||
}, [applyHighlight, highlightLines?.start, highlightLines?.end]);
|
|
||||||
const normalizedModelPath = (() => {
|
const normalizedModelPath = (() => {
|
||||||
const raw = (path || "local-file.txt").trim();
|
const raw = (path || "local-file.txt").trim();
|
||||||
const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`;
|
const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`;
|
||||||
|
|
@ -141,16 +104,7 @@ export function SourceCodeEditor({
|
||||||
}}
|
}}
|
||||||
onMount={(editor, monaco) => {
|
onMount={(editor, monaco) => {
|
||||||
monacoRef.current = monaco;
|
monacoRef.current = monaco;
|
||||||
editorRef.current = editor;
|
|
||||||
applySidebarTheme(monaco);
|
applySidebarTheme(monaco);
|
||||||
// Reveal now, then once more after the first layout settles:
|
|
||||||
// the panel slide-in animation means the editor often has no
|
|
||||||
// usable viewport height on the initial frame.
|
|
||||||
applyHighlight();
|
|
||||||
const layoutSub = editor.onDidLayoutChange(() => {
|
|
||||||
applyHighlight();
|
|
||||||
layoutSub.dispose();
|
|
||||||
});
|
|
||||||
if (!isManualSaveEnabled) return;
|
if (!isManualSaveEnabled) return;
|
||||||
editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => {
|
editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => {
|
||||||
void onSaveRef.current?.();
|
void onSaveRef.current?.();
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ import {
|
||||||
getAssetLabel,
|
getAssetLabel,
|
||||||
usePrimaryDownload,
|
usePrimaryDownload,
|
||||||
} from "@/lib/desktop-download-utils";
|
} from "@/lib/desktop-download-utils";
|
||||||
import { BUILD_TIME_AUTH_TYPE, buildBackendUrl } from "@/lib/env-config";
|
import { buildBackendUrl } from "@/lib/env-config";
|
||||||
import { trackLoginAttempt } from "@/lib/posthog/events";
|
import { trackLoginAttempt } from "@/lib/posthog/events";
|
||||||
import { cn } from "@/lib/utils";
|
import { cn } from "@/lib/utils";
|
||||||
|
|
||||||
|
|
@ -314,7 +314,6 @@ export function HeroSection() {
|
||||||
}
|
}
|
||||||
|
|
||||||
function GetStartedButton() {
|
function GetStartedButton() {
|
||||||
const isGoogleAuth = BUILD_TIME_AUTH_TYPE === "GOOGLE";
|
|
||||||
const [isRedirecting, setIsRedirecting] = useState(false);
|
const [isRedirecting, setIsRedirecting] = useState(false);
|
||||||
|
|
||||||
const handleGoogleLogin = () => {
|
const handleGoogleLogin = () => {
|
||||||
|
|
@ -324,29 +323,26 @@ function GetStartedButton() {
|
||||||
window.location.href = buildBackendUrl("/auth/google/authorize-redirect");
|
window.location.href = buildBackendUrl("/auth/google/authorize-redirect");
|
||||||
};
|
};
|
||||||
|
|
||||||
if (isGoogleAuth) {
|
return (
|
||||||
return (
|
<>
|
||||||
<Button
|
<Button
|
||||||
type="button"
|
type="button"
|
||||||
variant="ghost"
|
variant="ghost"
|
||||||
onClick={handleGoogleLogin}
|
onClick={handleGoogleLogin}
|
||||||
disabled={isRedirecting}
|
disabled={isRedirecting}
|
||||||
className="h-14 w-full cursor-pointer gap-3 rounded-lg border border-white bg-white text-center text-base font-medium text-[#1f1f1f] shadow-sm transition duration-150 hover:bg-zinc-100 hover:text-[#1f1f1f] sm:w-56 dark:border-white"
|
className="runtime-auth-google h-14 w-full cursor-pointer gap-3 rounded-lg border border-white bg-white text-center text-base font-medium text-[#1f1f1f] shadow-sm transition duration-150 hover:bg-zinc-100 hover:text-[#1f1f1f] sm:w-56 dark:border-white"
|
||||||
>
|
>
|
||||||
<GoogleLogo className="h-5 w-5" />
|
<GoogleLogo className="h-5 w-5" />
|
||||||
<span>Continue with Google</span>
|
<span>Continue with Google</span>
|
||||||
</Button>
|
</Button>
|
||||||
);
|
<Button
|
||||||
}
|
asChild
|
||||||
|
variant="ghost"
|
||||||
return (
|
className="runtime-auth-local h-14 w-full rounded-lg bg-black text-center text-base font-medium text-white shadow-sm ring-1 shadow-black/10 ring-black/10 transition duration-150 active:scale-98 hover:bg-black sm:w-52 dark:bg-white dark:text-black dark:hover:bg-white"
|
||||||
<Button
|
>
|
||||||
asChild
|
<Link href="/login">Get Started</Link>
|
||||||
variant="ghost"
|
</Button>
|
||||||
className="h-14 w-full rounded-lg bg-black text-center text-base font-medium text-white shadow-sm ring-1 shadow-black/10 ring-black/10 transition duration-150 active:scale-98 hover:bg-black sm:w-52 dark:bg-white dark:text-black dark:hover:bg-white"
|
</>
|
||||||
>
|
|
||||||
<Link href="/login">Get Started</Link>
|
|
||||||
</Button>
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,6 @@ import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right
|
||||||
import { Button } from "@/components/ui/button";
|
import { Button } from "@/components/ui/button";
|
||||||
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
|
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
|
||||||
import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl";
|
import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl";
|
||||||
import { useMediaQuery } from "@/hooks/use-media-query";
|
|
||||||
import { cn } from "@/lib/utils";
|
import { cn } from "@/lib/utils";
|
||||||
import { DocumentsSidebar } from "../sidebar";
|
import { DocumentsSidebar } from "../sidebar";
|
||||||
|
|
||||||
|
|
@ -197,9 +196,6 @@ export function RightPanel({
|
||||||
const citationState = useAtomValue(citationPanelAtom);
|
const citationState = useAtomValue(citationPanelAtom);
|
||||||
const closeCitation = useSetAtom(closeCitationPanelAtom);
|
const closeCitation = useSetAtom(closeCitationPanelAtom);
|
||||||
const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
|
const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
|
||||||
// Desktop-only surface; mobile uses the dedicated Mobile* drawers. Without
|
|
||||||
// this guard both render together and two editors fight over one model.
|
|
||||||
const isDesktop = useMediaQuery("(min-width: 1024px)");
|
|
||||||
|
|
||||||
const documentsOpen = documentsPanel?.open ?? false;
|
const documentsOpen = documentsPanel?.open ?? false;
|
||||||
const reportOpen = reportState.isOpen && !!reportState.reportId;
|
const reportOpen = reportState.isOpen && !!reportState.reportId;
|
||||||
|
|
@ -271,7 +267,7 @@ export function RightPanel({
|
||||||
<CollapseButton onClick={() => setCollapsed(true)} />
|
<CollapseButton onClick={() => setCollapsed(true)} />
|
||||||
) : null;
|
) : null;
|
||||||
|
|
||||||
if (!isVisible || !isDesktop) return null;
|
if (!isVisible) return null;
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<aside
|
<aside
|
||||||
|
|
@ -312,8 +308,6 @@ export function RightPanel({
|
||||||
searchSpaceId={editorState.searchSpaceId ?? undefined}
|
searchSpaceId={editorState.searchSpaceId ?? undefined}
|
||||||
title={editorState.title}
|
title={editorState.title}
|
||||||
onClose={closeEditor}
|
onClose={closeEditor}
|
||||||
highlightLines={editorState.highlightLines}
|
|
||||||
forceSourceView={editorState.forceSourceView}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
|
||||||
|
|
@ -272,6 +272,7 @@ export function ModelSelector({
|
||||||
type="button"
|
type="button"
|
||||||
variant="ghost"
|
variant="ghost"
|
||||||
size="sm"
|
size="sm"
|
||||||
|
aria-label="Select chat model"
|
||||||
className={cn(
|
className={cn(
|
||||||
"h-8 min-w-0 gap-2 rounded-md px-3 text-muted-foreground transition-colors",
|
"h-8 min-w-0 gap-2 rounded-md px-3 text-muted-foreground transition-colors",
|
||||||
"select-none",
|
"select-none",
|
||||||
|
|
|
||||||
|
|
@ -70,15 +70,10 @@ export const documentWithChunks = document.extend({
|
||||||
id: z.number(),
|
id: z.number(),
|
||||||
content: z.string(),
|
content: z.string(),
|
||||||
created_at: z.string(),
|
created_at: z.string(),
|
||||||
start_char: z.number().nullable().optional(),
|
|
||||||
end_char: z.number().nullable().optional(),
|
|
||||||
})
|
})
|
||||||
),
|
),
|
||||||
total_chunks: z.number().optional().default(0),
|
total_chunks: z.number().optional().default(0),
|
||||||
chunk_start_index: z.number().optional().default(0),
|
chunk_start_index: z.number().optional().default(0),
|
||||||
// 1-based inclusive line range of the cited chunk within source_markdown.
|
|
||||||
cited_start_line: z.number().nullable().optional(),
|
|
||||||
cited_end_line: z.number().nullable().optional(),
|
|
||||||
});
|
});
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -18,16 +18,12 @@ import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
|
||||||
* sometimes emit.
|
* sometimes emit.
|
||||||
*/
|
*/
|
||||||
export const CITATION_REGEX =
|
export const CITATION_REGEX =
|
||||||
/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|d\d+#L\d+-\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
|
/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
|
||||||
|
|
||||||
/** Matches the knowledge-base line-citation form `d<documentId>#L<start>-<end>`. */
|
|
||||||
const LINE_CITATION_REGEX = /^d(\d+)#L(\d+)-(\d+)$/;
|
|
||||||
|
|
||||||
/** A single parsed citation reference. */
|
/** A single parsed citation reference. */
|
||||||
export type CitationToken =
|
export type CitationToken =
|
||||||
| { kind: "url"; url: string }
|
| { kind: "url"; url: string }
|
||||||
| { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
|
| { kind: "chunk"; chunkId: number; isDocsChunk: boolean };
|
||||||
| { kind: "line"; documentId: number; startLine: number; endLine: number };
|
|
||||||
|
|
||||||
/** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
|
/** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
|
||||||
export type ParsedSegment = string | CitationToken;
|
export type ParsedSegment = string | CitationToken;
|
||||||
|
|
@ -99,15 +95,7 @@ export function parseTextWithCitations(text: string, urlMap: CitationUrlMap): Pa
|
||||||
|
|
||||||
const captured = match[1];
|
const captured = match[1];
|
||||||
|
|
||||||
const lineMatch = LINE_CITATION_REGEX.exec(captured);
|
if (captured.startsWith("http://") || captured.startsWith("https://")) {
|
||||||
if (lineMatch) {
|
|
||||||
segments.push({
|
|
||||||
kind: "line",
|
|
||||||
documentId: Number.parseInt(lineMatch[1], 10),
|
|
||||||
startLine: Number.parseInt(lineMatch[2], 10),
|
|
||||||
endLine: Number.parseInt(lineMatch[3], 10),
|
|
||||||
});
|
|
||||||
} else if (captured.startsWith("http://") || captured.startsWith("https://")) {
|
|
||||||
segments.push({ kind: "url", url: captured.trim() });
|
segments.push({ kind: "url", url: captured.trim() });
|
||||||
} else if (captured.startsWith("urlcite")) {
|
} else if (captured.startsWith("urlcite")) {
|
||||||
const url = urlMap.get(captured);
|
const url = urlMap.get(captured);
|
||||||
|
|
|
||||||
52
surfsense_web/lib/runtime-auth-config.ts
Normal file
52
surfsense_web/lib/runtime-auth-config.ts
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
export const RUNTIME_AUTH_TYPE_COOKIE_NAME = "surfsense_auth_type";
|
||||||
|
|
||||||
|
export type RuntimeAuthUiMode = "GOOGLE" | "LOCAL";
|
||||||
|
|
||||||
|
export function resolveRuntimeAuthUiMode(
|
||||||
|
value: string | null | undefined,
|
||||||
|
fallback: string | null | undefined = "GOOGLE"
|
||||||
|
): RuntimeAuthUiMode {
|
||||||
|
const candidate = value?.trim().toUpperCase();
|
||||||
|
if (candidate === "GOOGLE") return "GOOGLE";
|
||||||
|
if (candidate === "LOCAL") return "LOCAL";
|
||||||
|
|
||||||
|
const fallbackCandidate = fallback?.trim().toUpperCase();
|
||||||
|
return fallbackCandidate === "GOOGLE" ? "GOOGLE" : "LOCAL";
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getRuntimeAuthInitScript(fallbackAuthType: string): string {
|
||||||
|
const fallback = resolveRuntimeAuthUiMode(fallbackAuthType);
|
||||||
|
const cookieName = JSON.stringify(RUNTIME_AUTH_TYPE_COOKIE_NAME);
|
||||||
|
const fallbackValue = JSON.stringify(fallback);
|
||||||
|
|
||||||
|
return `
|
||||||
|
(function() {
|
||||||
|
try {
|
||||||
|
var cookieName = ${cookieName};
|
||||||
|
var fallback = ${fallbackValue};
|
||||||
|
var prefix = cookieName + "=";
|
||||||
|
var rawValue = fallback;
|
||||||
|
var cookies = document.cookie ? document.cookie.split(";") : [];
|
||||||
|
for (var i = 0; i < cookies.length; i++) {
|
||||||
|
var cookie = cookies[i].trim();
|
||||||
|
if (cookie.indexOf(prefix) === 0) {
|
||||||
|
rawValue = decodeURIComponent(cookie.slice(prefix.length));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var normalized = String(rawValue || fallback).toUpperCase() === "GOOGLE" ? "GOOGLE" : "LOCAL";
|
||||||
|
window.__SURFSENSE_AUTH_TYPE__ = normalized;
|
||||||
|
document.documentElement.setAttribute("data-surfsense-auth-type", normalized);
|
||||||
|
} catch (_) {
|
||||||
|
window.__SURFSENSE_AUTH_TYPE__ = ${fallbackValue};
|
||||||
|
document.documentElement.setAttribute("data-surfsense-auth-type", ${fallbackValue});
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
declare global {
|
||||||
|
interface Window {
|
||||||
|
__SURFSENSE_AUTH_TYPE__?: RuntimeAuthUiMode;
|
||||||
|
}
|
||||||
|
}
|
||||||
24
surfsense_web/proxy.ts
Normal file
24
surfsense_web/proxy.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
import { NextResponse, type NextRequest } from "next/server";
|
||||||
|
import { BUILD_TIME_AUTH_TYPE } from "@/lib/env-config";
|
||||||
|
import {
|
||||||
|
RUNTIME_AUTH_TYPE_COOKIE_NAME,
|
||||||
|
resolveRuntimeAuthUiMode,
|
||||||
|
} from "@/lib/runtime-auth-config";
|
||||||
|
|
||||||
|
export function proxy(request: NextRequest) {
|
||||||
|
const response = NextResponse.next();
|
||||||
|
const authType = resolveRuntimeAuthUiMode(process.env.AUTH_TYPE, BUILD_TIME_AUTH_TYPE);
|
||||||
|
|
||||||
|
response.cookies.set(RUNTIME_AUTH_TYPE_COOKIE_NAME, authType, {
|
||||||
|
path: "/",
|
||||||
|
maxAge: 60 * 60 * 24 * 365,
|
||||||
|
sameSite: "lax",
|
||||||
|
secure: request.nextUrl.protocol === "https:",
|
||||||
|
});
|
||||||
|
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const config = {
|
||||||
|
matcher: ["/((?!api|auth|_next/static|_next/image|favicon.ico|.*\\..*).*)"],
|
||||||
|
};
|
||||||
Loading…
Add table
Add a link
Reference in a new issue