mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-24 21:38:09 +02:00
Merge remote-tracking branch 'upstream/dev' into feat/api-key
This commit is contained in:
commit
3695e1d5c5
64 changed files with 1043 additions and 1852 deletions
|
|
@ -18,6 +18,7 @@ skipped (e.g. client disconnect).
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
|
@ -57,8 +58,9 @@ from app.db import (
|
|||
FolderRevision,
|
||||
shielded_async_session,
|
||||
)
|
||||
from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings
|
||||
from app.indexing_pipeline.document_chunker import chunk_text
|
||||
from app.utils.document_converters import (
|
||||
embed_texts,
|
||||
generate_content_hash,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
|
@ -232,23 +234,24 @@ async def _create_document(
|
|||
session.add(doc)
|
||||
await session.flush()
|
||||
|
||||
summary_embedding, chunk_embeddings = await build_chunk_embeddings(
|
||||
content, use_code_chunker=False
|
||||
)
|
||||
summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
|
||||
doc.embedding = summary_embedding
|
||||
session.add_all(
|
||||
[
|
||||
Chunk(
|
||||
document_id=doc.id,
|
||||
content=sl.text,
|
||||
embedding=embedding,
|
||||
position=i,
|
||||
start_char=sl.start_char,
|
||||
end_char=sl.end_char,
|
||||
)
|
||||
for i, (sl, embedding) in enumerate(chunk_embeddings)
|
||||
]
|
||||
)
|
||||
chunks = chunk_text(content)
|
||||
if chunks:
|
||||
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
|
||||
session.add_all(
|
||||
[
|
||||
Chunk(
|
||||
document_id=doc.id,
|
||||
content=text,
|
||||
embedding=embedding,
|
||||
position=i,
|
||||
)
|
||||
for i, (text, embedding) in enumerate(
|
||||
zip(chunks, chunk_embeddings, strict=True)
|
||||
)
|
||||
]
|
||||
)
|
||||
return doc
|
||||
|
||||
|
||||
|
|
@ -284,25 +287,26 @@ async def _update_document(
|
|||
search_space_id,
|
||||
)
|
||||
|
||||
summary_embedding, chunk_embeddings = await build_chunk_embeddings(
|
||||
content, use_code_chunker=False
|
||||
)
|
||||
summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
|
||||
document.embedding = summary_embedding
|
||||
|
||||
await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
|
||||
session.add_all(
|
||||
[
|
||||
Chunk(
|
||||
document_id=document.id,
|
||||
content=sl.text,
|
||||
embedding=embedding,
|
||||
position=i,
|
||||
start_char=sl.start_char,
|
||||
end_char=sl.end_char,
|
||||
)
|
||||
for i, (sl, embedding) in enumerate(chunk_embeddings)
|
||||
]
|
||||
)
|
||||
chunks = chunk_text(content)
|
||||
if chunks:
|
||||
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
|
||||
session.add_all(
|
||||
[
|
||||
Chunk(
|
||||
document_id=document.id,
|
||||
content=text,
|
||||
embedding=embedding,
|
||||
position=i,
|
||||
)
|
||||
for i, (text, embedding) in enumerate(
|
||||
zip(chunks, chunk_embeddings, strict=True)
|
||||
)
|
||||
]
|
||||
)
|
||||
return document
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,58 +1,42 @@
|
|||
<citations>
|
||||
Citations reach the answer through three channels. Use whichever applies, and
|
||||
never invent ids you didn't see: ids are matched exactly, so a wrong one
|
||||
silently breaks the link — when in doubt, omit. Always write a citation as
|
||||
plain `[citation:…]` brackets — no markdown links, no footnote numbers, no
|
||||
parentheses.
|
||||
Citations reach the answer through two channels. Use whichever applies — and
|
||||
never invent ids you didn't see. Citation ids are resolved by exact-match
|
||||
lookup; a wrong id silently breaks the link, so when in doubt, omit.
|
||||
|
||||
### Channel A — web_search chunk blocks injected this turn
|
||||
### Channel A — chunk blocks injected this turn
|
||||
When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
|
||||
turn, the chunk `id` is the result's URL:
|
||||
turn:
|
||||
|
||||
1. For each factual statement taken from a chunk, add `[citation:<url>]`
|
||||
using the **exact** id from a visible `<chunk id='…'>` tag. Copy the
|
||||
URL verbatim; do not retype it from memory.
|
||||
2. Multiple chunks → `[citation:url1], [citation:url2]` (comma-separated,
|
||||
1. For each factual statement taken from those chunks, add
|
||||
`[citation:chunk_id]` using the **exact** id from a visible
|
||||
`<chunk id='…'>` tag. Copy digit-for-digit (or the URL verbatim);
|
||||
do not retype from memory.
|
||||
2. `<document_id>` is the parent doc id, **not** a citation source —
|
||||
only ids inside `<chunk id='…'>` count.
|
||||
3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
|
||||
each id copied individually).
|
||||
3. Never invent, normalise, or guess at a URL; if unsure, omit.
|
||||
4. Never invent, normalise, or guess at adjacent ids; if unsure, omit.
|
||||
5. Plain brackets only — no markdown links, no footnote numbering.
|
||||
|
||||
### Channel B — citations relayed by a `task` specialist
|
||||
A `task(...)` tool message may contain `[citation:…]` markers the
|
||||
specialist already attached to its prose — line citations
|
||||
(`[citation:d<id>#L<a>-<b>]`) or chunk ids (`[citation:N]`). The
|
||||
specialist read the underlying document and tied each marker to a
|
||||
passage; you didn't. So:
|
||||
A `task(...)` tool message may contain `[citation:<chunk_id>]` markers
|
||||
the specialist already attached to its prose. The specialist saw the
|
||||
underlying `<chunk id='…'>` blocks; you didn't. So:
|
||||
|
||||
1. **Preserve those markers verbatim** in your final answer — do not
|
||||
reformat, renumber, drop, or wrap them in markdown links. When you
|
||||
paraphrase a specialist sentence, copy the marker character-for-
|
||||
character; do not regenerate it from memory (LLMs reliably corrupt
|
||||
nearby digits).
|
||||
character; do not regenerate the id from memory (LLMs reliably
|
||||
corrupt nearby digits).
|
||||
2. Keep each marker attached to the sentence the specialist attached
|
||||
it to.
|
||||
3. Do **not** add new `[citation:…]` markers of your own to a
|
||||
specialist's prose; if a fact has no marker, the specialist
|
||||
couldn't tie it to a source and neither can you.
|
||||
couldn't tie it to a chunk and neither can you.
|
||||
4. When a specialist returns JSON, the citation markers live inside
|
||||
the prose-bearing fields (e.g. a summary or excerpt). Pull them
|
||||
along with the surrounding sentence when you quote.
|
||||
|
||||
### Channel C — your knowledge base (search hits and `read_file`)
|
||||
Knowledge-base facts are cited by line range using the document id:
|
||||
`[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
|
||||
|
||||
1. `search_knowledge_base` prints a ready `[citation:d…#L…-…]` token above each
|
||||
matched passage. When that passage supports your point, copy the token
|
||||
verbatim — that is the entire citation.
|
||||
2. When you `read_file` a `/documents/...` path, its header gives the
|
||||
`<document_id>` and an optional `<matched_lines>` pointer, and the body is
|
||||
shown with line numbers; cite the lines you actually used. Use `read_file`
|
||||
when you need more context than a search passage shows.
|
||||
3. Copy document ids and line numbers exactly as shown — never estimate,
|
||||
shift, or invent them.
|
||||
4. Older documents without a numbered body instead show `<chunk id='N'>`
|
||||
blocks; cite those with `[citation:N]`, copying the id exactly.
|
||||
|
||||
If none of these channels surfaces a citable source this turn, do not
|
||||
fabricate citations.
|
||||
If neither channel surfaces citation markers this turn, do not fabricate
|
||||
them.
|
||||
</citations>
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@ from app.agents.chat.runtime.path_resolver import (
|
|||
)
|
||||
from app.db import Document, shielded_async_session
|
||||
from app.utils.perf import get_perf_logger
|
||||
from app.utils.text_spans import char_span_to_line_range
|
||||
|
||||
_perf_log = get_perf_logger()
|
||||
|
||||
|
|
@ -57,16 +56,12 @@ _TOOL_DESCRIPTION = (
|
|||
)
|
||||
|
||||
|
||||
async def _resolve_doc_context(
|
||||
async def _resolve_virtual_paths(
|
||||
results: list[dict[str, Any]],
|
||||
*,
|
||||
search_space_id: int,
|
||||
) -> tuple[dict[int, str], dict[int, str]]:
|
||||
"""Resolve ``Document.id`` -> (canonical virtual path, source_markdown).
|
||||
|
||||
``source_markdown`` is the canonical body the chunk spans index into; the
|
||||
renderer uses it to turn a chunk's char span into a line range.
|
||||
"""
|
||||
) -> dict[int, str]:
|
||||
"""Resolve ``Document.id`` -> canonical virtual path for the search hits."""
|
||||
doc_ids = [
|
||||
doc_id
|
||||
for doc_id in (
|
||||
|
|
@ -77,24 +72,17 @@ async def _resolve_doc_context(
|
|||
if isinstance(doc_id, int)
|
||||
]
|
||||
if not doc_ids:
|
||||
return {}, {}
|
||||
return {}
|
||||
|
||||
async with shielded_async_session() as session:
|
||||
index: PathIndex = await build_path_index(session, search_space_id)
|
||||
rows = await session.execute(
|
||||
select(
|
||||
Document.id, Document.folder_id, Document.source_markdown
|
||||
).where(
|
||||
folder_rows = await session.execute(
|
||||
select(Document.id, Document.folder_id).where(
|
||||
Document.search_space_id == search_space_id,
|
||||
Document.id.in_(doc_ids),
|
||||
)
|
||||
)
|
||||
folder_by_doc_id: dict[int, int | None] = {}
|
||||
bodies: dict[int, str] = {}
|
||||
for row in rows.all():
|
||||
folder_by_doc_id[row.id] = row.folder_id
|
||||
if row.source_markdown:
|
||||
bodies[row.id] = row.source_markdown
|
||||
folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
|
||||
|
||||
paths: dict[int, str] = {}
|
||||
for doc in results:
|
||||
|
|
@ -109,76 +97,13 @@ async def _resolve_doc_context(
|
|||
folder_id=folder_id if isinstance(folder_id, int) else None,
|
||||
index=index,
|
||||
)
|
||||
return paths, bodies
|
||||
|
||||
|
||||
def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
|
||||
"""Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
|
||||
start = chunk.get("start_char")
|
||||
end = chunk.get("end_char")
|
||||
if (
|
||||
not body
|
||||
or not isinstance(doc_id, int)
|
||||
or not isinstance(start, int)
|
||||
or not isinstance(end, int)
|
||||
):
|
||||
return ""
|
||||
start_line, end_line = char_span_to_line_range(body, start, end)
|
||||
return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
|
||||
|
||||
|
||||
def _render_passage(
|
||||
chunk: dict[str, Any], body: str | None, doc_id: int | None
|
||||
) -> str | None:
|
||||
"""Render one matched chunk as an indented passage tagged with its token."""
|
||||
content = (chunk.get("content") or "").strip()
|
||||
if not content:
|
||||
return None
|
||||
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
|
||||
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
||||
snippet += " ..."
|
||||
indented = snippet.replace("\n", "\n ")
|
||||
token = _citation_token(chunk, body, doc_id)
|
||||
head = f"\n {token}" if token else ""
|
||||
return f"{head}\n {indented}"
|
||||
|
||||
|
||||
def _matched_passages(
|
||||
doc: dict[str, Any], body: str | None, doc_id: int | None
|
||||
) -> str:
|
||||
"""Render the RRF-matched chunks; '' when none can be rendered."""
|
||||
by_id = {
|
||||
c.get("chunk_id"): c
|
||||
for c in (doc.get("chunks") or [])
|
||||
if isinstance(c, dict)
|
||||
}
|
||||
rendered: list[str] = []
|
||||
for chunk_id in doc.get("matched_chunk_ids") or []:
|
||||
chunk = by_id.get(chunk_id)
|
||||
if chunk is None:
|
||||
continue
|
||||
passage = _render_passage(chunk, body, doc_id)
|
||||
if passage:
|
||||
rendered.append(passage)
|
||||
return "".join(rendered)
|
||||
|
||||
|
||||
def _fallback_snippet(doc: dict[str, Any]) -> str:
|
||||
"""Top-of-document preview, used only when no matched chunk is available."""
|
||||
content = (doc.get("content") or "").strip()
|
||||
if not content:
|
||||
return "\n (no preview available; read the document for details)"
|
||||
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
|
||||
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
||||
snippet += " ..."
|
||||
return "\n " + snippet.replace("\n", "\n ")
|
||||
return paths
|
||||
|
||||
|
||||
def _format_hits(
|
||||
results: list[dict[str, Any]],
|
||||
*,
|
||||
paths: dict[int, str],
|
||||
bodies: dict[int, str],
|
||||
query: str,
|
||||
) -> str:
|
||||
"""Render search hits as a compact, model-readable block."""
|
||||
|
|
@ -199,15 +124,21 @@ def _format_hits(
|
|||
score = doc.get("score")
|
||||
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
|
||||
path = paths.get(doc_id) if isinstance(doc_id, int) else None
|
||||
body = bodies.get(doc_id) if isinstance(doc_id, int) else None
|
||||
|
||||
id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
|
||||
header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
|
||||
header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
|
||||
f"\n path: {path}" if path else ""
|
||||
)
|
||||
|
||||
passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
|
||||
entry = header + (passages or _fallback_snippet(doc))
|
||||
content = (doc.get("content") or "").strip()
|
||||
if content:
|
||||
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
|
||||
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
||||
snippet += " ..."
|
||||
body = "\n " + snippet.replace("\n", "\n ")
|
||||
else:
|
||||
body = "\n (no preview available; read the document for details)"
|
||||
|
||||
entry = header + body
|
||||
if total + len(entry) > _MAX_TOTAL_CHARS:
|
||||
lines.append("\n<!-- additional matches truncated to fit context -->")
|
||||
break
|
||||
|
|
@ -215,9 +146,8 @@ def _format_hits(
|
|||
total += len(entry)
|
||||
|
||||
lines.append(
|
||||
"\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
|
||||
"verbatim. To quote more context or read the full document, delegate to "
|
||||
"the knowledge_base specialist with `task` using the path above."
|
||||
"\n\nTo read a full document, delegate to the knowledge_base specialist "
|
||||
"with `task`, referencing the path above."
|
||||
)
|
||||
lines.append("\n</knowledge_base_results>")
|
||||
return "".join(lines)
|
||||
|
|
@ -274,10 +204,8 @@ def create_search_knowledge_base_tool(
|
|||
top_k=clamped_top_k,
|
||||
)
|
||||
|
||||
paths, bodies = await _resolve_doc_context(results, search_space_id=_space_id)
|
||||
rendered = _format_hits(
|
||||
results, paths=paths, bodies=bodies, query=cleaned_query
|
||||
)
|
||||
paths = await _resolve_virtual_paths(results, search_space_id=_space_id)
|
||||
rendered = _format_hits(results, paths=paths, query=cleaned_query)
|
||||
matched = _matched_chunk_ids(results)
|
||||
|
||||
_perf_log.info(
|
||||
|
|
|
|||
|
|
@ -45,10 +45,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
|
||||
build_document_xml,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
|
||||
build_read_preamble,
|
||||
compute_matched_line_ranges,
|
||||
)
|
||||
from app.agents.chat.runtime.path_resolver import (
|
||||
DOCUMENTS_ROOT,
|
||||
build_path_index,
|
||||
|
|
@ -68,12 +64,6 @@ def _basename(path: str) -> str:
|
|||
return path.rsplit("/", 1)[-1]
|
||||
|
||||
|
||||
def _metadata_url(metadata: dict[str, Any]) -> str:
|
||||
return (
|
||||
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
|
||||
)
|
||||
|
||||
|
||||
def _is_under(child: str, parent: str) -> bool:
|
||||
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
|
||||
if parent == "/":
|
||||
|
|
@ -470,11 +460,8 @@ class KBPostgresBackend(BackendProtocol):
|
|||
loaded = await self._load_file_data(file_path)
|
||||
if loaded is None:
|
||||
return f"Error: File '{file_path}' not found"
|
||||
file_data, _, preamble = loaded
|
||||
body = format_read_response(file_data, offset, limit)
|
||||
if preamble and offset == 0:
|
||||
return preamble + body
|
||||
return body
|
||||
file_data, _ = loaded
|
||||
return format_read_response(file_data, offset, limit)
|
||||
|
||||
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
|
||||
return asyncio.run(self.aread(file_path, offset, limit))
|
||||
|
|
@ -482,14 +469,12 @@ class KBPostgresBackend(BackendProtocol):
|
|||
async def _load_file_data(
|
||||
self,
|
||||
path: str,
|
||||
) -> tuple[dict[str, Any], int | None, str | None] | None:
|
||||
) -> tuple[dict[str, Any], int | None] | None:
|
||||
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
|
||||
|
||||
Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path
|
||||
doesn't map to any known document. ``doc_id`` is ``None`` for the
|
||||
synthetic anonymous document. ``preamble`` is the metadata header to
|
||||
show above a numbered ``source_markdown`` body (``None`` for the legacy
|
||||
chunk-reconstructed XML reads used when a document has no body).
|
||||
Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
|
||||
to any known document. ``doc_id`` is ``None`` for the synthetic
|
||||
anonymous document so the caller doesn't track it as a DB-backed file.
|
||||
"""
|
||||
anon = self._kb_anon_doc()
|
||||
if anon and str(anon.get("path") or "") == path:
|
||||
|
|
@ -507,7 +492,7 @@ class KBPostgresBackend(BackendProtocol):
|
|||
}
|
||||
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
|
||||
file_data = create_file_data(xml)
|
||||
return file_data, None, None
|
||||
return file_data, None
|
||||
|
||||
if not path.startswith(DOCUMENTS_ROOT):
|
||||
return None
|
||||
|
|
@ -520,58 +505,41 @@ class KBPostgresBackend(BackendProtocol):
|
|||
)
|
||||
if document is None:
|
||||
return None
|
||||
source_markdown = document.source_markdown or ""
|
||||
document_type = (
|
||||
document.document_type.value
|
||||
if getattr(document, "document_type", None) is not None
|
||||
else "UNKNOWN"
|
||||
)
|
||||
metadata = dict(document.document_metadata or {})
|
||||
chunk_rows = await session.execute(
|
||||
select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char)
|
||||
select(Chunk.id, Chunk.content)
|
||||
.where(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_records = chunk_rows.all()
|
||||
document_id = document.id
|
||||
document_title = document.title
|
||||
chunks = [
|
||||
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
|
||||
]
|
||||
|
||||
matched = self._matched_chunk_ids(document_id)
|
||||
|
||||
# Canonical read: serve the verbatim body with cat -n line numbers that
|
||||
# line up with chunk char spans, so the agent cites real source lines.
|
||||
if source_markdown:
|
||||
ranges = compute_matched_line_ranges(
|
||||
source_markdown,
|
||||
[(r.id, r.start_char, r.end_char) for r in chunk_records],
|
||||
matched,
|
||||
)
|
||||
preamble = build_read_preamble(
|
||||
document_id=document_id,
|
||||
document_type=document_type,
|
||||
title=document_title,
|
||||
url=_metadata_url(metadata),
|
||||
matched_line_ranges=ranges,
|
||||
)
|
||||
return create_file_data(source_markdown), document_id, preamble
|
||||
|
||||
# Legacy fallback: no canonical body, reconstruct from chunks as XML.
|
||||
doc_payload = {
|
||||
"document_id": document_id,
|
||||
"chunks": [
|
||||
{"chunk_id": r.id, "content": r.content} for r in chunk_records
|
||||
],
|
||||
"matched_chunk_ids": list(matched),
|
||||
"document_id": document.id,
|
||||
"chunks": chunks,
|
||||
"matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
|
||||
"document": {
|
||||
"id": document_id,
|
||||
"title": document_title,
|
||||
"document_type": document_type,
|
||||
"metadata": metadata,
|
||||
"id": document.id,
|
||||
"title": document.title,
|
||||
"document_type": (
|
||||
document.document_type.value
|
||||
if getattr(document, "document_type", None) is not None
|
||||
else "UNKNOWN"
|
||||
),
|
||||
"metadata": dict(document.document_metadata or {}),
|
||||
},
|
||||
"source": document_type,
|
||||
"source": (
|
||||
document.document_type.value
|
||||
if getattr(document, "document_type", None) is not None
|
||||
else "UNKNOWN"
|
||||
),
|
||||
}
|
||||
xml = build_document_xml(doc_payload, matched_chunk_ids=matched)
|
||||
return create_file_data(xml), document_id, None
|
||||
xml = build_document_xml(
|
||||
doc_payload,
|
||||
matched_chunk_ids=self._matched_chunk_ids(document.id),
|
||||
)
|
||||
file_data = create_file_data(xml)
|
||||
return file_data, document.id
|
||||
|
||||
# ------------------------------------------------------------------ writes
|
||||
|
||||
|
|
@ -603,7 +571,7 @@ class KBPostgresBackend(BackendProtocol):
|
|||
loaded = await self._load_file_data(file_path)
|
||||
if loaded is None:
|
||||
return EditResult(error=f"Error: File '{file_path}' not found")
|
||||
file_data, _, _ = loaded
|
||||
file_data, _ = loaded
|
||||
|
||||
content = file_data_to_string(file_data)
|
||||
result = perform_string_replacement(
|
||||
|
|
|
|||
|
|
@ -1,73 +0,0 @@
|
|||
"""Read preamble for canonical (numbered ``source_markdown``) KB reads.
|
||||
|
||||
The KB read tool numbers the body lines ``cat -n`` style, so serving the raw
|
||||
``source_markdown`` makes those line numbers line up exactly with the chunk
|
||||
char spans and the editor highlight. This module renders the small header the
|
||||
agent sees above that body: document identity plus the matched line ranges to
|
||||
seek to, and a concrete reminder of the line-citation token shape.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterable
|
||||
|
||||
from app.utils.text_spans import char_span_to_line_range
|
||||
|
||||
|
||||
def _format_range(start: int, end: int) -> str:
|
||||
return f"{start}" if start == end else f"{start}-{end}"
|
||||
|
||||
|
||||
def compute_matched_line_ranges(
|
||||
source_markdown: str,
|
||||
chunks: Iterable[tuple[int, int | None, int | None]],
|
||||
matched_chunk_ids: set[int],
|
||||
) -> list[tuple[int, int]]:
|
||||
"""Map matched chunks to sorted, de-duplicated 1-based line ranges.
|
||||
|
||||
``chunks`` are ``(chunk_id, start_char, end_char)`` triples. Chunks without
|
||||
spans (legacy rows) are skipped — they have no resolvable location.
|
||||
"""
|
||||
ranges: set[tuple[int, int]] = set()
|
||||
for chunk_id, start_char, end_char in chunks:
|
||||
if chunk_id not in matched_chunk_ids:
|
||||
continue
|
||||
if start_char is None or end_char is None:
|
||||
continue
|
||||
ranges.add(char_span_to_line_range(source_markdown, start_char, end_char))
|
||||
return sorted(ranges)
|
||||
|
||||
|
||||
def build_read_preamble(
|
||||
*,
|
||||
document_id: int,
|
||||
document_type: str,
|
||||
title: str,
|
||||
url: str,
|
||||
matched_line_ranges: list[tuple[int, int]],
|
||||
) -> str:
|
||||
"""Render the metadata header shown above a numbered ``source_markdown`` body.
|
||||
|
||||
``matched_line_ranges`` are 1-based inclusive line ranges (already derived
|
||||
from chunk char spans) to point the agent at the relevant lines.
|
||||
"""
|
||||
lines = [
|
||||
"<document_metadata>",
|
||||
f" <document_id>{document_id}</document_id>",
|
||||
f" <document_type>{document_type}</document_type>",
|
||||
f" <title><![CDATA[{title}]]></title>",
|
||||
f" <url><![CDATA[{url}]]></url>",
|
||||
]
|
||||
if matched_line_ranges:
|
||||
ranges = ", ".join(_format_range(s, e) for s, e in matched_line_ranges)
|
||||
lines.append(f" <matched_lines>{ranges}</matched_lines>")
|
||||
lines.append("</document_metadata>")
|
||||
lines.append(
|
||||
f"Cite lines from this document as [citation:d{document_id}#L<start>-<end>] "
|
||||
"using the line numbers shown below."
|
||||
)
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
__all__ = ["build_read_preamble", "compute_matched_line_ranges"]
|
||||
|
|
@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
|||
loaded = await backend._load_file_data(validated)
|
||||
if loaded is None:
|
||||
return f"Error: File '{validated}' not found"
|
||||
_, doc_id_to_attach, _ = loaded
|
||||
_, doc_id_to_attach = loaded
|
||||
|
||||
res: EditResult = await backend.aedit(
|
||||
validated, old_string, new_string, replace_all=replace_all
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ async def cloud_move_file(
|
|||
loaded = await backend._load_file_data(source)
|
||||
if loaded is None:
|
||||
return f"Error: source '{source}' not found."
|
||||
source_file_data, loaded_doc_id, _ = loaded
|
||||
source_file_data, loaded_doc_id = loaded
|
||||
if source_doc_id is None:
|
||||
source_doc_id = loaded_doc_id
|
||||
|
||||
|
|
|
|||
|
|
@ -58,10 +58,8 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
|||
loaded = await backend._load_file_data(validated)
|
||||
if loaded is None:
|
||||
return f"Error: File '{validated}' not found"
|
||||
file_data, doc_id, preamble = loaded
|
||||
file_data, doc_id = loaded
|
||||
rendered = format_read_response(file_data, offset, limit)
|
||||
if preamble and offset == 0:
|
||||
rendered = preamble + rendered
|
||||
update: dict[str, Any] = {
|
||||
"files": {validated: file_data},
|
||||
"messages": [
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ async def cloud_rm(
|
|||
loaded = await backend._load_file_data(validated)
|
||||
if loaded is None:
|
||||
return f"Error: file '{validated}' not found."
|
||||
_, resolved_doc_id, _ = loaded
|
||||
_, resolved_doc_id = loaded
|
||||
|
||||
files_update: dict[str, Any] = {validated: None}
|
||||
update: dict[str, Any] = {
|
||||
|
|
|
|||
|
|
@ -240,24 +240,23 @@ def create_generate_image_tool(
|
|||
error="No images were generated",
|
||||
)
|
||||
|
||||
# Update all image URLs in response_dict to be absolute (for the serving endpoint)
|
||||
from urllib.parse import urlparse
|
||||
for image in images:
|
||||
if image.get("url"):
|
||||
raw_url: str = image["url"]
|
||||
if raw_url.startswith("/") and provider_base_url:
|
||||
parsed = urlparse(provider_base_url)
|
||||
origin = f"{parsed.scheme}://{parsed.netloc}"
|
||||
image["url"] = f"{origin}{raw_url}" # Update the stored dict!
|
||||
|
||||
first_image = images[0]
|
||||
revised_prompt = first_image.get("revised_prompt", prompt)
|
||||
|
||||
# b64_json (e.g. gpt-image-1) is served via our backend endpoint so
|
||||
# megabytes of base64 don't bloat the LLM context.
|
||||
# Some OpenAI-compatible backends (e.g. Xinference) return a relative
|
||||
# URL like /files/image.png. Browsers can't resolve these, so we
|
||||
# prepend the provider's base origin when the URL starts with "/".
|
||||
if first_image.get("url"):
|
||||
raw_url: str = first_image["url"]
|
||||
if raw_url.startswith("/") and provider_base_url:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed = urlparse(provider_base_url)
|
||||
origin = f"{parsed.scheme}://{parsed.netloc}"
|
||||
image_url = f"{origin}{raw_url}"
|
||||
else:
|
||||
image_url = raw_url
|
||||
image_url = first_image["url"]
|
||||
elif first_image.get("b64_json"):
|
||||
backend_url = config.BACKEND_URL or "http://localhost:8000"
|
||||
image_url = (
|
||||
|
|
|
|||
|
|
@ -35,24 +35,42 @@ Map outcomes to your `status`:
|
|||
|
||||
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
|
||||
|
||||
## Citations in your prose
|
||||
## Chunk citations in your prose
|
||||
|
||||
`read_file` on a KB document under `/documents/` serves it in one of two forms. Cite from whichever you actually see, attach the marker to the sentence in `action_summary` or `evidence.content_excerpt` stating that fact, and list every marker you emit in `evidence.citations`. The caller relays these markers to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
|
||||
When `read_file` returns a KB-indexed document under `/documents/`, the response includes `<chunk id='…'>` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:<chunk_id>]` to the sentence stating that fact, using the **exact** id from the `<chunk id='…'>` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
|
||||
|
||||
**Numbered body (default).** A `<document_metadata>` header gives the `<document_id>` and an optional `<matched_lines>` pointer, then the body is shown with line numbers. Cite the lines a fact came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
|
||||
### Where chunk ids live in `read_file` output
|
||||
|
||||
**Legacy chunk blocks (older docs without a stored body).** The response is XML with `<chunk id='N'>` blocks. Cite the chunk a fact came from as `[citation:N]`, using the **exact** id from a `<chunk id='…'>` tag.
|
||||
A KB document's XML has three numeric attributes — only **one** is a citation source:
|
||||
|
||||
```
|
||||
<document>
|
||||
<document_metadata>
|
||||
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
|
||||
...
|
||||
</document_metadata>
|
||||
<chunk_index>
|
||||
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
|
||||
<entry chunk_id="129" lines="23-30" matched="true"/>
|
||||
</chunk_index>
|
||||
<document_content>
|
||||
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
|
||||
<chunk id='129'><![CDATA[…]]></chunk>
|
||||
</document_content>
|
||||
</document>
|
||||
```
|
||||
|
||||
### Rules
|
||||
|
||||
- Cite only from a passage you actually quoted or paraphrased this turn. Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory.
|
||||
- Never cite `<document_id>` on its own — it identifies the document, not a passage. In the numbered form it is only the `d<document_id>` prefix of a line citation.
|
||||
- Never invent, normalise, shorten, shift, or guess at ids or line numbers. If unsure, omit rather than pick.
|
||||
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
|
||||
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
|
||||
- Never cite `<document_id>` — that's the parent doc, not a chunk.
|
||||
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
|
||||
- Prefer **fewer accurate citations** over many speculative ones.
|
||||
- Multiple passages supporting the same point → comma-separated and copied individually: `[citation:d42#L14-22], [citation:d42#L31-39]`.
|
||||
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
|
||||
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
|
||||
- Tool results with no body passage (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry nothing to cite.
|
||||
- Populate `evidence.citations` with **only** the markers you actually emitted — same set, same characters.
|
||||
- Tool results without `<chunk id='…'>` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none.
|
||||
- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits.
|
||||
|
||||
## Examples
|
||||
|
||||
|
|
@ -71,7 +89,7 @@ You construct the structured `evidence` fields from your own knowledge of what y
|
|||
"path": "/documents/meetings/2026-05-11-meeting.md",
|
||||
"matched_candidates": null,
|
||||
"content_excerpt": null,
|
||||
"citations": null
|
||||
"chunk_ids": null
|
||||
},
|
||||
"next_step": null,
|
||||
"missing_fields": null,
|
||||
|
|
@ -103,7 +121,7 @@ You construct the structured `evidence` fields from your own knowledge of what y
|
|||
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
|
||||
],
|
||||
"content_excerpt": null,
|
||||
"citations": null
|
||||
"chunk_ids": null
|
||||
},
|
||||
"next_step": "Ask the user which design doc to update.",
|
||||
"missing_fields": ["path"],
|
||||
|
|
@ -124,7 +142,7 @@ Return **only** one JSON object (no markdown or prose outside it):
|
|||
"path": string | null,
|
||||
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
||||
"content_excerpt": string | null,
|
||||
"citations": string[] | null
|
||||
"chunk_ids": string[] | null
|
||||
},
|
||||
"next_step": string | null,
|
||||
"missing_fields": string[] | null,
|
||||
|
|
|
|||
|
|
@ -33,11 +33,11 @@ Map outcomes to your `status`:
|
|||
- Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
|
||||
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
|
||||
|
||||
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.)
|
||||
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.)
|
||||
|
||||
## Citations in your prose
|
||||
## Chunk citations in your prose
|
||||
|
||||
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry chunk ids or numbered KB bodies. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work.
|
||||
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `<chunk id='…'>` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work.
|
||||
|
||||
## Examples
|
||||
|
||||
|
|
@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
|
|||
"path": "/notes/meetings/2026-05-11-meeting.md",
|
||||
"matched_candidates": null,
|
||||
"content_excerpt": null,
|
||||
"citations": null
|
||||
"chunk_ids": null
|
||||
},
|
||||
"next_step": null,
|
||||
"missing_fields": null,
|
||||
|
|
@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
|
|||
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
|
||||
],
|
||||
"content_excerpt": null,
|
||||
"citations": null
|
||||
"chunk_ids": null
|
||||
},
|
||||
"next_step": "Ask the user which design doc to update.",
|
||||
"missing_fields": ["path"],
|
||||
|
|
@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
|
|||
"path": string | null,
|
||||
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
||||
"content_excerpt": string | null,
|
||||
"citations": string[] | null
|
||||
"chunk_ids": string[] | null
|
||||
},
|
||||
"next_step": string | null,
|
||||
"missing_fields": string[] | null,
|
||||
|
|
|
|||
|
|
@ -28,21 +28,41 @@ Reply in plain prose:
|
|||
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
|
||||
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
|
||||
|
||||
## Citations
|
||||
## Chunk citations
|
||||
|
||||
`read_file` on a KB document under `/documents/` serves it in one of two forms; cite a claim from whichever you actually see, alongside the path. The caller passes these markers through to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
|
||||
When the evidence for a claim came from a `read_file` response that included `<chunk id='…'>` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:<chunk_id>]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
|
||||
|
||||
- **Numbered body (default).** A `<document_metadata>` header gives the `<document_id>`, and the body is shown with line numbers. Cite the lines a claim came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
|
||||
- **Legacy chunk blocks (older docs).** XML with `<chunk id='N'>` blocks. Cite the chunk a claim came from as `[citation:N]`.
|
||||
### Where chunk ids live in `read_file` output
|
||||
|
||||
A KB document's XML has three numeric attributes — only **one** is a citation source:
|
||||
|
||||
```
|
||||
<document>
|
||||
<document_metadata>
|
||||
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
|
||||
...
|
||||
</document_metadata>
|
||||
<chunk_index>
|
||||
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
|
||||
<entry chunk_id="129" lines="23-30" matched="true"/>
|
||||
</chunk_index>
|
||||
<document_content>
|
||||
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
|
||||
<chunk id='129'><![CDATA[…]]></chunk>
|
||||
</document_content>
|
||||
</document>
|
||||
```
|
||||
|
||||
### Rules
|
||||
|
||||
- Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. If you cannot see the id/lines for a claim, omit the citation.
|
||||
- Never cite `<document_id>` on its own — in the numbered form it is only the `d<document_id>` prefix of a line citation.
|
||||
- Never invent, normalise, shorten, shift, or guess. Prefer **fewer accurate citations** over many speculative ones.
|
||||
- Multiple passages supporting the same point → comma-separated and copied individually.
|
||||
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
|
||||
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
|
||||
- Never cite `<document_id>` — that's the parent doc, not a chunk.
|
||||
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
|
||||
- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids.
|
||||
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
|
||||
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
|
||||
- Listings (`ls` / `glob` / `grep`), error strings, and files without either form carry nothing to cite.
|
||||
- The absolute path under `/documents/` is always required; citations are additive, they do not replace the path reference.
|
||||
- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without `<chunk id='…'>`), skip the citation.
|
||||
- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference.
|
||||
|
||||
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:d42#L3-9].`
|
||||
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].`
|
||||
|
|
|
|||
|
|
@ -957,9 +957,8 @@ class Config:
|
|||
os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
|
||||
)
|
||||
# Bump to invalidate every cached embedding set after a chunker change.
|
||||
# v2: chunks became exact (raw) slices of source_markdown for citation spans.
|
||||
EMBEDDING_CACHE_CHUNKER_VERSION = int(
|
||||
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "2")
|
||||
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
|
||||
)
|
||||
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
|
||||
EMBEDDING_CACHE_MAX_TOTAL_MB = int(
|
||||
|
|
|
|||
|
|
@ -1470,11 +1470,6 @@ class Chunk(BaseModel, TimestampMixin):
|
|||
# ordering reads are document-scoped (covered by ix_chunks_document_id) and
|
||||
# building a position index on the large chunks table is not worth it.
|
||||
position = Column(Integer, nullable=False, server_default="0")
|
||||
# Half-open char span into the document's source_markdown the chunk was cut
|
||||
# from. Nullable: historical rows predate spans and populate on reindex.
|
||||
# Invariant for span-aware rows: source_markdown[start_char:end_char] == content.
|
||||
start_char = Column(Integer, nullable=True)
|
||||
end_char = Column(Integer, nullable=True)
|
||||
|
||||
document_id = Column(
|
||||
Integer,
|
||||
|
|
|
|||
|
|
@ -18,26 +18,23 @@ from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable
|
|||
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
|
||||
from app.indexing_pipeline.cache.service import EmbeddingCacheService
|
||||
from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice, chunk_markdown_with_spans
|
||||
from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
|
||||
from app.indexing_pipeline.document_embedder import embed_texts
|
||||
from app.observability import metrics
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SliceEmbedding = tuple[ChunkSlice, np.ndarray]
|
||||
ChunkPair = tuple[str, np.ndarray]
|
||||
|
||||
|
||||
async def build_chunk_embeddings(
|
||||
markdown: str, *, use_code_chunker: bool
|
||||
) -> tuple[np.ndarray, list[SliceEmbedding]]:
|
||||
"""Return the document-level vector and ordered ``(ChunkSlice, vector)`` pairs.
|
||||
) -> tuple[np.ndarray, list[ChunkPair]]:
|
||||
"""Return the document-level vector and ordered ``(chunk_text, vector)`` pairs.
|
||||
|
||||
Slices are always recomputed (cheap) so their char spans are exact; only the
|
||||
embeddings are cached, reused when the same markdown was embedded with the
|
||||
current model and chunker.
|
||||
Drop-in for the inline chunk+embed step; reuses prior output when the same
|
||||
markdown has already been embedded with the current model and chunker.
|
||||
"""
|
||||
slices = await chunk_slices(markdown, use_code_chunker=use_code_chunker)
|
||||
|
||||
settings = load_embedding_cache_settings()
|
||||
chunker_kind = "code" if use_code_chunker else "hybrid"
|
||||
embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
|
||||
|
|
@ -48,7 +45,7 @@ async def build_chunk_embeddings(
|
|||
embedding_dim=embedding_dim,
|
||||
)
|
||||
if not cacheable:
|
||||
return await _compute(markdown, slices)
|
||||
return await _compute(markdown, use_code_chunker=use_code_chunker)
|
||||
|
||||
key = EmbeddingKey(
|
||||
markdown_sha256=_hash_text(markdown),
|
||||
|
|
@ -59,30 +56,31 @@ async def build_chunk_embeddings(
|
|||
)
|
||||
|
||||
cached = await _recall(key)
|
||||
if cached is not None and _aligns(cached, slices):
|
||||
if cached is not None:
|
||||
metrics.record_embedding_cache_lookup(
|
||||
embedding_model=key.embedding_model,
|
||||
chunker_kind=chunker_kind,
|
||||
outcome="hit",
|
||||
)
|
||||
logger.debug("Embedding cache hit for %s", key.markdown_sha256)
|
||||
return cached.summary_embedding, list(
|
||||
zip(slices, (c.embedding for c in cached.chunks), strict=True)
|
||||
)
|
||||
return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
|
||||
|
||||
metrics.record_embedding_cache_lookup(
|
||||
embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
|
||||
)
|
||||
summary_embedding, pairs = await _compute(markdown, slices)
|
||||
await _remember(key, summary_embedding, pairs)
|
||||
return summary_embedding, pairs
|
||||
|
||||
|
||||
async def chunk_slices(markdown: str, *, use_code_chunker: bool) -> list[ChunkSlice]:
|
||||
"""Chunk markdown into ordered, char-addressed slices off the event loop."""
|
||||
return await asyncio.to_thread(
|
||||
chunk_markdown_with_spans, markdown, use_code_chunker
|
||||
summary_embedding, chunk_pairs = await _compute(
|
||||
markdown, use_code_chunker=use_code_chunker
|
||||
)
|
||||
await _remember(key, summary_embedding, chunk_pairs)
|
||||
return summary_embedding, chunk_pairs
|
||||
|
||||
|
||||
async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
|
||||
"""Chunk markdown into ordered texts with the pipeline's chunker selection."""
|
||||
if use_code_chunker:
|
||||
return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
|
||||
# Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
|
||||
return await asyncio.to_thread(chunk_text_hybrid, markdown)
|
||||
|
||||
|
||||
async def embed_batch(texts: list[str]) -> list[np.ndarray]:
|
||||
|
|
@ -90,19 +88,13 @@ async def embed_batch(texts: list[str]) -> list[np.ndarray]:
|
|||
return await asyncio.to_thread(embed_texts, texts)
|
||||
|
||||
|
||||
def _aligns(cached: EmbeddingSet, slices: list[ChunkSlice]) -> bool:
|
||||
"""A hit is only usable if its texts still match the current chunking."""
|
||||
return len(cached.chunks) == len(slices) and all(
|
||||
c.text == s.text for c, s in zip(cached.chunks, slices, strict=True)
|
||||
)
|
||||
|
||||
|
||||
async def _compute(
|
||||
markdown: str, slices: list[ChunkSlice]
|
||||
) -> tuple[np.ndarray, list[SliceEmbedding]]:
|
||||
embeddings = await embed_batch([markdown, *(s.text for s in slices)])
|
||||
markdown: str, *, use_code_chunker: bool
|
||||
) -> tuple[np.ndarray, list[ChunkPair]]:
|
||||
chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker)
|
||||
embeddings = await embed_batch([markdown, *chunk_texts])
|
||||
summary_embedding, *chunk_embeddings = embeddings
|
||||
return summary_embedding, list(zip(slices, chunk_embeddings, strict=True))
|
||||
return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
|
||||
|
||||
|
||||
async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
|
||||
|
|
@ -118,14 +110,14 @@ async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
|
|||
|
||||
|
||||
async def _remember(
|
||||
key: EmbeddingKey, summary_embedding: np.ndarray, pairs: list[SliceEmbedding]
|
||||
key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair]
|
||||
) -> None:
|
||||
try:
|
||||
from app.tasks.celery_tasks import get_celery_session_maker
|
||||
|
||||
embedding_set = EmbeddingSet(
|
||||
summary_embedding=summary_embedding,
|
||||
chunks=[CachedChunk(text=s.text, embedding=vec) for s, vec in pairs],
|
||||
chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs],
|
||||
)
|
||||
async with get_celery_session_maker()() as session:
|
||||
await EmbeddingCacheService(session).remember(key, embedding_set)
|
||||
|
|
|
|||
|
|
@ -19,9 +19,6 @@ class ExistingChunk:
|
|||
id: int
|
||||
content: str
|
||||
position: int
|
||||
# Stored char span; None for legacy rows indexed before spans existed.
|
||||
start_char: int | None = None
|
||||
end_char: int | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
|
|
|
|||
|
|
@ -1,30 +1,16 @@
|
|||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
from app.config import config
|
||||
|
||||
# Regex that matches a Markdown table block (header + separator + one or more rows)
|
||||
# A table block starts with a | at the beginning of a line and ends when a
|
||||
# non-table line (or end of string) is encountered. The final row may end at EOF
|
||||
# without a trailing newline, so the whole table stays one slice.
|
||||
# non-table line (or end of string) is encountered.
|
||||
_TABLE_BLOCK_RE = re.compile(
|
||||
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)",
|
||||
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ChunkSlice:
|
||||
"""A chunk paired with its half-open char span into the source markdown.
|
||||
|
||||
Invariant: ``markdown[start_char:end_char] == text``.
|
||||
"""
|
||||
|
||||
text: str
|
||||
start_char: int
|
||||
end_char: int
|
||||
|
||||
|
||||
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
||||
"""Chunk a text string using the configured chunker and return the chunk texts."""
|
||||
chunker = (
|
||||
|
|
@ -33,63 +19,41 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
|||
return [c.text for c in chunker.chunk(text)]
|
||||
|
||||
|
||||
def chunk_markdown_with_spans(
|
||||
text: str, use_code_chunker: bool = False
|
||||
) -> list[ChunkSlice]:
|
||||
"""Chunk markdown into a lossless, contiguous partition of char-addressed slices.
|
||||
def chunk_text_hybrid(text: str) -> list[str]:
|
||||
"""Table-aware chunker that prevents Markdown tables from being split mid-row.
|
||||
|
||||
Tables stay whole (issue #1334) and every slice is an exact substring of
|
||||
``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is
|
||||
the offset record citations resolve against.
|
||||
Algorithm:
|
||||
1. Scan the document for Markdown table blocks.
|
||||
2. Each table block is emitted as a single, unmodified chunk so that its
|
||||
header, separator row, and data rows always stay together.
|
||||
3. The non-table prose segments between (and around) tables are passed through
|
||||
the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
|
||||
document order.
|
||||
|
||||
This ensures that table data is never sliced in the middle by the token-based
|
||||
chunker, which would otherwise produce garbled rows that are useless for RAG.
|
||||
|
||||
Fixes #1334.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
slices: list[ChunkSlice] = []
|
||||
chunks: list[str] = []
|
||||
cursor = 0
|
||||
|
||||
for match in _TABLE_BLOCK_RE.finditer(text):
|
||||
if match.start() > cursor:
|
||||
slices.extend(
|
||||
_segment_slices(text, cursor, match.start(), use_code_chunker)
|
||||
)
|
||||
slices.append(ChunkSlice(match.group(0), match.start(), match.end()))
|
||||
# Prose before this table
|
||||
prose = text[cursor : match.start()].strip()
|
||||
if prose:
|
||||
chunks.extend(chunk_text(prose))
|
||||
|
||||
# The table itself is kept as one indivisible chunk
|
||||
table_block = match.group(0).strip()
|
||||
if table_block:
|
||||
chunks.append(table_block)
|
||||
|
||||
cursor = match.end()
|
||||
|
||||
if len(text) > cursor:
|
||||
slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker))
|
||||
# Remaining prose after the last table (or entire text if no tables)
|
||||
trailing = text[cursor:].strip()
|
||||
if trailing:
|
||||
chunks.extend(chunk_text(trailing))
|
||||
|
||||
return slices
|
||||
|
||||
|
||||
def _segment_slices(
|
||||
text: str, start: int, end: int, use_code_chunker: bool
|
||||
) -> list[ChunkSlice]:
|
||||
"""Sub-chunk one non-table segment into contiguous, char-addressed slices."""
|
||||
chunker = (
|
||||
config.code_chunker_instance if use_code_chunker else config.chunker_instance
|
||||
)
|
||||
segment = text[start:end]
|
||||
chunks = chunker.chunk(segment)
|
||||
|
||||
slices: list[ChunkSlice] = []
|
||||
local = 0
|
||||
for chunk in chunks:
|
||||
# Use the chunker's end offset only as a cut point, then re-slice the
|
||||
# segment ourselves so the result is an exact, gap-free substring.
|
||||
local_end = min(max(chunk.end_index, local), len(segment))
|
||||
if local_end <= local:
|
||||
continue
|
||||
slices.append(
|
||||
ChunkSlice(segment[local:local_end], start + local, start + local_end)
|
||||
)
|
||||
local = local_end
|
||||
|
||||
if local < len(segment):
|
||||
if slices:
|
||||
last = slices[-1]
|
||||
slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end)
|
||||
else:
|
||||
slices.append(ChunkSlice(segment[local:], start + local, end))
|
||||
|
||||
return slices
|
||||
return chunks
|
||||
|
|
|
|||
|
|
@ -20,10 +20,9 @@ from app.db import (
|
|||
DocumentType,
|
||||
)
|
||||
from app.indexing_pipeline.cache import build_chunk_embeddings
|
||||
from app.indexing_pipeline.cache.cached_indexing import chunk_slices, embed_batch
|
||||
from app.indexing_pipeline.chunk_reconciler import ChunkPlan, ExistingChunk, reconcile
|
||||
from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch
|
||||
from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
from app.indexing_pipeline.document_hashing import (
|
||||
compute_content_hash,
|
||||
compute_identifier_hash,
|
||||
|
|
@ -490,22 +489,12 @@ class IndexingPipelineService:
|
|||
|
||||
async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
|
||||
result = await self.session.execute(
|
||||
select(
|
||||
Chunk.id,
|
||||
Chunk.content,
|
||||
Chunk.position,
|
||||
Chunk.start_char,
|
||||
Chunk.end_char,
|
||||
).where(Chunk.document_id == document_id)
|
||||
select(Chunk.id, Chunk.content, Chunk.position).where(
|
||||
Chunk.document_id == document_id
|
||||
)
|
||||
)
|
||||
return [
|
||||
ExistingChunk(
|
||||
id=row.id,
|
||||
content=row.content,
|
||||
position=row.position,
|
||||
start_char=row.start_char,
|
||||
end_char=row.end_char,
|
||||
)
|
||||
ExistingChunk(id=row.id, content=row.content, position=row.position)
|
||||
for row in result
|
||||
]
|
||||
|
||||
|
|
@ -516,21 +505,15 @@ class IndexingPipelineService:
|
|||
delete(Chunk).where(Chunk.document_id == document.id)
|
||||
)
|
||||
|
||||
summary_embedding, slice_pairs = await build_chunk_embeddings(
|
||||
summary_embedding, chunk_pairs = await build_chunk_embeddings(
|
||||
content,
|
||||
use_code_chunker=connector_doc.should_use_code_chunker,
|
||||
)
|
||||
|
||||
document.embedding = summary_embedding
|
||||
return [
|
||||
Chunk(
|
||||
content=chunk_slice.text,
|
||||
embedding=emb,
|
||||
position=i,
|
||||
start_char=chunk_slice.start_char,
|
||||
end_char=chunk_slice.end_char,
|
||||
)
|
||||
for i, (chunk_slice, emb) in enumerate(slice_pairs)
|
||||
Chunk(content=text, embedding=emb, position=i)
|
||||
for i, (text, emb) in enumerate(chunk_pairs)
|
||||
]
|
||||
|
||||
async def _reindex_incrementally(
|
||||
|
|
@ -542,39 +525,35 @@ class IndexingPipelineService:
|
|||
) -> int:
|
||||
"""Edit path: keep rows whose text survived, embed only new texts.
|
||||
|
||||
Unchanged rows keep their embedding and their HNSW/GIN index entries. An
|
||||
edit can shift a kept chunk's char span without changing its text, so
|
||||
every kept row's position and span are refreshed whenever they drift.
|
||||
Unchanged rows keep their embedding and their HNSW/GIN index entries;
|
||||
moved rows get a position-only UPDATE, which touches neither index.
|
||||
"""
|
||||
slices = await chunk_slices(
|
||||
new_texts = await chunk_markdown(
|
||||
content, use_code_chunker=connector_doc.should_use_code_chunker
|
||||
)
|
||||
new_texts = [s.text for s in slices]
|
||||
plan = reconcile(existing, new_texts)
|
||||
|
||||
# One batch: the document-level summary vector plus the missing chunks.
|
||||
embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
|
||||
summary_embedding, *new_embeddings = embeddings
|
||||
|
||||
if plan.reused:
|
||||
await self.session.execute(
|
||||
update(Chunk),
|
||||
[{"id": cid, "position": pos} for cid, pos in plan.reused],
|
||||
)
|
||||
if plan.to_delete:
|
||||
await self.session.execute(
|
||||
delete(Chunk).where(Chunk.id.in_(plan.to_delete))
|
||||
)
|
||||
|
||||
span_updates = self._kept_row_span_updates(existing, slices, plan)
|
||||
if span_updates:
|
||||
await self.session.execute(update(Chunk), span_updates)
|
||||
|
||||
self.session.add_all(
|
||||
Chunk(
|
||||
content=slices[pos].text,
|
||||
content=text,
|
||||
embedding=emb,
|
||||
position=pos,
|
||||
start_char=slices[pos].start_char,
|
||||
end_char=slices[pos].end_char,
|
||||
document_id=document.id,
|
||||
)
|
||||
for (pos, _text), emb in zip(plan.to_embed, new_embeddings, strict=True)
|
||||
for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True)
|
||||
)
|
||||
document.embedding = summary_embedding
|
||||
|
||||
|
|
@ -585,36 +564,6 @@ class IndexingPipelineService:
|
|||
)
|
||||
return len(new_texts)
|
||||
|
||||
@staticmethod
|
||||
def _kept_row_span_updates(
|
||||
existing: list[ExistingChunk],
|
||||
slices: list[ChunkSlice],
|
||||
plan: ChunkPlan,
|
||||
) -> list[dict]:
|
||||
"""Position/span writes for kept rows, emitted only where a value drifts."""
|
||||
deleted = set(plan.to_delete)
|
||||
moved = dict(plan.reused)
|
||||
updates: list[dict] = []
|
||||
for chunk in existing:
|
||||
if chunk.id in deleted:
|
||||
continue
|
||||
new_position = moved.get(chunk.id, chunk.position)
|
||||
target = slices[new_position]
|
||||
if (
|
||||
chunk.position != new_position
|
||||
or chunk.start_char != target.start_char
|
||||
or chunk.end_char != target.end_char
|
||||
):
|
||||
updates.append(
|
||||
{
|
||||
"id": chunk.id,
|
||||
"position": new_position,
|
||||
"start_char": target.start_char,
|
||||
"end_char": target.end_char,
|
||||
}
|
||||
)
|
||||
return updates
|
||||
|
||||
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
|
||||
"""Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -440,15 +440,8 @@ class ChucksHybridSearchRetriever:
|
|||
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
|
||||
|
||||
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
|
||||
# start_char/end_char carry the citation span; None for legacy rows.
|
||||
chunk_query = (
|
||||
select(
|
||||
Chunk.id,
|
||||
Chunk.content,
|
||||
Chunk.document_id,
|
||||
Chunk.start_char,
|
||||
Chunk.end_char,
|
||||
)
|
||||
select(Chunk.id, Chunk.content, Chunk.document_id)
|
||||
.join(numbered, Chunk.id == numbered.c.chunk_id)
|
||||
.where(chunk_filter)
|
||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||
|
|
@ -483,14 +476,7 @@ class ChucksHybridSearchRetriever:
|
|||
if doc_id not in doc_map:
|
||||
continue
|
||||
doc_entry = doc_map[doc_id]
|
||||
doc_entry["chunks"].append(
|
||||
{
|
||||
"chunk_id": row.id,
|
||||
"content": row.content,
|
||||
"start_char": row.start_char,
|
||||
"end_char": row.end_char,
|
||||
}
|
||||
)
|
||||
doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
|
||||
if row.id in matched_chunk_ids:
|
||||
doc_entry["matched_chunk_ids"].append(row.id)
|
||||
|
||||
|
|
|
|||
|
|
@ -38,7 +38,6 @@ from app.schemas import (
|
|||
from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
|
||||
from app.users import get_auth_context
|
||||
from app.utils.rbac import check_permission
|
||||
from app.utils.text_spans import char_span_to_line_range
|
||||
|
||||
try:
|
||||
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
|
||||
|
|
@ -977,12 +976,9 @@ async def get_document_by_chunk_id(
|
|||
session: AsyncSession = Depends(get_async_session),
|
||||
auth: AuthContext = Depends(get_auth_context),
|
||||
):
|
||||
"""Resolve a chunk id to its document plus a window of surrounding chunks.
|
||||
|
||||
Returns the cited chunk's 1-based line range (cited_start_line/
|
||||
cited_end_line) when char spans exist, so callers can anchor the citation
|
||||
to exact source lines. Uses SQL-level pagination to avoid loading all
|
||||
chunks into memory.
|
||||
"""
|
||||
Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
|
||||
Uses SQL-level pagination to avoid loading all chunks into memory.
|
||||
"""
|
||||
try:
|
||||
from sqlalchemy import and_, func, or_
|
||||
|
|
@ -1046,17 +1042,6 @@ async def get_document_by_chunk_id(
|
|||
)
|
||||
windowed_chunks = windowed_result.scalars().all()
|
||||
|
||||
cited_start_line: int | None = None
|
||||
cited_end_line: int | None = None
|
||||
if (
|
||||
chunk.start_char is not None
|
||||
and chunk.end_char is not None
|
||||
and document.source_markdown
|
||||
):
|
||||
cited_start_line, cited_end_line = char_span_to_line_range(
|
||||
document.source_markdown, chunk.start_char, chunk.end_char
|
||||
)
|
||||
|
||||
return DocumentWithChunksRead(
|
||||
id=document.id,
|
||||
title=document.title,
|
||||
|
|
@ -1071,8 +1056,6 @@ async def get_document_by_chunk_id(
|
|||
chunks=windowed_chunks,
|
||||
total_chunks=total_chunks,
|
||||
chunk_start_index=start,
|
||||
cited_start_line=cited_start_line,
|
||||
cited_end_line=cited_end_line,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
|
|
|
|||
|
|
@ -43,34 +43,6 @@ EDITOR_PLATE_MAX_BYTES = 1 * 1024 * 1024
|
|||
EDITOR_PLATE_MAX_LINES = 5000
|
||||
|
||||
|
||||
def _raise_no_canonical_body(document: Document) -> None:
|
||||
"""Translate a missing source_markdown into a status-aware HTTP error."""
|
||||
doc_status = document.status or {}
|
||||
state = (
|
||||
doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready"
|
||||
)
|
||||
|
||||
if state in ("pending", "processing"):
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="This document is still being processed. Please wait a moment and try again.",
|
||||
)
|
||||
if state == "failed":
|
||||
reason = (
|
||||
doc_status.get("reason", "Unknown error")
|
||||
if isinstance(doc_status, dict)
|
||||
else "Unknown error"
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="This document has no editable content. It may not have been processed correctly. Try re-indexing or re-uploading it.",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
|
||||
async def get_editor_content(
|
||||
search_space_id: int,
|
||||
|
|
@ -82,9 +54,8 @@ async def get_editor_content(
|
|||
"""
|
||||
Get document content for editing.
|
||||
|
||||
Returns source_markdown (the canonical body) for the Plate.js editor, with a
|
||||
one-time migration from legacy blocknote_document. Never reconstructs the
|
||||
body from chunks.
|
||||
Returns source_markdown for the Plate.js editor.
|
||||
Falls back to blocknote_document → markdown conversion, then chunk reconstruction.
|
||||
|
||||
Requires DOCUMENTS_READ permission.
|
||||
"""
|
||||
|
|
@ -154,9 +125,52 @@ async def get_editor_content(
|
|||
await session.commit()
|
||||
return _build_response(empty_markdown)
|
||||
|
||||
# No canonical body. Chunks are an index artifact, never the source of
|
||||
# truth, so surface the processing state instead of rebuilding from them.
|
||||
_raise_no_canonical_body(document)
|
||||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
|
||||
if not chunk_contents:
|
||||
doc_status = document.status or {}
|
||||
state = (
|
||||
doc_status.get("state", "ready")
|
||||
if isinstance(doc_status, dict)
|
||||
else "ready"
|
||||
)
|
||||
if state in ("pending", "processing"):
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="This document is still being processed. Please wait a moment and try again.",
|
||||
)
|
||||
if state == "failed":
|
||||
reason = (
|
||||
doc_status.get("reason", "Unknown error")
|
||||
if isinstance(doc_status, dict)
|
||||
else "Unknown error"
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
|
||||
)
|
||||
|
||||
markdown_content = "\n\n".join(chunk_contents)
|
||||
|
||||
if not markdown_content.strip():
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="This document appears to be empty. Try re-uploading or editing it to add content.",
|
||||
)
|
||||
|
||||
document.source_markdown = markdown_content
|
||||
await session.commit()
|
||||
|
||||
return _build_response(markdown_content)
|
||||
|
||||
|
||||
@router.get(
|
||||
|
|
@ -170,9 +184,8 @@ async def download_document_markdown(
|
|||
):
|
||||
user = auth.user
|
||||
"""
|
||||
Download the canonical document body as a .md file.
|
||||
|
||||
Serves source_markdown, migrating legacy blocknote_document when present.
|
||||
Download the full document content as a .md file.
|
||||
Reconstructs markdown from source_markdown or chunks.
|
||||
"""
|
||||
await check_permission(
|
||||
session,
|
||||
|
|
@ -198,6 +211,15 @@ async def download_document_markdown(
|
|||
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||
|
||||
markdown = blocknote_to_markdown(document.blocknote_document)
|
||||
if markdown is None:
|
||||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
if chunk_contents:
|
||||
markdown = "\n\n".join(chunk_contents)
|
||||
|
||||
if not markdown or not markdown.strip():
|
||||
raise HTTPException(
|
||||
|
|
@ -340,6 +362,15 @@ async def export_document(
|
|||
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||
|
||||
markdown_content = blocknote_to_markdown(document.blocknote_document)
|
||||
if markdown_content is None:
|
||||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
if chunk_contents:
|
||||
markdown_content = "\n\n".join(chunk_contents)
|
||||
|
||||
if not markdown_content or not markdown_content.strip():
|
||||
raise HTTPException(status_code=400, detail="Document has no content to export")
|
||||
|
|
|
|||
|
|
@ -214,7 +214,7 @@ async def _execute_image_generation(
|
|||
)
|
||||
|
||||
# Store response
|
||||
image_gen.response_data = (
|
||||
response_dict = (
|
||||
response.model_dump() if hasattr(response, "model_dump") else dict(response)
|
||||
)
|
||||
if not image_gen.model and hasattr(response, "_hidden_params"):
|
||||
|
|
@ -222,6 +222,20 @@ async def _execute_image_generation(
|
|||
if isinstance(hidden, dict) and hidden.get("model"):
|
||||
image_gen.model = hidden["model"]
|
||||
|
||||
# Fix relative URLs in response data (for the serving endpoint)
|
||||
from urllib.parse import urlparse
|
||||
images = response_dict.get("data", [])
|
||||
provider_base_url = resolved_kwargs.get("api_base")
|
||||
for image in images:
|
||||
if image.get("url"):
|
||||
raw_url: str = image["url"]
|
||||
if raw_url.startswith("/") and provider_base_url:
|
||||
parsed = urlparse(provider_base_url)
|
||||
origin = f"{parsed.scheme}://{parsed.netloc}"
|
||||
image["url"] = f"{origin}{raw_url}"
|
||||
|
||||
image_gen.response_data = response_dict
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Image Generation Execution + Results CRUD
|
||||
|
|
|
|||
|
|
@ -17,7 +17,4 @@ class ChunkUpdate(ChunkBase):
|
|||
|
||||
|
||||
class ChunkRead(ChunkBase, IDModel, TimestampModel):
|
||||
start_char: int | None = None
|
||||
end_char: int | None = None
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
|
|
|||
|
|
@ -73,10 +73,6 @@ class DocumentWithChunksRead(DocumentRead):
|
|||
chunks: list[ChunkRead] = []
|
||||
total_chunks: int = 0
|
||||
chunk_start_index: int = 0
|
||||
# 1-based inclusive line range of the cited chunk within source_markdown;
|
||||
# None when the chunk predates char spans or the body is unavailable.
|
||||
cited_start_line: int | None = None
|
||||
cited_end_line: int | None = None
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,23 +0,0 @@
|
|||
"""Convert char spans into document-relative line ranges.
|
||||
|
||||
Chunks store half-open char spans into ``source_markdown``; citations and the
|
||||
editor speak in line numbers. This is the single shared conversion so search,
|
||||
the resolve API, and highlighting all agree on what "lines X-Y" means.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def char_span_to_line_range(text: str, start_char: int, end_char: int) -> tuple[int, int]:
|
||||
"""Return the 1-based inclusive line range covering ``[start_char, end_char)``.
|
||||
|
||||
Offsets are clamped to ``text`` bounds. An empty span resolves to the single
|
||||
line containing it.
|
||||
"""
|
||||
n = len(text)
|
||||
start = max(0, min(start_char, n))
|
||||
end = max(start, min(end_char, n))
|
||||
start_line = text.count("\n", 0, start) + 1
|
||||
last_char_index = max(start, end - 1)
|
||||
end_line = text.count("\n", 0, last_char_index) + 1
|
||||
return start_line, end_line
|
||||
Loading…
Add table
Add a link
Reference in a new issue