mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-20 21:18:13 +02:00
feat: serve numbered source_markdown reads with citation preamble
This commit is contained in:
parent
fc17b9becd
commit
188ae053ac
6 changed files with 78 additions and 44 deletions
|
|
@ -45,6 +45,10 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
|
||||
build_document_xml,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
|
||||
build_read_preamble,
|
||||
compute_matched_line_ranges,
|
||||
)
|
||||
from app.agents.chat.runtime.path_resolver import (
|
||||
DOCUMENTS_ROOT,
|
||||
build_path_index,
|
||||
|
|
@ -64,6 +68,12 @@ def _basename(path: str) -> str:
|
|||
return path.rsplit("/", 1)[-1]
|
||||
|
||||
|
||||
def _metadata_url(metadata: dict[str, Any]) -> str:
|
||||
return (
|
||||
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
|
||||
)
|
||||
|
||||
|
||||
def _is_under(child: str, parent: str) -> bool:
|
||||
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
|
||||
if parent == "/":
|
||||
|
|
@ -460,8 +470,11 @@ class KBPostgresBackend(BackendProtocol):
|
|||
loaded = await self._load_file_data(file_path)
|
||||
if loaded is None:
|
||||
return f"Error: File '{file_path}' not found"
|
||||
file_data, _ = loaded
|
||||
return format_read_response(file_data, offset, limit)
|
||||
file_data, _, preamble = loaded
|
||||
body = format_read_response(file_data, offset, limit)
|
||||
if preamble and offset == 0:
|
||||
return preamble + body
|
||||
return body
|
||||
|
||||
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
|
||||
return asyncio.run(self.aread(file_path, offset, limit))
|
||||
|
|
@ -469,12 +482,14 @@ class KBPostgresBackend(BackendProtocol):
|
|||
async def _load_file_data(
|
||||
self,
|
||||
path: str,
|
||||
) -> tuple[dict[str, Any], int | None] | None:
|
||||
) -> tuple[dict[str, Any], int | None, str | None] | None:
|
||||
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
|
||||
|
||||
Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
|
||||
to any known document. ``doc_id`` is ``None`` for the synthetic
|
||||
anonymous document so the caller doesn't track it as a DB-backed file.
|
||||
Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path
|
||||
doesn't map to any known document. ``doc_id`` is ``None`` for the
|
||||
synthetic anonymous document. ``preamble`` is the metadata header to
|
||||
show above a numbered ``source_markdown`` body (``None`` for the legacy
|
||||
chunk-reconstructed XML reads used when a document has no body).
|
||||
"""
|
||||
anon = self._kb_anon_doc()
|
||||
if anon and str(anon.get("path") or "") == path:
|
||||
|
|
@ -492,7 +507,7 @@ class KBPostgresBackend(BackendProtocol):
|
|||
}
|
||||
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
|
||||
file_data = create_file_data(xml)
|
||||
return file_data, None
|
||||
return file_data, None, None
|
||||
|
||||
if not path.startswith(DOCUMENTS_ROOT):
|
||||
return None
|
||||
|
|
@ -505,41 +520,58 @@ class KBPostgresBackend(BackendProtocol):
|
|||
)
|
||||
if document is None:
|
||||
return None
|
||||
chunk_rows = await session.execute(
|
||||
select(Chunk.id, Chunk.content)
|
||||
.where(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunks = [
|
||||
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
|
||||
]
|
||||
|
||||
doc_payload = {
|
||||
"document_id": document.id,
|
||||
"chunks": chunks,
|
||||
"matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
|
||||
"document": {
|
||||
"id": document.id,
|
||||
"title": document.title,
|
||||
"document_type": (
|
||||
document.document_type.value
|
||||
if getattr(document, "document_type", None) is not None
|
||||
else "UNKNOWN"
|
||||
),
|
||||
"metadata": dict(document.document_metadata or {}),
|
||||
},
|
||||
"source": (
|
||||
source_markdown = document.source_markdown or ""
|
||||
document_type = (
|
||||
document.document_type.value
|
||||
if getattr(document, "document_type", None) is not None
|
||||
else "UNKNOWN"
|
||||
),
|
||||
)
|
||||
metadata = dict(document.document_metadata or {})
|
||||
chunk_rows = await session.execute(
|
||||
select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char)
|
||||
.where(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_records = chunk_rows.all()
|
||||
document_id = document.id
|
||||
document_title = document.title
|
||||
|
||||
matched = self._matched_chunk_ids(document_id)
|
||||
|
||||
# Canonical read: serve the verbatim body with cat -n line numbers that
|
||||
# line up with chunk char spans, so the agent cites real source lines.
|
||||
if source_markdown:
|
||||
ranges = compute_matched_line_ranges(
|
||||
source_markdown,
|
||||
[(r.id, r.start_char, r.end_char) for r in chunk_records],
|
||||
matched,
|
||||
)
|
||||
preamble = build_read_preamble(
|
||||
document_id=document_id,
|
||||
document_type=document_type,
|
||||
title=document_title,
|
||||
url=_metadata_url(metadata),
|
||||
matched_line_ranges=ranges,
|
||||
)
|
||||
return create_file_data(source_markdown), document_id, preamble
|
||||
|
||||
# Legacy fallback: no canonical body, reconstruct from chunks as XML.
|
||||
doc_payload = {
|
||||
"document_id": document_id,
|
||||
"chunks": [
|
||||
{"chunk_id": r.id, "content": r.content} for r in chunk_records
|
||||
],
|
||||
"matched_chunk_ids": list(matched),
|
||||
"document": {
|
||||
"id": document_id,
|
||||
"title": document_title,
|
||||
"document_type": document_type,
|
||||
"metadata": metadata,
|
||||
},
|
||||
"source": document_type,
|
||||
}
|
||||
xml = build_document_xml(
|
||||
doc_payload,
|
||||
matched_chunk_ids=self._matched_chunk_ids(document.id),
|
||||
)
|
||||
file_data = create_file_data(xml)
|
||||
return file_data, document.id
|
||||
xml = build_document_xml(doc_payload, matched_chunk_ids=matched)
|
||||
return create_file_data(xml), document_id, None
|
||||
|
||||
# ------------------------------------------------------------------ writes
|
||||
|
||||
|
|
@ -571,7 +603,7 @@ class KBPostgresBackend(BackendProtocol):
|
|||
loaded = await self._load_file_data(file_path)
|
||||
if loaded is None:
|
||||
return EditResult(error=f"Error: File '{file_path}' not found")
|
||||
file_data, _ = loaded
|
||||
file_data, _, _ = loaded
|
||||
|
||||
content = file_data_to_string(file_data)
|
||||
result = perform_string_replacement(
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
|||
loaded = await backend._load_file_data(validated)
|
||||
if loaded is None:
|
||||
return f"Error: File '{validated}' not found"
|
||||
_, doc_id_to_attach = loaded
|
||||
_, doc_id_to_attach, _ = loaded
|
||||
|
||||
res: EditResult = await backend.aedit(
|
||||
validated, old_string, new_string, replace_all=replace_all
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ async def cloud_move_file(
|
|||
loaded = await backend._load_file_data(source)
|
||||
if loaded is None:
|
||||
return f"Error: source '{source}' not found."
|
||||
source_file_data, loaded_doc_id = loaded
|
||||
source_file_data, loaded_doc_id, _ = loaded
|
||||
if source_doc_id is None:
|
||||
source_doc_id = loaded_doc_id
|
||||
|
||||
|
|
|
|||
|
|
@ -58,8 +58,10 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
|||
loaded = await backend._load_file_data(validated)
|
||||
if loaded is None:
|
||||
return f"Error: File '{validated}' not found"
|
||||
file_data, doc_id = loaded
|
||||
file_data, doc_id, preamble = loaded
|
||||
rendered = format_read_response(file_data, offset, limit)
|
||||
if preamble and offset == 0:
|
||||
rendered = preamble + rendered
|
||||
update: dict[str, Any] = {
|
||||
"files": {validated: file_data},
|
||||
"messages": [
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ async def cloud_rm(
|
|||
loaded = await backend._load_file_data(validated)
|
||||
if loaded is None:
|
||||
return f"Error: file '{validated}' not found."
|
||||
_, resolved_doc_id = loaded
|
||||
_, resolved_doc_id, _ = loaded
|
||||
|
||||
files_update: dict[str, Any] = {validated: None}
|
||||
update: dict[str, Any] = {
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
|
|||
def __init__(self, *, children=None, file_data=None) -> None:
|
||||
self.als_info = AsyncMock(return_value=children or [])
|
||||
self._load_file_data = AsyncMock(
|
||||
return_value=(file_data, 17) if file_data is not None else None
|
||||
return_value=(file_data, 17, None) if file_data is not None else None
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue