mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-02 22:01:05 +02:00
feat: serve numbered source_markdown reads with citation preamble
This commit is contained in:
parent
fc17b9becd
commit
188ae053ac
6 changed files with 78 additions and 44 deletions
|
|
@ -45,6 +45,10 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
|
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
|
||||||
build_document_xml,
|
build_document_xml,
|
||||||
)
|
)
|
||||||
|
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
|
||||||
|
build_read_preamble,
|
||||||
|
compute_matched_line_ranges,
|
||||||
|
)
|
||||||
from app.agents.chat.runtime.path_resolver import (
|
from app.agents.chat.runtime.path_resolver import (
|
||||||
DOCUMENTS_ROOT,
|
DOCUMENTS_ROOT,
|
||||||
build_path_index,
|
build_path_index,
|
||||||
|
|
@ -64,6 +68,12 @@ def _basename(path: str) -> str:
|
||||||
return path.rsplit("/", 1)[-1]
|
return path.rsplit("/", 1)[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def _metadata_url(metadata: dict[str, Any]) -> str:
|
||||||
|
return (
|
||||||
|
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _is_under(child: str, parent: str) -> bool:
|
def _is_under(child: str, parent: str) -> bool:
|
||||||
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
|
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
|
||||||
if parent == "/":
|
if parent == "/":
|
||||||
|
|
@ -460,8 +470,11 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
loaded = await self._load_file_data(file_path)
|
loaded = await self._load_file_data(file_path)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: File '{file_path}' not found"
|
return f"Error: File '{file_path}' not found"
|
||||||
file_data, _ = loaded
|
file_data, _, preamble = loaded
|
||||||
return format_read_response(file_data, offset, limit)
|
body = format_read_response(file_data, offset, limit)
|
||||||
|
if preamble and offset == 0:
|
||||||
|
return preamble + body
|
||||||
|
return body
|
||||||
|
|
||||||
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
|
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
|
||||||
return asyncio.run(self.aread(file_path, offset, limit))
|
return asyncio.run(self.aread(file_path, offset, limit))
|
||||||
|
|
@ -469,12 +482,14 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
async def _load_file_data(
|
async def _load_file_data(
|
||||||
self,
|
self,
|
||||||
path: str,
|
path: str,
|
||||||
) -> tuple[dict[str, Any], int | None] | None:
|
) -> tuple[dict[str, Any], int | None, str | None] | None:
|
||||||
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
|
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
|
||||||
|
|
||||||
Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
|
Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path
|
||||||
to any known document. ``doc_id`` is ``None`` for the synthetic
|
doesn't map to any known document. ``doc_id`` is ``None`` for the
|
||||||
anonymous document so the caller doesn't track it as a DB-backed file.
|
synthetic anonymous document. ``preamble`` is the metadata header to
|
||||||
|
show above a numbered ``source_markdown`` body (``None`` for the legacy
|
||||||
|
chunk-reconstructed XML reads used when a document has no body).
|
||||||
"""
|
"""
|
||||||
anon = self._kb_anon_doc()
|
anon = self._kb_anon_doc()
|
||||||
if anon and str(anon.get("path") or "") == path:
|
if anon and str(anon.get("path") or "") == path:
|
||||||
|
|
@ -492,7 +507,7 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
}
|
}
|
||||||
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
|
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
|
||||||
file_data = create_file_data(xml)
|
file_data = create_file_data(xml)
|
||||||
return file_data, None
|
return file_data, None, None
|
||||||
|
|
||||||
if not path.startswith(DOCUMENTS_ROOT):
|
if not path.startswith(DOCUMENTS_ROOT):
|
||||||
return None
|
return None
|
||||||
|
|
@ -505,41 +520,58 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
)
|
)
|
||||||
if document is None:
|
if document is None:
|
||||||
return None
|
return None
|
||||||
chunk_rows = await session.execute(
|
source_markdown = document.source_markdown or ""
|
||||||
select(Chunk.id, Chunk.content)
|
document_type = (
|
||||||
.where(Chunk.document_id == document.id)
|
|
||||||
.order_by(Chunk.position, Chunk.id)
|
|
||||||
)
|
|
||||||
chunks = [
|
|
||||||
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
|
|
||||||
]
|
|
||||||
|
|
||||||
doc_payload = {
|
|
||||||
"document_id": document.id,
|
|
||||||
"chunks": chunks,
|
|
||||||
"matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
|
|
||||||
"document": {
|
|
||||||
"id": document.id,
|
|
||||||
"title": document.title,
|
|
||||||
"document_type": (
|
|
||||||
document.document_type.value
|
|
||||||
if getattr(document, "document_type", None) is not None
|
|
||||||
else "UNKNOWN"
|
|
||||||
),
|
|
||||||
"metadata": dict(document.document_metadata or {}),
|
|
||||||
},
|
|
||||||
"source": (
|
|
||||||
document.document_type.value
|
document.document_type.value
|
||||||
if getattr(document, "document_type", None) is not None
|
if getattr(document, "document_type", None) is not None
|
||||||
else "UNKNOWN"
|
else "UNKNOWN"
|
||||||
),
|
)
|
||||||
|
metadata = dict(document.document_metadata or {})
|
||||||
|
chunk_rows = await session.execute(
|
||||||
|
select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char)
|
||||||
|
.where(Chunk.document_id == document.id)
|
||||||
|
.order_by(Chunk.position, Chunk.id)
|
||||||
|
)
|
||||||
|
chunk_records = chunk_rows.all()
|
||||||
|
document_id = document.id
|
||||||
|
document_title = document.title
|
||||||
|
|
||||||
|
matched = self._matched_chunk_ids(document_id)
|
||||||
|
|
||||||
|
# Canonical read: serve the verbatim body with cat -n line numbers that
|
||||||
|
# line up with chunk char spans, so the agent cites real source lines.
|
||||||
|
if source_markdown:
|
||||||
|
ranges = compute_matched_line_ranges(
|
||||||
|
source_markdown,
|
||||||
|
[(r.id, r.start_char, r.end_char) for r in chunk_records],
|
||||||
|
matched,
|
||||||
|
)
|
||||||
|
preamble = build_read_preamble(
|
||||||
|
document_id=document_id,
|
||||||
|
document_type=document_type,
|
||||||
|
title=document_title,
|
||||||
|
url=_metadata_url(metadata),
|
||||||
|
matched_line_ranges=ranges,
|
||||||
|
)
|
||||||
|
return create_file_data(source_markdown), document_id, preamble
|
||||||
|
|
||||||
|
# Legacy fallback: no canonical body, reconstruct from chunks as XML.
|
||||||
|
doc_payload = {
|
||||||
|
"document_id": document_id,
|
||||||
|
"chunks": [
|
||||||
|
{"chunk_id": r.id, "content": r.content} for r in chunk_records
|
||||||
|
],
|
||||||
|
"matched_chunk_ids": list(matched),
|
||||||
|
"document": {
|
||||||
|
"id": document_id,
|
||||||
|
"title": document_title,
|
||||||
|
"document_type": document_type,
|
||||||
|
"metadata": metadata,
|
||||||
|
},
|
||||||
|
"source": document_type,
|
||||||
}
|
}
|
||||||
xml = build_document_xml(
|
xml = build_document_xml(doc_payload, matched_chunk_ids=matched)
|
||||||
doc_payload,
|
return create_file_data(xml), document_id, None
|
||||||
matched_chunk_ids=self._matched_chunk_ids(document.id),
|
|
||||||
)
|
|
||||||
file_data = create_file_data(xml)
|
|
||||||
return file_data, document.id
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------ writes
|
# ------------------------------------------------------------------ writes
|
||||||
|
|
||||||
|
|
@ -571,7 +603,7 @@ class KBPostgresBackend(BackendProtocol):
|
||||||
loaded = await self._load_file_data(file_path)
|
loaded = await self._load_file_data(file_path)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return EditResult(error=f"Error: File '{file_path}' not found")
|
return EditResult(error=f"Error: File '{file_path}' not found")
|
||||||
file_data, _ = loaded
|
file_data, _, _ = loaded
|
||||||
|
|
||||||
content = file_data_to_string(file_data)
|
content = file_data_to_string(file_data)
|
||||||
result = perform_string_replacement(
|
result = perform_string_replacement(
|
||||||
|
|
|
||||||
|
|
@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
||||||
loaded = await backend._load_file_data(validated)
|
loaded = await backend._load_file_data(validated)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: File '{validated}' not found"
|
return f"Error: File '{validated}' not found"
|
||||||
_, doc_id_to_attach = loaded
|
_, doc_id_to_attach, _ = loaded
|
||||||
|
|
||||||
res: EditResult = await backend.aedit(
|
res: EditResult = await backend.aedit(
|
||||||
validated, old_string, new_string, replace_all=replace_all
|
validated, old_string, new_string, replace_all=replace_all
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ async def cloud_move_file(
|
||||||
loaded = await backend._load_file_data(source)
|
loaded = await backend._load_file_data(source)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: source '{source}' not found."
|
return f"Error: source '{source}' not found."
|
||||||
source_file_data, loaded_doc_id = loaded
|
source_file_data, loaded_doc_id, _ = loaded
|
||||||
if source_doc_id is None:
|
if source_doc_id is None:
|
||||||
source_doc_id = loaded_doc_id
|
source_doc_id = loaded_doc_id
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -58,8 +58,10 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
||||||
loaded = await backend._load_file_data(validated)
|
loaded = await backend._load_file_data(validated)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: File '{validated}' not found"
|
return f"Error: File '{validated}' not found"
|
||||||
file_data, doc_id = loaded
|
file_data, doc_id, preamble = loaded
|
||||||
rendered = format_read_response(file_data, offset, limit)
|
rendered = format_read_response(file_data, offset, limit)
|
||||||
|
if preamble and offset == 0:
|
||||||
|
rendered = preamble + rendered
|
||||||
update: dict[str, Any] = {
|
update: dict[str, Any] = {
|
||||||
"files": {validated: file_data},
|
"files": {validated: file_data},
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ async def cloud_rm(
|
||||||
loaded = await backend._load_file_data(validated)
|
loaded = await backend._load_file_data(validated)
|
||||||
if loaded is None:
|
if loaded is None:
|
||||||
return f"Error: file '{validated}' not found."
|
return f"Error: file '{validated}' not found."
|
||||||
_, resolved_doc_id = loaded
|
_, resolved_doc_id, _ = loaded
|
||||||
|
|
||||||
files_update: dict[str, Any] = {validated: None}
|
files_update: dict[str, Any] = {validated: None}
|
||||||
update: dict[str, Any] = {
|
update: dict[str, Any] = {
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
|
||||||
def __init__(self, *, children=None, file_data=None) -> None:
|
def __init__(self, *, children=None, file_data=None) -> None:
|
||||||
self.als_info = AsyncMock(return_value=children or [])
|
self.als_info = AsyncMock(return_value=children or [])
|
||||||
self._load_file_data = AsyncMock(
|
self._load_file_data = AsyncMock(
|
||||||
return_value=(file_data, 17) if file_data is not None else None
|
return_value=(file_data, 17, None) if file_data is not None else None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue