mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-23 19:05:16 +02:00
refactor: add public URL handling for SurfSense documents across various components and schemas
This commit is contained in:
parent
ea087d1d23
commit
01d7379914
8 changed files with 35 additions and 8 deletions
|
|
@ -9,6 +9,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
|
||||
from app.utils.document_converters import embed_text
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
|
||||
|
||||
def format_surfsense_docs_results(results: list[tuple]) -> str:
|
||||
|
|
@ -19,13 +20,14 @@ def format_surfsense_docs_results(results: list[tuple]) -> str:
|
|||
# Group chunks by document
|
||||
grouped: dict[int, dict] = {}
|
||||
for chunk, doc in results:
|
||||
public_url = surfsense_docs_public_url(doc.source)
|
||||
if doc.id not in grouped:
|
||||
grouped[doc.id] = {
|
||||
"document_id": f"doc-{doc.id}",
|
||||
"document_type": "SURFSENSE_DOCS",
|
||||
"title": doc.title,
|
||||
"url": doc.source,
|
||||
"metadata": {"source": doc.source},
|
||||
"url": public_url,
|
||||
"metadata": {"source": doc.source, "public_url": public_url},
|
||||
"chunks": [],
|
||||
}
|
||||
grouped[doc.id]["chunks"].append(
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
|
||||
from app.utils.document_converters import embed_text
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
|
||||
|
||||
def format_surfsense_docs_results(results: list[tuple]) -> str:
|
||||
|
|
@ -40,13 +41,14 @@ def format_surfsense_docs_results(results: list[tuple]) -> str:
|
|||
# Group chunks by document
|
||||
grouped: dict[int, dict] = {}
|
||||
for chunk, doc in results:
|
||||
public_url = surfsense_docs_public_url(doc.source)
|
||||
if doc.id not in grouped:
|
||||
grouped[doc.id] = {
|
||||
"document_id": f"doc-{doc.id}",
|
||||
"document_type": "SURFSENSE_DOCS",
|
||||
"title": doc.title,
|
||||
"url": doc.source,
|
||||
"metadata": {"source": doc.source},
|
||||
"url": public_url,
|
||||
"metadata": {"source": doc.source, "public_url": public_url},
|
||||
"chunks": [],
|
||||
}
|
||||
grouped[doc.id]["chunks"].append(
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from app.schemas.surfsense_docs import (
|
|||
SurfsenseDocsDocumentWithChunksRead,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
|
@ -76,6 +77,7 @@ async def get_surfsense_doc_by_chunk_id(
|
|||
id=document.id,
|
||||
title=document.title,
|
||||
source=document.source,
|
||||
public_url=surfsense_docs_public_url(document.source),
|
||||
content=document.content,
|
||||
chunks=[
|
||||
SurfsenseDocsChunkRead(id=c.id, content=c.content)
|
||||
|
|
@ -146,6 +148,7 @@ async def list_surfsense_docs(
|
|||
id=doc.id,
|
||||
title=doc.title,
|
||||
source=doc.source,
|
||||
public_url=surfsense_docs_public_url(doc.source),
|
||||
content=doc.content,
|
||||
created_at=doc.created_at,
|
||||
updated_at=doc.updated_at,
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ class SurfsenseDocsDocumentRead(BaseModel):
|
|||
id: int
|
||||
title: str
|
||||
source: str
|
||||
public_url: str
|
||||
content: str
|
||||
created_at: datetime | None = None
|
||||
updated_at: datetime | None = None
|
||||
|
|
@ -35,6 +36,7 @@ class SurfsenseDocsDocumentWithChunksRead(BaseModel):
|
|||
id: int
|
||||
title: str
|
||||
source: str
|
||||
public_url: str
|
||||
content: str
|
||||
chunks: list[SurfsenseDocsChunkRead]
|
||||
|
||||
|
|
|
|||
|
|
@ -78,6 +78,7 @@ from app.services.new_streaming_service import VercelStreamingService
|
|||
from app.tasks.chat.streaming.graph_stream.event_stream import stream_output
|
||||
from app.utils.content_utils import bootstrap_history_from_db
|
||||
from app.utils.perf import get_perf_logger, log_system_snapshot, trim_native_heap
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
from app.utils.user_message_multimodal import build_human_message_content
|
||||
|
||||
_background_tasks: set[asyncio.Task] = set()
|
||||
|
|
@ -239,14 +240,17 @@ def format_mentioned_surfsense_docs_as_context(
|
|||
)
|
||||
|
||||
for doc in documents:
|
||||
metadata_json = json.dumps({"source": doc.source}, ensure_ascii=False)
|
||||
public_url = surfsense_docs_public_url(doc.source)
|
||||
metadata_json = json.dumps(
|
||||
{"source": doc.source, "public_url": public_url}, ensure_ascii=False
|
||||
)
|
||||
|
||||
context_parts.append("<document>")
|
||||
context_parts.append("<document_metadata>")
|
||||
context_parts.append(f" <document_id>doc-{doc.id}</document_id>")
|
||||
context_parts.append(" <document_type>SURFSENSE_DOCS</document_type>")
|
||||
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
|
||||
context_parts.append(f" <url><![CDATA[{doc.source}]]></url>")
|
||||
context_parts.append(f" <url><![CDATA[{public_url}]]></url>")
|
||||
context_parts.append(
|
||||
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
|
||||
)
|
||||
|
|
|
|||
13
surfsense_backend/app/utils/surfsense_docs.py
Normal file
13
surfsense_backend/app/utils/surfsense_docs.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
"""Utilities for SurfSense's built-in documentation index."""
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
DOCS_PUBLIC_ROOT = PurePosixPath("/docs")
|
||||
|
||||
|
||||
def surfsense_docs_public_url(source: str) -> str:
|
||||
"""Return the public docs route for an indexed documentation source path."""
|
||||
docs_path = PurePosixPath(source).with_suffix("")
|
||||
if docs_path.name == "index":
|
||||
docs_path = docs_path.parent
|
||||
return (DOCS_PUBLIC_ROOT / docs_path).as_posix()
|
||||
Loading…
Add table
Add a link
Reference in a new issue