Merge pull request #682 from CREDO23/sur-77-impr-add-surfsense-documentation-to-new-default-search

[Feature] Onboarding | Built-in documentation search tool
This commit is contained in:
Rohan Verma 2026-01-12 12:45:52 -08:00 committed by GitHub
commit 33586f1cb7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 874 additions and 20 deletions

View file

@ -165,6 +165,8 @@ COPY --from=frontend-builder /app/.next/standalone ./
COPY --from=frontend-builder /app/.next/static ./.next/static
COPY --from=frontend-builder /app/public ./public
COPY surfsense_web/content/docs /app/surfsense_web/content/docs
# ====================
# Setup Backend
# ====================

View file

@ -0,0 +1,165 @@
"""Add Surfsense docs tables for global documentation storage
Revision ID: 60
Revises: 59
"""
from collections.abc import Sequence
from alembic import op
from app.config import config
# revision identifiers, used by Alembic.
revision: str = "60"
down_revision: str | None = "59"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
# Get embedding dimension from config
EMBEDDING_DIM = config.embedding_model_instance.dimension
def upgrade() -> None:
"""Create surfsense_docs_documents and surfsense_docs_chunks tables."""
# Create surfsense_docs_documents table
op.execute(
f"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'surfsense_docs_documents'
) THEN
CREATE TABLE surfsense_docs_documents (
id SERIAL PRIMARY KEY,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
source VARCHAR NOT NULL UNIQUE,
title VARCHAR NOT NULL,
content TEXT NOT NULL,
content_hash VARCHAR NOT NULL,
embedding vector({EMBEDDING_DIM}),
updated_at TIMESTAMP WITH TIME ZONE
);
END IF;
END$$;
"""
)
# Create indexes for surfsense_docs_documents
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_indexes
WHERE tablename = 'surfsense_docs_documents' AND indexname = 'ix_surfsense_docs_documents_source'
) THEN
CREATE INDEX ix_surfsense_docs_documents_source ON surfsense_docs_documents(source);
END IF;
IF NOT EXISTS (
SELECT 1 FROM pg_indexes
WHERE tablename = 'surfsense_docs_documents' AND indexname = 'ix_surfsense_docs_documents_content_hash'
) THEN
CREATE INDEX ix_surfsense_docs_documents_content_hash ON surfsense_docs_documents(content_hash);
END IF;
IF NOT EXISTS (
SELECT 1 FROM pg_indexes
WHERE tablename = 'surfsense_docs_documents' AND indexname = 'ix_surfsense_docs_documents_updated_at'
) THEN
CREATE INDEX ix_surfsense_docs_documents_updated_at ON surfsense_docs_documents(updated_at);
END IF;
END$$;
"""
)
# Create surfsense_docs_chunks table
op.execute(
f"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'surfsense_docs_chunks'
) THEN
CREATE TABLE surfsense_docs_chunks (
id SERIAL PRIMARY KEY,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
content TEXT NOT NULL,
embedding vector({EMBEDDING_DIM}),
document_id INTEGER NOT NULL REFERENCES surfsense_docs_documents(id) ON DELETE CASCADE
);
END IF;
END$$;
"""
)
# Create indexes for surfsense_docs_chunks
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_indexes
WHERE tablename = 'surfsense_docs_chunks' AND indexname = 'ix_surfsense_docs_chunks_document_id'
) THEN
CREATE INDEX ix_surfsense_docs_chunks_document_id ON surfsense_docs_chunks(document_id);
END IF;
END$$;
"""
)
# Create vector indexes for similarity search
op.execute(
"""
CREATE INDEX IF NOT EXISTS surfsense_docs_documents_vector_index
ON surfsense_docs_documents USING hnsw (embedding public.vector_cosine_ops);
"""
)
op.execute(
"""
CREATE INDEX IF NOT EXISTS surfsense_docs_chunks_vector_index
ON surfsense_docs_chunks USING hnsw (embedding public.vector_cosine_ops);
"""
)
# Create full-text search indexes (same pattern as documents/chunks tables)
op.execute(
"""
CREATE INDEX IF NOT EXISTS surfsense_docs_documents_search_index
ON surfsense_docs_documents USING gin (to_tsvector('english', content));
"""
)
op.execute(
"""
CREATE INDEX IF NOT EXISTS surfsense_docs_chunks_search_index
ON surfsense_docs_chunks USING gin (to_tsvector('english', content));
"""
)
def downgrade() -> None:
"""Remove surfsense docs tables."""
# Drop full-text search indexes
op.execute("DROP INDEX IF EXISTS surfsense_docs_chunks_search_index")
op.execute("DROP INDEX IF EXISTS surfsense_docs_documents_search_index")
# Drop vector indexes
op.execute("DROP INDEX IF EXISTS surfsense_docs_chunks_vector_index")
op.execute("DROP INDEX IF EXISTS surfsense_docs_documents_vector_index")
# Drop regular indexes
op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_chunks_document_id")
op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_updated_at")
op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_content_hash")
op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_source")
# Drop tables (chunks first due to FK)
op.execute("DROP TABLE IF EXISTS surfsense_docs_chunks")
op.execute("DROP TABLE IF EXISTS surfsense_docs_documents")

View file

@ -26,6 +26,13 @@ SURFSENSE_TOOLS_INSTRUCTIONS = """
<tools>
You have access to the following tools:
0. search_surfsense_docs: Search the official SurfSense documentation.
- Use this tool when the user asks anything about SurfSense itself (the application they are using).
- Args:
- query: The search query about SurfSense
- top_k: Number of documentation chunks to retrieve (default: 10)
- Returns: Documentation content with chunk IDs for citations (prefixed with 'doc-', e.g., [citation:doc-123])
1. search_knowledge_base: Search the user's personal knowledge base for relevant information.
- Args:
- query: The search query - be specific and include key terms
@ -152,6 +159,18 @@ You have access to the following tools:
- Airtable/Notion: Check field values, apply mapping above
</tools>
<tool_call_examples>
- User: "How do I install SurfSense?"
- Call: `search_surfsense_docs(query="installation setup")`
- User: "What connectors does SurfSense support?"
- Call: `search_surfsense_docs(query="available connectors integrations")`
- User: "How do I set up the Notion connector?"
- Call: `search_surfsense_docs(query="Notion connector setup configuration")`
- User: "How do I use Docker to run SurfSense?"
- Call: `search_surfsense_docs(query="Docker installation setup")`
- User: "Fetch all my notes and what's in them?"
- Call: `search_knowledge_base(query="*", top_k=50, connectors_to_search=["NOTE"])`
@ -308,7 +327,7 @@ The documents you receive are structured like this:
</document_content>
</document>
IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite document_id.
IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124, doc-45). Do NOT cite document_id.
</document_structure_example>
<citation_format>
@ -319,11 +338,13 @@ IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite docume
- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
- Copy the EXACT chunk id from the XML - if it says `<chunk id='doc-123'>`, use [citation:doc-123]
</citation_format>
<citation_examples>
CORRECT citation formats:
- [citation:5]
- [citation:doc-123] (for Surfsense documentation chunks)
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
INCORRECT citation formats (DO NOT use):

View file

@ -6,6 +6,7 @@ To add a new tool, see the documentation in registry.py.
Available tools:
- search_knowledge_base: Search the user's personal knowledge base
- search_surfsense_docs: Search Surfsense documentation for usage help
- generate_podcast: Generate audio podcasts from content
- link_preview: Fetch rich previews for URLs
- display_image: Display images in chat
@ -31,6 +32,7 @@ from .registry import (
get_tool_by_name,
)
from .scrape_webpage import create_scrape_webpage_tool
from .search_surfsense_docs import create_search_surfsense_docs_tool
__all__ = [
# Registry
@ -43,6 +45,7 @@ __all__ = [
"create_link_preview_tool",
"create_scrape_webpage_tool",
"create_search_knowledge_base_tool",
"create_search_surfsense_docs_tool",
# Knowledge base utilities
"format_documents_for_context",
"get_all_tool_names",

View file

@ -48,6 +48,7 @@ from .knowledge_base import create_search_knowledge_base_tool
from .link_preview import create_link_preview_tool
from .podcast import create_generate_podcast_tool
from .scrape_webpage import create_scrape_webpage_tool
from .search_surfsense_docs import create_search_surfsense_docs_tool
# =============================================================================
# Tool Definition
@ -126,6 +127,15 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
requires=[], # firecrawl_api_key is optional
),
# Note: write_todos is now provided by TodoListMiddleware from deepagents
# Surfsense documentation search tool
ToolDefinition(
name="search_surfsense_docs",
description="Search Surfsense documentation for help with using the application",
factory=lambda deps: create_search_surfsense_docs_tool(
db_session=deps["db_session"],
),
requires=["db_session"],
),
# =========================================================================
# ADD YOUR CUSTOM TOOLS BELOW
# =========================================================================

View file

@ -0,0 +1,160 @@
"""
Surfsense documentation search tool.
This tool allows the agent to search the pre-indexed Surfsense documentation
to help users with questions about how to use the application.
The documentation is indexed at deployment time from MDX files and stored
in dedicated tables (surfsense_docs_documents, surfsense_docs_chunks).
"""
import json
from langchain_core.tools import tool
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
def format_surfsense_docs_results(results: list[tuple]) -> str:
"""
Format search results into XML structure for the LLM context.
Uses the same XML structure as format_documents_for_context from knowledge_base.py
but with 'doc-' prefix on chunk IDs. This allows:
- LLM to use consistent [citation:doc-XXX] format
- Frontend to detect 'doc-' prefix and route to surfsense docs endpoint
Args:
results: List of (chunk, document) tuples from the database query
Returns:
Formatted XML string with documentation content and citation-ready chunks
"""
if not results:
return "No relevant Surfsense documentation found for your query."
# Group chunks by document
grouped: dict[int, dict] = {}
for chunk, doc in results:
if doc.id not in grouped:
grouped[doc.id] = {
"document_id": f"doc-{doc.id}",
"document_type": "SURFSENSE_DOCS",
"title": doc.title,
"url": doc.source,
"metadata": {"source": doc.source},
"chunks": [],
}
grouped[doc.id]["chunks"].append({
"chunk_id": f"doc-{chunk.id}",
"content": chunk.content,
})
# Render XML matching format_documents_for_context structure
parts: list[str] = []
for g in grouped.values():
metadata_json = json.dumps(g["metadata"], ensure_ascii=False)
parts.append("<document>")
parts.append("<document_metadata>")
parts.append(f" <document_id>{g['document_id']}</document_id>")
parts.append(f" <document_type>{g['document_type']}</document_type>")
parts.append(f" <title><![CDATA[{g['title']}]]></title>")
parts.append(f" <url><![CDATA[{g['url']}]]></url>")
parts.append(f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>")
parts.append("</document_metadata>")
parts.append("")
parts.append("<document_content>")
for ch in g["chunks"]:
parts.append(f" <chunk id='{ch['chunk_id']}'><![CDATA[{ch['content']}]]></chunk>")
parts.append("</document_content>")
parts.append("</document>")
parts.append("")
return "\n".join(parts).strip()
async def search_surfsense_docs_async(
query: str,
db_session: AsyncSession,
top_k: int = 10,
) -> str:
"""
Search Surfsense documentation using vector similarity.
Args:
query: The search query about Surfsense usage
db_session: Database session for executing queries
top_k: Number of results to return
Returns:
Formatted string with relevant documentation content
"""
# Get embedding for the query
query_embedding = config.embedding_model_instance.embed(query)
# Vector similarity search on chunks, joining with documents
stmt = (
select(SurfsenseDocsChunk, SurfsenseDocsDocument)
.join(
SurfsenseDocsDocument,
SurfsenseDocsChunk.document_id == SurfsenseDocsDocument.id,
)
.order_by(SurfsenseDocsChunk.embedding.op("<=>")(query_embedding))
.limit(top_k)
)
result = await db_session.execute(stmt)
rows = result.all()
return format_surfsense_docs_results(rows)
def create_search_surfsense_docs_tool(db_session: AsyncSession):
"""
Factory function to create the search_surfsense_docs tool.
Args:
db_session: Database session for executing queries
Returns:
A configured tool function for searching Surfsense documentation
"""
@tool
async def search_surfsense_docs(query: str, top_k: int = 10) -> str:
"""
Search Surfsense documentation for help with using the application.
Use this tool when the user asks questions about:
- How to use Surfsense features
- Installation and setup instructions
- Configuration options and settings
- Troubleshooting common issues
- Available connectors and integrations
- Browser extension usage
- API documentation
This searches the official Surfsense documentation that was indexed
at deployment time. It does NOT search the user's personal knowledge base.
Args:
query: The search query about Surfsense usage or features
top_k: Number of documentation chunks to retrieve (default: 10)
Returns:
Relevant documentation content formatted with chunk IDs for citations
"""
return await search_surfsense_docs_async(
query=query,
db_session=db_session,
top_k=top_k,
)
return search_surfsense_docs

View file

@ -13,6 +13,7 @@ from app.config import config
from app.db import User, create_db_and_tables, get_async_session
from app.routes import router as crud_router
from app.schemas import UserCreate, UserRead, UserUpdate
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
from app.users import SECRET, auth_backend, current_active_user, fastapi_users
@ -22,6 +23,8 @@ async def lifespan(app: FastAPI):
await create_db_and_tables()
# Setup LangGraph checkpointer tables for conversation persistence
await setup_checkpointer_tables()
# Seed Surfsense documentation
await seed_surfsense_docs()
yield
# Cleanup: close checkpointer connection on shutdown
await close_checkpointer()

View file

@ -428,6 +428,44 @@ class Chunk(BaseModel, TimestampMixin):
document = relationship("Document", back_populates="chunks")
class SurfsenseDocsDocument(BaseModel, TimestampMixin):
"""
Surfsense documentation storage.
Indexed at migration time from MDX files.
"""
__tablename__ = "surfsense_docs_documents"
source = Column(String, nullable=False, unique=True, index=True) # File path: "connectors/slack.mdx"
title = Column(String, nullable=False)
content = Column(Text, nullable=False)
content_hash = Column(String, nullable=False, index=True) # For detecting changes
embedding = Column(Vector(config.embedding_model_instance.dimension))
updated_at = Column(TIMESTAMP(timezone=True), nullable=True, index=True)
chunks = relationship(
"SurfsenseDocsChunk",
back_populates="document",
cascade="all, delete-orphan",
)
class SurfsenseDocsChunk(BaseModel, TimestampMixin):
"""Chunk storage for Surfsense documentation."""
__tablename__ = "surfsense_docs_chunks"
content = Column(Text, nullable=False)
embedding = Column(Vector(config.embedding_model_instance.dimension))
document_id = Column(
Integer,
ForeignKey("surfsense_docs_documents.id", ondelete="CASCADE"),
nullable=False,
)
document = relationship("SurfsenseDocsDocument", back_populates="chunks")
class Podcast(BaseModel, TimestampMixin):
"""Podcast model for storing generated podcasts."""

View file

@ -31,6 +31,7 @@ from .rbac_routes import router as rbac_router
from .search_source_connectors_routes import router as search_source_connectors_router
from .search_spaces_routes import router as search_spaces_router
from .slack_add_connector_route import router as slack_add_connector_router
from .surfsense_docs_routes import router as surfsense_docs_router
from .teams_add_connector_route import router as teams_add_connector_router
router = APIRouter()
@ -59,3 +60,4 @@ router.include_router(clickup_add_connector_router)
router.include_router(new_llm_config_router) # LLM configs with prompt configuration
router.include_router(logs_router)
router.include_router(circleback_webhook_router) # Circleback meeting webhooks
router.include_router(surfsense_docs_router) # Surfsense documentation for citations

View file

@ -0,0 +1,89 @@
"""
Routes for Surfsense documentation.
These endpoints support the citation system for Surfsense docs,
allowing the frontend to fetch document details when a user clicks
on a [citation:doc-XXX] link.
"""
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from app.db import (
SurfsenseDocsChunk,
SurfsenseDocsDocument,
User,
get_async_session,
)
from app.schemas.surfsense_docs import (
SurfsenseDocsChunkRead,
SurfsenseDocsDocumentWithChunksRead,
)
from app.users import current_active_user
router = APIRouter()
@router.get(
"/surfsense-docs/by-chunk/{chunk_id}",
response_model=SurfsenseDocsDocumentWithChunksRead,
)
async def get_surfsense_doc_by_chunk_id(
chunk_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Retrieves a Surfsense documentation document based on a chunk ID.
This endpoint is used by the frontend to resolve [citation:doc-XXX] links.
"""
try:
# Get the chunk
chunk_result = await session.execute(
select(SurfsenseDocsChunk).filter(SurfsenseDocsChunk.id == chunk_id)
)
chunk = chunk_result.scalars().first()
if not chunk:
raise HTTPException(
status_code=404,
detail=f"Surfsense docs chunk with id {chunk_id} not found",
)
# Get the associated document with all its chunks
document_result = await session.execute(
select(SurfsenseDocsDocument)
.options(selectinload(SurfsenseDocsDocument.chunks))
.filter(SurfsenseDocsDocument.id == chunk.document_id)
)
document = document_result.scalars().first()
if not document:
raise HTTPException(
status_code=404,
detail="Surfsense docs document not found",
)
# Sort chunks by ID
sorted_chunks = sorted(document.chunks, key=lambda x: x.id)
return SurfsenseDocsDocumentWithChunksRead(
id=document.id,
title=document.title,
source=document.source,
content=document.content,
chunks=[
SurfsenseDocsChunkRead(id=c.id, content=c.content)
for c in sorted_chunks
],
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Failed to retrieve Surfsense documentation: {e!s}",
) from e

View file

@ -0,0 +1,27 @@
"""
Schemas for Surfsense documentation.
"""
from pydantic import BaseModel, ConfigDict
class SurfsenseDocsChunkRead(BaseModel):
"""Schema for a Surfsense docs chunk."""
id: int
content: str
model_config = ConfigDict(from_attributes=True)
class SurfsenseDocsDocumentWithChunksRead(BaseModel):
"""Schema for a Surfsense docs document with its chunks."""
id: int
title: str
source: str
content: str
chunks: list[SurfsenseDocsChunkRead]
model_config = ConfigDict(from_attributes=True)

View file

@ -0,0 +1,222 @@
"""
Surfsense documentation indexer.
Indexes MDX documentation files at startup.
"""
import hashlib
import logging
import re
from datetime import UTC, datetime
from pathlib import Path
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from app.config import config
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
logger = logging.getLogger(__name__)
# Path to docs relative to project root
DOCS_DIR = Path(__file__).resolve().parent.parent.parent.parent / "surfsense_web" / "content" / "docs"
def parse_mdx_frontmatter(content: str) -> tuple[str, str]:
"""
Parse MDX file to extract frontmatter title and content.
Args:
content: Raw MDX file content
Returns:
Tuple of (title, content_without_frontmatter)
"""
# Match frontmatter between --- markers
frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
match = re.match(frontmatter_pattern, content, re.DOTALL)
if match:
frontmatter = match.group(1)
content_without_frontmatter = content[match.end():]
# Extract title from frontmatter
title_match = re.search(r"^title:\s*(.+)$", frontmatter, re.MULTILINE)
title = title_match.group(1).strip() if title_match else "Untitled"
# Remove quotes if present
title = title.strip("\"'")
return title, content_without_frontmatter.strip()
return "Untitled", content.strip()
def get_all_mdx_files() -> list[Path]:
"""
Get all MDX files from the docs directory.
Returns:
List of Path objects for each MDX file
"""
if not DOCS_DIR.exists():
logger.warning(f"Docs directory not found: {DOCS_DIR}")
return []
return list(DOCS_DIR.rglob("*.mdx"))
def generate_surfsense_docs_content_hash(content: str) -> str:
"""Generate SHA-256 hash for Surfsense docs content."""
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
"""
Create chunks from Surfsense documentation content.
Args:
content: Document content to chunk
Returns:
List of SurfsenseDocsChunk objects with embeddings
"""
return [
SurfsenseDocsChunk(
content=chunk.text,
embedding=config.embedding_model_instance.embed(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, int]:
"""
Index all Surfsense documentation files.
Args:
session: SQLAlchemy async session
Returns:
Tuple of (created, updated, skipped, deleted) counts
"""
created = 0
updated = 0
skipped = 0
deleted = 0
# Get all existing docs from database
existing_docs_result = await session.execute(
select(SurfsenseDocsDocument).options(selectinload(SurfsenseDocsDocument.chunks))
)
existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()}
# Track which sources we've processed
processed_sources = set()
# Get all MDX files
mdx_files = get_all_mdx_files()
logger.info(f"Found {len(mdx_files)} MDX files to index")
for mdx_file in mdx_files:
try:
source = str(mdx_file.relative_to(DOCS_DIR))
processed_sources.add(source)
# Read file content
raw_content = mdx_file.read_text(encoding="utf-8")
title, content = parse_mdx_frontmatter(raw_content)
content_hash = generate_surfsense_docs_content_hash(raw_content)
if source in existing_docs:
existing_doc = existing_docs[source]
# Check if content changed
if existing_doc.content_hash == content_hash:
logger.debug(f"Skipping unchanged: {source}")
skipped += 1
continue
# Content changed - update document
logger.info(f"Updating changed document: {source}")
# Create new chunks
chunks = create_surfsense_docs_chunks(content)
# Update document fields
existing_doc.title = title
existing_doc.content = content
existing_doc.content_hash = content_hash
existing_doc.embedding = config.embedding_model_instance.embed(content)
existing_doc.chunks = chunks
existing_doc.updated_at = datetime.now(UTC)
updated += 1
else:
# New document - create it
logger.info(f"Creating new document: {source}")
chunks = create_surfsense_docs_chunks(content)
document = SurfsenseDocsDocument(
source=source,
title=title,
content=content,
content_hash=content_hash,
embedding=config.embedding_model_instance.embed(content),
chunks=chunks,
updated_at=datetime.now(UTC),
)
session.add(document)
created += 1
except Exception as e:
logger.error(f"Error processing {mdx_file}: {e}", exc_info=True)
continue
# Delete documents for removed files
for source, doc in existing_docs.items():
if source not in processed_sources:
logger.info(f"Deleting removed document: {source}")
await session.delete(doc)
deleted += 1
# Commit all changes
await session.commit()
logger.info(
f"Indexing complete: {created} created, {updated} updated, "
f"{skipped} skipped, {deleted} deleted"
)
return created, updated, skipped, deleted
async def seed_surfsense_docs() -> tuple[int, int, int, int]:
"""
Seed Surfsense documentation into the database.
This function indexes all MDX files from the docs directory.
It handles creating, updating, and deleting docs based on content changes.
Returns:
Tuple of (created, updated, skipped, deleted) counts
Returns (0, 0, 0, 0) if an error occurs
"""
logger.info("Starting Surfsense docs indexing...")
try:
async with async_session_maker() as session:
created, updated, skipped, deleted = await index_surfsense_docs(session)
logger.info(
f"Surfsense docs indexing complete: "
f"created={created}, updated={updated}, skipped={skipped}, deleted={deleted}"
)
return created, updated, skipped, deleted
except Exception as e:
logger.error(f"Failed to seed Surfsense docs: {e}", exc_info=True)
return 0, 0, 0, 0

View file

@ -0,0 +1,40 @@
#!/usr/bin/env python
"""
Seed Surfsense documentation into the database.
CLI wrapper for the seed_surfsense_docs function.
Can be run manually for debugging or re-indexing.
Usage:
python scripts/seed_surfsense_docs.py
"""
import asyncio
import sys
from pathlib import Path
# Add the parent directory to the path so we can import app modules
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
def main():
"""CLI entry point for seeding Surfsense docs."""
print("=" * 50)
print(" Surfsense Documentation Seeding")
print("=" * 50)
created, updated, skipped, deleted = asyncio.run(seed_surfsense_docs())
print()
print("Results:")
print(f" Created: {created}")
print(f" Updated: {updated}")
print(f" Skipped: {skipped}")
print(f" Deleted: {deleted}")
print("=" * 50)
if __name__ == "__main__":
main()

View file

@ -7,13 +7,15 @@ import { SourceDetailPanel } from "@/components/new-chat/source-detail-panel";
interface InlineCitationProps {
chunkId: number;
citationNumber: number;
isDocsChunk?: boolean;
}
/**
* Inline citation component for the new chat.
* Renders a clickable numbered badge that opens the SourceDetailPanel with document chunk details.
* Supports both regular knowledge base chunks and Surfsense documentation chunks.
*/
export const InlineCitation: FC<InlineCitationProps> = ({ chunkId, citationNumber }) => {
export const InlineCitation: FC<InlineCitationProps> = ({ chunkId, citationNumber, isDocsChunk = false }) => {
const [isOpen, setIsOpen] = useState(false);
return (
@ -21,10 +23,11 @@ export const InlineCitation: FC<InlineCitationProps> = ({ chunkId, citationNumbe
open={isOpen}
onOpenChange={setIsOpen}
chunkId={chunkId}
sourceType=""
title="Source"
sourceType={isDocsChunk ? "SURFSENSE_DOCS" : ""}
title={isDocsChunk ? "Surfsense Documentation" : "Source"}
description=""
url=""
isDocsChunk={isDocsChunk}
>
<span
onClick={() => setIsOpen(true)}

View file

@ -15,12 +15,13 @@ import { InlineCitation } from "@/components/assistant-ui/inline-citation";
import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
import { cn } from "@/lib/utils";
// Citation pattern: [citation:CHUNK_ID]
const CITATION_REGEX = /\[citation:(\d+)\]/g;
// Citation pattern: [citation:CHUNK_ID] or [citation:doc-CHUNK_ID]
const CITATION_REGEX = /\[citation:(doc-)?(\d+)\]/g;
// Track chunk IDs to citation numbers mapping for consistent numbering
// This map is reset when a new message starts rendering
let chunkIdToCitationNumber: Map<number, number> = new Map();
// Uses string keys to differentiate between doc and regular chunks (e.g., "doc-123" vs "123")
let chunkIdToCitationNumber: Map<string, number> = new Map();
let nextCitationNumber = 1;
/**
@ -33,16 +34,20 @@ export function resetCitationCounter() {
/**
* Gets or assigns a citation number for a chunk ID
* Uses string key to differentiate between doc and regular chunks
*/
function getCitationNumber(chunkId: number): number {
if (!chunkIdToCitationNumber.has(chunkId)) {
chunkIdToCitationNumber.set(chunkId, nextCitationNumber++);
function getCitationNumber(chunkId: number, isDocsChunk: boolean): number {
const key = isDocsChunk ? `doc-${chunkId}` : String(chunkId);
const existingNumber = chunkIdToCitationNumber.get(key);
if (existingNumber === undefined) {
chunkIdToCitationNumber.set(key, nextCitationNumber++);
}
return chunkIdToCitationNumber.get(chunkId)!;
return chunkIdToCitationNumber.get(key)!;
}
/**
* Parses text and replaces [citation:XXX] patterns with InlineCitation components
* Supports both regular chunks [citation:123] and docs chunks [citation:doc-123]
*/
function parseTextWithCitations(text: string): ReactNode[] {
const parts: ReactNode[] = [];
@ -59,14 +64,16 @@ function parseTextWithCitations(text: string): ReactNode[] {
parts.push(text.substring(lastIndex, match.index));
}
// Add the citation component
const chunkId = Number.parseInt(match[1], 10);
const citationNumber = getCitationNumber(chunkId);
// Check if this is a docs chunk (has "doc-" prefix)
const isDocsChunk = match[1] === "doc-";
const chunkId = Number.parseInt(match[2], 10);
const citationNumber = getCitationNumber(chunkId, isDocsChunk);
parts.push(
<InlineCitation
key={`citation-${chunkId}-${instanceIndex}`}
key={`citation-${isDocsChunk ? "doc-" : ""}${chunkId}-${instanceIndex}`}
chunkId={chunkId}
citationNumber={citationNumber}
isDocsChunk={isDocsChunk}
/>
);

View file

@ -21,10 +21,16 @@ import { Badge } from "@/components/ui/badge";
import { Button } from "@/components/ui/button";
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible";
import { ScrollArea } from "@/components/ui/scroll-area";
import type {
GetDocumentByChunkResponse,
GetSurfsenseDocsByChunkResponse,
} from "@/contracts/types/document.types";
import { documentsApiService } from "@/lib/apis/documents-api.service";
import { cacheKeys } from "@/lib/query-client/cache-keys";
import { cn } from "@/lib/utils";
type DocumentData = GetDocumentByChunkResponse | GetSurfsenseDocsByChunkResponse;
interface SourceDetailPanelProps {
open: boolean;
onOpenChange: (open: boolean) => void;
@ -34,6 +40,7 @@ interface SourceDetailPanelProps {
description?: string;
url?: string;
children?: ReactNode;
isDocsChunk?: boolean;
}
const formatDocumentType = (type: string) => {
@ -114,6 +121,7 @@ export function SourceDetailPanel({
description,
url,
children,
isDocsChunk = false,
}: SourceDetailPanelProps) {
const scrollAreaRef = useRef<HTMLDivElement>(null);
const hasScrolledRef = useRef(false); // Use ref to avoid stale closures
@ -131,9 +139,16 @@ export function SourceDetailPanel({
data: documentData,
isLoading: isDocumentByChunkFetching,
error: documentByChunkFetchingError,
} = useQuery({
queryKey: cacheKeys.documents.byChunk(chunkId.toString()),
queryFn: () => documentsApiService.getDocumentByChunk({ chunk_id: chunkId }),
} = useQuery<DocumentData>({
queryKey: isDocsChunk
? cacheKeys.documents.byChunk(`doc-${chunkId}`)
: cacheKeys.documents.byChunk(chunkId.toString()),
queryFn: async () => {
if (isDocsChunk) {
return documentsApiService.getSurfsenseDocByChunk(chunkId);
}
return documentsApiService.getDocumentByChunk({ chunk_id: chunkId });
},
enabled: !!chunkId && open,
staleTime: 5 * 60 * 1000,
});
@ -325,7 +340,7 @@ export function SourceDetailPanel({
{documentData?.title || title || "Source Document"}
</h2>
<p className="text-sm text-muted-foreground mt-0.5">
{documentData
{documentData && "document_type" in documentData
? formatDocumentType(documentData.document_type)
: sourceType && formatDocumentType(sourceType)}
{documentData?.chunks && (
@ -491,7 +506,8 @@ export function SourceDetailPanel({
<ScrollArea className="flex-1" ref={scrollAreaRef}>
<div className="p-6 lg:p-8 max-w-4xl mx-auto space-y-6">
{/* Document Metadata */}
{documentData.document_metadata &&
{"document_metadata" in documentData &&
documentData.document_metadata &&
Object.keys(documentData.document_metadata).length > 0 && (
<motion.div
initial={{ opacity: 0, y: 10 }}

View file

@ -59,6 +59,26 @@ export const documentWithChunks = document.extend({
),
});
/**
* Surfsense documentation schemas
* Follows the same pattern as document/documentWithChunks
*/
export const surfsenseDocsChunk = z.object({
id: z.number(),
content: z.string(),
});
export const surfsenseDocsDocument = z.object({
id: z.number(),
title: z.string(),
source: z.string(),
content: z.string(),
});
export const surfsenseDocsDocumentWithChunks = surfsenseDocsDocument.extend({
chunks: z.array(surfsenseDocsChunk),
});
/**
* Get documents
*/
@ -154,6 +174,15 @@ export const getDocumentByChunkRequest = z.object({
export const getDocumentByChunkResponse = documentWithChunks;
/**
* Get Surfsense docs by chunk
*/
export const getSurfsenseDocsByChunkRequest = z.object({
chunk_id: z.number(),
});
export const getSurfsenseDocsByChunkResponse = surfsenseDocsDocumentWithChunks;
/**
* Update document
*/
@ -193,3 +222,8 @@ export type UpdateDocumentResponse = z.infer<typeof updateDocumentResponse>;
export type DeleteDocumentRequest = z.infer<typeof deleteDocumentRequest>;
export type DeleteDocumentResponse = z.infer<typeof deleteDocumentResponse>;
export type DocumentTypeEnum = z.infer<typeof documentTypeEnum>;
export type SurfsenseDocsChunk = z.infer<typeof surfsenseDocsChunk>;
export type SurfsenseDocsDocument = z.infer<typeof surfsenseDocsDocument>;
export type SurfsenseDocsDocumentWithChunks = z.infer<typeof surfsenseDocsDocumentWithChunks>;
export type GetSurfsenseDocsByChunkRequest = z.infer<typeof getSurfsenseDocsByChunkRequest>;
export type GetSurfsenseDocsByChunkResponse = z.infer<typeof getSurfsenseDocsByChunkResponse>;

View file

@ -17,6 +17,7 @@ import {
getDocumentsResponse,
getDocumentTypeCountsRequest,
getDocumentTypeCountsResponse,
getSurfsenseDocsByChunkResponse,
type SearchDocumentsRequest,
searchDocumentsRequest,
searchDocumentsResponse,
@ -209,6 +210,17 @@ class DocumentsApiService {
);
};
/**
* Get Surfsense documentation by chunk ID
* Used for resolving [citation:doc-XXX] citations
*/
getSurfsenseDocByChunk = async (chunkId: number) => {
return baseApiService.get(
`/api/v1/surfsense-docs/by-chunk/${chunkId}`,
getSurfsenseDocsByChunkResponse
);
};
/**
* Update a document
*/