From 961d74165674062b8d9eab7d5e1c311d9d50fbeb Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 15:26:55 +0200 Subject: [PATCH 01/20] feat: add SurfsenseDocsDocument model --- surfsense_backend/app/db.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index d54254f9c..abca893fb 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -428,6 +428,28 @@ class Chunk(BaseModel, TimestampMixin): document = relationship("Document", back_populates="chunks") +class SurfsenseDocsDocument(BaseModel, TimestampMixin): + """ + Surfsense documentation storage. + Indexed at migration time from MDX files. + """ + + __tablename__ = "surfsense_docs_documents" + + source = Column(String, nullable=False, unique=True, index=True) # File path: "connectors/slack.mdx" + title = Column(String, nullable=False) + content = Column(Text, nullable=False) + content_hash = Column(String, nullable=False, index=True) # For detecting changes + embedding = Column(Vector(config.embedding_model_instance.dimension)) + updated_at = Column(TIMESTAMP(timezone=True), nullable=True, index=True) + + chunks = relationship( + "SurfsenseDocsChunk", + back_populates="document", + cascade="all, delete-orphan", + ) + + class Podcast(BaseModel, TimestampMixin): """Podcast model for storing generated podcasts.""" From ba404cc1516dc90176637e1e01396b4dfeaf4856 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 15:28:36 +0200 Subject: [PATCH 02/20] feat: add SurfsenseDocsChunk model with relationship --- surfsense_backend/app/db.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index abca893fb..006d73358 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -450,6 +450,22 @@ class SurfsenseDocsDocument(BaseModel, TimestampMixin): ) +class SurfsenseDocsChunk(BaseModel, TimestampMixin): + """Chunk storage for Surfsense documentation.""" + + __tablename__ = "surfsense_docs_chunks" + + content = Column(Text, nullable=False) + embedding = Column(Vector(config.embedding_model_instance.dimension)) + + document_id = Column( + Integer, + ForeignKey("surfsense_docs_documents.id", ondelete="CASCADE"), + nullable=False, + ) + document = relationship("SurfsenseDocsDocument", back_populates="chunks") + + class Podcast(BaseModel, TimestampMixin): """Podcast model for storing generated podcasts.""" From fff851ae3fe07d6b2ac388296c4705083dc131bf Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 15:42:30 +0200 Subject: [PATCH 03/20] feat: create indexer module with MDX parsing --- .../app/tasks/surfsense_docs_indexer.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 surfsense_backend/app/tasks/surfsense_docs_indexer.py diff --git a/surfsense_backend/app/tasks/surfsense_docs_indexer.py b/surfsense_backend/app/tasks/surfsense_docs_indexer.py new file mode 100644 index 000000000..c5e846635 --- /dev/null +++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py @@ -0,0 +1,64 @@ +""" +Surfsense documentation indexer. +Indexes MDX documentation files at migration time. +""" + +import hashlib +import logging +import re +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Path to docs relative to project root +DOCS_DIR = Path(__file__).resolve().parent.parent.parent.parent / "surfsense_web" / "content" / "docs" + + +def parse_mdx_frontmatter(content: str) -> tuple[str, str]: + """ + Parse MDX file to extract frontmatter title and content. + + Args: + content: Raw MDX file content + + Returns: + Tuple of (title, content_without_frontmatter) + """ + # Match frontmatter between --- markers + frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n" + match = re.match(frontmatter_pattern, content, re.DOTALL) + + if match: + frontmatter = match.group(1) + content_without_frontmatter = content[match.end():] + + # Extract title from frontmatter + title_match = re.search(r"^title:\s*(.+)$", frontmatter, re.MULTILINE) + title = title_match.group(1).strip() if title_match else "Untitled" + + # Remove quotes if present + title = title.strip("\"'") + + return title, content_without_frontmatter.strip() + + return "Untitled", content.strip() + + +def get_all_mdx_files() -> list[Path]: + """ + Get all MDX files from the docs directory. + + Returns: + List of Path objects for each MDX file + """ + if not DOCS_DIR.exists(): + logger.warning(f"Docs directory not found: {DOCS_DIR}") + return [] + + return list(DOCS_DIR.rglob("*.mdx")) + + +def generate_surfsense_docs_content_hash(content: str) -> str: + """Generate SHA-256 hash for Surfsense docs content.""" + return hashlib.sha256(content.encode("utf-8")).hexdigest() + From 2e83ed8dcd7affb5d4e8570fdf48ca12031d4335 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 15:45:17 +0200 Subject: [PATCH 04/20] feat: add chunking and embedding logic to indexer --- .../app/tasks/surfsense_docs_indexer.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/surfsense_backend/app/tasks/surfsense_docs_indexer.py b/surfsense_backend/app/tasks/surfsense_docs_indexer.py index c5e846635..6b4c4c91c 100644 --- a/surfsense_backend/app/tasks/surfsense_docs_indexer.py +++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py @@ -8,6 +8,9 @@ import logging import re from pathlib import Path +from app.config import config +from app.db import SurfsenseDocsChunk + logger = logging.getLogger(__name__) # Path to docs relative to project root @@ -62,3 +65,22 @@ def generate_surfsense_docs_content_hash(content: str) -> str: """Generate SHA-256 hash for Surfsense docs content.""" return hashlib.sha256(content.encode("utf-8")).hexdigest() + +def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]: + """ + Create chunks from Surfsense documentation content. + + Args: + content: Document content to chunk + + Returns: + List of SurfsenseDocsChunk objects with embeddings + """ + return [ + SurfsenseDocsChunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) + for chunk in config.chunker_instance.chunk(content) + ] + From 105f4c5c9d8eaaeb7a33636441cb8aa31b41dd43 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 15:58:27 +0200 Subject: [PATCH 05/20] feat: add create/update/skip/delete logic to indexer --- .../app/tasks/surfsense_docs_indexer.py | 109 +++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/tasks/surfsense_docs_indexer.py b/surfsense_backend/app/tasks/surfsense_docs_indexer.py index 6b4c4c91c..51a1c0938 100644 --- a/surfsense_backend/app/tasks/surfsense_docs_indexer.py +++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py @@ -6,10 +6,14 @@ Indexes MDX documentation files at migration time. import hashlib import logging import re +from datetime import UTC, datetime from pathlib import Path +from sqlalchemy import select +from sqlalchemy.orm import Session, selectinload + from app.config import config -from app.db import SurfsenseDocsChunk +from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument logger = logging.getLogger(__name__) @@ -84,3 +88,106 @@ def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]: for chunk in config.chunker_instance.chunk(content) ] + +def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: + """ + Index all Surfsense documentation files. + + Args: + session: SQLAlchemy sync session + + Returns: + Tuple of (created, updated, skipped, deleted) counts + """ + created = 0 + updated = 0 + skipped = 0 + deleted = 0 + + # Get all existing docs from database + existing_docs_result = session.execute( + select(SurfsenseDocsDocument).options(selectinload(SurfsenseDocsDocument.chunks)) + ) + existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()} + + # Track which sources we've processed + processed_sources = set() + + # Get all MDX files + mdx_files = get_all_mdx_files() + logger.info(f"Found {len(mdx_files)} MDX files to index") + + for mdx_file in mdx_files: + try: + source = str(mdx_file.relative_to(DOCS_DIR)) + processed_sources.add(source) + + # Read file content + raw_content = mdx_file.read_text(encoding="utf-8") + title, content = parse_mdx_frontmatter(raw_content) + content_hash = generate_surfsense_docs_content_hash(raw_content) + + if source in existing_docs: + existing_doc = existing_docs[source] + + # Check if content changed + if existing_doc.content_hash == content_hash: + logger.debug(f"Skipping unchanged: {source}") + skipped += 1 + continue + + # Content changed - update document + logger.info(f"Updating changed document: {source}") + + # Create new chunks + chunks = create_surfsense_docs_chunks(content) + + # Update document fields + existing_doc.title = title + existing_doc.content = content + existing_doc.content_hash = content_hash + existing_doc.embedding = config.embedding_model_instance.embed(content) + existing_doc.chunks = chunks + existing_doc.updated_at = datetime.now(UTC) + + updated += 1 + else: + # New document - create it + logger.info(f"Creating new document: {source}") + + chunks = create_surfsense_docs_chunks(content) + + document = SurfsenseDocsDocument( + source=source, + title=title, + content=content, + content_hash=content_hash, + embedding=config.embedding_model_instance.embed(content), + chunks=chunks, + updated_at=datetime.now(UTC), + ) + + session.add(document) + created += 1 + + except Exception as e: + logger.error(f"Error processing {mdx_file}: {e}", exc_info=True) + continue + + # Delete documents for removed files + for source, doc in existing_docs.items(): + if source not in processed_sources: + logger.info(f"Deleting removed document: {source}") + session.delete(doc) + deleted += 1 + + # Commit all changes + session.commit() + + logger.info( + f"Indexing complete: {created} created, {updated} updated, " + f"{skipped} skipped, {deleted} deleted" + ) + + return created, updated, skipped, deleted + From f30f39b5e960548c2ff267a30cdc9a9e56d13d0b Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 16:39:39 +0200 Subject: [PATCH 06/20] feat: create migration for Surfsense docs tables --- .../versions/60_add_surfsense_docs_tables.py | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 surfsense_backend/alembic/versions/60_add_surfsense_docs_tables.py diff --git a/surfsense_backend/alembic/versions/60_add_surfsense_docs_tables.py b/surfsense_backend/alembic/versions/60_add_surfsense_docs_tables.py new file mode 100644 index 000000000..7e5aa9437 --- /dev/null +++ b/surfsense_backend/alembic/versions/60_add_surfsense_docs_tables.py @@ -0,0 +1,165 @@ +"""Add Surfsense docs tables for global documentation storage + +Revision ID: 60 +Revises: 59 +""" + +from collections.abc import Sequence + +from alembic import op + +from app.config import config + +# revision identifiers, used by Alembic. +revision: str = "60" +down_revision: str | None = "59" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +# Get embedding dimension from config +EMBEDDING_DIM = config.embedding_model_instance.dimension + + +def upgrade() -> None: + """Create surfsense_docs_documents and surfsense_docs_chunks tables.""" + + # Create surfsense_docs_documents table + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'surfsense_docs_documents' + ) THEN + CREATE TABLE surfsense_docs_documents ( + id SERIAL PRIMARY KEY, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + source VARCHAR NOT NULL UNIQUE, + title VARCHAR NOT NULL, + content TEXT NOT NULL, + content_hash VARCHAR NOT NULL, + embedding vector({EMBEDDING_DIM}), + updated_at TIMESTAMP WITH TIME ZONE + ); + END IF; + END$$; + """ + ) + + # Create indexes for surfsense_docs_documents + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'surfsense_docs_documents' AND indexname = 'ix_surfsense_docs_documents_source' + ) THEN + CREATE INDEX ix_surfsense_docs_documents_source ON surfsense_docs_documents(source); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'surfsense_docs_documents' AND indexname = 'ix_surfsense_docs_documents_content_hash' + ) THEN + CREATE INDEX ix_surfsense_docs_documents_content_hash ON surfsense_docs_documents(content_hash); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'surfsense_docs_documents' AND indexname = 'ix_surfsense_docs_documents_updated_at' + ) THEN + CREATE INDEX ix_surfsense_docs_documents_updated_at ON surfsense_docs_documents(updated_at); + END IF; + END$$; + """ + ) + + # Create surfsense_docs_chunks table + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'surfsense_docs_chunks' + ) THEN + CREATE TABLE surfsense_docs_chunks ( + id SERIAL PRIMARY KEY, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + content TEXT NOT NULL, + embedding vector({EMBEDDING_DIM}), + document_id INTEGER NOT NULL REFERENCES surfsense_docs_documents(id) ON DELETE CASCADE + ); + END IF; + END$$; + """ + ) + + # Create indexes for surfsense_docs_chunks + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'surfsense_docs_chunks' AND indexname = 'ix_surfsense_docs_chunks_document_id' + ) THEN + CREATE INDEX ix_surfsense_docs_chunks_document_id ON surfsense_docs_chunks(document_id); + END IF; + END$$; + """ + ) + + # Create vector indexes for similarity search + op.execute( + """ + CREATE INDEX IF NOT EXISTS surfsense_docs_documents_vector_index + ON surfsense_docs_documents USING hnsw (embedding public.vector_cosine_ops); + """ + ) + + op.execute( + """ + CREATE INDEX IF NOT EXISTS surfsense_docs_chunks_vector_index + ON surfsense_docs_chunks USING hnsw (embedding public.vector_cosine_ops); + """ + ) + + # Create full-text search indexes (same pattern as documents/chunks tables) + op.execute( + """ + CREATE INDEX IF NOT EXISTS surfsense_docs_documents_search_index + ON surfsense_docs_documents USING gin (to_tsvector('english', content)); + """ + ) + + op.execute( + """ + CREATE INDEX IF NOT EXISTS surfsense_docs_chunks_search_index + ON surfsense_docs_chunks USING gin (to_tsvector('english', content)); + """ + ) + + +def downgrade() -> None: + """Remove surfsense docs tables.""" + # Drop full-text search indexes + op.execute("DROP INDEX IF EXISTS surfsense_docs_chunks_search_index") + op.execute("DROP INDEX IF EXISTS surfsense_docs_documents_search_index") + + # Drop vector indexes + op.execute("DROP INDEX IF EXISTS surfsense_docs_chunks_vector_index") + op.execute("DROP INDEX IF EXISTS surfsense_docs_documents_vector_index") + + # Drop regular indexes + op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_chunks_document_id") + op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_updated_at") + op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_content_hash") + op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_source") + + # Drop tables (chunks first due to FK) + op.execute("DROP TABLE IF EXISTS surfsense_docs_chunks") + op.execute("DROP TABLE IF EXISTS surfsense_docs_documents") + From ec145431f2ad1455585d32e1942d51fbb9fb9022 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 16:49:33 +0200 Subject: [PATCH 07/20] feat: add seeding script for Surfsense docs (run after migrations) --- .../scripts/seed_surfsense_docs.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 surfsense_backend/scripts/seed_surfsense_docs.py diff --git a/surfsense_backend/scripts/seed_surfsense_docs.py b/surfsense_backend/scripts/seed_surfsense_docs.py new file mode 100644 index 000000000..2e9eee649 --- /dev/null +++ b/surfsense_backend/scripts/seed_surfsense_docs.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +""" +Seed Surfsense documentation into the database. +Run this script after migrations to index MDX documentation files. + +Usage: + python scripts/seed_surfsense_docs.py +""" + +import sys +from pathlib import Path + +# Add the parent directory to the path so we can import app modules +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from sqlalchemy import create_engine +from sqlalchemy.orm import Session + +from app.config import config +from app.tasks.surfsense_docs_indexer import index_surfsense_docs + + +def main(): + """Main entry point for seeding Surfsense docs.""" + print("Starting Surfsense docs seeding...") + + # Create sync engine from database URL + # Convert async URL to sync if needed + database_url = config.DATABASE_URL + if database_url.startswith("postgresql+asyncpg://"): + database_url = database_url.replace("postgresql+asyncpg://", "postgresql://") + + engine = create_engine(database_url) + + with Session(engine) as session: + created, updated, skipped, deleted = index_surfsense_docs(session) + + print(f"\nSurfsense docs seeding complete:") + print(f" Created: {created}") + print(f" Updated: {updated}") + print(f" Skipped: {skipped}") + print(f" Deleted: {deleted}") + + +if __name__ == "__main__": + main() + From 6f672361432ccccb3a9ba40f7ac3a1e31a76ed0a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 16:58:09 +0200 Subject: [PATCH 08/20] feat: add docs seeding function to all-in-one entrypoint --- scripts/docker/entrypoint-allinone.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/scripts/docker/entrypoint-allinone.sh b/scripts/docker/entrypoint-allinone.sh index 8248968ab..ab21b2658 100644 --- a/scripts/docker/entrypoint-allinone.sh +++ b/scripts/docker/entrypoint-allinone.sh @@ -145,9 +145,29 @@ run_migrations() { echo "✅ Database migrations complete" } +# ================================================ +# Seed Surfsense documentation +# ================================================ +seed_surfsense_docs() { + echo "📚 Seeding Surfsense documentation..." + + # Start PostgreSQL temporarily for seeding + su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres -l /tmp/postgres_seed.log start" + sleep 5 + + cd /app/backend + python scripts/seed_surfsense_docs.py || echo "⚠️ Docs seeding may have already been done" + + # Stop PostgreSQL + su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres stop" + + echo "✅ Surfsense documentation seeded" +} + # Run migrations on first start or when explicitly requested if [ ! -f /data/.migrations_run ] || [ "${FORCE_MIGRATIONS:-false}" = "true" ]; then run_migrations + seed_surfsense_docs touch /data/.migrations_run fi From 4aa686480e9ce5db150353f0ee466c8ba3cf80f7 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 17:04:58 +0200 Subject: [PATCH 09/20] refactor: decouple docs seeding from migrations with separate flags --- scripts/docker/entrypoint-allinone.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/docker/entrypoint-allinone.sh b/scripts/docker/entrypoint-allinone.sh index ab21b2658..0888facf1 100644 --- a/scripts/docker/entrypoint-allinone.sh +++ b/scripts/docker/entrypoint-allinone.sh @@ -167,10 +167,15 @@ seed_surfsense_docs() { # Run migrations on first start or when explicitly requested if [ ! -f /data/.migrations_run ] || [ "${FORCE_MIGRATIONS:-false}" = "true" ]; then run_migrations - seed_surfsense_docs touch /data/.migrations_run fi +# Seed docs on first start or when explicitly requested +if [ ! -f /data/.docs_seeded ] || [ "${FORCE_SEED_DOCS:-false}" = "true" ]; then + seed_surfsense_docs + touch /data/.docs_seeded +fi + # ================================================ # Environment Variables Info # ================================================ From 1be9de9c240415e12825a6d33e75d71144fcc5eb Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 17:52:31 +0200 Subject: [PATCH 10/20] feat: add search_surfsense_docs tool with vector search --- .../new_chat/tools/search_surfsense_docs.py | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py diff --git a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py new file mode 100644 index 000000000..21f3942ab --- /dev/null +++ b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py @@ -0,0 +1,160 @@ +""" +Surfsense documentation search tool. + +This tool allows the agent to search the pre-indexed Surfsense documentation +to help users with questions about how to use the application. + +The documentation is indexed at deployment time from MDX files and stored +in dedicated tables (surfsense_docs_documents, surfsense_docs_chunks). +""" + +import json + +from langchain_core.tools import tool +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument + + +def format_surfsense_docs_results(results: list[tuple]) -> str: + """ + Format search results into XML structure for the LLM context. + + Uses the same XML structure as format_documents_for_context from knowledge_base.py + but with 'doc-' prefix on chunk IDs. This allows: + - LLM to use consistent [citation:doc-XXX] format + - Frontend to detect 'doc-' prefix and route to surfsense docs endpoint + + Args: + results: List of (chunk, document) tuples from the database query + + Returns: + Formatted XML string with documentation content and citation-ready chunks + """ + if not results: + return "No relevant Surfsense documentation found for your query." + + # Group chunks by document + grouped: dict[int, dict] = {} + for chunk, doc in results: + if doc.id not in grouped: + grouped[doc.id] = { + "document_id": f"doc-{doc.id}", + "document_type": "SURFSENSE_DOCS", + "title": doc.title, + "url": doc.source, + "metadata": {"source": doc.source}, + "chunks": [], + } + grouped[doc.id]["chunks"].append({ + "chunk_id": f"doc-{chunk.id}", + "content": chunk.content, + }) + + # Render XML matching format_documents_for_context structure + parts: list[str] = [] + for g in grouped.values(): + metadata_json = json.dumps(g["metadata"], ensure_ascii=False) + + parts.append("") + parts.append("") + parts.append(f" {g['document_id']}") + parts.append(f" {g['document_type']}") + parts.append(f" <![CDATA[{g['title']}]]>") + parts.append(f" ") + parts.append(f" ") + parts.append("") + parts.append("") + parts.append("") + + for ch in g["chunks"]: + parts.append(f" ") + + parts.append("") + parts.append("") + parts.append("") + + return "\n".join(parts).strip() + + +async def search_surfsense_docs_async( + query: str, + db_session: AsyncSession, + top_k: int = 5, +) -> str: + """ + Search Surfsense documentation using vector similarity. + + Args: + query: The search query about Surfsense usage + db_session: Database session for executing queries + top_k: Number of results to return + + Returns: + Formatted string with relevant documentation content + """ + # Get embedding for the query + query_embedding = config.embedding_model_instance.embed(query) + + # Vector similarity search on chunks, joining with documents + stmt = ( + select(SurfsenseDocsChunk, SurfsenseDocsDocument) + .join( + SurfsenseDocsDocument, + SurfsenseDocsChunk.document_id == SurfsenseDocsDocument.id, + ) + .order_by(SurfsenseDocsChunk.embedding.op("<=>")(query_embedding)) + .limit(top_k) + ) + + result = await db_session.execute(stmt) + rows = result.all() + + return format_surfsense_docs_results(rows) + + +def create_search_surfsense_docs_tool(db_session: AsyncSession): + """ + Factory function to create the search_surfsense_docs tool. + + Args: + db_session: Database session for executing queries + + Returns: + A configured tool function for searching Surfsense documentation + """ + + @tool + async def search_surfsense_docs(query: str, top_k: int = 5) -> str: + """ + Search Surfsense documentation for help with using the application. + + Use this tool when the user asks questions about: + - How to use Surfsense features + - Installation and setup instructions + - Configuration options and settings + - Troubleshooting common issues + - Available connectors and integrations + - Browser extension usage + - API documentation + + This searches the official Surfsense documentation that was indexed + at deployment time. It does NOT search the user's personal knowledge base. + + Args: + query: The search query about Surfsense usage or features + top_k: Number of documentation chunks to retrieve (default: 5) + + Returns: + Relevant documentation content formatted with chunk IDs for citations + """ + return await search_surfsense_docs_async( + query=query, + db_session=db_session, + top_k=top_k, + ) + + return search_surfsense_docs + From c4d214baa4c90a90dfba3da6627131e32a4e97ec Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 17:58:14 +0200 Subject: [PATCH 11/20] feat: register search_surfsense_docs tool in agent toolkit --- .../app/agents/new_chat/tools/__init__.py | 3 +++ .../app/agents/new_chat/tools/registry.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py index b89988327..b531d9b4d 100644 --- a/surfsense_backend/app/agents/new_chat/tools/__init__.py +++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py @@ -6,6 +6,7 @@ To add a new tool, see the documentation in registry.py. Available tools: - search_knowledge_base: Search the user's personal knowledge base +- search_surfsense_docs: Search Surfsense documentation for usage help - generate_podcast: Generate audio podcasts from content - link_preview: Fetch rich previews for URLs - display_image: Display images in chat @@ -31,6 +32,7 @@ from .registry import ( get_tool_by_name, ) from .scrape_webpage import create_scrape_webpage_tool +from .search_surfsense_docs import create_search_surfsense_docs_tool __all__ = [ # Registry @@ -43,6 +45,7 @@ __all__ = [ "create_link_preview_tool", "create_scrape_webpage_tool", "create_search_knowledge_base_tool", + "create_search_surfsense_docs_tool", # Knowledge base utilities "format_documents_for_context", "get_all_tool_names", diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py index bc305aecc..c7439bf8f 100644 --- a/surfsense_backend/app/agents/new_chat/tools/registry.py +++ b/surfsense_backend/app/agents/new_chat/tools/registry.py @@ -48,6 +48,7 @@ from .knowledge_base import create_search_knowledge_base_tool from .link_preview import create_link_preview_tool from .podcast import create_generate_podcast_tool from .scrape_webpage import create_scrape_webpage_tool +from .search_surfsense_docs import create_search_surfsense_docs_tool # ============================================================================= # Tool Definition @@ -126,6 +127,15 @@ BUILTIN_TOOLS: list[ToolDefinition] = [ requires=[], # firecrawl_api_key is optional ), # Note: write_todos is now provided by TodoListMiddleware from deepagents + # Surfsense documentation search tool + ToolDefinition( + name="search_surfsense_docs", + description="Search Surfsense documentation for help with using the application", + factory=lambda deps: create_search_surfsense_docs_tool( + db_session=deps["db_session"], + ), + requires=["db_session"], + ), # ========================================================================= # ADD YOUR CUSTOM TOOLS BELOW # ========================================================================= From 3539b2a83da6be0997c7c63c74b862228fd41291 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 18:07:14 +0200 Subject: [PATCH 12/20] feat: add surfsense docs citation endpoint --- surfsense_backend/app/routes/__init__.py | 2 + .../app/routes/surfsense_docs_routes.py | 89 +++++++++++++++++++ .../app/schemas/surfsense_docs.py | 27 ++++++ 3 files changed, 118 insertions(+) create mode 100644 surfsense_backend/app/routes/surfsense_docs_routes.py create mode 100644 surfsense_backend/app/schemas/surfsense_docs.py diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index b4e94c732..4b6df350a 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -31,6 +31,7 @@ from .rbac_routes import router as rbac_router from .search_source_connectors_routes import router as search_source_connectors_router from .search_spaces_routes import router as search_spaces_router from .slack_add_connector_route import router as slack_add_connector_router +from .surfsense_docs_routes import router as surfsense_docs_router from .teams_add_connector_route import router as teams_add_connector_router router = APIRouter() @@ -59,3 +60,4 @@ router.include_router(clickup_add_connector_router) router.include_router(new_llm_config_router) # LLM configs with prompt configuration router.include_router(logs_router) router.include_router(circleback_webhook_router) # Circleback meeting webhooks +router.include_router(surfsense_docs_router) # Surfsense documentation for citations diff --git a/surfsense_backend/app/routes/surfsense_docs_routes.py b/surfsense_backend/app/routes/surfsense_docs_routes.py new file mode 100644 index 000000000..a2de65568 --- /dev/null +++ b/surfsense_backend/app/routes/surfsense_docs_routes.py @@ -0,0 +1,89 @@ +""" +Routes for Surfsense documentation. + +These endpoints support the citation system for Surfsense docs, +allowing the frontend to fetch document details when a user clicks +on a [citation:doc-XXX] link. +""" + +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from app.db import ( + SurfsenseDocsChunk, + SurfsenseDocsDocument, + User, + get_async_session, +) +from app.schemas.surfsense_docs import ( + SurfsenseDocsChunkRead, + SurfsenseDocsDocumentWithChunksRead, +) +from app.users import current_active_user + +router = APIRouter() + + +@router.get( + "/surfsense-docs/by-chunk/{chunk_id}", + response_model=SurfsenseDocsDocumentWithChunksRead, +) +async def get_surfsense_doc_by_chunk_id( + chunk_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Retrieves a Surfsense documentation document based on a chunk ID. + + This endpoint is used by the frontend to resolve [citation:doc-XXX] links. + """ + try: + # Get the chunk + chunk_result = await session.execute( + select(SurfsenseDocsChunk).filter(SurfsenseDocsChunk.id == chunk_id) + ) + chunk = chunk_result.scalars().first() + + if not chunk: + raise HTTPException( + status_code=404, + detail=f"Surfsense docs chunk with id {chunk_id} not found", + ) + + # Get the associated document with all its chunks + document_result = await session.execute( + select(SurfsenseDocsDocument) + .options(selectinload(SurfsenseDocsDocument.chunks)) + .filter(SurfsenseDocsDocument.id == chunk.document_id) + ) + document = document_result.scalars().first() + + if not document: + raise HTTPException( + status_code=404, + detail="Surfsense docs document not found", + ) + + # Sort chunks by ID + sorted_chunks = sorted(document.chunks, key=lambda x: x.id) + + return SurfsenseDocsDocumentWithChunksRead( + id=document.id, + title=document.title, + source=document.source, + content=document.content, + chunks=[ + SurfsenseDocsChunkRead(id=c.id, content=c.content) + for c in sorted_chunks + ], + ) + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Failed to retrieve Surfsense documentation: {e!s}", + ) from e diff --git a/surfsense_backend/app/schemas/surfsense_docs.py b/surfsense_backend/app/schemas/surfsense_docs.py new file mode 100644 index 000000000..7464df342 --- /dev/null +++ b/surfsense_backend/app/schemas/surfsense_docs.py @@ -0,0 +1,27 @@ +""" +Schemas for Surfsense documentation. +""" + +from pydantic import BaseModel, ConfigDict + + +class SurfsenseDocsChunkRead(BaseModel): + """Schema for a Surfsense docs chunk.""" + + id: int + content: str + + model_config = ConfigDict(from_attributes=True) + + +class SurfsenseDocsDocumentWithChunksRead(BaseModel): + """Schema for a Surfsense docs document with its chunks.""" + + id: int + title: str + source: str + content: str + chunks: list[SurfsenseDocsChunkRead] + + model_config = ConfigDict(from_attributes=True) + From abd3bace53e6280e84c723653b2f123ad17e8729 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 18:21:59 +0200 Subject: [PATCH 13/20] feat: add frontend support for surfsense docs citations --- .../assistant-ui/inline-citation.tsx | 9 ++++--- .../components/assistant-ui/markdown-text.tsx | 26 ++++++++++++------- .../new-chat/source-detail-panel.tsx | 11 ++++++-- .../lib/apis/documents-api.service.ts | 12 +++++++++ 4 files changed, 43 insertions(+), 15 deletions(-) diff --git a/surfsense_web/components/assistant-ui/inline-citation.tsx b/surfsense_web/components/assistant-ui/inline-citation.tsx index 065f37e8e..9eab9a3c3 100644 --- a/surfsense_web/components/assistant-ui/inline-citation.tsx +++ b/surfsense_web/components/assistant-ui/inline-citation.tsx @@ -7,13 +7,15 @@ import { SourceDetailPanel } from "@/components/new-chat/source-detail-panel"; interface InlineCitationProps { chunkId: number; citationNumber: number; + isDocsChunk?: boolean; } /** * Inline citation component for the new chat. * Renders a clickable numbered badge that opens the SourceDetailPanel with document chunk details. + * Supports both regular knowledge base chunks and Surfsense documentation chunks. */ -export const InlineCitation: FC = ({ chunkId, citationNumber }) => { +export const InlineCitation: FC = ({ chunkId, citationNumber, isDocsChunk = false }) => { const [isOpen, setIsOpen] = useState(false); return ( @@ -21,10 +23,11 @@ export const InlineCitation: FC = ({ chunkId, citationNumbe open={isOpen} onOpenChange={setIsOpen} chunkId={chunkId} - sourceType="" - title="Source" + sourceType={isDocsChunk ? "SURFSENSE_DOCS" : ""} + title={isDocsChunk ? "Surfsense Documentation" : "Source"} description="" url="" + isDocsChunk={isDocsChunk} > setIsOpen(true)} diff --git a/surfsense_web/components/assistant-ui/markdown-text.tsx b/surfsense_web/components/assistant-ui/markdown-text.tsx index 41d6143b9..532ae7663 100644 --- a/surfsense_web/components/assistant-ui/markdown-text.tsx +++ b/surfsense_web/components/assistant-ui/markdown-text.tsx @@ -15,8 +15,8 @@ import { InlineCitation } from "@/components/assistant-ui/inline-citation"; import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button"; import { cn } from "@/lib/utils"; -// Citation pattern: [citation:CHUNK_ID] -const CITATION_REGEX = /\[citation:(\d+)\]/g; +// Citation pattern: [citation:CHUNK_ID] or [citation:doc-CHUNK_ID] +const CITATION_REGEX = /\[citation:(doc-)?(\d+)\]/g; // Track chunk IDs to citation numbers mapping for consistent numbering // This map is reset when a new message starts rendering @@ -33,16 +33,20 @@ export function resetCitationCounter() { /** * Gets or assigns a citation number for a chunk ID + * Uses string key to differentiate between doc and regular chunks */ -function getCitationNumber(chunkId: number): number { - if (!chunkIdToCitationNumber.has(chunkId)) { - chunkIdToCitationNumber.set(chunkId, nextCitationNumber++); +function getCitationNumber(chunkId: number, isDocsChunk: boolean): number { + const key = isDocsChunk ? `doc-${chunkId}` : String(chunkId); + const existingNumber = chunkIdToCitationNumber.get(key as unknown as number); + if (existingNumber === undefined) { + chunkIdToCitationNumber.set(key as unknown as number, nextCitationNumber++); } - return chunkIdToCitationNumber.get(chunkId)!; + return chunkIdToCitationNumber.get(key as unknown as number)!; } /** * Parses text and replaces [citation:XXX] patterns with InlineCitation components + * Supports both regular chunks [citation:123] and docs chunks [citation:doc-123] */ function parseTextWithCitations(text: string): ReactNode[] { const parts: ReactNode[] = []; @@ -59,14 +63,16 @@ function parseTextWithCitations(text: string): ReactNode[] { parts.push(text.substring(lastIndex, match.index)); } - // Add the citation component - const chunkId = Number.parseInt(match[1], 10); - const citationNumber = getCitationNumber(chunkId); + // Check if this is a docs chunk (has "doc-" prefix) + const isDocsChunk = match[1] === "doc-"; + const chunkId = Number.parseInt(match[2], 10); + const citationNumber = getCitationNumber(chunkId, isDocsChunk); parts.push( ); diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx index 35249dc50..dc0c3c3f8 100644 --- a/surfsense_web/components/new-chat/source-detail-panel.tsx +++ b/surfsense_web/components/new-chat/source-detail-panel.tsx @@ -34,6 +34,7 @@ interface SourceDetailPanelProps { description?: string; url?: string; children?: ReactNode; + isDocsChunk?: boolean; } const formatDocumentType = (type: string) => { @@ -114,6 +115,7 @@ export function SourceDetailPanel({ description, url, children, + isDocsChunk = false, }: SourceDetailPanelProps) { const scrollAreaRef = useRef(null); const hasScrolledRef = useRef(false); // Use ref to avoid stale closures @@ -132,8 +134,13 @@ export function SourceDetailPanel({ isLoading: isDocumentByChunkFetching, error: documentByChunkFetchingError, } = useQuery({ - queryKey: cacheKeys.documents.byChunk(chunkId.toString()), - queryFn: () => documentsApiService.getDocumentByChunk({ chunk_id: chunkId }), + queryKey: isDocsChunk + ? cacheKeys.documents.byChunk(`doc-${chunkId}`) + : cacheKeys.documents.byChunk(chunkId.toString()), + queryFn: () => + isDocsChunk + ? documentsApiService.getSurfsenseDocByChunk(chunkId) + : documentsApiService.getDocumentByChunk({ chunk_id: chunkId }), enabled: !!chunkId && open, staleTime: 5 * 60 * 1000, }); diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index cf7a4b778..372baee4d 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -209,6 +209,18 @@ class DocumentsApiService { ); }; + /** + * Get Surfsense documentation by chunk ID + * Used for resolving [citation:doc-XXX] citations + */ + getSurfsenseDocByChunk = async (chunkId: number) => { + // Response shape matches getDocumentByChunkResponse structure + return baseApiService.get( + `/api/v1/surfsense-docs/by-chunk/${chunkId}`, + getDocumentByChunkResponse + ); + }; + /** * Update a document */ From 2c3d625b35613a38bb3e58aba359af917afa163a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 9 Jan 2026 20:11:47 +0200 Subject: [PATCH 14/20] fix: increase top_k from 5 to 10 to match knowledge base --- .../app/agents/new_chat/tools/search_surfsense_docs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py index 21f3942ab..a34e16ff2 100644 --- a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py +++ b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py @@ -82,7 +82,7 @@ def format_surfsense_docs_results(results: list[tuple]) -> str: async def search_surfsense_docs_async( query: str, db_session: AsyncSession, - top_k: int = 5, + top_k: int = 10, ) -> str: """ Search Surfsense documentation using vector similarity. @@ -127,7 +127,7 @@ def create_search_surfsense_docs_tool(db_session: AsyncSession): """ @tool - async def search_surfsense_docs(query: str, top_k: int = 5) -> str: + async def search_surfsense_docs(query: str, top_k: int = 10) -> str: """ Search Surfsense documentation for help with using the application. @@ -145,7 +145,7 @@ def create_search_surfsense_docs_tool(db_session: AsyncSession): Args: query: The search query about Surfsense usage or features - top_k: Number of documentation chunks to retrieve (default: 5) + top_k: Number of documentation chunks to retrieve (default: 10) Returns: Relevant documentation content formatted with chunk IDs for citations From 42473fe4edb4a2872bd5c50dc87e24dd238d6e40 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Mon, 12 Jan 2026 18:06:51 +0200 Subject: [PATCH 15/20] feat: add Zod schemas for Surfsense docs and fix citation handling --- .../components/assistant-ui/markdown-text.tsx | 9 ++--- .../new-chat/source-detail-panel.tsx | 23 +++++++++---- .../contracts/types/document.types.ts | 34 +++++++++++++++++++ .../lib/apis/documents-api.service.ts | 4 +-- 4 files changed, 57 insertions(+), 13 deletions(-) diff --git a/surfsense_web/components/assistant-ui/markdown-text.tsx b/surfsense_web/components/assistant-ui/markdown-text.tsx index 532ae7663..5bc905645 100644 --- a/surfsense_web/components/assistant-ui/markdown-text.tsx +++ b/surfsense_web/components/assistant-ui/markdown-text.tsx @@ -20,7 +20,8 @@ const CITATION_REGEX = /\[citation:(doc-)?(\d+)\]/g; // Track chunk IDs to citation numbers mapping for consistent numbering // This map is reset when a new message starts rendering -let chunkIdToCitationNumber: Map = new Map(); +// Uses string keys to differentiate between doc and regular chunks (e.g., "doc-123" vs "123") +let chunkIdToCitationNumber: Map = new Map(); let nextCitationNumber = 1; /** @@ -37,11 +38,11 @@ export function resetCitationCounter() { */ function getCitationNumber(chunkId: number, isDocsChunk: boolean): number { const key = isDocsChunk ? `doc-${chunkId}` : String(chunkId); - const existingNumber = chunkIdToCitationNumber.get(key as unknown as number); + const existingNumber = chunkIdToCitationNumber.get(key); if (existingNumber === undefined) { - chunkIdToCitationNumber.set(key as unknown as number, nextCitationNumber++); + chunkIdToCitationNumber.set(key, nextCitationNumber++); } - return chunkIdToCitationNumber.get(key as unknown as number)!; + return chunkIdToCitationNumber.get(key)!; } /** diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx index dc0c3c3f8..df2809fdb 100644 --- a/surfsense_web/components/new-chat/source-detail-panel.tsx +++ b/surfsense_web/components/new-chat/source-detail-panel.tsx @@ -21,10 +21,16 @@ import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; import { ScrollArea } from "@/components/ui/scroll-area"; +import type { + GetDocumentByChunkResponse, + GetSurfsenseDocsByChunkResponse, +} from "@/contracts/types/document.types"; import { documentsApiService } from "@/lib/apis/documents-api.service"; import { cacheKeys } from "@/lib/query-client/cache-keys"; import { cn } from "@/lib/utils"; +type DocumentData = GetDocumentByChunkResponse | GetSurfsenseDocsByChunkResponse; + interface SourceDetailPanelProps { open: boolean; onOpenChange: (open: boolean) => void; @@ -133,14 +139,16 @@ export function SourceDetailPanel({ data: documentData, isLoading: isDocumentByChunkFetching, error: documentByChunkFetchingError, - } = useQuery({ + } = useQuery({ queryKey: isDocsChunk ? cacheKeys.documents.byChunk(`doc-${chunkId}`) : cacheKeys.documents.byChunk(chunkId.toString()), - queryFn: () => - isDocsChunk - ? documentsApiService.getSurfsenseDocByChunk(chunkId) - : documentsApiService.getDocumentByChunk({ chunk_id: chunkId }), + queryFn: async () => { + if (isDocsChunk) { + return documentsApiService.getSurfsenseDocByChunk(chunkId); + } + return documentsApiService.getDocumentByChunk({ chunk_id: chunkId }); + }, enabled: !!chunkId && open, staleTime: 5 * 60 * 1000, }); @@ -332,7 +340,7 @@ export function SourceDetailPanel({ {documentData?.title || title || "Source Document"}

- {documentData + {documentData && "document_type" in documentData ? formatDocumentType(documentData.document_type) : sourceType && formatDocumentType(sourceType)} {documentData?.chunks && ( @@ -498,7 +506,8 @@ export function SourceDetailPanel({

{/* Document Metadata */} - {documentData.document_metadata && + {"document_metadata" in documentData && + documentData.document_metadata && Object.keys(documentData.document_metadata).length > 0 && ( ; export type DeleteDocumentRequest = z.infer; export type DeleteDocumentResponse = z.infer; export type DocumentTypeEnum = z.infer; +export type SurfsenseDocsChunk = z.infer; +export type SurfsenseDocsDocument = z.infer; +export type SurfsenseDocsDocumentWithChunks = z.infer; +export type GetSurfsenseDocsByChunkRequest = z.infer; +export type GetSurfsenseDocsByChunkResponse = z.infer; diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index 372baee4d..2e7d18e44 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -17,6 +17,7 @@ import { getDocumentsResponse, getDocumentTypeCountsRequest, getDocumentTypeCountsResponse, + getSurfsenseDocsByChunkResponse, type SearchDocumentsRequest, searchDocumentsRequest, searchDocumentsResponse, @@ -214,10 +215,9 @@ class DocumentsApiService { * Used for resolving [citation:doc-XXX] citations */ getSurfsenseDocByChunk = async (chunkId: number) => { - // Response shape matches getDocumentByChunkResponse structure return baseApiService.get( `/api/v1/surfsense-docs/by-chunk/${chunkId}`, - getDocumentByChunkResponse + getSurfsenseDocsByChunkResponse ); }; From f6621f9a9a2235972f11b9f4f671459e4f650e5e Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Mon, 12 Jan 2026 18:28:38 +0200 Subject: [PATCH 16/20] fix: copy docs content to Docker image for Surfsense docs indexer --- Dockerfile.allinone | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile.allinone b/Dockerfile.allinone index 1c04ffb99..33ae32023 100644 --- a/Dockerfile.allinone +++ b/Dockerfile.allinone @@ -165,6 +165,9 @@ COPY --from=frontend-builder /app/.next/standalone ./ COPY --from=frontend-builder /app/.next/static ./.next/static COPY --from=frontend-builder /app/public ./public +# Copy docs content for Surfsense docs indexer (used at runtime for seeding) +COPY surfsense_web/content/docs /app/surfsense_web/content/docs + # ==================== # Setup Backend # ==================== From 96545056cdf2a88d608e983cfe5e1c63e195e6f5 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Mon, 12 Jan 2026 20:15:11 +0200 Subject: [PATCH 17/20] refactor: async docs seeding in FastAPI lifespan --- scripts/docker/entrypoint-allinone.sh | 25 +---------- surfsense_backend/app/app.py | 3 ++ .../app/tasks/surfsense_docs_indexer.py | 45 +++++++++++++++---- .../scripts/seed_surfsense_docs.py | 41 +++++++---------- 4 files changed, 58 insertions(+), 56 deletions(-) diff --git a/scripts/docker/entrypoint-allinone.sh b/scripts/docker/entrypoint-allinone.sh index 0888facf1..ef0ef28ed 100644 --- a/scripts/docker/entrypoint-allinone.sh +++ b/scripts/docker/entrypoint-allinone.sh @@ -145,36 +145,13 @@ run_migrations() { echo "✅ Database migrations complete" } -# ================================================ -# Seed Surfsense documentation -# ================================================ -seed_surfsense_docs() { - echo "📚 Seeding Surfsense documentation..." - - # Start PostgreSQL temporarily for seeding - su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres -l /tmp/postgres_seed.log start" - sleep 5 - - cd /app/backend - python scripts/seed_surfsense_docs.py || echo "⚠️ Docs seeding may have already been done" - - # Stop PostgreSQL - su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres stop" - - echo "✅ Surfsense documentation seeded" -} - # Run migrations on first start or when explicitly requested if [ ! -f /data/.migrations_run ] || [ "${FORCE_MIGRATIONS:-false}" = "true" ]; then run_migrations touch /data/.migrations_run fi -# Seed docs on first start or when explicitly requested -if [ ! -f /data/.docs_seeded ] || [ "${FORCE_SEED_DOCS:-false}" = "true" ]; then - seed_surfsense_docs - touch /data/.docs_seeded -fi +# Note: Surfsense docs seeding is now handled by FastAPI startup (app.py lifespan) # ================================================ # Environment Variables Info diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py index 993961148..3ad9d89bc 100644 --- a/surfsense_backend/app/app.py +++ b/surfsense_backend/app/app.py @@ -13,6 +13,7 @@ from app.config import config from app.db import User, create_db_and_tables, get_async_session from app.routes import router as crud_router from app.schemas import UserCreate, UserRead, UserUpdate +from app.tasks.surfsense_docs_indexer import seed_surfsense_docs from app.users import SECRET, auth_backend, current_active_user, fastapi_users @@ -22,6 +23,8 @@ async def lifespan(app: FastAPI): await create_db_and_tables() # Setup LangGraph checkpointer tables for conversation persistence await setup_checkpointer_tables() + # Seed Surfsense documentation + await seed_surfsense_docs() yield # Cleanup: close checkpointer connection on shutdown await close_checkpointer() diff --git a/surfsense_backend/app/tasks/surfsense_docs_indexer.py b/surfsense_backend/app/tasks/surfsense_docs_indexer.py index 51a1c0938..f2c1e69ba 100644 --- a/surfsense_backend/app/tasks/surfsense_docs_indexer.py +++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py @@ -1,6 +1,6 @@ """ Surfsense documentation indexer. -Indexes MDX documentation files at migration time. +Indexes MDX documentation files at startup. """ import hashlib @@ -10,10 +10,11 @@ from datetime import UTC, datetime from pathlib import Path from sqlalchemy import select -from sqlalchemy.orm import Session, selectinload +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload from app.config import config -from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument +from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker logger = logging.getLogger(__name__) @@ -89,12 +90,12 @@ def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]: ] -def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: +async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, int]: """ Index all Surfsense documentation files. Args: - session: SQLAlchemy sync session + session: SQLAlchemy async session Returns: Tuple of (created, updated, skipped, deleted) counts @@ -105,7 +106,7 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: deleted = 0 # Get all existing docs from database - existing_docs_result = session.execute( + existing_docs_result = await session.execute( select(SurfsenseDocsDocument).options(selectinload(SurfsenseDocsDocument.chunks)) ) existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()} @@ -178,11 +179,11 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: for source, doc in existing_docs.items(): if source not in processed_sources: logger.info(f"Deleting removed document: {source}") - session.delete(doc) + await session.delete(doc) deleted += 1 # Commit all changes - session.commit() + await session.commit() logger.info( f"Indexing complete: {created} created, {updated} updated, " @@ -191,3 +192,31 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: return created, updated, skipped, deleted + +async def seed_surfsense_docs() -> tuple[int, int, int, int]: + """ + Seed Surfsense documentation into the database. + + This function indexes all MDX files from the docs directory. + It handles creating, updating, and deleting docs based on content changes. + + Returns: + Tuple of (created, updated, skipped, deleted) counts + Returns (0, 0, 0, 0) if an error occurs + """ + logger.info("Starting Surfsense docs indexing...") + + try: + async with async_session_maker() as session: + created, updated, skipped, deleted = await index_surfsense_docs(session) + + logger.info( + f"Surfsense docs indexing complete: " + f"created={created}, updated={updated}, skipped={skipped}, deleted={deleted}" + ) + + return created, updated, skipped, deleted + + except Exception as e: + logger.error(f"Failed to seed Surfsense docs: {e}", exc_info=True) + return 0, 0, 0, 0 diff --git a/surfsense_backend/scripts/seed_surfsense_docs.py b/surfsense_backend/scripts/seed_surfsense_docs.py index 2e9eee649..d9536bf91 100644 --- a/surfsense_backend/scripts/seed_surfsense_docs.py +++ b/surfsense_backend/scripts/seed_surfsense_docs.py @@ -1,47 +1,40 @@ #!/usr/bin/env python """ Seed Surfsense documentation into the database. -Run this script after migrations to index MDX documentation files. + +CLI wrapper for the seed_surfsense_docs function. +Can be run manually for debugging or re-indexing. Usage: python scripts/seed_surfsense_docs.py """ +import asyncio import sys from pathlib import Path # Add the parent directory to the path so we can import app modules sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) -from sqlalchemy import create_engine -from sqlalchemy.orm import Session - -from app.config import config -from app.tasks.surfsense_docs_indexer import index_surfsense_docs +from app.tasks.surfsense_docs_indexer import seed_surfsense_docs def main(): - """Main entry point for seeding Surfsense docs.""" - print("Starting Surfsense docs seeding...") + """CLI entry point for seeding Surfsense docs.""" + print("=" * 50) + print(" Surfsense Documentation Seeding") + print("=" * 50) - # Create sync engine from database URL - # Convert async URL to sync if needed - database_url = config.DATABASE_URL - if database_url.startswith("postgresql+asyncpg://"): - database_url = database_url.replace("postgresql+asyncpg://", "postgresql://") + created, updated, skipped, deleted = asyncio.run(seed_surfsense_docs()) - engine = create_engine(database_url) - - with Session(engine) as session: - created, updated, skipped, deleted = index_surfsense_docs(session) - - print(f"\nSurfsense docs seeding complete:") - print(f" Created: {created}") - print(f" Updated: {updated}") - print(f" Skipped: {skipped}") - print(f" Deleted: {deleted}") + print() + print("Results:") + print(f" Created: {created}") + print(f" Updated: {updated}") + print(f" Skipped: {skipped}") + print(f" Deleted: {deleted}") + print("=" * 50) if __name__ == "__main__": main() - From 19ef32539d33a2c30f757c062576b267abd426a6 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Mon, 12 Jan 2026 20:20:48 +0200 Subject: [PATCH 18/20] cleanup: remove obsolete comments --- Dockerfile.allinone | 1 - scripts/docker/entrypoint-allinone.sh | 2 -- 2 files changed, 3 deletions(-) diff --git a/Dockerfile.allinone b/Dockerfile.allinone index 33ae32023..95893c0b5 100644 --- a/Dockerfile.allinone +++ b/Dockerfile.allinone @@ -165,7 +165,6 @@ COPY --from=frontend-builder /app/.next/standalone ./ COPY --from=frontend-builder /app/.next/static ./.next/static COPY --from=frontend-builder /app/public ./public -# Copy docs content for Surfsense docs indexer (used at runtime for seeding) COPY surfsense_web/content/docs /app/surfsense_web/content/docs # ==================== diff --git a/scripts/docker/entrypoint-allinone.sh b/scripts/docker/entrypoint-allinone.sh index ef0ef28ed..8248968ab 100644 --- a/scripts/docker/entrypoint-allinone.sh +++ b/scripts/docker/entrypoint-allinone.sh @@ -151,8 +151,6 @@ if [ ! -f /data/.migrations_run ] || [ "${FORCE_MIGRATIONS:-false}" = "true" ]; touch /data/.migrations_run fi -# Note: Surfsense docs seeding is now handled by FastAPI startup (app.py lifespan) - # ================================================ # Environment Variables Info # ================================================ From 842004e6170410c50021ad3d3286c669b8926781 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Mon, 12 Jan 2026 20:54:00 +0200 Subject: [PATCH 19/20] docs: add tool examples to system prompt --- .../app/agents/new_chat/system_prompt.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py index f9dfdb025..169363fe9 100644 --- a/surfsense_backend/app/agents/new_chat/system_prompt.py +++ b/surfsense_backend/app/agents/new_chat/system_prompt.py @@ -26,6 +26,13 @@ SURFSENSE_TOOLS_INSTRUCTIONS = """ You have access to the following tools: +0. search_surfsense_docs: Search the official SurfSense documentation. + - Use this tool when the user asks anything about SurfSense itself (the application they are using). + - Args: + - query: The search query about SurfSense + - top_k: Number of documentation chunks to retrieve (default: 10) + - Returns: Documentation content with chunk IDs for citations (prefixed with 'doc-', e.g., [citation:doc-123]) + 1. search_knowledge_base: Search the user's personal knowledge base for relevant information. - Args: - query: The search query - be specific and include key terms @@ -152,6 +159,18 @@ You have access to the following tools: - Airtable/Notion: Check field values, apply mapping above +- User: "How do I install SurfSense?" + - Call: `search_surfsense_docs(query="installation setup")` + +- User: "What connectors does SurfSense support?" + - Call: `search_surfsense_docs(query="available connectors integrations")` + +- User: "How do I set up the Notion connector?" + - Call: `search_surfsense_docs(query="Notion connector setup configuration")` + +- User: "How do I use Docker to run SurfSense?" + - Call: `search_surfsense_docs(query="Docker installation setup")` + - User: "Fetch all my notes and what's in them?" - Call: `search_knowledge_base(query="*", top_k=50, connectors_to_search=["NOTE"])` From c0b97fcc68cae577d6273a8570ff7717d364d814 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Mon, 12 Jan 2026 21:00:06 +0200 Subject: [PATCH 20/20] fix: add doc- prefix examples to citation instructions --- surfsense_backend/app/agents/new_chat/system_prompt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py index 169363fe9..15fc17022 100644 --- a/surfsense_backend/app/agents/new_chat/system_prompt.py +++ b/surfsense_backend/app/agents/new_chat/system_prompt.py @@ -327,7 +327,7 @@ The documents you receive are structured like this: -IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite document_id. +IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124, doc-45). Do NOT cite document_id. @@ -338,11 +338,13 @@ IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite docume - NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format - NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only - NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess +- Copy the EXACT chunk id from the XML - if it says ``, use [citation:doc-123] CORRECT citation formats: - [citation:5] +- [citation:doc-123] (for Surfsense documentation chunks) - [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] INCORRECT citation formats (DO NOT use):