feat: implement and test index method

2026-07-12 22:42:13 +02:00 · 2026-02-25 01:40:30 +02:00 · 2026-02-25 01:40:30 +02:00 · 61e50834e6
commit 61e50834e6
parent 497ed681d5
8 changed files with 218 additions and 31 deletions
--- a/surfsense_backend/app/indexing_pipeline/connector_document.py
+++ b/surfsense_backend/app/indexing_pipeline/connector_document.py
@ -4,6 +4,7 @@ from app.db import DocumentType


 class ConnectorDocument(BaseModel):
+    """Canonical data transfer object produced by connector adapters and consumed by the indexing pipeline."""
    title: str
    source_markdown: str
    unique_id: str
--- a/surfsense_backend/app/indexing_pipeline/document_chunker.py
+++ b/surfsense_backend/app/indexing_pipeline/document_chunker.py
@ -0,0 +1,6 @@
+from app.config import config
+
+
+def chunk_text(text: str) -> list[str]:
+    """Chunk a text string using the configured chunker and return the chunk texts."""
+    return [c.text for c in config.chunker_instance.chunk(text)]
--- a/surfsense_backend/app/indexing_pipeline/document_embedder.py
+++ b/surfsense_backend/app/indexing_pipeline/document_embedder.py
@ -0,0 +1,6 @@
+from app.config import config
+
+
+def embed_text(text: str) -> list[float]:
+    """Embed a single text string using the configured embedding model."""
+    return config.embedding_model_instance.embed(text)
--- a/surfsense_backend/app/indexing_pipeline/document_hashing.py
+++ b/surfsense_backend/app/indexing_pipeline/document_hashing.py
@ -4,10 +4,12 @@ from app.indexing_pipeline.connector_document import ConnectorDocument


 def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
+    """Return a stable SHA-256 hash identifying a document by its source identity."""
    combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}"
    return hashlib.sha256(combined.encode("utf-8")).hexdigest()


 def compute_content_hash(doc: ConnectorDocument) -> str:
+    """Return a SHA-256 hash of the document's content scoped to its search space."""
    combined = f"{doc.search_space_id}:{doc.source_markdown}"
    return hashlib.sha256(combined.encode("utf-8")).hexdigest()
--- a/surfsense_backend/app/indexing_pipeline/document_summarizer.py
+++ b/surfsense_backend/app/indexing_pipeline/document_summarizer.py
@ -0,0 +1,28 @@
+from app.prompts import SUMMARY_PROMPT_TEMPLATE
+from app.utils.document_converters import optimize_content_for_context_window
+
+
+async def summarize_document(source_markdown: str, llm, metadata: dict | None = None) -> str:
+    """Generate a text summary of a document using an LLM, prefixed with metadata when provided."""
+    model_name = getattr(llm, "model", "gpt-3.5-turbo")
+    optimized_content = optimize_content_for_context_window(
+        source_markdown, metadata, model_name
+    )
+
+    summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
+    content_with_metadata = (
+        f"<DOCUMENT><DOCUMENT_METADATA>\n\n{metadata}\n\n</DOCUMENT_METADATA>"
+        f"\n\n<DOCUMENT_CONTENT>\n\n{optimized_content}\n\n</DOCUMENT_CONTENT></DOCUMENT>"
+    )
+    summary_result = await summary_chain.ainvoke({"document": content_with_metadata})
+    summary_content = summary_result.content
+
+    if metadata:
+        metadata_parts = ["# DOCUMENT METADATA"]
+        for key, value in metadata.items():
+            if value:
+                metadata_parts.append(f"**{key.replace('_', ' ').title()}:** {value}")
+        metadata_section = "\n".join(metadata_parts)
+        return f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
+
+    return summary_content
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@ -4,14 +4,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import object_session
 from sqlalchemy.orm.attributes import set_committed_value

-from app.config import config
-from app.db import Document, DocumentStatus
+from app.db import Chunk, Document, DocumentStatus
 from app.indexing_pipeline.connector_document import ConnectorDocument
+from app.indexing_pipeline.document_chunker import chunk_text
+from app.indexing_pipeline.document_embedder import embed_text
 from app.indexing_pipeline.document_hashing import compute_content_hash, compute_unique_identifier_hash
-from app.utils.document_converters import create_document_chunks, generate_document_summary
+from app.indexing_pipeline.document_summarizer import summarize_document


 def _safe_set_chunks(document: Document, chunks: list) -> None:
+    """Assign chunks to a document without triggering SQLAlchemy async lazy loading."""
    set_committed_value(document, "chunks", chunks)
    session = object_session(document)
    if session is not None:
@ -22,12 +24,17 @@ def _safe_set_chunks(document: Document, chunks: list) -> None:


 class IndexingPipelineService:
+    """Single pipeline for indexing connector documents. All connectors use this service."""
+
    def __init__(self, session: AsyncSession) -> None:
        self.session = session

    async def prepare_for_indexing(
        self, connector_docs: list[ConnectorDocument]
    ) -> list[Document]:
+        """
+        Persist new documents and detect changes, returning only those that need indexing.
+        """
        documents = []

        for connector_doc in connector_docs:
@ -73,19 +80,26 @@ class IndexingPipelineService:
    async def index(
        self, document: Document, connector_doc: ConnectorDocument, llm
    ) -> None:
+        """
+        Run summarization, embedding, and chunking for a document and persist the results.
+        """
        try:
            document.status = DocumentStatus.processing()
            await self.session.commit()

            if connector_doc.should_summarize:
-                content, embedding = await generate_document_summary(
+                content = await summarize_document(
                    connector_doc.source_markdown, llm, connector_doc.metadata
                )
            else:
                content = connector_doc.source_markdown
-                embedding = config.embedding_model_instance.embed(content)

-            chunks = await create_document_chunks(connector_doc.source_markdown)
+            embedding = embed_text(content)
+
+            chunks = [
+                Chunk(content=text, embedding=embed_text(text))
+                for text in chunk_text(connector_doc.source_markdown)
+            ]

            document.source_markdown = connector_doc.source_markdown
            document.content = content