feat: enhance legacy document migration for Google connectors

- Implemented fallback logic in Google Calendar, Drive, and Gmail indexers to handle legacy Composio document types, ensuring smooth migration to native types. - Updated document indexing functions to check for existing documents using both primary and legacy hashes, improving data integrity during indexing.
2026-04-27 17:56:25 +02:00 · 2026-03-20 03:39:05 +05:30 · 2026-03-20 03:39:05 +05:30 · aaf34800e6
commit aaf34800e6
parent 8e7cda31c5
4 changed files with 99 additions and 6 deletions
--- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
@ -614,10 +614,23 @@ async def search_knowledge_base_async(
    connectors = _normalize_connectors(connectors_to_search, available_connectors)

    # --- Optimization 1: skip connectors that have zero indexed documents ---
+    # Native Google types must also match their legacy Composio equivalents
+    # (old documents may still carry the Composio type until re-indexed).
+    _NATIVE_TO_LEGACY: dict[str, str] = {
+        "GOOGLE_DRIVE_FILE": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+        "GOOGLE_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
+        "GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
+    }
+
    if available_document_types:
        doc_types_set = set(available_document_types)
        before_count = len(connectors)
-        connectors = [c for c in connectors if c in doc_types_set]
+        connectors = [
+            c
+            for c in connectors
+            if c in doc_types_set
+            or _NATIVE_TO_LEGACY.get(c, "") in doc_types_set
+        ]
        skipped = before_count - len(connectors)
        if skipped:
            perf.info(
@ -793,6 +806,10 @@ async def search_knowledge_base_async(

        deduplicated.append(doc)

+    # Sort by RRF score so the most relevant documents from ANY connector
+    # appear first, preventing budget truncation from hiding top results.
+    deduplicated.sort(key=lambda d: d.get("score", 0), reverse=True)
+
    output_budget = _compute_tool_output_budget(max_input_tokens)
    result = format_documents_for_context(deduplicated, max_chars=output_budget)