mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 17:56:25 +02:00
feat: enhance legacy document migration for Google connectors
- Implemented fallback logic in Google Calendar, Drive, and Gmail indexers to handle legacy Composio document types, ensuring smooth migration to native types. - Updated document indexing functions to check for existing documents using both primary and legacy hashes, improving data integrity during indexing.
This commit is contained in:
parent
8e7cda31c5
commit
aaf34800e6
4 changed files with 99 additions and 6 deletions
|
|
@ -614,10 +614,23 @@ async def search_knowledge_base_async(
|
|||
connectors = _normalize_connectors(connectors_to_search, available_connectors)
|
||||
|
||||
# --- Optimization 1: skip connectors that have zero indexed documents ---
|
||||
# Native Google types must also match their legacy Composio equivalents
|
||||
# (old documents may still carry the Composio type until re-indexed).
|
||||
_NATIVE_TO_LEGACY: dict[str, str] = {
|
||||
"GOOGLE_DRIVE_FILE": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"GOOGLE_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
|
||||
"GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
}
|
||||
|
||||
if available_document_types:
|
||||
doc_types_set = set(available_document_types)
|
||||
before_count = len(connectors)
|
||||
connectors = [c for c in connectors if c in doc_types_set]
|
||||
connectors = [
|
||||
c
|
||||
for c in connectors
|
||||
if c in doc_types_set
|
||||
or _NATIVE_TO_LEGACY.get(c, "") in doc_types_set
|
||||
]
|
||||
skipped = before_count - len(connectors)
|
||||
if skipped:
|
||||
perf.info(
|
||||
|
|
@ -793,6 +806,10 @@ async def search_knowledge_base_async(
|
|||
|
||||
deduplicated.append(doc)
|
||||
|
||||
# Sort by RRF score so the most relevant documents from ANY connector
|
||||
# appear first, preventing budget truncation from hiding top results.
|
||||
deduplicated.sort(key=lambda d: d.get("score", 0), reverse=True)
|
||||
|
||||
output_budget = _compute_tool_output_budget(max_input_tokens)
|
||||
result = format_documents_for_context(deduplicated, max_chars=output_budget)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue