feat: enhance Google connectors indexing with content extraction and document migration

- Added `download_and_extract_content` function to extract content from Google Drive files as markdown. - Updated Google Drive indexer to utilize the new content extraction method. - Implemented document migration logic to update legacy Composio document types to their native Google types. - Introduced identifier hashing for stable document identification. - Improved file pre-filtering to handle unchanged and rename-only files efficiently.
2026-04-26 01:06:23 +02:00 · 2026-03-25 18:33:44 +05:30 · 2026-03-25 18:33:44 +05:30 · f7b52470eb
commit f7b52470eb
parent 2da6fd89ea
8 changed files with 951 additions and 1588 deletions
--- a/surfsense_backend/app/indexing_pipeline/document_hashing.py
+++ b/surfsense_backend/app/indexing_pipeline/document_hashing.py
@ -3,10 +3,17 @@ import hashlib
 from app.indexing_pipeline.connector_document import ConnectorDocument


+def compute_identifier_hash(
+    document_type_value: str, unique_id: str, search_space_id: int
+) -> str:
+    """Return a stable SHA-256 hash from raw identity components."""
+    combined = f"{document_type_value}:{unique_id}:{search_space_id}"
+    return hashlib.sha256(combined.encode("utf-8")).hexdigest()
+
+
 def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
    """Return a stable SHA-256 hash identifying a document by its source identity."""
-    combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}"
-    return hashlib.sha256(combined.encode("utf-8")).hexdigest()
+    return compute_identifier_hash(doc.document_type.value, doc.unique_id, doc.search_space_id)


 def compute_content_hash(doc: ConnectorDocument) -> str: