feat: enhance Google connectors indexing with content extraction and document migration

- Added `download_and_extract_content` function to extract content from Google Drive files as markdown. - Updated Google Drive indexer to utilize the new content extraction method. - Implemented document migration logic to update legacy Composio document types to their native Google types. - Introduced identifier hashing for stable document identification. - Improved file pre-filtering to handle unchanged and rename-only files efficiently.
2026-04-25 00:36:31 +02:00 · 2026-03-25 18:33:44 +05:30 · 2026-03-25 18:33:44 +05:30 · f7b52470eb
commit f7b52470eb
parent 2da6fd89ea
8 changed files with 951 additions and 1588 deletions
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py
@ -3,6 +3,7 @@ import pytest
 from app.db import DocumentType
 from app.indexing_pipeline.document_hashing import (
    compute_content_hash,
+    compute_identifier_hash,
    compute_unique_identifier_hash,
 )

@ -61,3 +62,23 @@ def test_different_content_produces_different_content_hash(make_connector_docume
    doc_a = make_connector_document(source_markdown="Original content")
    doc_b = make_connector_document(source_markdown="Updated content")
    assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
+
+
+def test_compute_identifier_hash_matches_connector_doc_hash(make_connector_document):
+    """Raw-args hash equals ConnectorDocument hash for equivalent inputs."""
+    doc = make_connector_document(
+        document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
+        unique_id="msg-123",
+        search_space_id=5,
+    )
+    raw_hash = compute_identifier_hash("GOOGLE_GMAIL_CONNECTOR", "msg-123", 5)
+    assert raw_hash == compute_unique_identifier_hash(doc)
+
+
+def test_compute_identifier_hash_differs_for_different_inputs():
+    """Different arguments produce different hashes."""
+    h1 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 1)
+    h2 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-2", 1)
+    h3 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 2)
+    h4 = compute_identifier_hash("COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "file-1", 1)
+    assert len({h1, h2, h3, h4}) == 4