mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 09:16:22 +02:00
feat: enhance Google connectors indexing with content extraction and document migration
- Added `download_and_extract_content` function to extract content from Google Drive files as markdown. - Updated Google Drive indexer to utilize the new content extraction method. - Implemented document migration logic to update legacy Composio document types to their native Google types. - Introduced identifier hashing for stable document identification. - Improved file pre-filtering to handle unchanged and rename-only files efficiently.
This commit is contained in:
parent
2da6fd89ea
commit
f7b52470eb
8 changed files with 951 additions and 1588 deletions
|
|
@ -3,6 +3,7 @@ import pytest
|
|||
from app.db import DocumentType
|
||||
from app.indexing_pipeline.document_hashing import (
|
||||
compute_content_hash,
|
||||
compute_identifier_hash,
|
||||
compute_unique_identifier_hash,
|
||||
)
|
||||
|
||||
|
|
@ -61,3 +62,23 @@ def test_different_content_produces_different_content_hash(make_connector_docume
|
|||
doc_a = make_connector_document(source_markdown="Original content")
|
||||
doc_b = make_connector_document(source_markdown="Updated content")
|
||||
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
|
||||
|
||||
|
||||
def test_compute_identifier_hash_matches_connector_doc_hash(make_connector_document):
|
||||
"""Raw-args hash equals ConnectorDocument hash for equivalent inputs."""
|
||||
doc = make_connector_document(
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
unique_id="msg-123",
|
||||
search_space_id=5,
|
||||
)
|
||||
raw_hash = compute_identifier_hash("GOOGLE_GMAIL_CONNECTOR", "msg-123", 5)
|
||||
assert raw_hash == compute_unique_identifier_hash(doc)
|
||||
|
||||
|
||||
def test_compute_identifier_hash_differs_for_different_inputs():
|
||||
"""Different arguments produce different hashes."""
|
||||
h1 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 1)
|
||||
h2 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-2", 1)
|
||||
h3 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 2)
|
||||
h4 = compute_identifier_hash("COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "file-1", 1)
|
||||
assert len({h1, h2, h3, h4}) == 4
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue