mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
feat: enhance Google connectors indexing with content extraction and document migration
- Added `download_and_extract_content` function to extract content from Google Drive files as markdown. - Updated Google Drive indexer to utilize the new content extraction method. - Implemented document migration logic to update legacy Composio document types to their native Google types. - Introduced identifier hashing for stable document identification. - Improved file pre-filtering to handle unchanged and rename-only files efficiently.
This commit is contained in:
parent
2da6fd89ea
commit
f7b52470eb
8 changed files with 951 additions and 1588 deletions
|
|
@ -3,10 +3,17 @@ import hashlib
|
|||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
|
||||
|
||||
def compute_identifier_hash(
|
||||
document_type_value: str, unique_id: str, search_space_id: int
|
||||
) -> str:
|
||||
"""Return a stable SHA-256 hash from raw identity components."""
|
||||
combined = f"{document_type_value}:{unique_id}:{search_space_id}"
|
||||
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
|
||||
"""Return a stable SHA-256 hash identifying a document by its source identity."""
|
||||
combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}"
|
||||
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
|
||||
return compute_identifier_hash(doc.document_type.value, doc.unique_id, doc.search_space_id)
|
||||
|
||||
|
||||
def compute_content_hash(doc: ConnectorDocument) -> str:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue