SurfSense/surfsense_backend/app/indexing_pipeline/document_hashing.py
Anish Sarkar f7b52470eb feat: enhance Google connectors indexing with content extraction and document migration
- Added `download_and_extract_content` function to extract content from Google Drive files as markdown.
- Updated Google Drive indexer to utilize the new content extraction method.
- Implemented document migration logic to update legacy Composio document types to their native Google types.
- Introduced identifier hashing for stable document identification.
- Improved file pre-filtering to handle unchanged and rename-only files efficiently.
2026-03-25 18:33:44 +05:30

22 lines
920 B
Python

import hashlib
from app.indexing_pipeline.connector_document import ConnectorDocument
def compute_identifier_hash(
document_type_value: str, unique_id: str, search_space_id: int
) -> str:
"""Return a stable SHA-256 hash from raw identity components."""
combined = f"{document_type_value}:{unique_id}:{search_space_id}"
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
"""Return a stable SHA-256 hash identifying a document by its source identity."""
return compute_identifier_hash(doc.document_type.value, doc.unique_id, doc.search_space_id)
def compute_content_hash(doc: ConnectorDocument) -> str:
"""Return a SHA-256 hash of the document's content scoped to its search space."""
combined = f"{doc.search_space_id}:{doc.source_markdown}"
return hashlib.sha256(combined.encode("utf-8")).hexdigest()