2026-02-24 22:48:40 +02:00
|
|
|
import hashlib
|
|
|
|
|
|
|
|
|
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
|
|
|
|
|
|
|
|
|
|
2026-03-25 18:33:44 +05:30
|
|
|
def compute_identifier_hash(
|
|
|
|
|
document_type_value: str, unique_id: str, search_space_id: int
|
|
|
|
|
) -> str:
|
|
|
|
|
"""Return a stable SHA-256 hash from raw identity components."""
|
|
|
|
|
combined = f"{document_type_value}:{unique_id}:{search_space_id}"
|
|
|
|
|
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
|
|
|
|
|
|
|
|
|
|
|
2026-02-24 22:48:40 +02:00
|
|
|
def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
|
2026-02-25 01:40:30 +02:00
|
|
|
"""Return a stable SHA-256 hash identifying a document by its source identity."""
|
2026-03-28 16:39:46 -07:00
|
|
|
return compute_identifier_hash(
|
|
|
|
|
doc.document_type.value, doc.unique_id, doc.search_space_id
|
|
|
|
|
)
|
2026-02-24 22:48:40 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_content_hash(doc: ConnectorDocument) -> str:
|
2026-02-25 01:40:30 +02:00
|
|
|
"""Return a SHA-256 hash of the document's content scoped to its search space."""
|
2026-02-24 22:48:40 +02:00
|
|
|
combined = f"{doc.search_space_id}:{doc.source_markdown}"
|
|
|
|
|
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
|