mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 08:46:22 +02:00
feat: add unique identifier hash for documents to prevent duplicates across various connectors
This commit is contained in:
parent
673bf6f3c1
commit
c99cd710ea
22 changed files with 1631 additions and 356 deletions
|
|
@ -29,3 +29,27 @@ async def check_duplicate_document(
|
|||
select(Document).where(Document.content_hash == content_hash)
|
||||
)
|
||||
return existing_doc_result.scalars().first()
|
||||
|
||||
|
||||
async def check_document_by_unique_identifier(
|
||||
session: AsyncSession, unique_identifier_hash: str
|
||||
) -> Document | None:
|
||||
"""
|
||||
Check if a document with the given unique identifier hash already exists.
|
||||
Eagerly loads chunks to avoid lazy loading issues during updates.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
unique_identifier_hash: Hash of the unique identifier from the source
|
||||
|
||||
Returns:
|
||||
Existing document if found, None otherwise
|
||||
"""
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
existing_doc_result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.where(Document.unique_identifier_hash == unique_identifier_hash)
|
||||
)
|
||||
return existing_doc_result.scalars().first()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue