Merge pull request #404 from MODSetter/dev

feat: add unique identifier hash for documents to prevent duplicates across various connectors
2026-06-06 20:15:17 +02:00 · 2025-10-14 21:19:53 -07:00 · 2025-10-14 21:19:53 -07:00 · d868bae134
commit d868bae134
parent 1f3e4bfa09 97f06899e4
22 changed files with 1631 additions and 356 deletions
--- a/surfsense_backend/alembic/versions/29_add_unique_identifier_hash_to_documents.py
+++ b/surfsense_backend/alembic/versions/29_add_unique_identifier_hash_to_documents.py
@ -0,0 +1,54 @@
 """Add unique_identifier_hash column to documents table
 Revision ID: 29
 Revises: 28
 """
 from collections.abc import Sequence
 import sqlalchemy as sa
 from sqlalchemy import inspect
 from alembic import op
 # revision identifiers, used by Alembic.
 revision: str = "29"
 down_revision: str | None = "28"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    bind = op.get_bind()
    inspector = inspect(bind)
    columns = [col["name"] for col in inspector.get_columns("documents")]
    # Only add the column if it doesn't already exist
    if "unique_identifier_hash" not in columns:
        op.add_column(
            "documents",
            sa.Column("unique_identifier_hash", sa.String(), nullable=True),
        )
        op.create_index(
            op.f("ix_documents_unique_identifier_hash"),
            "documents",
            ["unique_identifier_hash"],
            unique=False,
        )
        op.create_unique_constraint(
            op.f("uq_documents_unique_identifier_hash"),
            "documents",
            ["unique_identifier_hash"],
        )
    else:
        print(
            "Column 'unique_identifier_hash' already exists. Skipping column creation."
        )
 def downgrade() -> None:
    op.drop_constraint(
        op.f("uq_documents_unique_identifier_hash"), "documents", type_="unique"
    )
    op.drop_index(op.f("ix_documents_unique_identifier_hash"), table_name="documents")
    op.drop_column("documents", "unique_identifier_hash")
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -174,6 +174,7 @@ class Document(BaseModel, TimestampMixin):
    content = Column(Text, nullable=False)
    content_hash = Column(String, nullable=False, index=True, unique=True)
    unique_identifier_hash = Column(String, nullable=True, index=True, unique=True)
    embedding = Column(Vector(config.embedding_model_instance.dimension))
    search_space_id = Column(
--- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
@ -16,11 +16,12 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    calculate_date_range,
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -240,25 +241,100 @@ async def index_airtable_records(
                                documents_skipped += 1
                                continue
                            record_id = record.get("id", "Unknown")
                            # Generate unique identifier hash for this Airtable record
                            unique_identifier_hash = generate_unique_identifier_hash(
                                DocumentType.AIRTABLE_CONNECTOR,
                                record_id,
                                search_space_id,
                            )
                            # Generate content hash
                            content_hash = generate_content_hash(
                                markdown_content, search_space_id
                            )
-                            # Check if document already exists
+                            # Check if document with this unique identifier already exists
-                            existing_document_by_hash = (
+                            existing_document = (
-                                await check_duplicate_document_by_hash(
+                                await check_document_by_unique_identifier(
-                                    session, content_hash
+                                    session, unique_identifier_hash
                                )
                            )
-                            if existing_document_by_hash:
+                            if existing_document:
-                                logger.info(
+                                # Document exists - check if content has changed
-                                    f"Document with content hash {content_hash} already exists for message {record.get('id')}. Skipping processing."
+                                if existing_document.content_hash == content_hash:
-                                )
+                                    logger.info(
-                                documents_skipped += 1
+                                        f"Document for Airtable record {record_id} unchanged. Skipping."
-                                continue
+                                    )
                                    documents_skipped += 1
                                    continue
                                else:
                                    # Content has changed - update the existing document
                                    logger.info(
                                        f"Content changed for Airtable record {record_id}. Updating document."
                                    )
                                    # Generate document summary
                                    user_llm = await get_user_long_context_llm(
                                        session, user_id, search_space_id
                                    )
                                    if user_llm:
                                        document_metadata = {
                                            "record_id": record_id,
                                            "created_time": record.get(
                                                "CREATED_TIME()", ""
                                            ),
                                            "document_type": "Airtable Record",
                                            "connector_type": "Airtable",
                                        }
                                        (
                                            summary_content,
                                            summary_embedding,
                                        ) = await generate_document_summary(
                                            markdown_content,
                                            user_llm,
                                            document_metadata,
                                        )
                                    else:
                                        summary_content = (
                                            f"Airtable Record: {record_id}\n\n"
                                        )
                                        summary_embedding = (
                                            config.embedding_model_instance.embed(
                                                summary_content
                                            )
                                        )
                                    # Process chunks
                                    chunks = await create_document_chunks(
                                        markdown_content
                                    )
                                    # Update existing document
                                    existing_document.title = (
                                        f"Airtable Record: {record_id}"
                                    )
                                    existing_document.content = summary_content
                                    existing_document.content_hash = content_hash
                                    existing_document.embedding = summary_embedding
                                    existing_document.document_metadata = {
                                        "record_id": record_id,
                                        "created_time": record.get(
                                            "CREATED_TIME()", ""
                                        ),
                                    }
                                    existing_document.chunks = chunks
                                    documents_indexed += 1
                                    logger.info(
                                        f"Successfully updated Airtable record {record_id}"
                                    )
                                    continue
                            # Document doesn't exist - create new one
                            # Generate document summary
                            user_llm = await get_user_long_context_llm(
                                session, user_id, search_space_id
@ -266,7 +342,7 @@ async def index_airtable_records(
                            if user_llm:
                                document_metadata = {
-                                    "record_id": record.get("id", "Unknown"),
+                                    "record_id": record_id,
                                    "created_time": record.get("CREATED_TIME()", ""),
                                    "document_type": "Airtable Record",
                                    "connector_type": "Airtable",
@ -279,7 +355,7 @@ async def index_airtable_records(
                                )
                            else:
                                # Fallback to simple summary if no LLM configured
-                                summary_content = f"Airtable Record: {record.get('id', 'Unknown')}\n\n"
+                                summary_content = f"Airtable Record: {record_id}\n\n"
                                summary_embedding = (
                                    config.embedding_model_instance.embed(
                                        summary_content
@ -291,18 +367,19 @@ async def index_airtable_records(
                            # Create and store new document
                            logger.info(
-                                f"Creating new document for Airtable record: {record.get('id', 'Unknown')}"
+                                f"Creating new document for Airtable record: {record_id}"
                            )
                            document = Document(
                                search_space_id=search_space_id,
-                                title=f"Airtable Record: {record.get('id', 'Unknown')}",
+                                title=f"Airtable Record: {record_id}",
                                document_type=DocumentType.AIRTABLE_CONNECTOR,
                                document_metadata={
-                                    "record_id": record.get("id", "Unknown"),
+                                    "record_id": record_id,
                                    "created_time": record.get("CREATED_TIME()", ""),
                                },
                                content=summary_content,
                                content_hash=content_hash,
                                unique_identifier_hash=unique_identifier_hash,
                                embedding=summary_embedding,
                                chunks=chunks,
                            )
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -37,6 +37,30 @@ async def check_duplicate_document_by_hash(
    return existing_doc_result.scalars().first()
 async def check_document_by_unique_identifier(
    session: AsyncSession, unique_identifier_hash: str
 ) -> Document | None:
    """
    Check if a document with the given unique identifier hash already exists.
    Eagerly loads chunks to avoid lazy loading issues during updates.
    Args:
        session: Database session
        unique_identifier_hash: Hash of the unique identifier from the source system
    Returns:
        Existing document if found, None otherwise
    """
    from sqlalchemy.orm import selectinload
    existing_doc_result = await session.execute(
        select(Document)
        .options(selectinload(Document.chunks))
        .where(Document.unique_identifier_hash == unique_identifier_hash)
    )
    return existing_doc_result.scalars().first()
 async def get_connector_by_id(
    session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
 ) -> SearchSourceConnector | None:
--- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
@ -16,10 +16,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -209,18 +210,92 @@ async def index_clickup_tasks(
                        documents_skipped += 1
                        continue
-                    # Hash for duplicates
+                    # Generate unique identifier hash for this ClickUp task
-                    content_hash = generate_content_hash(task_content, search_space_id)
+                    unique_identifier_hash = generate_unique_identifier_hash(
-                    existing_document_by_hash = await check_duplicate_document_by_hash(
+                        DocumentType.CLICKUP_CONNECTOR, task_id, search_space_id
                        session, content_hash
                    )
                    if existing_document_by_hash:
                        logger.info(
                            f"Document with content hash {content_hash} already exists for task {task_name}. Skipping processing."
                        )
                        documents_skipped += 1
                        continue
                    # Generate content hash
                    content_hash = generate_content_hash(task_content, search_space_id)
                    # Check if document with this unique identifier already exists
                    existing_document = await check_document_by_unique_identifier(
                        session, unique_identifier_hash
                    )
                    if existing_document:
                        # Document exists - check if content has changed
                        if existing_document.content_hash == content_hash:
                            logger.info(
                                f"Document for ClickUp task {task_name} unchanged. Skipping."
                            )
                            documents_skipped += 1
                            continue
                        else:
                            # Content has changed - update the existing document
                            logger.info(
                                f"Content changed for ClickUp task {task_name}. Updating document."
                            )
                            # Generate summary with metadata
                            user_llm = await get_user_long_context_llm(
                                session, user_id, search_space_id
                            )
                            if user_llm:
                                document_metadata = {
                                    "task_id": task_id,
                                    "task_name": task_name,
                                    "task_status": task_status,
                                    "task_priority": task_priority,
                                    "task_list": task_list_name,
                                    "task_space": task_space_name,
                                    "assignees": len(task_assignees),
                                    "document_type": "ClickUp Task",
                                    "connector_type": "ClickUp",
                                }
                                (
                                    summary_content,
                                    summary_embedding,
                                ) = await generate_document_summary(
                                    task_content, user_llm, document_metadata
                                )
                            else:
                                summary_content = task_content
                                summary_embedding = (
                                    config.embedding_model_instance.embed(task_content)
                                )
                            # Process chunks
                            chunks = await create_document_chunks(task_content)
                            # Update existing document
                            existing_document.title = f"Task - {task_name}"
                            existing_document.content = summary_content
                            existing_document.content_hash = content_hash
                            existing_document.embedding = summary_embedding
                            existing_document.document_metadata = {
                                "task_id": task_id,
                                "task_name": task_name,
                                "task_status": task_status,
                                "task_priority": task_priority,
                                "task_assignees": task_assignees,
                                "task_due_date": task_due_date,
                                "task_created": task_created,
                                "task_updated": task_updated,
                                "indexed_at": datetime.now().strftime(
                                    "%Y-%m-%d %H:%M:%S"
                                ),
                            }
                            existing_document.chunks = chunks
                            documents_indexed += 1
                            logger.info(
                                f"Successfully updated ClickUp task {task_name}"
                            )
                            continue
                    # Document doesn't exist - create new one
                    # Generate summary with metadata
                    user_llm = await get_user_long_context_llm(
                        session, user_id, search_space_id
@ -270,6 +345,7 @@ async def index_clickup_tasks(
                        },
                        content=summary_content,
                        content_hash=content_hash,
                        unique_identifier_hash=unique_identifier_hash,
                        embedding=summary_embedding,
                        chunks=chunks,
                    )
--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@ -16,11 +16,12 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    calculate_date_range,
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -217,26 +218,97 @@ async def index_confluence_pages(
                    documents_skipped += 1
                    continue
                # Generate unique identifier hash for this Confluence page
                unique_identifier_hash = generate_unique_identifier_hash(
                    DocumentType.CONFLUENCE_CONNECTOR, page_id, search_space_id
                )
                # Generate content hash
                content_hash = generate_content_hash(full_content, search_space_id)
-                # Check if document already exists
+                # Check if document with this unique identifier already exists
-                existing_document_by_hash = await check_duplicate_document_by_hash(
+                existing_document = await check_document_by_unique_identifier(
-                    session, content_hash
+                    session, unique_identifier_hash
                )
-                if existing_document_by_hash:
+                comment_count = len(comments)
                    logger.info(
                        f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                if existing_document:
                    # Document exists - check if content has changed
                    if existing_document.content_hash == content_hash:
                        logger.info(
                            f"Document for Confluence page {page_title} unchanged. Skipping."
                        )
                        documents_skipped += 1
                        continue
                    else:
                        # Content has changed - update the existing document
                        logger.info(
                            f"Content changed for Confluence page {page_title}. Updating document."
                        )
                        # Generate summary with metadata
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
                        )
                        if user_llm:
                            document_metadata = {
                                "page_title": page_title,
                                "page_id": page_id,
                                "space_id": space_id,
                                "comment_count": comment_count,
                                "document_type": "Confluence Page",
                                "connector_type": "Confluence",
                            }
                            (
                                summary_content,
                                summary_embedding,
                            ) = await generate_document_summary(
                                full_content, user_llm, document_metadata
                            )
                        else:
                            summary_content = f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
                            if page_content:
                                content_preview = page_content[:1000]
                                if len(page_content) > 1000:
                                    content_preview += "..."
                                summary_content += (
                                    f"Content Preview: {content_preview}\n\n"
                                )
                            summary_content += f"Comments: {comment_count}"
                            summary_embedding = config.embedding_model_instance.embed(
                                summary_content
                            )
                        # Process chunks
                        chunks = await create_document_chunks(full_content)
                        # Update existing document
                        existing_document.title = f"Confluence - {page_title}"
                        existing_document.content = summary_content
                        existing_document.content_hash = content_hash
                        existing_document.embedding = summary_embedding
                        existing_document.document_metadata = {
                            "page_id": page_id,
                            "page_title": page_title,
                            "space_id": space_id,
                            "comment_count": comment_count,
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        }
                        existing_document.chunks = chunks
                        documents_indexed += 1
                        logger.info(
                            f"Successfully updated Confluence page {page_title}"
                        )
                        continue
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
                )
                comment_count = len(comments)
                if user_llm:
                    document_metadata = {
@ -287,6 +359,7 @@ async def index_confluence_pages(
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
--- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
@ -16,11 +16,12 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    build_document_metadata_string,
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -307,23 +308,98 @@ async def index_discord_messages(
                        combined_document_string = build_document_metadata_string(
                            metadata_sections
                        )
                        # Generate unique identifier hash for this Discord channel
                        unique_identifier_hash = generate_unique_identifier_hash(
                            DocumentType.DISCORD_CONNECTOR, channel_id, search_space_id
                        )
                        # Generate content hash
                        content_hash = generate_content_hash(
                            combined_document_string, search_space_id
                        )
-                        # Skip duplicates by hash
+                        # Check if document with this unique identifier already exists
-                        existing_document_by_hash = (
+                        existing_document = await check_document_by_unique_identifier(
-                            await check_duplicate_document_by_hash(
+                            session, unique_identifier_hash
                                session, content_hash
                            )
                        )
                        if existing_document_by_hash:
                            logger.info(
                                f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing."
                            )
                            documents_skipped += 1
                            continue
                        if existing_document:
                            # Document exists - check if content has changed
                            if existing_document.content_hash == content_hash:
                                logger.info(
                                    f"Document for Discord channel {guild_name}#{channel_name} unchanged. Skipping."
                                )
                                documents_skipped += 1
                                continue
                            else:
                                # Content has changed - update the existing document
                                logger.info(
                                    f"Content changed for Discord channel {guild_name}#{channel_name}. Updating document."
                                )
                                # Get user's long context LLM
                                user_llm = await get_user_long_context_llm(
                                    session, user_id, search_space_id
                                )
                                if not user_llm:
                                    logger.error(
                                        f"No long context LLM configured for user {user_id}"
                                    )
                                    skipped_channels.append(
                                        f"{guild_name}#{channel_name} (no LLM configured)"
                                    )
                                    documents_skipped += 1
                                    continue
                                # Generate summary with metadata
                                document_metadata = {
                                    "guild_name": guild_name,
                                    "channel_name": channel_name,
                                    "message_count": len(formatted_messages),
                                    "document_type": "Discord Channel Messages",
                                    "connector_type": "Discord",
                                }
                                (
                                    summary_content,
                                    summary_embedding,
                                ) = await generate_document_summary(
                                    combined_document_string,
                                    user_llm,
                                    document_metadata,
                                )
                                # Chunks from channel content
                                chunks = await create_document_chunks(channel_content)
                                # Update existing document
                                existing_document.title = (
                                    f"Discord - {guild_name}#{channel_name}"
                                )
                                existing_document.content = summary_content
                                existing_document.content_hash = content_hash
                                existing_document.embedding = summary_embedding
                                existing_document.document_metadata = {
                                    "guild_name": guild_name,
                                    "guild_id": guild_id,
                                    "channel_name": channel_name,
                                    "channel_id": channel_id,
                                    "message_count": len(formatted_messages),
                                    "start_date": start_date_iso,
                                    "end_date": end_date_iso,
                                    "indexed_at": datetime.now(UTC).strftime(
                                        "%Y-%m-%d %H:%M:%S"
                                    ),
                                }
                                existing_document.chunks = chunks
                                documents_indexed += 1
                                logger.info(
                                    f"Successfully updated Discord channel {guild_name}#{channel_name}"
                                )
                                continue
                        # Document doesn't exist - create new one
                        # Get user's long context LLM
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
@ -375,6 +451,7 @@ async def index_discord_messages(
                            },
                            content=summary_content,
                            content_hash=content_hash,
                            unique_identifier_hash=unique_identifier_hash,
                            embedding=summary_embedding,
                            chunks=chunks,
                        )
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@ -16,10 +16,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
 )
@ -199,19 +200,101 @@ async def index_github_repos(
                        )
                        continue  # Skip if content fetch failed
-                    content_hash = generate_content_hash(file_content, search_space_id)
+                    # Generate unique identifier hash for this GitHub file
-
+                    unique_identifier_hash = generate_unique_identifier_hash(
-                    # Check if document with this content hash already exists
+                        DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
                    existing_document_by_hash = await check_duplicate_document_by_hash(
                        session, content_hash
                    )
-                    if existing_document_by_hash:
+                    # Generate content hash
-                        logger.info(
+                    content_hash = generate_content_hash(file_content, search_space_id)
                            f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing."
                        )
                        continue
                    # Check if document with this unique identifier already exists
                    existing_document = await check_document_by_unique_identifier(
                        session, unique_identifier_hash
                    )
                    if existing_document:
                        # Document exists - check if content has changed
                        if existing_document.content_hash == content_hash:
                            logger.info(
                                f"Document for GitHub file {full_path_key} unchanged. Skipping."
                            )
                            continue
                        else:
                            # Content has changed - update the existing document
                            logger.info(
                                f"Content changed for GitHub file {full_path_key}. Updating document."
                            )
                            # Generate summary with metadata
                            user_llm = await get_user_long_context_llm(
                                session, user_id, search_space_id
                            )
                            if user_llm:
                                file_extension = (
                                    file_path.split(".")[-1]
                                    if "." in file_path
                                    else None
                                )
                                document_metadata = {
                                    "file_path": full_path_key,
                                    "repository": repo_full_name,
                                    "file_type": file_extension or "unknown",
                                    "document_type": "GitHub Repository File",
                                    "connector_type": "GitHub",
                                }
                                (
                                    summary_content,
                                    summary_embedding,
                                ) = await generate_document_summary(
                                    file_content, user_llm, document_metadata
                                )
                            else:
                                summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
                                summary_embedding = (
                                    config.embedding_model_instance.embed(
                                        summary_content
                                    )
                                )
                            # Chunk the content
                            try:
                                if hasattr(config, "code_chunker_instance"):
                                    chunks_data = [
                                        await create_document_chunks(file_content)
                                    ][0]
                                else:
                                    chunks_data = await create_document_chunks(
                                        file_content
                                    )
                            except Exception as chunk_err:
                                logger.error(
                                    f"Failed to chunk file {full_path_key}: {chunk_err}"
                                )
                                continue
                            # Update existing document
                            existing_document.title = f"GitHub - {full_path_key}"
                            existing_document.content = summary_content
                            existing_document.content_hash = content_hash
                            existing_document.embedding = summary_embedding
                            existing_document.document_metadata = {
                                "file_path": file_path,
                                "file_sha": file_sha,
                                "file_url": file_url,
                                "repository": repo_full_name,
                                "indexed_at": datetime.now(UTC).strftime(
                                    "%Y-%m-%d %H:%M:%S"
                                ),
                            }
                            existing_document.chunks = chunks_data
                            logger.info(
                                f"Successfully updated GitHub file {full_path_key}"
                            )
                            continue
                    # Document doesn't exist - create new one
                    # Generate summary with metadata
                    user_llm = await get_user_long_context_llm(
                        session, user_id, search_space_id
@ -290,6 +373,7 @@ async def index_github_repos(
                        document_metadata=doc_metadata,
                        content=summary_content,  # Store summary
                        content_hash=content_hash,
                        unique_identifier_hash=unique_identifier_hash,
                        embedding=summary_embedding,
                        search_space_id=search_space_id,
                        chunks=chunks_data,  # Associate chunks directly
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@ -17,9 +17,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -248,23 +250,99 @@ async def index_google_calendar_events(
                location = event.get("location", "")
                description = event.get("description", "")
                # Generate unique identifier hash for this Google Calendar event
                unique_identifier_hash = generate_unique_identifier_hash(
                    DocumentType.GOOGLE_CALENDAR_CONNECTOR, event_id, search_space_id
                )
                # Generate content hash
                content_hash = generate_content_hash(event_markdown, search_space_id)
-                # Duplicate check via simple query using helper in base
+                # Check if document with this unique identifier already exists
-                from .base import (
+                existing_document = await check_document_by_unique_identifier(
-                    check_duplicate_document_by_hash,  # local import to avoid circular at module import
+                    session, unique_identifier_hash
                )
-                existing_document_by_hash = await check_duplicate_document_by_hash(
+                if existing_document:
-                    session, content_hash
+                    # Document exists - check if content has changed
-                )
+                    if existing_document.content_hash == content_hash:
-                if existing_document_by_hash:
+                        logger.info(
-                    logger.info(
+                            f"Document for Google Calendar event {event_summary} unchanged. Skipping."
-                        f"Document with content hash {content_hash} already exists for event {event_summary}. Skipping processing."
+                        )
-                    )
+                        documents_skipped += 1
-                    documents_skipped += 1
+                        continue
-                    continue
+                    else:
                        # Content has changed - update the existing document
                        logger.info(
                            f"Content changed for Google Calendar event {event_summary}. Updating document."
                        )
                        # Generate summary with metadata
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
                        )
                        if user_llm:
                            document_metadata = {
                                "event_id": event_id,
                                "event_summary": event_summary,
                                "calendar_id": calendar_id,
                                "start_time": start_time,
                                "end_time": end_time,
                                "location": location or "No location",
                                "document_type": "Google Calendar Event",
                                "connector_type": "Google Calendar",
                            }
                            (
                                summary_content,
                                summary_embedding,
                            ) = await generate_document_summary(
                                event_markdown, user_llm, document_metadata
                            )
                        else:
                            summary_content = (
                                f"Google Calendar Event: {event_summary}\n\n"
                            )
                            summary_content += f"Calendar: {calendar_id}\n"
                            summary_content += f"Start: {start_time}\n"
                            summary_content += f"End: {end_time}\n"
                            if location:
                                summary_content += f"Location: {location}\n"
                            if description:
                                desc_preview = description[:1000]
                                if len(description) > 1000:
                                    desc_preview += "..."
                                summary_content += f"Description: {desc_preview}\n"
                            summary_embedding = config.embedding_model_instance.embed(
                                summary_content
                            )
                        # Process chunks
                        chunks = await create_document_chunks(event_markdown)
                        # Update existing document
                        existing_document.title = f"Calendar Event - {event_summary}"
                        existing_document.content = summary_content
                        existing_document.content_hash = content_hash
                        existing_document.embedding = summary_embedding
                        existing_document.document_metadata = {
                            "event_id": event_id,
                            "event_summary": event_summary,
                            "calendar_id": calendar_id,
                            "start_time": start_time,
                            "end_time": end_time,
                            "location": location,
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        }
                        existing_document.chunks = chunks
                        documents_indexed += 1
                        logger.info(
                            f"Successfully updated Google Calendar event {event_summary}"
                        )
                        continue
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
@ -320,6 +398,7 @@ async def index_google_calendar_events(
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@ -21,10 +21,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -194,21 +195,85 @@ async def index_google_gmail_messages(
                    documents_skipped += 1
                    continue
                # Generate unique identifier hash for this Gmail message
                unique_identifier_hash = generate_unique_identifier_hash(
                    DocumentType.GOOGLE_GMAIL_CONNECTOR, message_id, search_space_id
                )
                # Generate content hash
                content_hash = generate_content_hash(markdown_content, search_space_id)
-                # Check if document already exists
+                # Check if document with this unique identifier already exists
-                existing_document_by_hash = await check_duplicate_document_by_hash(
+                existing_document = await check_document_by_unique_identifier(
-                    session, content_hash
+                    session, unique_identifier_hash
                )
-                if existing_document_by_hash:
+                if existing_document:
-                    logger.info(
+                    # Document exists - check if content has changed
-                        f"Document with content hash {content_hash} already exists for message {message_id}. Skipping processing."
+                    if existing_document.content_hash == content_hash:
-                    )
+                        logger.info(
-                    documents_skipped += 1
+                            f"Document for Gmail message {subject} unchanged. Skipping."
-                    continue
+                        )
                        documents_skipped += 1
                        continue
                    else:
                        # Content has changed - update the existing document
                        logger.info(
                            f"Content changed for Gmail message {subject}. Updating document."
                        )
                        # Generate summary with metadata
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
                        )
                        if user_llm:
                            document_metadata = {
                                "message_id": message_id,
                                "thread_id": thread_id,
                                "subject": subject,
                                "sender": sender,
                                "date": date_str,
                                "document_type": "Gmail Message",
                                "connector_type": "Google Gmail",
                            }
                            (
                                summary_content,
                                summary_embedding,
                            ) = await generate_document_summary(
                                markdown_content, user_llm, document_metadata
                            )
                        else:
                            summary_content = f"Google Gmail Message: {subject}\n\n"
                            summary_content += f"Sender: {sender}\n"
                            summary_content += f"Date: {date_str}\n"
                            summary_embedding = config.embedding_model_instance.embed(
                                summary_content
                            )
                        # Process chunks
                        chunks = await create_document_chunks(markdown_content)
                        # Update existing document
                        existing_document.title = f"Gmail: {subject}"
                        existing_document.content = summary_content
                        existing_document.content_hash = content_hash
                        existing_document.embedding = summary_embedding
                        existing_document.document_metadata = {
                            "message_id": message_id,
                            "thread_id": thread_id,
                            "subject": subject,
                            "sender": sender,
                            "date": date_str,
                            "connector_id": connector_id,
                        }
                        existing_document.chunks = chunks
                        documents_indexed += 1
                        logger.info(f"Successfully updated Gmail message {subject}")
                        continue
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
@ -258,6 +323,7 @@ async def index_google_gmail_messages(
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
--- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
@ -16,11 +16,12 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    calculate_date_range,
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -200,26 +201,96 @@ async def index_jira_issues(
                    documents_skipped += 1
                    continue
                # Generate unique identifier hash for this Jira issue
                unique_identifier_hash = generate_unique_identifier_hash(
                    DocumentType.JIRA_CONNECTOR, issue_id, search_space_id
                )
                # Generate content hash
                content_hash = generate_content_hash(issue_content, search_space_id)
-                # Check if document already exists
+                # Check if document with this unique identifier already exists
-                existing_document_by_hash = await check_duplicate_document_by_hash(
+                existing_document = await check_document_by_unique_identifier(
-                    session, content_hash
+                    session, unique_identifier_hash
                )
-                if existing_document_by_hash:
+                comment_count = len(formatted_issue.get("comments", []))
                    logger.info(
                        f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                if existing_document:
                    # Document exists - check if content has changed
                    if existing_document.content_hash == content_hash:
                        logger.info(
                            f"Document for Jira issue {issue_identifier} unchanged. Skipping."
                        )
                        documents_skipped += 1
                        continue
                    else:
                        # Content has changed - update the existing document
                        logger.info(
                            f"Content changed for Jira issue {issue_identifier}. Updating document."
                        )
                        # Generate summary with metadata
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
                        )
                        if user_llm:
                            document_metadata = {
                                "issue_key": issue_identifier,
                                "issue_title": issue_title,
                                "status": formatted_issue.get("status", "Unknown"),
                                "priority": formatted_issue.get("priority", "Unknown"),
                                "comment_count": comment_count,
                                "document_type": "Jira Issue",
                                "connector_type": "Jira",
                            }
                            (
                                summary_content,
                                summary_embedding,
                            ) = await generate_document_summary(
                                issue_content, user_llm, document_metadata
                            )
                        else:
                            summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
                            if formatted_issue.get("description"):
                                summary_content += f"Description: {formatted_issue.get('description')}\n\n"
                            summary_content += f"Comments: {comment_count}"
                            summary_embedding = config.embedding_model_instance.embed(
                                summary_content
                            )
                        # Process chunks
                        chunks = await create_document_chunks(issue_content)
                        # Update existing document
                        existing_document.title = (
                            f"Jira - {issue_identifier}: {issue_title}"
                        )
                        existing_document.content = summary_content
                        existing_document.content_hash = content_hash
                        existing_document.embedding = summary_embedding
                        existing_document.document_metadata = {
                            "issue_id": issue_id,
                            "issue_identifier": issue_identifier,
                            "issue_title": issue_title,
                            "state": formatted_issue.get("status", "Unknown"),
                            "comment_count": comment_count,
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        }
                        existing_document.chunks = chunks
                        documents_indexed += 1
                        logger.info(
                            f"Successfully updated Jira issue {issue_identifier}"
                        )
                        continue
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
                )
                comment_count = len(formatted_issue.get("comments", []))
                if user_llm:
                    document_metadata = {
@ -270,6 +341,7 @@ async def index_jira_issues(
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
--- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
@ -16,11 +16,12 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    calculate_date_range,
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -213,27 +214,101 @@ async def index_linear_issues(
                    documents_skipped += 1
                    continue
-                content_hash = generate_content_hash(issue_content, search_space_id)
+                # Generate unique identifier hash for this Linear issue
-
+                unique_identifier_hash = generate_unique_identifier_hash(
-                # Check if document with this content hash already exists
+                    DocumentType.LINEAR_CONNECTOR, issue_id, search_space_id
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
-                if existing_document_by_hash:
+                # Generate content hash
-                    logger.info(
+                content_hash = generate_content_hash(issue_content, search_space_id)
                        f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                # Check if document with this unique identifier already exists
                existing_document = await check_document_by_unique_identifier(
                    session, unique_identifier_hash
                )
                state = formatted_issue.get("state", "Unknown")
                description = formatted_issue.get("description", "")
                comment_count = len(formatted_issue.get("comments", []))
                if existing_document:
                    # Document exists - check if content has changed
                    if existing_document.content_hash == content_hash:
                        logger.info(
                            f"Document for Linear issue {issue_identifier} unchanged. Skipping."
                        )
                        documents_skipped += 1
                        continue
                    else:
                        # Content has changed - update the existing document
                        logger.info(
                            f"Content changed for Linear issue {issue_identifier}. Updating document."
                        )
                        # Generate summary with metadata
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
                        )
                        if user_llm:
                            document_metadata = {
                                "issue_id": issue_identifier,
                                "issue_title": issue_title,
                                "state": state,
                                "priority": formatted_issue.get("priority", "Unknown"),
                                "comment_count": comment_count,
                                "document_type": "Linear Issue",
                                "connector_type": "Linear",
                            }
                            (
                                summary_content,
                                summary_embedding,
                            ) = await generate_document_summary(
                                issue_content, user_llm, document_metadata
                            )
                        else:
                            # Fallback to simple summary if no LLM configured
                            if description and len(description) > 1000:
                                description = description[:997] + "..."
                            summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
                            if description:
                                summary_content += f"Description: {description}\n\n"
                            summary_content += f"Comments: {comment_count}"
                            summary_embedding = config.embedding_model_instance.embed(
                                summary_content
                            )
                        # Process chunks
                        chunks = await create_document_chunks(issue_content)
                        # Update existing document
                        existing_document.title = (
                            f"Linear - {issue_identifier}: {issue_title}"
                        )
                        existing_document.content = summary_content
                        existing_document.content_hash = content_hash
                        existing_document.embedding = summary_embedding
                        existing_document.document_metadata = {
                            "issue_id": issue_id,
                            "issue_identifier": issue_identifier,
                            "issue_title": issue_title,
                            "state": state,
                            "comment_count": comment_count,
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        }
                        existing_document.chunks = chunks
                        documents_indexed += 1
                        logger.info(
                            f"Successfully updated Linear issue {issue_identifier}"
                        )
                        continue
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
                )
                state = formatted_issue.get("state", "Unknown")
                description = formatted_issue.get("description", "")
                comment_count = len(formatted_issue.get("comments", []))
                if user_llm:
                    document_metadata = {
@ -285,6 +360,7 @@ async def index_linear_issues(
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@ -16,9 +16,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -254,21 +256,108 @@ async def index_luma_events(
                description = event_data.get("description", "")
                cover_url = event_data.get("cover_url", "")
                # Generate unique identifier hash for this Luma event
                unique_identifier_hash = generate_unique_identifier_hash(
                    DocumentType.LUMA_CONNECTOR, event_id, search_space_id
                )
                # Generate content hash
                content_hash = generate_content_hash(event_markdown, search_space_id)
-                # Duplicate check via simple query using helper in base
+                # Check if document with this unique identifier already exists
-                from .base import check_duplicate_document_by_hash
+                existing_document = await check_document_by_unique_identifier(
-
+                    session, unique_identifier_hash
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
                if existing_document_by_hash:
                    logger.info(
                        f"Document with content hash {content_hash} already exists for event {event_name}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                if existing_document:
                    # Document exists - check if content has changed
                    if existing_document.content_hash == content_hash:
                        logger.info(
                            f"Document for Luma event {event_name} unchanged. Skipping."
                        )
                        documents_skipped += 1
                        continue
                    else:
                        # Content has changed - update the existing document
                        logger.info(
                            f"Content changed for Luma event {event_name}. Updating document."
                        )
                        # Generate summary with metadata
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
                        )
                        if user_llm:
                            document_metadata = {
                                "event_id": event_id,
                                "event_name": event_name,
                                "event_url": event_url,
                                "start_at": start_at,
                                "end_at": end_at,
                                "timezone": timezone,
                                "location": location or "No location",
                                "city": city,
                                "hosts": host_names,
                                "document_type": "Luma Event",
                                "connector_type": "Luma",
                            }
                            (
                                summary_content,
                                summary_embedding,
                            ) = await generate_document_summary(
                                event_markdown, user_llm, document_metadata
                            )
                        else:
                            summary_content = f"Luma Event: {event_name}\n\n"
                            if event_url:
                                summary_content += f"URL: {event_url}\n"
                            summary_content += f"Start: {start_at}\n"
                            summary_content += f"End: {end_at}\n"
                            if timezone:
                                summary_content += f"Timezone: {timezone}\n"
                            if location:
                                summary_content += f"Location: {location}\n"
                            if city:
                                summary_content += f"City: {city}\n"
                            if host_names:
                                summary_content += f"Hosts: {host_names}\n"
                            if description:
                                desc_preview = description[:1000]
                                if len(description) > 1000:
                                    desc_preview += "..."
                                summary_content += f"Description: {desc_preview}\n"
                            summary_embedding = config.embedding_model_instance.embed(
                                summary_content
                            )
                        # Process chunks
                        chunks = await create_document_chunks(event_markdown)
                        # Update existing document
                        existing_document.title = f"Luma Event - {event_name}"
                        existing_document.content = summary_content
                        existing_document.content_hash = content_hash
                        existing_document.embedding = summary_embedding
                        existing_document.document_metadata = {
                            "event_id": event_id,
                            "event_name": event_name,
                            "event_url": event_url,
                            "start_at": start_at,
                            "end_at": end_at,
                            "timezone": timezone,
                            "location": location,
                            "city": city,
                            "hosts": host_names,
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        }
                        existing_document.chunks = chunks
                        documents_indexed += 1
                        logger.info(f"Successfully updated Luma event {event_name}")
                        continue
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
@ -340,6 +429,7 @@ async def index_luma_events(
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@ -15,11 +15,12 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    build_document_metadata_string,
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -282,22 +283,82 @@ async def index_notion_pages(
                combined_document_string = build_document_metadata_string(
                    metadata_sections
                )
                # Generate unique identifier hash for this Notion page
                unique_identifier_hash = generate_unique_identifier_hash(
                    DocumentType.NOTION_CONNECTOR, page_id, search_space_id
                )
                # Generate content hash
                content_hash = generate_content_hash(
                    combined_document_string, search_space_id
                )
-                # Check if document with this content hash already exists
+                # Check if document with this unique identifier already exists
-                existing_document_by_hash = await check_duplicate_document_by_hash(
+                existing_document = await check_document_by_unique_identifier(
-                    session, content_hash
+                    session, unique_identifier_hash
                )
-                if existing_document_by_hash:
+                if existing_document:
-                    logger.info(
+                    # Document exists - check if content has changed
-                        f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
+                    if existing_document.content_hash == content_hash:
-                    )
+                        logger.info(
-                    documents_skipped += 1
+                            f"Document for Notion page {page_title} unchanged. Skipping."
-                    continue
+                        )
                        documents_skipped += 1
                        continue
                    else:
                        # Content has changed - update the existing document
                        logger.info(
                            f"Content changed for Notion page {page_title}. Updating document."
                        )
                        # Get user's long context LLM
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
                        )
                        if not user_llm:
                            logger.error(
                                f"No long context LLM configured for user {user_id}"
                            )
                            skipped_pages.append(f"{page_title} (no LLM configured)")
                            documents_skipped += 1
                            continue
                        # Generate summary with metadata
                        document_metadata = {
                            "page_title": page_title,
                            "page_id": page_id,
                            "document_type": "Notion Page",
                            "connector_type": "Notion",
                        }
                        (
                            summary_content,
                            summary_embedding,
                        ) = await generate_document_summary(
                            markdown_content, user_llm, document_metadata
                        )
                        # Process chunks
                        chunks = await create_document_chunks(markdown_content)
                        # Update existing document
                        existing_document.title = f"Notion - {page_title}"
                        existing_document.content = summary_content
                        existing_document.content_hash = content_hash
                        existing_document.embedding = summary_embedding
                        existing_document.document_metadata = {
                            "page_title": page_title,
                            "page_id": page_id,
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        }
                        existing_document.chunks = chunks
                        documents_indexed += 1
                        logger.info(f"Successfully updated Notion page: {page_title}")
                        continue
                # Document doesn't exist - create new one
                # Get user's long context LLM
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
@ -336,6 +397,7 @@ async def index_notion_pages(
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@ -15,12 +15,13 @@ from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_unique_identifier_hash,
 )
 from .base import (
    build_document_metadata_markdown,
    calculate_date_range,
-    check_duplicate_document_by_hash,
+    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -235,6 +236,7 @@ async def index_slack_messages(
                for msg in formatted_messages:
                    timestamp = msg.get("datetime", "Unknown Time")
                    msg_ts = msg.get("ts", timestamp)  # Get original Slack timestamp
                    msg_user_name = msg.get("user_name", "Unknown User")
                    msg_user_email = msg.get("user_email", "Unknown Email")
                    msg_text = msg.get("text", "")
@ -261,22 +263,68 @@ async def index_slack_messages(
                    combined_document_string = build_document_metadata_markdown(
                        metadata_sections
                    )
                    # Generate unique identifier hash for this Slack message
                    unique_identifier = f"{channel_id}_{msg_ts}"
                    unique_identifier_hash = generate_unique_identifier_hash(
                        DocumentType.SLACK_CONNECTOR, unique_identifier, search_space_id
                    )
                    # Generate content hash
                    content_hash = generate_content_hash(
                        combined_document_string, search_space_id
                    )
-                    # Check if document with this content hash already exists
+                    # Check if document with this unique identifier already exists
-                    existing_document_by_hash = await check_duplicate_document_by_hash(
+                    existing_document = await check_document_by_unique_identifier(
-                        session, content_hash
+                        session, unique_identifier_hash
                    )
-                    if existing_document_by_hash:
+                    if existing_document:
-                        logger.info(
+                        # Document exists - check if content has changed
-                            f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
+                        if existing_document.content_hash == content_hash:
-                        )
+                            logger.info(
-                        documents_skipped += 1
+                                f"Document for Slack message {msg_ts} in channel {channel_name} unchanged. Skipping."
-                        continue
+                            )
                            documents_skipped += 1
                            continue
                        else:
                            # Content has changed - update the existing document
                            logger.info(
                                f"Content changed for Slack message {msg_ts} in channel {channel_name}. Updating document."
                            )
                            # Update chunks and embedding
                            chunks = await create_document_chunks(
                                combined_document_string
                            )
                            doc_embedding = config.embedding_model_instance.embed(
                                combined_document_string
                            )
                            # Update existing document
                            existing_document.content = combined_document_string
                            existing_document.content_hash = content_hash
                            existing_document.embedding = doc_embedding
                            existing_document.document_metadata = {
                                "channel_name": channel_name,
                                "channel_id": channel_id,
                                "start_date": start_date_str,
                                "end_date": end_date_str,
                                "message_count": len(formatted_messages),
                                "indexed_at": datetime.now().strftime(
                                    "%Y-%m-%d %H:%M:%S"
                                ),
                            }
                            # Delete old chunks and add new ones
                            existing_document.chunks = chunks
                            documents_indexed += 1
                            logger.info(f"Successfully updated Slack message {msg_ts}")
                            continue
                    # Document doesn't exist - create new one
                    # Process chunks
                    chunks = await create_document_chunks(combined_document_string)
                    doc_embedding = config.embedding_model_instance.embed(
@ -300,6 +348,7 @@ async def index_slack_messages(
                        embedding=doc_embedding,
                        chunks=chunks,
                        content_hash=content_hash,
                        unique_identifier_hash=unique_identifier_hash,
                    )
                    session.add(document)
--- a/surfsense_backend/app/tasks/document_processors/base.py
+++ b/surfsense_backend/app/tasks/document_processors/base.py
@ -29,3 +29,27 @@ async def check_duplicate_document(
        select(Document).where(Document.content_hash == content_hash)
    )
    return existing_doc_result.scalars().first()
 async def check_document_by_unique_identifier(
    session: AsyncSession, unique_identifier_hash: str
 ) -> Document | None:
    """
    Check if a document with the given unique identifier hash already exists.
    Eagerly loads chunks to avoid lazy loading issues during updates.
    Args:
        session: Database session
        unique_identifier_hash: Hash of the unique identifier from the source
    Returns:
        Existing document if found, None otherwise
    """
    from sqlalchemy.orm import selectinload
    existing_doc_result = await session.execute(
        select(Document)
        .options(selectinload(Document.chunks))
        .where(Document.unique_identifier_hash == unique_identifier_hash)
    )
    return existing_doc_result.scalars().first()
--- a/surfsense_backend/app/tasks/document_processors/extension_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py
@ -15,10 +15,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
-    check_duplicate_document,
+    check_document_by_unique_identifier,
 )
@ -85,25 +86,42 @@ async def add_extension_received_document(
        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
        # Generate unique identifier hash for this extension document (using URL)
        unique_identifier_hash = generate_unique_identifier_hash(
            DocumentType.EXTENSION, content.metadata.VisitedWebPageURL, search_space_id
        )
        # Generate content hash
        content_hash = generate_content_hash(combined_document_string, search_space_id)
-        # Check if document with this content hash already exists
+        # Check if document with this unique identifier already exists
-        existing_document = await check_duplicate_document(session, content_hash)
+        existing_document = await check_document_by_unique_identifier(
-        if existing_document:
+            session, unique_identifier_hash
-            await task_logger.log_task_success(
+        )
                log_entry,
                f"Extension document already exists: {content.metadata.VisitedWebPageTitle}",
                {
                    "duplicate_detected": True,
                    "existing_document_id": existing_document.id,
                },
            )
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
-        # Get user's long context LLM
+        if existing_document:
            # Document exists - check if content has changed
            if existing_document.content_hash == content_hash:
                await task_logger.log_task_success(
                    log_entry,
                    f"Extension document unchanged: {content.metadata.VisitedWebPageTitle}",
                    {
                        "duplicate_detected": True,
                        "existing_document_id": existing_document.id,
                    },
                )
                logging.info(
                    f"Document for URL {content.metadata.VisitedWebPageURL} unchanged. Skipping."
                )
                return existing_document
            else:
                # Content has changed - update the existing document
                logging.info(
                    f"Content changed for URL {content.metadata.VisitedWebPageURL}. Updating document."
                )
        # Get user's long context LLM (needed for both create and update)
        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
        if not user_llm:
            raise RuntimeError(
@ -127,21 +145,36 @@ async def add_extension_received_document(
        # Process chunks
        chunks = await create_document_chunks(content.pageContent)
-        # Create and store document
+        # Update or create document
-        document = Document(
+        if existing_document:
-            search_space_id=search_space_id,
+            # Update existing document
-            title=content.metadata.VisitedWebPageTitle,
+            existing_document.title = content.metadata.VisitedWebPageTitle
-            document_type=DocumentType.EXTENSION,
+            existing_document.content = summary_content
-            document_metadata=content.metadata.model_dump(),
+            existing_document.content_hash = content_hash
-            content=summary_content,
+            existing_document.embedding = summary_embedding
-            embedding=summary_embedding,
+            existing_document.document_metadata = content.metadata.model_dump()
-            chunks=chunks,
+            existing_document.chunks = chunks
            content_hash=content_hash,
        )
-        session.add(document)
+            await session.commit()
-        await session.commit()
+            await session.refresh(existing_document)
-        await session.refresh(document)
+            document = existing_document
        else:
            # Create new document
            document = Document(
                search_space_id=search_space_id,
                title=content.metadata.VisitedWebPageTitle,
                document_type=DocumentType.EXTENSION,
                document_metadata=content.metadata.model_dump(),
                content=summary_content,
                embedding=summary_embedding,
                chunks=chunks,
                content_hash=content_hash,
                unique_identifier_hash=unique_identifier_hash,
            )
            session.add(document)
            await session.commit()
            await session.refresh(document)
        # Log success
        await task_logger.log_task_success(
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -15,10 +15,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
-    check_duplicate_document,
+    check_document_by_unique_identifier,
 )
@ -47,19 +48,31 @@ async def add_received_file_document_using_unstructured(
            unstructured_processed_elements
        )
        # Generate unique identifier hash for this file
        unique_identifier_hash = generate_unique_identifier_hash(
            DocumentType.FILE, file_name, search_space_id
        )
        # Generate content hash
        content_hash = generate_content_hash(file_in_markdown, search_space_id)
-        # Check if document with this content hash already exists
+        # Check if document with this unique identifier already exists
-        existing_document = await check_duplicate_document(session, content_hash)
+        existing_document = await check_document_by_unique_identifier(
            session, unique_identifier_hash
        )
        if existing_document:
-            logging.info(
+            # Document exists - check if content has changed
-                f"Document with content hash {content_hash} already exists. Skipping processing."
+            if existing_document.content_hash == content_hash:
-            )
+                logging.info(f"Document for file {file_name} unchanged. Skipping.")
-            return existing_document
+                return existing_document
            else:
                # Content has changed - update the existing document
                logging.info(
                    f"Content changed for file {file_name}. Updating document."
                )
-        # TODO: Check if file_markdown exceeds token limit of embedding model
+        # Get user's long context LLM (needed for both create and update)
        # Get user's long context LLM
        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
        if not user_llm:
            raise RuntimeError(
@ -79,24 +92,42 @@ async def add_received_file_document_using_unstructured(
        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
-        # Create and store document
+        # Update or create document
-        document = Document(
+        if existing_document:
-            search_space_id=search_space_id,
+            # Update existing document
-            title=file_name,
+            existing_document.title = file_name
-            document_type=DocumentType.FILE,
+            existing_document.content = summary_content
-            document_metadata={
+            existing_document.content_hash = content_hash
            existing_document.embedding = summary_embedding
            existing_document.document_metadata = {
                "FILE_NAME": file_name,
                "ETL_SERVICE": "UNSTRUCTURED",
-            },
+            }
-            content=summary_content,
+            existing_document.chunks = chunks
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
-        session.add(document)
+            await session.commit()
-        await session.commit()
+            await session.refresh(existing_document)
-        await session.refresh(document)
+            document = existing_document
        else:
            # Create new document
            document = Document(
                search_space_id=search_space_id,
                title=file_name,
                document_type=DocumentType.FILE,
                document_metadata={
                    "FILE_NAME": file_name,
                    "ETL_SERVICE": "UNSTRUCTURED",
                },
                content=summary_content,
                embedding=summary_embedding,
                chunks=chunks,
                content_hash=content_hash,
                unique_identifier_hash=unique_identifier_hash,
            )
            session.add(document)
            await session.commit()
            await session.refresh(document)
        return document
    except SQLAlchemyError as db_error:
@ -131,17 +162,31 @@ async def add_received_file_document_using_llamacloud(
        # Combine all markdown documents into one
        file_in_markdown = llamacloud_markdown_document
        # Generate unique identifier hash for this file
        unique_identifier_hash = generate_unique_identifier_hash(
            DocumentType.FILE, file_name, search_space_id
        )
        # Generate content hash
        content_hash = generate_content_hash(file_in_markdown, search_space_id)
-        # Check if document with this content hash already exists
+        # Check if document with this unique identifier already exists
-        existing_document = await check_duplicate_document(session, content_hash)
+        existing_document = await check_document_by_unique_identifier(
-        if existing_document:
+            session, unique_identifier_hash
-            logging.info(
+        )
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
-        # Get user's long context LLM
+        if existing_document:
            # Document exists - check if content has changed
            if existing_document.content_hash == content_hash:
                logging.info(f"Document for file {file_name} unchanged. Skipping.")
                return existing_document
            else:
                # Content has changed - update the existing document
                logging.info(
                    f"Content changed for file {file_name}. Updating document."
                )
        # Get user's long context LLM (needed for both create and update)
        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
        if not user_llm:
            raise RuntimeError(
@ -161,24 +206,42 @@ async def add_received_file_document_using_llamacloud(
        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
-        # Create and store document
+        # Update or create document
-        document = Document(
+        if existing_document:
-            search_space_id=search_space_id,
+            # Update existing document
-            title=file_name,
+            existing_document.title = file_name
-            document_type=DocumentType.FILE,
+            existing_document.content = summary_content
-            document_metadata={
+            existing_document.content_hash = content_hash
            existing_document.embedding = summary_embedding
            existing_document.document_metadata = {
                "FILE_NAME": file_name,
                "ETL_SERVICE": "LLAMACLOUD",
-            },
+            }
-            content=summary_content,
+            existing_document.chunks = chunks
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
-        session.add(document)
+            await session.commit()
-        await session.commit()
+            await session.refresh(existing_document)
-        await session.refresh(document)
+            document = existing_document
        else:
            # Create new document
            document = Document(
                search_space_id=search_space_id,
                title=file_name,
                document_type=DocumentType.FILE,
                document_metadata={
                    "FILE_NAME": file_name,
                    "ETL_SERVICE": "LLAMACLOUD",
                },
                content=summary_content,
                embedding=summary_embedding,
                chunks=chunks,
                content_hash=content_hash,
                unique_identifier_hash=unique_identifier_hash,
            )
            session.add(document)
            await session.commit()
            await session.refresh(document)
        return document
    except SQLAlchemyError as db_error:
@ -214,17 +277,31 @@ async def add_received_file_document_using_docling(
    try:
        file_in_markdown = docling_markdown_document
        # Generate unique identifier hash for this file
        unique_identifier_hash = generate_unique_identifier_hash(
            DocumentType.FILE, file_name, search_space_id
        )
        # Generate content hash
        content_hash = generate_content_hash(file_in_markdown, search_space_id)
-        # Check if document with this content hash already exists
+        # Check if document with this unique identifier already exists
-        existing_document = await check_duplicate_document(session, content_hash)
+        existing_document = await check_document_by_unique_identifier(
-        if existing_document:
+            session, unique_identifier_hash
-            logging.info(
+        )
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
-        # Get user's long context LLM
+        if existing_document:
            # Document exists - check if content has changed
            if existing_document.content_hash == content_hash:
                logging.info(f"Document for file {file_name} unchanged. Skipping.")
                return existing_document
            else:
                # Content has changed - update the existing document
                logging.info(
                    f"Content changed for file {file_name}. Updating document."
                )
        # Get user's long context LLM (needed for both create and update)
        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
        if not user_llm:
            raise RuntimeError(
@ -268,20 +345,38 @@ async def add_received_file_document_using_docling(
        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
-        # Create and store document
+        # Update or create document
-        document = Document(
+        if existing_document:
-            search_space_id=search_space_id,
+            # Update existing document
-            title=file_name,
+            existing_document.title = file_name
-            document_type=DocumentType.FILE,
+            existing_document.content = enhanced_summary_content
-            document_metadata={
+            existing_document.content_hash = content_hash
            existing_document.embedding = summary_embedding
            existing_document.document_metadata = {
                "FILE_NAME": file_name,
                "ETL_SERVICE": "DOCLING",
-            },
+            }
-            content=enhanced_summary_content,
+            existing_document.chunks = chunks
-            embedding=summary_embedding,
+
-            chunks=chunks,
+            await session.commit()
-            content_hash=content_hash,
+            await session.refresh(existing_document)
-        )
+            document = existing_document
        else:
            # Create new document
            document = Document(
                search_space_id=search_space_id,
                title=file_name,
                document_type=DocumentType.FILE,
                document_metadata={
                    "FILE_NAME": file_name,
                    "ETL_SERVICE": "DOCLING",
                },
                content=enhanced_summary_content,
                embedding=summary_embedding,
                chunks=chunks,
                content_hash=content_hash,
                unique_identifier_hash=unique_identifier_hash,
            )
        session.add(document)
        await session.commit()
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@ -14,10 +14,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
-    check_duplicate_document,
+    check_document_by_unique_identifier,
 )
@ -56,25 +57,41 @@ async def add_received_markdown_file_document(
    )
    try:
        # Generate unique identifier hash for this markdown file
        unique_identifier_hash = generate_unique_identifier_hash(
            DocumentType.FILE, file_name, search_space_id
        )
        # Generate content hash
        content_hash = generate_content_hash(file_in_markdown, search_space_id)
-        # Check if document with this content hash already exists
+        # Check if document with this unique identifier already exists
-        existing_document = await check_duplicate_document(session, content_hash)
+        existing_document = await check_document_by_unique_identifier(
-        if existing_document:
+            session, unique_identifier_hash
-            await task_logger.log_task_success(
+        )
                log_entry,
                f"Markdown file document already exists: {file_name}",
                {
                    "duplicate_detected": True,
                    "existing_document_id": existing_document.id,
                },
            )
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
-        # Get user's long context LLM
+        if existing_document:
            # Document exists - check if content has changed
            if existing_document.content_hash == content_hash:
                await task_logger.log_task_success(
                    log_entry,
                    f"Markdown file document unchanged: {file_name}",
                    {
                        "duplicate_detected": True,
                        "existing_document_id": existing_document.id,
                    },
                )
                logging.info(
                    f"Document for markdown file {file_name} unchanged. Skipping."
                )
                return existing_document
            else:
                # Content has changed - update the existing document
                logging.info(
                    f"Content changed for markdown file {file_name}. Updating document."
                )
        # Get user's long context LLM (needed for both create and update)
        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
        if not user_llm:
            raise RuntimeError(
@ -93,23 +110,40 @@ async def add_received_markdown_file_document(
        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
-        # Create and store document
+        # Update or create document
-        document = Document(
+        if existing_document:
-            search_space_id=search_space_id,
+            # Update existing document
-            title=file_name,
+            existing_document.title = file_name
-            document_type=DocumentType.FILE,
+            existing_document.content = summary_content
-            document_metadata={
+            existing_document.content_hash = content_hash
            existing_document.embedding = summary_embedding
            existing_document.document_metadata = {
                "FILE_NAME": file_name,
-            },
+            }
-            content=summary_content,
+            existing_document.chunks = chunks
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
-        session.add(document)
+            await session.commit()
-        await session.commit()
+            await session.refresh(existing_document)
-        await session.refresh(document)
+            document = existing_document
        else:
            # Create new document
            document = Document(
                search_space_id=search_space_id,
                title=file_name,
                document_type=DocumentType.FILE,
                document_metadata={
                    "FILE_NAME": file_name,
                },
                content=summary_content,
                embedding=summary_embedding,
                chunks=chunks,
                content_hash=content_hash,
                unique_identifier_hash=unique_identifier_hash,
            )
            session.add(document)
            await session.commit()
            await session.refresh(document)
        # Log success
        await task_logger.log_task_success(
--- a/surfsense_backend/app/tasks/document_processors/url_crawler.py
+++ b/surfsense_backend/app/tasks/document_processors/url_crawler.py
@ -17,10 +17,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
-    check_duplicate_document,
+    check_document_by_unique_identifier,
    md,
 )
@ -129,31 +130,49 @@ async def add_crawled_url_document(
        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
        content_hash = generate_content_hash(combined_document_string, search_space_id)
-        # Check for duplicates
+        # Generate unique identifier hash for this URL
-        await task_logger.log_task_progress(
+        unique_identifier_hash = generate_unique_identifier_hash(
-            log_entry,
+            DocumentType.CRAWLED_URL, url, search_space_id
            f"Checking for duplicate content: {url}",
            {"stage": "duplicate_check", "content_hash": content_hash},
        )
-        existing_document = await check_duplicate_document(session, content_hash)
+        # Generate content hash
-        if existing_document:
+        content_hash = generate_content_hash(combined_document_string, search_space_id)
            await task_logger.log_task_success(
                log_entry,
                f"Document already exists for URL: {url}",
                {
                    "duplicate_detected": True,
                    "existing_document_id": existing_document.id,
                },
            )
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
-        # Get LLM for summary generation
+        # Check if document with this unique identifier already exists
        await task_logger.log_task_progress(
            log_entry,
            f"Checking for existing URL: {url}",
            {"stage": "duplicate_check", "url": url},
        )
        existing_document = await check_document_by_unique_identifier(
            session, unique_identifier_hash
        )
        if existing_document:
            # Document exists - check if content has changed
            if existing_document.content_hash == content_hash:
                await task_logger.log_task_success(
                    log_entry,
                    f"URL document unchanged: {url}",
                    {
                        "duplicate_detected": True,
                        "existing_document_id": existing_document.id,
                    },
                )
                logging.info(f"Document for URL {url} unchanged. Skipping.")
                return existing_document
            else:
                # Content has changed - update the existing document
                logging.info(f"Content changed for URL {url}. Updating document.")
                await task_logger.log_task_progress(
                    log_entry,
                    f"Updating URL document: {url}",
                    {"stage": "document_update", "url": url},
                )
        # Get LLM for summary generation (needed for both create and update)
        await task_logger.log_task_progress(
            log_entry,
            f"Preparing for summary generation: {url}",
@ -194,27 +213,50 @@ async def add_crawled_url_document(
        chunks = await create_document_chunks(content_in_markdown)
-        # Create and store document
+        # Update or create document
-        await task_logger.log_task_progress(
+        if existing_document:
-            log_entry,
+            # Update existing document
-            f"Creating document in database for URL: {url}",
+            await task_logger.log_task_progress(
-            {"stage": "document_creation", "chunks_count": len(chunks)},
+                log_entry,
-        )
+                f"Updating document in database for URL: {url}",
                {"stage": "document_update", "chunks_count": len(chunks)},
            )
-        document = Document(
+            existing_document.title = (
-            search_space_id=search_space_id,
+                url_crawled[0].metadata["title"]
-            title=url_crawled[0].metadata["title"]
+                if isinstance(crawl_loader, FireCrawlLoader)
-            if isinstance(crawl_loader, FireCrawlLoader)
+                else url_crawled[0].metadata["source"]
-            else url_crawled[0].metadata["source"],
+            )
-            document_type=DocumentType.CRAWLED_URL,
+            existing_document.content = summary_content
-            document_metadata=url_crawled[0].metadata,
+            existing_document.content_hash = content_hash
-            content=summary_content,
+            existing_document.embedding = summary_embedding
-            embedding=summary_embedding,
+            existing_document.document_metadata = url_crawled[0].metadata
-            chunks=chunks,
+            existing_document.chunks = chunks
            content_hash=content_hash,
        )
-        session.add(document)
+            document = existing_document
        else:
            # Create new document
            await task_logger.log_task_progress(
                log_entry,
                f"Creating document in database for URL: {url}",
                {"stage": "document_creation", "chunks_count": len(chunks)},
            )
            document = Document(
                search_space_id=search_space_id,
                title=url_crawled[0].metadata["title"]
                if isinstance(crawl_loader, FireCrawlLoader)
                else url_crawled[0].metadata["source"],
                document_type=DocumentType.CRAWLED_URL,
                document_metadata=url_crawled[0].metadata,
                content=summary_content,
                embedding=summary_embedding,
                chunks=chunks,
                content_hash=content_hash,
                unique_identifier_hash=unique_identifier_hash,
            )
            session.add(document)
        await session.commit()
        await session.refresh(document)
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -17,10 +17,11 @@ from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
-    check_duplicate_document,
+    check_document_by_unique_identifier,
 )
@ -201,32 +202,54 @@ async def add_youtube_video_document(
        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
        content_hash = generate_content_hash(combined_document_string, search_space_id)
-        # Check for duplicates
+        # Generate unique identifier hash for this YouTube video
-        await task_logger.log_task_progress(
+        unique_identifier_hash = generate_unique_identifier_hash(
-            log_entry,
+            DocumentType.YOUTUBE_VIDEO, video_id, search_space_id
            f"Checking for duplicate video content: {video_id}",
            {"stage": "duplicate_check", "content_hash": content_hash},
        )
-        existing_document = await check_duplicate_document(session, content_hash)
+        # Generate content hash
-        if existing_document:
+        content_hash = generate_content_hash(combined_document_string, search_space_id)
            await task_logger.log_task_success(
                log_entry,
                f"YouTube video document already exists: {video_data.get('title', 'YouTube Video')}",
                {
                    "duplicate_detected": True,
                    "existing_document_id": existing_document.id,
                    "video_id": video_id,
                },
            )
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
-        # Get LLM for summary generation
+        # Check if document with this unique identifier already exists
        await task_logger.log_task_progress(
            log_entry,
            f"Checking for existing video: {video_id}",
            {"stage": "duplicate_check", "video_id": video_id},
        )
        existing_document = await check_document_by_unique_identifier(
            session, unique_identifier_hash
        )
        if existing_document:
            # Document exists - check if content has changed
            if existing_document.content_hash == content_hash:
                await task_logger.log_task_success(
                    log_entry,
                    f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}",
                    {
                        "duplicate_detected": True,
                        "existing_document_id": existing_document.id,
                        "video_id": video_id,
                    },
                )
                logging.info(
                    f"Document for YouTube video {video_id} unchanged. Skipping."
                )
                return existing_document
            else:
                # Content has changed - update the existing document
                logging.info(
                    f"Content changed for YouTube video {video_id}. Updating document."
                )
                await task_logger.log_task_progress(
                    log_entry,
                    f"Updating YouTube video document: {video_data.get('title', 'YouTube Video')}",
                    {"stage": "document_update", "video_id": video_id},
                )
        # Get LLM for summary generation (needed for both create and update)
        await task_logger.log_task_progress(
            log_entry,
            f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
@ -270,33 +293,60 @@ async def add_youtube_video_document(
        chunks = await create_document_chunks(combined_document_string)
-        # Create document
+        # Update or create document
-        await task_logger.log_task_progress(
+        if existing_document:
-            log_entry,
+            # Update existing document
-            f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
+            await task_logger.log_task_progress(
-            {"stage": "document_creation", "chunks_count": len(chunks)},
+                log_entry,
-        )
+                f"Updating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
                {"stage": "document_update", "chunks_count": len(chunks)},
            )
-        document = Document(
+            existing_document.title = video_data.get("title", "YouTube Video")
-            title=video_data.get("title", "YouTube Video"),
+            existing_document.content = summary_content
-            document_type=DocumentType.YOUTUBE_VIDEO,
+            existing_document.content_hash = content_hash
-            document_metadata={
+            existing_document.embedding = summary_embedding
            existing_document.document_metadata = {
                "url": url,
                "video_id": video_id,
                "video_title": video_data.get("title", "YouTube Video"),
                "author": video_data.get("author_name", "Unknown"),
                "thumbnail": video_data.get("thumbnail_url", ""),
-            },
+            }
-            content=summary_content,
+            existing_document.chunks = chunks
            embedding=summary_embedding,
            chunks=chunks,
            search_space_id=search_space_id,
            content_hash=content_hash,
        )
-        session.add(document)
+            await session.commit()
-        await session.commit()
+            await session.refresh(existing_document)
-        await session.refresh(document)
+            document = existing_document
        else:
            # Create new document
            await task_logger.log_task_progress(
                log_entry,
                f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
                {"stage": "document_creation", "chunks_count": len(chunks)},
            )
            document = Document(
                title=video_data.get("title", "YouTube Video"),
                document_type=DocumentType.YOUTUBE_VIDEO,
                document_metadata={
                    "url": url,
                    "video_id": video_id,
                    "video_title": video_data.get("title", "YouTube Video"),
                    "author": video_data.get("author_name", "Unknown"),
                    "thumbnail": video_data.get("thumbnail_url", ""),
                },
                content=summary_content,
                embedding=summary_embedding,
                chunks=chunks,
                search_space_id=search_space_id,
                content_hash=content_hash,
                unique_identifier_hash=unique_identifier_hash,
            )
            session.add(document)
            await session.commit()
            await session.refresh(document)
        # Log success
        await task_logger.log_task_success(
--- a/surfsense_backend/app/utils/document_converters.py
+++ b/surfsense_backend/app/utils/document_converters.py
@ -3,7 +3,7 @@ import hashlib
 from litellm import get_model_info, token_counter
 from app.config import config
-from app.db import Chunk
+from app.db import Chunk, DocumentType
 from app.prompts import SUMMARY_PROMPT_TEMPLATE
@ -308,3 +308,40 @@ def generate_content_hash(content: str, search_space_id: int) -> str:
    """Generate SHA-256 hash for the given content combined with search space ID."""
    combined_data = f"{search_space_id}:{content}"
    return hashlib.sha256(combined_data.encode("utf-8")).hexdigest()
 def generate_unique_identifier_hash(
    document_type: DocumentType,
    unique_identifier: str | int | float,
    search_space_id: int,
 ) -> str:
    """
    Generate SHA-256 hash for a unique document identifier from connector sources.
    This function creates a consistent hash based on the document type, its unique
    identifier from the source system, and the search space ID. This helps prevent
    duplicate documents when syncing from various connectors like Slack, Notion, Jira, etc.
    Args:
        document_type: The type of document (e.g., SLACK_CONNECTOR, NOTION_CONNECTOR)
        unique_identifier: The unique ID from the source system (e.g., message ID, page ID)
        search_space_id: The search space this document belongs to
    Returns:
        str: SHA-256 hash string representing the unique document identifier
    Example:
        >>> generate_unique_identifier_hash(
        ...     DocumentType.SLACK_CONNECTOR,
        ...     "1234567890.123456",
        ...     42
        ... )
        'a1b2c3d4e5f6...'
    """
    # Convert unique_identifier to string to handle different types
    identifier_str = str(unique_identifier)
    # Combine document type value, unique identifier, and search space ID
    combined_data = f"{document_type.value}:{identifier_str}:{search_space_id}"
    return hashlib.sha256(combined_data.encode("utf-8")).hexdigest()