Merge remote-tracking branch 'upstream/dev' into fix/notion-connector

2026-04-30 03:16:25 +02:00 · 2026-01-29 10:45:31 +05:30 · 2026-01-29 10:45:31 +05:30 · 1658724fb2
commit 1658724fb2
parent 59d5bf9aa5 0b65c3a98c
32 changed files with 633 additions and 78 deletions
--- a/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
@ -55,7 +55,9 @@ def _clear_generating_podcast(search_space_id: int) -> None:
        client = redis.from_url(redis_url, decode_responses=True)
        key = f"podcast:generating:{search_space_id}"
        client.delete(key)
-        logger.info(f"Cleared generating podcast key for search_space_id={search_space_id}")
+        logger.info(
+            f"Cleared generating podcast key for search_space_id={search_space_id}"
+        )
    except Exception as e:
        logger.warning(f"Could not clear generating podcast key: {e}")

@ -119,9 +121,7 @@ async def _generate_content_podcast(
 ) -> dict:
    """Generate content-based podcast and update existing record."""
    async with get_celery_session_maker()() as session:
-        result = await session.execute(
-            select(Podcast).filter(Podcast.id == podcast_id)
-        )
+        result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
        podcast = result.scalars().first()

        if not podcast:
--- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
@ -156,6 +156,41 @@ async def _check_and_trigger_schedules():
                            )
                            await session.commit()
                            continue
+
+                    # Special handling for Webcrawler - skip if no URLs configured
+                    elif (
+                        connector.connector_type
+                        == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
+                    ):
+                        from app.utils.webcrawler_utils import parse_webcrawler_urls
+
+                        connector_config = connector.config or {}
+                        urls = parse_webcrawler_urls(
+                            connector_config.get("INITIAL_URLS")
+                        )
+
+                        if urls:
+                            task.delay(
+                                connector.id,
+                                connector.search_space_id,
+                                str(connector.user_id),
+                                None,  # start_date
+                                None,  # end_date
+                            )
+                        else:
+                            # No URLs configured - skip indexing but still update next_scheduled_at
+                            logger.info(
+                                f"Webcrawler connector {connector.id} has no URLs configured, "
+                                "skipping periodic indexing (will check again at next scheduled time)"
+                            )
+                            from datetime import timedelta
+
+                            connector.next_scheduled_at = now + timedelta(
+                                minutes=connector.indexing_frequency_minutes
+                            )
+                            await session.commit()
+                            continue
+
                    else:
                        task.delay(
                            connector.id,
--- a/surfsense_backend/app/tasks/composio_indexer.py
+++ b/surfsense_backend/app/tasks/composio_indexer.py
@ -86,7 +86,7 @@ async def index_composio_connector(
    end_date: str | None = None,
    update_last_indexed: bool = True,
    max_items: int = 1000,
-) -> tuple[int, str]:
+) -> tuple[int, int, str | None]:
    """
    Index content from a Composio connector.

@ -104,7 +104,7 @@ async def index_composio_connector(
        max_items: Maximum number of items to fetch

    Returns:
-        Tuple of (number_of_indexed_items, error_message or None)
+        Tuple of (number_of_indexed_items, number_of_skipped_items, error_message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)

@ -132,14 +132,14 @@ async def index_composio_connector(
            await task_logger.log_task_failure(
                log_entry, error_msg, {"error_type": "InvalidConnectorType"}
            )
-            return 0, error_msg
+            return 0, 0, error_msg

        if not connector:
            error_msg = f"Composio connector with ID {connector_id} not found"
            await task_logger.log_task_failure(
                log_entry, error_msg, {"error_type": "ConnectorNotFound"}
            )
-            return 0, error_msg
+            return 0, 0, error_msg

        # Get toolkit ID from config
        toolkit_id = connector.config.get("toolkit_id")
@ -150,7 +150,7 @@ async def index_composio_connector(
            await task_logger.log_task_failure(
                log_entry, error_msg, {"error_type": "MissingToolkitId"}
            )
-            return 0, error_msg
+            return 0, 0, error_msg

        # Check if toolkit is indexable
        if toolkit_id not in INDEXABLE_TOOLKITS:
@ -158,7 +158,7 @@ async def index_composio_connector(
            await task_logger.log_task_failure(
                log_entry, error_msg, {"error_type": "ToolkitNotIndexable"}
            )
-            return 0, error_msg
+            return 0, 0, error_msg

        # Get indexer function from registry
        try:
@ -167,7 +167,7 @@ async def index_composio_connector(
            await task_logger.log_task_failure(
                log_entry, str(e), {"error_type": "NoIndexerImplemented"}
            )
-            return 0, str(e)
+            return 0, 0, str(e)

        # Build kwargs for the indexer function
        kwargs = {
@ -199,7 +199,7 @@ async def index_composio_connector(
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
-        return 0, f"Database error: {db_error!s}"
+        return 0, 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
@ -209,4 +209,4 @@ async def index_composio_connector(
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Composio connector: {e!s}", exc_info=True)
-        return 0, f"Failed to index Composio connector: {e!s}"
+        return 0, 0, f"Failed to index Composio connector: {e!s}"
--- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
@ -20,6 +20,7 @@ from app.utils.document_converters import (
 from .base import (
    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -317,6 +318,24 @@ async def index_airtable_records(
                                    )
                                    continue

+                            # Document doesn't exist by unique_identifier_hash
+                            # Check if a document with the same content_hash exists (from another connector)
+                            with session.no_autoflush:
+                                duplicate_by_content = (
+                                    await check_duplicate_document_by_hash(
+                                        session, content_hash
+                                    )
+                                )
+
+                            if duplicate_by_content:
+                                logger.info(
+                                    f"Airtable record {record_id} already indexed by another connector "
+                                    f"(existing document ID: {duplicate_by_content.id}, "
+                                    f"type: {duplicate_by_content.document_type}). Skipping."
+                                )
+                                documents_skipped += 1
+                                continue
+
                            # Document doesn't exist - create new one
                            # Generate document summary
                            user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
@ -22,6 +22,7 @@ from app.utils.document_converters import (
 from .base import (
    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -308,6 +309,22 @@ async def index_bookstack_pages(
                        logger.info(f"Successfully updated BookStack page {page_name}")
                        continue

+                # Document doesn't exist by unique_identifier_hash
+                # Check if a document with the same content_hash exists (from another connector)
+                with session.no_autoflush:
+                    duplicate_by_content = await check_duplicate_document_by_hash(
+                        session, content_hash
+                    )
+
+                if duplicate_by_content:
+                    logger.info(
+                        f"BookStack page {page_name} already indexed by another connector "
+                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"type: {duplicate_by_content.document_type}). Skipping."
+                    )
+                    documents_skipped += 1
+                    continue
+
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
@ -22,6 +22,7 @@ from app.utils.document_converters import (

 from .base import (
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -302,6 +303,22 @@ async def index_clickup_tasks(
                            )
                            continue

+                    # Document doesn't exist by unique_identifier_hash
+                    # Check if a document with the same content_hash exists (from another connector)
+                    with session.no_autoflush:
+                        duplicate_by_content = await check_duplicate_document_by_hash(
+                            session, content_hash
+                        )
+
+                    if duplicate_by_content:
+                        logger.info(
+                            f"ClickUp task {task_name} already indexed by another connector "
+                            f"(existing document ID: {duplicate_by_content.id}, "
+                            f"type: {duplicate_by_content.document_type}). Skipping."
+                        )
+                        documents_skipped += 1
+                        continue
+
                    # Document doesn't exist - create new one
                    # Generate summary with metadata
                    user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@ -23,6 +23,7 @@ from app.utils.document_converters import (
 from .base import (
    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -306,6 +307,22 @@ async def index_confluence_pages(
                        )
                        continue

+                # Document doesn't exist by unique_identifier_hash
+                # Check if a document with the same content_hash exists (from another connector)
+                with session.no_autoflush:
+                    duplicate_by_content = await check_duplicate_document_by_hash(
+                        session, content_hash
+                    )
+
+                if duplicate_by_content:
+                    logger.info(
+                        f"Confluence page {page_title} already indexed by another connector "
+                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"type: {duplicate_by_content.document_type}). Skipping."
+                    )
+                    documents_skipped += 1
+                    continue
+
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
@ -21,6 +21,7 @@ from app.utils.document_converters import (
 from .base import (
    build_document_metadata_markdown,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -454,6 +455,24 @@ async def index_discord_messages(
                                    )
                                    continue

+                            # Document doesn't exist by unique_identifier_hash
+                            # Check if a document with the same content_hash exists (from another connector)
+                            with session.no_autoflush:
+                                duplicate_by_content = (
+                                    await check_duplicate_document_by_hash(
+                                        session, content_hash
+                                    )
+                                )
+
+                            if duplicate_by_content:
+                                logger.info(
+                                    f"Discord message {msg_id} in {guild_name}#{channel_name} already indexed by another connector "
+                                    f"(existing document ID: {duplicate_by_content.id}, "
+                                    f"type: {duplicate_by_content.document_type}). Skipping."
+                                )
+                                documents_skipped += 1
+                                continue
+
                            # Document doesn't exist - create new one
                            # Process chunks
                            chunks = await create_document_chunks(
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@ -24,6 +24,7 @@ from app.utils.document_converters import (

 from .base import (
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -319,6 +320,21 @@ async def _process_repository_digest(
            # Delete existing document to replace with new one
            await session.delete(existing_document)
            await session.flush()
+    else:
+        # Document doesn't exist by unique_identifier_hash
+        # Check if a document with the same content_hash exists (from another connector)
+        with session.no_autoflush:
+            duplicate_by_content = await check_duplicate_document_by_hash(
+                session, content_hash
+            )
+
+        if duplicate_by_content:
+            logger.info(
+                f"Repository {repo_full_name} already indexed by another connector "
+                f"(existing document ID: {duplicate_by_content.id}, "
+                f"type: {duplicate_by_content.document_type}). Skipping."
+            )
+            return 0

    # Generate summary using LLM (ONE call per repository!)
    user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@ -24,7 +24,9 @@ from app.utils.document_converters import (
 )

 from .base import (
+    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -163,10 +165,22 @@ async def index_google_gmail_messages(
            credentials, session, user_id, connector_id
        )

+        # Calculate date range using last_indexed_at if dates not provided
+        # This ensures Gmail uses the same date logic as other connectors
+        # (uses last_indexed_at → now, or 365 days back for first-time indexing)
+        calculated_start_date, calculated_end_date = calculate_date_range(
+            connector, start_date, end_date, default_days_back=365
+        )
+
        # Fetch recent Google gmail messages
-        logger.info(f"Fetching recent emails for connector {connector_id}")
+        logger.info(
+            f"Fetching emails for connector {connector_id} "
+            f"from {calculated_start_date} to {calculated_end_date}"
+        )
        messages, error = await gmail_connector.get_recent_messages(
-            max_results=max_messages, start_date=start_date, end_date=end_date
+            max_results=max_messages,
+            start_date=calculated_start_date,
+            end_date=calculated_end_date,
        )

        if error:
@ -316,6 +330,22 @@ async def index_google_gmail_messages(
                        logger.info(f"Successfully updated Gmail message {subject}")
                        continue

+                # Document doesn't exist by unique_identifier_hash
+                # Check if a document with the same content_hash exists (from another connector)
+                with session.no_autoflush:
+                    duplicate_by_content = await check_duplicate_document_by_hash(
+                        session, content_hash
+                    )
+
+                if duplicate_by_content:
+                    logger.info(
+                        f"Gmail message {subject} already indexed by another connector "
+                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"type: {duplicate_by_content.document_type}). Skipping."
+                    )
+                    documents_skipped += 1
+                    continue
+
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
@ -23,6 +23,7 @@ from app.utils.document_converters import (
 from .base import (
    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -284,6 +285,22 @@ async def index_jira_issues(
                        )
                        continue

+                # Document doesn't exist by unique_identifier_hash
+                # Check if a document with the same content_hash exists (from another connector)
+                with session.no_autoflush:
+                    duplicate_by_content = await check_duplicate_document_by_hash(
+                        session, content_hash
+                    )
+
+                if duplicate_by_content:
+                    logger.info(
+                        f"Jira issue {issue_identifier} already indexed by another connector "
+                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"type: {duplicate_by_content.document_type}). Skipping."
+                    )
+                    documents_skipped += 1
+                    continue
+
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
@ -22,6 +22,7 @@ from app.utils.document_converters import (
 from .base import (
    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -315,6 +316,22 @@ async def index_linear_issues(
                        )
                        continue

+                # Document doesn't exist by unique_identifier_hash
+                # Check if a document with the same content_hash exists (from another connector)
+                with session.no_autoflush:
+                    duplicate_by_content = await check_duplicate_document_by_hash(
+                        session, content_hash
+                    )
+
+                if duplicate_by_content:
+                    logger.info(
+                        f"Linear issue {issue_identifier} already indexed by another connector "
+                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"type: {duplicate_by_content.document_type}). Skipping."
+                    )
+                    documents_skipped += 1
+                    continue
+
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@ -21,6 +21,7 @@ from app.utils.document_converters import (

 from .base import (
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -363,6 +364,22 @@ async def index_luma_events(
                        logger.info(f"Successfully updated Luma event {event_name}")
                        continue

+                # Document doesn't exist by unique_identifier_hash
+                # Check if a document with the same content_hash exists (from another connector)
+                with session.no_autoflush:
+                    duplicate_by_content = await check_duplicate_document_by_hash(
+                        session, content_hash
+                    )
+
+                if duplicate_by_content:
+                    logger.info(
+                        f"Luma event {event_name} already indexed by another connector "
+                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"type: {duplicate_by_content.document_type}). Skipping."
+                    )
+                    documents_skipped += 1
+                    continue
+
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@ -23,6 +23,7 @@ from .base import (
    build_document_metadata_string,
    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -388,6 +389,22 @@ async def index_notion_pages(

                        continue

+                # Document doesn't exist by unique_identifier_hash
+                # Check if a document with the same content_hash exists (from another connector)
+                with session.no_autoflush:
+                    duplicate_by_content = await check_duplicate_document_by_hash(
+                        session, content_hash
+                    )
+
+                if duplicate_by_content:
+                    logger.info(
+                        f"Notion page {page_title} already indexed by another connector "
+                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"type: {duplicate_by_content.document_type}). Skipping."
+                    )
+                    documents_skipped += 1
+                    continue
+
                # Document doesn't exist - create new one
                # Get user's long context LLM
                user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
@ -28,6 +28,7 @@ from app.utils.document_converters import (
 from .base import (
    build_document_metadata_string,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -426,6 +427,22 @@ async def index_obsidian_vault(
                    indexed_count += 1

                else:
+                    # Document doesn't exist by unique_identifier_hash
+                    # Check if a document with the same content_hash exists (from another connector)
+                    with session.no_autoflush:
+                        duplicate_by_content = await check_duplicate_document_by_hash(
+                            session, content_hash
+                        )
+
+                    if duplicate_by_content:
+                        logger.info(
+                            f"Obsidian note {title} already indexed by another connector "
+                            f"(existing document ID: {duplicate_by_content.id}, "
+                            f"type: {duplicate_by_content.document_type}). Skipping."
+                        )
+                        skipped_count += 1
+                        continue
+
                    # Create new document
                    logger.info(f"Indexing new note: {title}")

--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@ -22,6 +22,7 @@ from .base import (
    build_document_metadata_markdown,
    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -325,6 +326,22 @@ async def index_slack_messages(
                            logger.info(f"Successfully updated Slack message {msg_ts}")
                            continue

+                    # Document doesn't exist by unique_identifier_hash
+                    # Check if a document with the same content_hash exists (from another connector)
+                    with session.no_autoflush:
+                        duplicate_by_content = await check_duplicate_document_by_hash(
+                            session, content_hash
+                        )
+
+                    if duplicate_by_content:
+                        logger.info(
+                            f"Slack message {msg_ts} in channel {channel_name} already indexed by another connector "
+                            f"(existing document ID: {duplicate_by_content.id}, "
+                            f"type: {duplicate_by_content.document_type}). Skipping."
+                        )
+                        documents_skipped += 1
+                        continue
+
                    # Document doesn't exist - create new one
                    # Process chunks
                    chunks = await create_document_chunks(combined_document_string)
--- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
@ -21,6 +21,7 @@ from .base import (
    build_document_metadata_markdown,
    calculate_date_range,
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -354,6 +355,27 @@ async def index_teams_messages(
                                    )
                                    continue

+                            # Document doesn't exist by unique_identifier_hash
+                            # Check if a document with the same content_hash exists (from another connector)
+                            with session.no_autoflush:
+                                duplicate_by_content = (
+                                    await check_duplicate_document_by_hash(
+                                        session, content_hash
+                                    )
+                                )
+
+                            if duplicate_by_content:
+                                logger.info(
+                                    "Teams message %s in channel %s already indexed by another connector "
+                                    "(existing document ID: %s, type: %s). Skipping.",
+                                    message_id,
+                                    channel_name,
+                                    duplicate_by_content.id,
+                                    duplicate_by_content.document_type,
+                                )
+                                documents_skipped += 1
+                                continue
+
                            # Document doesn't exist - create new one
                            # Process chunks
                            chunks = await create_document_chunks(
--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -18,9 +18,11 @@ from app.utils.document_converters import (
    generate_document_summary,
    generate_unique_identifier_hash,
 )
+from app.utils.webcrawler_utils import parse_webcrawler_urls

 from .base import (
    check_document_by_unique_identifier,
+    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
@ -96,13 +98,7 @@ async def index_crawled_urls(
        api_key = connector.config.get("FIRECRAWL_API_KEY")

        # Get URLs from connector config
-        initial_urls = connector.config.get("INITIAL_URLS", "")
-        if isinstance(initial_urls, str):
-            urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
-        elif isinstance(initial_urls, list):
-            urls = [url.strip() for url in initial_urls if url.strip()]
-        else:
-            urls = []
+        urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))

        logger.info(
            f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
@ -281,6 +277,22 @@ async def index_crawled_urls(
                        logger.info(f"Successfully updated URL {url}")
                        continue

+                # Document doesn't exist by unique_identifier_hash
+                # Check if a document with the same content_hash exists (from another connector)
+                with session.no_autoflush:
+                    duplicate_by_content = await check_duplicate_document_by_hash(
+                        session, content_hash
+                    )
+
+                if duplicate_by_content:
+                    logger.info(
+                        f"URL {url} already indexed by another connector "
+                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"type: {duplicate_by_content.document_type}). Skipping."
+                    )
+                    documents_skipped += 1
+                    continue
+
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -55,7 +55,9 @@ LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
 )

 # Timeout calculation constants
-UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024  # 100 KB/s (conservative for slow connections)
+UPLOAD_BYTES_PER_SECOND_SLOW = (
+    100 * 1024
+)  # 100 KB/s (conservative for slow connections)
 MIN_UPLOAD_TIMEOUT = 120  # Minimum 2 minutes for any file
 MAX_UPLOAD_TIMEOUT = 1800  # Maximum 30 minutes for very large files
 BASE_JOB_TIMEOUT = 600  # 10 minutes base for job processing
@ -219,19 +221,19 @@ async def find_existing_document_with_migration(
 def calculate_upload_timeout(file_size_bytes: int) -> float:
    """
    Calculate appropriate upload timeout based on file size.
-    
+
    Assumes a conservative slow connection speed to handle worst-case scenarios.
-    
+
    Args:
        file_size_bytes: Size of the file in bytes
-        
+
    Returns:
        Timeout in seconds
    """
    # Calculate time needed at slow connection speed
    # Add 50% buffer for network variability and SSL overhead
    estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
-    
+
    # Clamp to reasonable bounds
    return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))

@ -239,21 +241,21 @@ def calculate_upload_timeout(file_size_bytes: int) -> float:
 def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
    """
    Calculate job processing timeout based on page count and file size.
-    
+
    Args:
        estimated_pages: Estimated number of pages
        file_size_bytes: Size of the file in bytes
-        
+
    Returns:
        Timeout in seconds
    """
    # Base timeout + time per page
    page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
-    
+
    # Also consider file size (large images take longer to process)
    # ~1 minute per 10MB of file size
    size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
-    
+
    # Use the larger of the two estimates
    return max(page_based_timeout, size_based_timeout)

@ -284,18 +286,18 @@ async def parse_with_llamacloud_retry(
    """
    import os
    import random
-    
+
    from llama_cloud_services import LlamaParse
    from llama_cloud_services.parse.utils import ResultType

    # Get file size for timeout calculations
    file_size_bytes = os.path.getsize(file_path)
    file_size_mb = file_size_bytes / (1024 * 1024)
-    
+
    # Calculate dynamic timeouts based on file size and page count
    upload_timeout = calculate_upload_timeout(file_size_bytes)
    job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
-    
+
    # HTTP client timeouts - scaled based on file size
    # Write timeout is critical for large file uploads
    custom_timeout = httpx.Timeout(
@ -304,7 +306,7 @@ async def parse_with_llamacloud_retry(
        write=upload_timeout,  # Dynamic based on file size (upload time)
        pool=120.0,  # 2 minutes to acquire connection from pool
    )
-    
+
    logging.info(
        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
@ -335,14 +337,14 @@ async def parse_with_llamacloud_retry(

                # Parse the file asynchronously
                result = await parser.aparse(file_path)
-                
+
                # Success - log if we had previous failures
                if attempt > 1:
                    logging.info(
                        f"LlamaCloud upload succeeded on attempt {attempt} after "
                        f"{len(attempt_errors)} failures"
                    )
-                
+
                return result

        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
@ -355,8 +357,7 @@ async def parse_with_llamacloud_retry(
                # Calculate exponential backoff with jitter
                # Base delay doubles each attempt, capped at max delay
                base_delay = min(
-                    LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
-                    LLAMACLOUD_MAX_DELAY
+                    LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY
                )
                # Add random jitter (±25%) to prevent thundering herd
                jitter = base_delay * 0.25 * (2 * random.random() - 1)