Merge remote-tracking branch 'upstream/dev' into dev

2026-06-28 21:49:40 +02:00 · 2026-02-03 14:24:24 +02:00 · 2026-02-03 14:24:24 +02:00 · 60330622bf
commit 60330622bf
parent 8d9dfc7aa4 e172983042
54 changed files with 1542 additions and 34 deletions
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@ -3,6 +3,12 @@ DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
 #Celery Config
 CELERY_BROKER_URL=redis://localhost:6379/0
 CELERY_RESULT_BACKEND=redis://localhost:6379/0
+# Optional: isolate queues when sharing Redis with other apps
+CELERY_TASK_DEFAULT_QUEUE=surfsense
+
+# Redis for app-level features (heartbeats, podcast markers)
+# Defaults to CELERY_BROKER_URL when not set
+REDIS_APP_URL=redis://localhost:6379/0

 #Electric(for migrations only)
 ELECTRIC_DB_USER=electric
--- a/surfsense_backend/alembic/versions/86_add_document_created_by.py
+++ b/surfsense_backend/alembic/versions/86_add_document_created_by.py
@ -0,0 +1,185 @@
+"""Add created_by_id column to documents table for document ownership tracking
+
+Revision ID: 86
+Revises: 85
+Create Date: 2026-02-02
+
+Changes:
+1. Add created_by_id column (UUID, nullable, foreign key to user.id)
+2. Create index on created_by_id for performance
+3. Backfill existing documents with search space owner's user_id (with progress indicator)
+"""
+
+import sys
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "86"
+down_revision: str | None = "85"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+# Batch size for backfill operation
+BATCH_SIZE = 5000
+
+
+def upgrade() -> None:
+    """Add created_by_id column to documents and backfill with search space owner."""
+
+    # 1. Add created_by_id column (nullable for backward compatibility)
+    print("Step 1/4: Adding created_by_id column...")
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (
+                SELECT 1 FROM information_schema.columns
+                WHERE table_name = 'documents' AND column_name = 'created_by_id'
+            ) THEN
+                ALTER TABLE documents
+                ADD COLUMN created_by_id UUID;
+            END IF;
+        END$$;
+        """
+    )
+    print("  Done: created_by_id column added.")
+
+    # 2. Create index on created_by_id for efficient queries
+    print("Step 2/4: Creating index on created_by_id...")
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS ix_documents_created_by_id
+        ON documents (created_by_id);
+        """
+    )
+    print("  Done: Index created.")
+
+    # 3. Add foreign key constraint with ON DELETE SET NULL
+    # First check if constraint already exists
+    print("Step 3/4: Adding foreign key constraint...")
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (
+                SELECT 1 FROM information_schema.table_constraints
+                WHERE constraint_name = 'fk_documents_created_by_id'
+                AND table_name = 'documents'
+            ) THEN
+                ALTER TABLE documents
+                ADD CONSTRAINT fk_documents_created_by_id
+                FOREIGN KEY (created_by_id) REFERENCES "user"(id)
+                ON DELETE SET NULL;
+            END IF;
+        END$$;
+        """
+    )
+    print("  Done: Foreign key constraint added.")
+
+    # 4. Backfill existing documents with search space owner's user_id
+    # Process in batches with progress indicator
+    print("Step 4/4: Backfilling created_by_id for existing documents...")
+    
+    connection = op.get_bind()
+    
+    # Get total count of documents that need backfilling
+    result = connection.execute(
+        sa.text("""
+            SELECT COUNT(*) FROM documents WHERE created_by_id IS NULL
+        """)
+    )
+    total_count = result.scalar()
+    
+    if total_count == 0:
+        print("  No documents need backfilling. Skipping.")
+        return
+    
+    print(f"  Total documents to backfill: {total_count:,}")
+    
+    processed = 0
+    batch_num = 0
+    
+    while processed < total_count:
+        batch_num += 1
+        
+        # Update a batch of documents using a subquery to limit the update
+        # We use ctid (tuple identifier) for efficient batching in PostgreSQL
+        result = connection.execute(
+            sa.text("""
+                UPDATE documents
+                SET created_by_id = searchspaces.user_id
+                FROM searchspaces
+                WHERE documents.search_space_id = searchspaces.id
+                AND documents.created_by_id IS NULL
+                AND documents.id IN (
+                    SELECT d.id FROM documents d
+                    WHERE d.created_by_id IS NULL
+                    LIMIT :batch_size
+                )
+            """),
+            {"batch_size": BATCH_SIZE}
+        )
+        
+        rows_updated = result.rowcount
+        if rows_updated == 0:
+            # No more rows to update
+            break
+            
+        processed += rows_updated
+        progress_pct = min(100.0, (processed / total_count) * 100)
+        
+        # Print progress with carriage return for in-place update
+        sys.stdout.write(f"\r  Progress: {processed:,}/{total_count:,} documents ({progress_pct:.1f}%) - Batch {batch_num}")
+        sys.stdout.flush()
+    
+    # Final newline after progress
+    print()
+    print(f"  Done: Backfilled {processed:,} documents.")
+
+
+def downgrade() -> None:
+    """Remove created_by_id column from documents."""
+
+    # Drop foreign key constraint
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF EXISTS (
+                SELECT 1 FROM information_schema.table_constraints
+                WHERE constraint_name = 'fk_documents_created_by_id'
+                AND table_name = 'documents'
+            ) THEN
+                ALTER TABLE documents
+                DROP CONSTRAINT fk_documents_created_by_id;
+            END IF;
+        END$$;
+        """
+    )
+
+    # Drop index
+    op.execute(
+        """
+        DROP INDEX IF EXISTS ix_documents_created_by_id;
+        """
+    )
+
+    # Drop column
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF EXISTS (
+                SELECT 1 FROM information_schema.columns
+                WHERE table_name = 'documents' AND column_name = 'created_by_id'
+            ) THEN
+                ALTER TABLE documents
+                DROP COLUMN created_by_id;
+            END IF;
+        END$$;
+        """
+    )
--- a/surfsense_backend/alembic/versions/87_add_document_connector_id.py
+++ b/surfsense_backend/alembic/versions/87_add_document_connector_id.py
@ -0,0 +1,170 @@
+"""Add connector_id column to documents table for linking documents to their source connector
+
+Revision ID: 87
+Revises: 86
+Create Date: 2026-02-02
+
+Changes:
+1. Add connector_id column (Integer, nullable, foreign key to search_source_connectors.id)
+2. Create index on connector_id for efficient bulk deletion queries
+3. SET NULL on delete - allows controlled cleanup in application code
+4. Backfill existing documents based on document_type and search_space_id matching
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "87"
+down_revision: str | None = "86"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    """Add connector_id column to documents and backfill from existing connectors."""
+
+    # 1. Add connector_id column (nullable - for manually uploaded docs without connector)
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (
+                SELECT 1 FROM information_schema.columns
+                WHERE table_name = 'documents' AND column_name = 'connector_id'
+            ) THEN
+                ALTER TABLE documents
+                ADD COLUMN connector_id INTEGER;
+            END IF;
+        END$$;
+        """
+    )
+
+    # 2. Create index on connector_id for efficient cleanup queries
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS ix_documents_connector_id
+        ON documents (connector_id);
+        """
+    )
+
+    # 3. Add foreign key constraint with ON DELETE SET NULL
+    # SET NULL allows us to delete documents in controlled batches before deleting connector
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (
+                SELECT 1 FROM information_schema.table_constraints
+                WHERE constraint_name = 'fk_documents_connector_id'
+                AND table_name = 'documents'
+            ) THEN
+                ALTER TABLE documents
+                ADD CONSTRAINT fk_documents_connector_id
+                FOREIGN KEY (connector_id) REFERENCES search_source_connectors(id)
+                ON DELETE SET NULL;
+            END IF;
+        END$$;
+        """
+    )
+
+    # 4. Backfill existing documents with connector_id based on document_type matching
+    # This maps document types to their corresponding connector types
+    # Only backfills for documents in search spaces that have exactly one connector of that type
+
+    # Map of document_type -> connector_type for backfilling
+    document_connector_mappings = [
+        ("NOTION_CONNECTOR", "NOTION_CONNECTOR"),
+        ("SLACK_CONNECTOR", "SLACK_CONNECTOR"),
+        ("TEAMS_CONNECTOR", "TEAMS_CONNECTOR"),
+        ("GITHUB_CONNECTOR", "GITHUB_CONNECTOR"),
+        ("LINEAR_CONNECTOR", "LINEAR_CONNECTOR"),
+        ("DISCORD_CONNECTOR", "DISCORD_CONNECTOR"),
+        ("JIRA_CONNECTOR", "JIRA_CONNECTOR"),
+        ("CONFLUENCE_CONNECTOR", "CONFLUENCE_CONNECTOR"),
+        ("CLICKUP_CONNECTOR", "CLICKUP_CONNECTOR"),
+        ("GOOGLE_CALENDAR_CONNECTOR", "GOOGLE_CALENDAR_CONNECTOR"),
+        ("GOOGLE_GMAIL_CONNECTOR", "GOOGLE_GMAIL_CONNECTOR"),
+        ("GOOGLE_DRIVE_FILE", "GOOGLE_DRIVE_CONNECTOR"),
+        ("AIRTABLE_CONNECTOR", "AIRTABLE_CONNECTOR"),
+        ("LUMA_CONNECTOR", "LUMA_CONNECTOR"),
+        ("ELASTICSEARCH_CONNECTOR", "ELASTICSEARCH_CONNECTOR"),
+        ("BOOKSTACK_CONNECTOR", "BOOKSTACK_CONNECTOR"),
+        ("CIRCLEBACK", "CIRCLEBACK_CONNECTOR"),
+        ("OBSIDIAN_CONNECTOR", "OBSIDIAN_CONNECTOR"),
+        ("COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"),
+        ("COMPOSIO_GMAIL_CONNECTOR", "COMPOSIO_GMAIL_CONNECTOR"),
+        ("COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"),
+        ("CRAWLED_URL", "WEBCRAWLER_CONNECTOR"),
+    ]
+
+    for doc_type, connector_type in document_connector_mappings:
+        # Backfill connector_id for documents where:
+        # 1. Document has this document_type
+        # 2. Document doesn't already have a connector_id
+        # 3. There's exactly one connector of this type in the same search space
+        # This safely handles most cases while avoiding ambiguity
+        op.execute(
+            f"""
+            UPDATE documents d
+            SET connector_id = (
+                SELECT ssc.id 
+                FROM search_source_connectors ssc
+                WHERE ssc.search_space_id = d.search_space_id
+                AND ssc.connector_type = '{connector_type}'
+                LIMIT 1
+            )
+            WHERE d.document_type = '{doc_type}'
+            AND d.connector_id IS NULL
+            AND EXISTS (
+                SELECT 1 FROM search_source_connectors ssc
+                WHERE ssc.search_space_id = d.search_space_id
+                AND ssc.connector_type = '{connector_type}'
+            );
+            """
+        )
+
+
+def downgrade() -> None:
+    """Remove connector_id column from documents."""
+
+    # Drop foreign key constraint
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF EXISTS (
+                SELECT 1 FROM information_schema.table_constraints
+                WHERE constraint_name = 'fk_documents_connector_id'
+                AND table_name = 'documents'
+            ) THEN
+                ALTER TABLE documents
+                DROP CONSTRAINT fk_documents_connector_id;
+            END IF;
+        END$$;
+        """
+    )
+
+    # Drop index
+    op.execute(
+        """
+        DROP INDEX IF EXISTS ix_documents_connector_id;
+        """
+    )
+
+    # Drop column
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF EXISTS (
+                SELECT 1 FROM information_schema.columns
+                WHERE table_name = 'documents' AND column_name = 'connector_id'
+            ) THEN
+                ALTER TABLE documents
+                DROP COLUMN connector_id;
+            END IF;
+        END$$;
+        """
+    )
--- a/surfsense_backend/alembic/versions/88_make_podcast_transcript_nullable.py
+++ b/surfsense_backend/alembic/versions/88_make_podcast_transcript_nullable.py
@ -0,0 +1,58 @@
+"""Make podcast_transcript nullable
+
+Revision ID: 88
+Revises: 87
+Create Date: 2026-02-02
+
+The podcast workflow now creates a podcast record with PENDING status first,
+then fills in the transcript after generation completes. This requires
+podcast_transcript to be nullable.
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+revision: str = "88"
+down_revision: str | None = "87"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    # Make podcast_transcript nullable and remove the server default
+    op.execute(
+        """
+        ALTER TABLE podcasts
+        ALTER COLUMN podcast_transcript DROP NOT NULL;
+        """
+    )
+    op.execute(
+        """
+        ALTER TABLE podcasts
+        ALTER COLUMN podcast_transcript DROP DEFAULT;
+        """
+    )
+
+
+def downgrade() -> None:
+    # Set empty JSON for any NULL values before adding NOT NULL constraint
+    op.execute(
+        """
+        UPDATE podcasts
+        SET podcast_transcript = '{}'::jsonb
+        WHERE podcast_transcript IS NULL;
+        """
+    )
+    op.execute(
+        """
+        ALTER TABLE podcasts
+        ALTER COLUMN podcast_transcript SET DEFAULT '{}';
+        """
+    )
+    op.execute(
+        """
+        ALTER TABLE podcasts
+        ALTER COLUMN podcast_transcript SET NOT NULL;
+        """
+    )
--- a/surfsense_backend/alembic/versions/89_make_podcast_file_location_nullable.py
+++ b/surfsense_backend/alembic/versions/89_make_podcast_file_location_nullable.py
@ -0,0 +1,46 @@
+"""Make podcast file_location nullable
+
+Revision ID: 89
+Revises: 88
+Create Date: 2026-02-03
+
+The podcast workflow creates a podcast record with PENDING status first,
+then fills in the file_location after audio generation completes. This requires
+file_location to be nullable.
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+revision: str = "89"
+down_revision: str | None = "88"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    # Make file_location nullable
+    op.execute(
+        """
+        ALTER TABLE podcasts
+        ALTER COLUMN file_location DROP NOT NULL;
+        """
+    )
+
+
+def downgrade() -> None:
+    # Set empty string for any NULL values before adding NOT NULL constraint
+    op.execute(
+        """
+        UPDATE podcasts
+        SET file_location = ''
+        WHERE file_location IS NULL;
+        """
+    )
+    op.execute(
+        """
+        ALTER TABLE podcasts
+        ALTER COLUMN file_location SET NOT NULL;
+        """
+    )
--- a/surfsense_backend/app/agents/new_chat/tools/podcast.py
+++ b/surfsense_backend/app/agents/new_chat/tools/podcast.py
@ -21,8 +21,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Podcast, PodcastStatus

 # Redis connection for tracking active podcast tasks
-# Uses the same Redis instance as Celery
-REDIS_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
+# Defaults to the Celery broker when REDIS_APP_URL is not set
+REDIS_URL = os.getenv(
+    "REDIS_APP_URL",
+    os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0"),
+)
 _redis_client: redis.Redis | None = None


--- a/surfsense_backend/app/celery_app.py
+++ b/surfsense_backend/app/celery_app.py
@ -26,6 +26,7 @@ def init_worker(**kwargs):
 # Get Celery configuration from environment
 CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
 CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
+CELERY_TASK_DEFAULT_QUEUE = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")

 # Get schedule checker interval from environment
 # Format: "<number><unit>" where unit is 'm' (minutes) or 'h' (hours)
@ -80,6 +81,7 @@ celery_app = Celery(
        "app.tasks.celery_tasks.blocknote_migration_tasks",
        "app.tasks.celery_tasks.document_reindex_tasks",
        "app.tasks.celery_tasks.stale_notification_cleanup_task",
+        "app.tasks.celery_tasks.connector_deletion_task",
    ],
 )

@ -91,6 +93,9 @@ celery_app.conf.update(
    result_serializer="json",
    timezone="UTC",
    enable_utc=True,
+    task_default_queue=CELERY_TASK_DEFAULT_QUEUE,
+    task_default_exchange=CELERY_TASK_DEFAULT_QUEUE,
+    task_default_routing_key=CELERY_TASK_DEFAULT_QUEUE,
    # Task execution settings
    task_track_started=True,
    task_time_limit=28800,  # 8 hour hard limit
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@ -122,8 +122,52 @@ global_llm_configs:
    use_default_system_instructions: false
    citations_enabled: true

-  # Example: Groq - Fast inference
+  # Example: Azure OpenAI GPT-4o
+  # IMPORTANT: For Azure deployments, always include 'base_model' in litellm_params
+  # to enable accurate token counting, cost tracking, and max token limits
  - id: -5
+    name: "Global Azure GPT-4o"
+    description: "Azure OpenAI GPT-4o deployment"
+    provider: "AZURE"
+    # model_name format for Azure: azure/<your-deployment-name>
+    model_name: "azure/gpt-4o-deployment"
+    api_key: "your-azure-api-key-here"
+    api_base: "https://your-resource.openai.azure.com"
+    api_version: "2024-02-15-preview"  # Azure API version
+    rpm: 1000
+    tpm: 150000
+    litellm_params:
+      temperature: 0.7
+      max_tokens: 4000
+      # REQUIRED for Azure: Specify the underlying OpenAI model
+      # This fixes "Could not identify azure model" warnings
+      # Common base_model values: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo
+      base_model: "gpt-4o"
+    system_instructions: ""
+    use_default_system_instructions: true
+    citations_enabled: true
+
+  # Example: Azure OpenAI GPT-4 Turbo
+  - id: -6
+    name: "Global Azure GPT-4 Turbo"
+    description: "Azure OpenAI GPT-4 Turbo deployment"
+    provider: "AZURE"
+    model_name: "azure/gpt-4-turbo-deployment"
+    api_key: "your-azure-api-key-here"
+    api_base: "https://your-resource.openai.azure.com"
+    api_version: "2024-02-15-preview"
+    rpm: 500
+    tpm: 100000
+    litellm_params:
+      temperature: 0.7
+      max_tokens: 4000
+      base_model: "gpt-4-turbo"  # Maps to gpt-4-turbo-preview
+    system_instructions: ""
+    use_default_system_instructions: true
+    citations_enabled: true
+
+  # Example: Groq - Fast inference
+  - id: -7
    name: "Global Groq Llama 3"
    description: "Ultra-fast Llama 3 70B via Groq"
    provider: "GROQ"
@ -150,3 +194,11 @@ global_llm_configs:
 # - All standard LiteLLM providers are supported
 # - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute)
 #   These help the router distribute load evenly and avoid rate limit errors
+#
+# AZURE-SPECIFIC NOTES:
+# - Always add 'base_model' in litellm_params for Azure deployments
+# - This fixes "Could not identify azure model 'X'" warnings
+# - base_model should match the underlying OpenAI model (e.g., gpt-4o, gpt-4-turbo, gpt-3.5-turbo)
+# - model_name format: "azure/<your-deployment-name>"
+# - api_version: Use a recent Azure API version (e.g., "2024-02-15-preview")
+# - See: https://docs.litellm.ai/docs/proxy/cost_tracking#spend-tracking-for-azure-openai-models
--- a/surfsense_backend/app/connectors/composio_gmail_connector.py
+++ b/surfsense_backend/app/connectors/composio_gmail_connector.py
@ -394,6 +394,8 @@ async def _process_gmail_message_batch(
                embedding=summary_embedding,
                chunks=chunks,
                updated_at=get_current_timestamp(),
+                created_by_id=user_id,
+                connector_id=connector_id,
            )
            session.add(document)
            documents_indexed += 1
--- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py
+++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py
@ -442,6 +442,8 @@ async def index_composio_google_calendar(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )
                session.add(document)
                documents_indexed += 1
--- a/surfsense_backend/app/connectors/composio_google_drive_connector.py
+++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py
@ -1248,7 +1248,6 @@ async def _process_single_drive_file(
            "file_name": file_name,
            "FILE_NAME": file_name,  # For compatibility
            "mime_type": mime_type,
-            "connector_id": connector_id,
            "toolkit_id": "googledrive",
            "source": "composio",
        },
@ -1258,6 +1257,8 @@ async def _process_single_drive_file(
        embedding=summary_embedding,
        chunks=chunks,
        updated_at=get_current_timestamp(),
+        created_by_id=user_id,
+        connector_id=connector_id,
    )
    session.add(document)

--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@ -25,6 +25,7 @@ async def download_and_process_file(
    session: AsyncSession,
    task_logger: TaskLoggingService,
    log_entry: Log,
+    connector_id: int | None = None,
 ) -> tuple[Any, str | None, dict[str, Any] | None]:
    """
    Download Google Drive file and process using Surfsense file processors.
@ -37,6 +38,7 @@ async def download_and_process_file(
        session: Database session
        task_logger: Task logging service
        log_entry: Log entry for tracking
+        connector_id: ID of the connector (for de-indexing support)

    Returns:
        Tuple of (Document object if successful, error message if failed, file metadata dict)
@ -92,6 +94,9 @@ async def download_and_process_file(
                "source_connector": "google_drive",
            },
        }
+        # Include connector_id for de-indexing support
+        if connector_id is not None:
+            connector_info["connector_id"] = connector_id

        # Add additional Drive metadata if available
        if "modifiedTime" in file:
--- a/surfsense_backend/app/connectors/google_drive/credentials.py
+++ b/surfsense_backend/app/connectors/google_drive/credentials.py
@ -127,7 +127,12 @@ async def get_valid_credentials(
                    )
                creds_dict["_token_encrypted"] = True

-            connector.config = creds_dict
+            # IMPORTANT: Merge new credentials with existing config to preserve
+            # user settings like selected_folders, selected_files, indexing_options,
+            # folder_tokens, etc. that would otherwise be wiped on token refresh.
+            existing_config = connector.config.copy() if connector.config else {}
+            existing_config.update(creds_dict)
+            connector.config = existing_config
            flag_modified(connector, "config")
            await session.commit()

--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -761,7 +761,27 @@ class Document(BaseModel, TimestampMixin):
    search_space_id = Column(
        Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
    )
+
+    # Track who created/uploaded this document
+    created_by_id = Column(
+        UUID(as_uuid=True),
+        ForeignKey("user.id", ondelete="SET NULL"),
+        nullable=True,  # Nullable for backward compatibility with existing records
+        index=True,
+    )
+
+    # Track which connector created this document (for cleanup on connector deletion)
+    connector_id = Column(
+        Integer,
+        ForeignKey("search_source_connectors.id", ondelete="SET NULL"),
+        nullable=True,  # Nullable for manually uploaded docs without connector
+        index=True,
+    )
+
+    # Relationships
    search_space = relationship("SearchSpace", back_populates="documents")
+    created_by = relationship("User", back_populates="documents")
+    connector = relationship("SearchSourceConnector", back_populates="documents")
    chunks = relationship(
        "Chunk", back_populates="document", cascade="all, delete-orphan"
    )
@ -990,6 +1010,9 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
        UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
    )

+    # Documents created by this connector (for cleanup on connector deletion)
+    documents = relationship("Document", back_populates="connector")
+

 class NewLLMConfig(BaseModel, TimestampMixin):
    """
@ -1296,6 +1319,13 @@ if config.AUTH_TYPE == "GOOGLE":
            passive_deletes=True,
        )

+        # Documents created/uploaded by this user
+        documents = relationship(
+            "Document",
+            back_populates="created_by",
+            passive_deletes=True,
+        )
+
        # User memories for personalized AI responses
        memories = relationship(
            "UserMemory",
@ -1354,6 +1384,13 @@ else:
            passive_deletes=True,
        )

+        # Documents created/uploaded by this user
+        documents = relationship(
+            "Document",
+            back_populates="created_by",
+            passive_deletes=True,
+        )
+
        # User memories for personalized AI responses
        memories = relationship(
            "UserMemory",
--- a/surfsense_backend/app/routes/circleback_webhook_route.py
+++ b/surfsense_backend/app/routes/circleback_webhook_route.py
@ -9,8 +9,12 @@ import logging
 from datetime import datetime
 from typing import Any

-from fastapi import APIRouter, HTTPException
+from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel, Field
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType, get_async_session

 logger = logging.getLogger(__name__)

@ -212,6 +216,7 @@ def format_circleback_meeting_to_markdown(payload: CirclebackWebhookPayload) ->
 async def receive_circleback_webhook(
    search_space_id: int,
    payload: CirclebackWebhookPayload,
+    session: AsyncSession = Depends(get_async_session),
 ):
    """
    Receive and process a Circleback webhook.
@ -223,6 +228,7 @@ async def receive_circleback_webhook(
    Args:
        search_space_id: The ID of the search space to save the document to
        payload: The Circleback webhook payload containing meeting data
+        session: Database session for looking up the connector

    Returns:
        Success message with document details
@ -236,6 +242,26 @@ async def receive_circleback_webhook(
            f"Received Circleback webhook for meeting {payload.id} in search space {search_space_id}"
        )

+        # Look up the Circleback connector for this search space
+        connector_result = await session.execute(
+            select(SearchSourceConnector.id).where(
+                SearchSourceConnector.search_space_id == search_space_id,
+                SearchSourceConnector.connector_type
+                == SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
+            )
+        )
+        connector_id = connector_result.scalar_one_or_none()
+
+        if connector_id:
+            logger.info(
+                f"Found Circleback connector {connector_id} for search space {search_space_id}"
+            )
+        else:
+            logger.warning(
+                f"No Circleback connector found for search space {search_space_id}. "
+                "Document will be created without connector_id."
+            )
+
        # Convert to markdown
        markdown_content = format_circleback_meeting_to_markdown(payload)

@ -264,6 +290,7 @@ async def receive_circleback_webhook(
            markdown_content=markdown_content,
            metadata=meeting_metadata,
            search_space_id=search_space_id,
+            connector_id=connector_id,
        )

        logger.info(
--- a/surfsense_backend/app/routes/composio_routes.py
+++ b/surfsense_backend/app/routes/composio_routes.py
@ -20,6 +20,7 @@ from pydantic import ValidationError
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
+from sqlalchemy.orm.attributes import flag_modified

 from app.config import config
 from app.db import (
@ -330,10 +331,19 @@ async def composio_callback(
                    )

            # Update existing connector with new connected_account_id
+            # IMPORTANT: Merge new credentials with existing config to preserve
+            # user settings like selected_folders, selected_files, indexing_options,
+            # drive_page_token, etc. that would otherwise be wiped on reconnection.
            logger.info(
                f"Updating existing Composio connector {existing_connector.id} with new connected_account_id {final_connected_account_id}"
            )
-            existing_connector.config = connector_config
+            existing_config = (
+                existing_connector.config.copy() if existing_connector.config else {}
+            )
+            existing_config.update(connector_config)
+            existing_connector.config = existing_config
+
+            flag_modified(existing_connector, "config")
            await session.commit()
            await session.refresh(existing_connector)

--- a/surfsense_backend/app/routes/notes_routes.py
+++ b/surfsense_backend/app/routes/notes_routes.py
@ -76,6 +76,7 @@ async def create_note(
        document_metadata={"NOTE": True},
        embedding=None,  # Will be generated on first reindex
        updated_at=datetime.now(UTC),
+        created_by_id=user.id,  # Track who created this note
    )

    session.add(document)
@ -93,6 +94,7 @@ async def create_note(
        search_space_id=document.search_space_id,
        created_at=document.created_at,
        updated_at=document.updated_at,
+        created_by_id=document.created_by_id,
    )


--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -91,7 +91,10 @@ def get_heartbeat_redis_client() -> redis.Redis:
    """Get or create Redis client for heartbeat tracking."""
    global _heartbeat_redis_client
    if _heartbeat_redis_client is None:
-        redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
+        redis_url = os.getenv(
+            "REDIS_APP_URL",
+            os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0"),
+        )
        _heartbeat_redis_client = redis.from_url(redis_url, decode_responses=True)
    return _heartbeat_redis_client

@ -524,9 +527,17 @@ async def delete_search_source_connector(
    user: User = Depends(current_active_user),
 ):
    """
-    Delete a search source connector.
+    Delete a search source connector and all its associated documents.
+
+    The deletion runs in background via Celery task. User is notified
+    via the notification system when complete (no polling required).
+
    Requires CONNECTORS_DELETE permission.
    """
+    from app.tasks.celery_tasks.connector_deletion_task import (
+        delete_connector_with_documents_task,
+    )
+
    try:
        # Get the connector first
        result = await session.execute(
@ -548,7 +559,12 @@ async def delete_search_source_connector(
            "You don't have permission to delete this connector",
        )

-        # Delete any periodic schedule associated with this connector
+        # Store connector info before we queue the deletion task
+        connector_name = db_connector.name
+        connector_type = db_connector.connector_type.value
+        search_space_id = db_connector.search_space_id
+
+        # Delete any periodic schedule associated with this connector (lightweight, sync)
        if db_connector.periodic_indexing_enabled:
            success = delete_periodic_schedule(connector_id)
            if not success:
@ -556,7 +572,7 @@ async def delete_search_source_connector(
                    f"Failed to delete periodic schedule for connector {connector_id}"
                )

-        # For Composio connectors, also delete the connected account in Composio
+        # For Composio connectors, delete the connected account in Composio (lightweight API call, sync)
        composio_connector_types = [
            SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
            SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
@ -588,16 +604,33 @@ async def delete_search_source_connector(
                        f"Error deleting Composio connected account {composio_connected_account_id}: {composio_error!s}"
                    )

-        await session.delete(db_connector)
-        await session.commit()
-        return {"message": "Search source connector deleted successfully"}
+        # Queue background task to delete documents and connector
+        # This handles potentially large document counts without blocking the API
+        delete_connector_with_documents_task.delay(
+            connector_id=connector_id,
+            user_id=str(user.id),
+            search_space_id=search_space_id,
+            connector_name=connector_name,
+            connector_type=connector_type,
+        )
+
+        logger.info(
+            f"Queued deletion task for connector {connector_id} ({connector_name})"
+        )
+
+        return {
+            "message": "Connector deletion started. You will be notified when complete.",
+            "status": "queued",
+            "connector_id": connector_id,
+            "connector_name": connector_name,
+        }
    except HTTPException:
        raise
    except Exception as e:
        await session.rollback()
        raise HTTPException(
            status_code=500,
-            detail=f"Failed to delete search source connector: {e!s}",
+            detail=f"Failed to start connector deletion: {e!s}",
        ) from e


--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@ -1,5 +1,6 @@
 from datetime import datetime
 from typing import TypeVar
+from uuid import UUID

 from pydantic import BaseModel, ConfigDict

@ -51,6 +52,7 @@ class DocumentRead(BaseModel):
    created_at: datetime
    updated_at: datetime | None
    search_space_id: int
+    created_by_id: UUID | None = None  # User who created/uploaded this document

    model_config = ConfigDict(from_attributes=True)

--- a/surfsense_backend/app/tasks/celery_tasks/connector_deletion_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_deletion_task.py
@ -0,0 +1,269 @@
+"""Celery task for background connector deletion.
+
+This task handles the deletion of all documents associated with a connector
+in the background, then deletes the connector itself. User is notified via
+the notification system when complete (no polling required).
+
+Features:
+- Batch deletion to handle large document counts
+- Automatic retry on failure
+- Progress tracking via notifications
+- Handles both success and failure notifications
+"""
+
+import asyncio
+import logging
+from uuid import UUID
+
+from sqlalchemy import delete, func, select
+from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
+from sqlalchemy.pool import NullPool
+
+from app.celery_app import celery_app
+from app.config import config
+from app.db import Document, Notification, SearchSourceConnector
+
+logger = logging.getLogger(__name__)
+
+# Batch size for document deletion
+DELETION_BATCH_SIZE = 500
+
+
+def _get_celery_session_maker():
+    """Create async session maker for Celery tasks."""
+    engine = create_async_engine(
+        config.DATABASE_URL,
+        poolclass=NullPool,
+        echo=False,
+    )
+    return async_sessionmaker(engine, expire_on_commit=False), engine
+
+
+@celery_app.task(
+    bind=True,
+    name="delete_connector_with_documents",
+    max_retries=3,
+    default_retry_delay=60,
+    autoretry_for=(Exception,),
+    retry_backoff=True,
+)
+def delete_connector_with_documents_task(
+    self,
+    connector_id: int,
+    user_id: str,
+    search_space_id: int,
+    connector_name: str,
+    connector_type: str,
+):
+    """
+    Background task to delete a connector and all its associated documents.
+
+    Creates a notification when complete (success or failure).
+    No polling required - user sees notification in UI.
+
+    Args:
+        connector_id: ID of the connector to delete
+        user_id: ID of the user who initiated the deletion
+        search_space_id: ID of the search space
+        connector_name: Name of the connector (for notification message)
+        connector_type: Type of the connector (for logging)
+    """
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    try:
+        return loop.run_until_complete(
+            _delete_connector_async(
+                connector_id=connector_id,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                connector_name=connector_name,
+                connector_type=connector_type,
+            )
+        )
+    finally:
+        loop.close()
+
+
+async def _delete_connector_async(
+    connector_id: int,
+    user_id: str,
+    search_space_id: int,
+    connector_name: str,
+    connector_type: str,
+) -> dict:
+    """
+    Async implementation of connector deletion.
+
+    Steps:
+    1. Count total documents to delete
+    2. Delete documents in batches (chunks cascade automatically)
+    3. Delete the connector record
+    4. Create success notification
+
+    On failure, creates failure notification and re-raises exception.
+    """
+    session_maker, engine = _get_celery_session_maker()
+    total_deleted = 0
+
+    try:
+        async with session_maker() as session:
+            # Step 1: Count total documents for this connector
+            count_result = await session.execute(
+                select(func.count(Document.id)).where(
+                    Document.connector_id == connector_id
+                )
+            )
+            total_docs = count_result.scalar() or 0
+
+            logger.info(
+                f"Starting deletion of connector {connector_id} ({connector_name}). "
+                f"Documents to delete: {total_docs}"
+            )
+
+            # Step 2: Delete documents in batches
+            while True:
+                # Get batch of document IDs
+                result = await session.execute(
+                    select(Document.id)
+                    .where(Document.connector_id == connector_id)
+                    .limit(DELETION_BATCH_SIZE)
+                )
+                doc_ids = [row[0] for row in result.fetchall()]
+
+                if not doc_ids:
+                    break
+
+                # Delete this batch (chunks are deleted via CASCADE)
+                await session.execute(delete(Document).where(Document.id.in_(doc_ids)))
+                await session.commit()
+
+                total_deleted += len(doc_ids)
+                logger.info(
+                    f"Deleted batch of {len(doc_ids)} documents. "
+                    f"Progress: {total_deleted}/{total_docs}"
+                )
+
+            # Step 3: Delete the connector record
+            result = await session.execute(
+                select(SearchSourceConnector).where(
+                    SearchSourceConnector.id == connector_id
+                )
+            )
+            connector = result.scalar_one_or_none()
+
+            if connector:
+                await session.delete(connector)
+                logger.info(f"Deleted connector record: {connector_id}")
+            else:
+                logger.warning(
+                    f"Connector {connector_id} not found - may have been already deleted"
+                )
+
+            # Step 4: Create success notification
+            doc_text = "document" if total_deleted == 1 else "documents"
+            notification = Notification(
+                user_id=UUID(user_id),
+                search_space_id=search_space_id,
+                type="connector_deletion",
+                title=f"{connector_name} Removed",
+                message=f"Connector and {total_deleted} {doc_text} have been removed from your knowledge base.",
+                notification_metadata={
+                    "connector_id": connector_id,
+                    "connector_name": connector_name,
+                    "connector_type": connector_type,
+                    "documents_deleted": total_deleted,
+                    "status": "completed",
+                },
+            )
+            session.add(notification)
+            await session.commit()
+
+            logger.info(
+                f"Connector {connector_id} ({connector_name}) deleted successfully. "
+                f"Total documents deleted: {total_deleted}"
+            )
+
+            return {
+                "status": "success",
+                "connector_id": connector_id,
+                "connector_name": connector_name,
+                "documents_deleted": total_deleted,
+            }
+
+    except Exception as e:
+        logger.error(
+            f"Failed to delete connector {connector_id} ({connector_name}): {e!s}",
+            exc_info=True,
+        )
+
+        # Create failure notification
+        try:
+            async with session_maker() as session:
+                notification = Notification(
+                    user_id=UUID(user_id),
+                    search_space_id=search_space_id,
+                    type="connector_deletion",
+                    title=f"Failed to Remove {connector_name}",
+                    message="Something went wrong while removing this connector. Please try again.",
+                    notification_metadata={
+                        "connector_id": connector_id,
+                        "connector_name": connector_name,
+                        "connector_type": connector_type,
+                        "documents_deleted": total_deleted,
+                        "status": "failed",
+                        "error": str(e),
+                    },
+                )
+                session.add(notification)
+                await session.commit()
+        except Exception as notify_error:
+            logger.error(
+                f"Failed to create failure notification: {notify_error!s}",
+                exc_info=True,
+            )
+
+        # Re-raise to trigger Celery retry
+        raise
+
+    finally:
+        await engine.dispose()
+
+
+async def delete_documents_by_connector_id(
+    session,
+    connector_id: int,
+    batch_size: int = DELETION_BATCH_SIZE,
+) -> int:
+    """
+    Delete all documents associated with a connector in batches.
+
+    This is a utility function that can be used independently of the Celery task
+    for synchronous deletion scenarios (e.g., small document counts).
+
+    Args:
+        session: AsyncSession instance
+        connector_id: ID of the connector
+        batch_size: Number of documents to delete per batch
+
+    Returns:
+        Total number of documents deleted
+    """
+    total_deleted = 0
+
+    while True:
+        result = await session.execute(
+            select(Document.id)
+            .where(Document.connector_id == connector_id)
+            .limit(batch_size)
+        )
+        doc_ids = [row[0] for row in result.fetchall()]
+
+        if not doc_ids:
+            break
+
+        await session.execute(delete(Document).where(Document.id.in_(doc_ids)))
+        await session.commit()
+        total_deleted += len(doc_ids)
+
+    return total_deleted
--- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
@ -323,6 +323,28 @@ def process_file_upload_task(
        user_id: ID of the user
    """
    import asyncio
+    import os
+    import traceback
+
+    logger.info(
+        f"[process_file_upload] Task started - file: {filename}, "
+        f"search_space_id: {search_space_id}, user_id: {user_id}"
+    )
+    logger.info(f"[process_file_upload] File path: {file_path}")
+
+    # Check if file exists and is accessible
+    if not os.path.exists(file_path):
+        logger.error(
+            f"[process_file_upload] File does not exist: {file_path}. "
+            "The temp file may have been cleaned up before the task ran."
+        )
+        return
+
+    try:
+        file_size = os.path.getsize(file_path)
+        logger.info(f"[process_file_upload] File size: {file_size} bytes")
+    except Exception as e:
+        logger.warning(f"[process_file_upload] Could not get file size: {e}")

    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
@ -331,6 +353,15 @@ def process_file_upload_task(
        loop.run_until_complete(
            _process_file_upload(file_path, filename, search_space_id, user_id)
        )
+        logger.info(
+            f"[process_file_upload] Task completed successfully for: {filename}"
+        )
+    except Exception as e:
+        logger.error(
+            f"[process_file_upload] Task failed for {filename}: {e}\n"
+            f"Traceback:\n{traceback.format_exc()}"
+        )
+        raise
    finally:
        loop.close()

@ -343,16 +374,22 @@ async def _process_file_upload(

    from app.tasks.document_processors.file_processors import process_file_in_background

+    logger.info(f"[_process_file_upload] Starting async processing for: {filename}")
+
    async with get_celery_session_maker()() as session:
+        logger.info(f"[_process_file_upload] Database session created for: {filename}")
        task_logger = TaskLoggingService(session, search_space_id)

        # Get file size for notification metadata
        try:
            file_size = os.path.getsize(file_path)
-        except Exception:
+            logger.info(f"[_process_file_upload] File size: {file_size} bytes")
+        except Exception as e:
+            logger.warning(f"[_process_file_upload] Could not get file size: {e}")
            file_size = None

        # Create notification for document processing
+        logger.info(f"[_process_file_upload] Creating notification for: {filename}")
        notification = (
            await NotificationService.document_processing.notify_processing_started(
                session=session,
@ -363,6 +400,9 @@ async def _process_file_upload(
                file_size=file_size,
            )
        )
+        logger.info(
+            f"[_process_file_upload] Notification created with ID: {notification.id if notification else 'None'}"
+        )

        log_entry = await task_logger.log_task_start(
            task_name="process_file_upload",
@ -505,6 +545,7 @@ def process_circleback_meeting_task(
    markdown_content: str,
    metadata: dict,
    search_space_id: int,
+    connector_id: int | None = None,
 ):
    """
    Celery task to process Circleback meeting webhook data.
@ -515,6 +556,7 @@ def process_circleback_meeting_task(
        markdown_content: Meeting content formatted as markdown
        metadata: Meeting metadata dictionary
        search_space_id: ID of the search space
+        connector_id: ID of the Circleback connector (for deletion support)
    """
    import asyncio

@ -529,6 +571,7 @@ def process_circleback_meeting_task(
                markdown_content,
                metadata,
                search_space_id,
+                connector_id,
            )
        )
    finally:
@ -541,6 +584,7 @@ async def _process_circleback_meeting(
    markdown_content: str,
    metadata: dict,
    search_space_id: int,
+    connector_id: int | None = None,
 ):
    """Process Circleback meeting with new session."""
    from app.tasks.document_processors.circleback_processor import (
@ -597,6 +641,7 @@ async def _process_circleback_meeting(
                markdown_content=markdown_content,
                metadata=metadata,
                search_space_id=search_space_id,
+                connector_id=connector_id,
            )

            if result:
--- a/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
@ -51,7 +51,10 @@ def _clear_generating_podcast(search_space_id: int) -> None:
    import redis

    try:
-        redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
+        redis_url = os.getenv(
+            "REDIS_APP_URL",
+            os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0"),
+        )
        client = redis.from_url(redis_url, decode_responses=True)
        key = f"podcast:generating:{search_space_id}"
        client.delete(key)
--- a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
@ -36,7 +36,10 @@ def get_redis_client() -> redis.Redis:
    """Get or create Redis client for heartbeat checking."""
    global _redis_client
    if _redis_client is None:
-        redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
+        redis_url = os.getenv(
+            "REDIS_APP_URL",
+            os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0"),
+        )
        _redis_client = redis.from_url(redis_url, decode_responses=True)
    return _redis_client

--- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
@ -417,6 +417,8 @@ async def index_airtable_records(
                                embedding=summary_embedding,
                                chunks=chunks,
                                updated_at=get_current_timestamp(),
+                                created_by_id=user_id,
+                                connector_id=connector_id,
                            )

                            session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
@ -396,6 +396,8 @@ async def index_bookstack_pages(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )

                session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
@ -395,6 +395,8 @@ async def index_clickup_tasks(
                        embedding=summary_embedding,
                        chunks=chunks,
                        updated_at=get_current_timestamp(),
+                        created_by_id=user_id,
+                        connector_id=connector_id,
                    )

                    session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@ -402,6 +402,8 @@ async def index_confluence_pages(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )

                session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
@ -527,6 +527,8 @@ async def index_discord_messages(
                                content_hash=content_hash,
                                unique_identifier_hash=unique_identifier_hash,
                                updated_at=get_current_timestamp(),
+                                created_by_id=user_id,
+                                connector_id=connector_id,
                            )

                            session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py
@ -292,6 +292,8 @@ async def index_elasticsearch_documents(
                        document_metadata=metadata,
                        search_space_id=search_space_id,
                        updated_at=get_current_timestamp(),
+                        created_by_id=user_id,
+                        connector_id=connector_id,
                    )

                    # Create chunks and attach to document (persist via relationship)
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@ -220,6 +220,7 @@ async def index_github_repos(
                    user_id=user_id,
                    task_logger=task_logger,
                    log_entry=log_entry,
+                    connector_id=connector_id,
                )

                documents_processed += docs_created
@ -292,6 +293,7 @@ async def _process_repository_digest(
    user_id: str,
    task_logger: TaskLoggingService,
    log_entry,
+    connector_id: int,
 ) -> int:
    """
    Process a repository digest and create documents.
@ -426,6 +428,8 @@ async def _process_repository_digest(
        search_space_id=search_space_id,
        chunks=chunks_data,
        updated_at=get_current_timestamp(),
+        created_by_id=user_id,
+        connector_id=connector_id,
    )

    session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@ -499,6 +499,8 @@ async def index_google_calendar_events(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )

                session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@ -767,6 +767,7 @@ async def _process_single_file(
            session=session,
            task_logger=task_logger,
            log_entry=log_entry,
+            connector_id=connector_id,
        )

        if error:
--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@ -413,7 +413,6 @@ async def index_google_gmail_messages(
                        "subject": subject,
                        "sender": sender,
                        "date": date_str,
-                        "connector_id": connector_id,
                    },
                    content=summary_content,
                    content_hash=content_hash,
@ -421,6 +420,8 @@ async def index_google_gmail_messages(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )
                session.add(document)
                documents_indexed += 1
--- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
@ -380,6 +380,8 @@ async def index_jira_issues(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )

                session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
@ -413,6 +413,8 @@ async def index_linear_issues(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )

                session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@ -476,6 +476,8 @@ async def index_luma_events(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )

                session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@ -398,6 +398,7 @@ async def index_notion_pages(
                        }
                        existing_document.chunks = chunks
                        existing_document.updated_at = get_current_timestamp()
+                        existing_document.connector_id = connector_id

                        documents_indexed += 1
                        logger.info(f"Successfully updated Notion page: {page_title}")
@ -470,6 +471,8 @@ async def index_notion_pages(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )

                session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
@ -500,6 +500,8 @@ async def index_obsidian_vault(
                        embedding=embedding,
                        chunks=chunks,
                        updated_at=get_current_timestamp(),
+                        created_by_id=user_id,
+                        connector_id=connector_id,
                    )

                    session.add(new_document)
--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@ -389,6 +389,8 @@ async def index_slack_messages(
                        content_hash=content_hash,
                        unique_identifier_hash=unique_identifier_hash,
                        updated_at=get_current_timestamp(),
+                        created_by_id=user_id,
+                        connector_id=connector_id,
                    )

                    session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
@ -430,6 +430,8 @@ async def index_teams_messages(
                                content_hash=content_hash,
                                unique_identifier_hash=unique_identifier_hash,
                                updated_at=get_current_timestamp(),
+                                created_by_id=user_id,
+                                connector_id=connector_id,
                            )

                            session.add(document)
--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -371,6 +371,8 @@ async def index_crawled_urls(
                    embedding=summary_embedding,
                    chunks=chunks,
                    updated_at=get_current_timestamp(),
+                    created_by_id=user_id,
+                    connector_id=connector_id,
                )

                session.add(document)
--- a/surfsense_backend/app/tasks/document_processors/circleback_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/circleback_processor.py
@ -8,10 +8,17 @@ and stores it as searchable documents in the database.
 import logging
 from typing import Any

+from sqlalchemy import select
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession

-from app.db import Document, DocumentType
+from app.db import (
+    Document,
+    DocumentType,
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+    SearchSpace,
+)
 from app.services.llm_service import get_document_summary_llm
 from app.utils.document_converters import (
    create_document_chunks,
@ -35,6 +42,7 @@ async def add_circleback_meeting_document(
    markdown_content: str,
    metadata: dict[str, Any],
    search_space_id: int,
+    connector_id: int | None = None,
 ) -> Document | None:
    """
    Process and store a Circleback meeting document.
@ -46,6 +54,7 @@ async def add_circleback_meeting_document(
        markdown_content: Meeting content formatted as markdown
        metadata: Meeting metadata dictionary
        search_space_id: ID of the search space
+        connector_id: ID of the Circleback connector (for deletion support)

    Returns:
        Document object if successful, None if failed or duplicate
@ -125,6 +134,30 @@ async def add_circleback_meeting_document(
            **metadata,
        }

+        # Fetch the user who set up the Circleback connector (preferred)
+        # or fall back to search space owner if no connector found
+        created_by_user_id = None
+
+        # Try to find the Circleback connector for this search space
+        connector_result = await session.execute(
+            select(SearchSourceConnector.user_id).where(
+                SearchSourceConnector.search_space_id == search_space_id,
+                SearchSourceConnector.connector_type
+                == SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
+            )
+        )
+        connector_user = connector_result.scalar_one_or_none()
+
+        if connector_user:
+            # Use the user who set up the Circleback connector
+            created_by_user_id = connector_user
+        else:
+            # Fallback: use search space owner if no connector found
+            search_space_result = await session.execute(
+                select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
+            )
+            created_by_user_id = search_space_result.scalar_one_or_none()
+
        # Update or create document
        if existing_document:
            # Update existing document
@ -138,6 +171,9 @@ async def add_circleback_meeting_document(
            existing_document.blocknote_document = blocknote_json
            existing_document.content_needs_reindexing = False
            existing_document.updated_at = get_current_timestamp()
+            # Ensure connector_id is set (backfill for documents created before this field)
+            if connector_id is not None:
+                existing_document.connector_id = connector_id

            await session.commit()
            await session.refresh(existing_document)
@ -160,6 +196,8 @@ async def add_circleback_meeting_document(
                blocknote_document=blocknote_json,
                content_needs_reindexing=False,
                updated_at=get_current_timestamp(),
+                created_by_id=created_by_user_id,
+                connector_id=connector_id,
            )

            session.add(document)
--- a/surfsense_backend/app/tasks/document_processors/extension_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py
@ -185,6 +185,7 @@ async def add_extension_received_document(
                unique_identifier_hash=unique_identifier_hash,
                blocknote_document=blocknote_json,
                updated_at=get_current_timestamp(),
+                created_by_id=user_id,
            )

            session.add(document)
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -526,6 +526,8 @@ async def add_received_file_document_using_unstructured(
                blocknote_document=blocknote_json,
                content_needs_reindexing=False,
                updated_at=get_current_timestamp(),
+                created_by_id=user_id,
+                connector_id=connector.get("connector_id") if connector else None,
            )

            session.add(document)
@ -665,6 +667,8 @@ async def add_received_file_document_using_llamacloud(
                blocknote_document=blocknote_json,
                content_needs_reindexing=False,
                updated_at=get_current_timestamp(),
+                created_by_id=user_id,
+                connector_id=connector.get("connector_id") if connector else None,
            )

            session.add(document)
@ -829,6 +833,8 @@ async def add_received_file_document_using_docling(
                blocknote_document=blocknote_json,
                content_needs_reindexing=False,
                updated_at=get_current_timestamp(),
+                created_by_id=user_id,
+                connector_id=connector.get("connector_id") if connector else None,
            )

            session.add(document)
@ -849,7 +855,7 @@ async def add_received_file_document_using_docling(
 async def _update_document_from_connector(
    document: Document | None, connector: dict | None, session: AsyncSession
 ) -> None:
-    """Helper to update document type and metadata from connector info."""
+    """Helper to update document type, metadata, and connector_id from connector info."""
    if document and connector:
        if "type" in connector:
            document.document_type = connector["type"]
@ -861,6 +867,9 @@ async def _update_document_from_connector(
                # Expand existing metadata with connector metadata
                merged = {**document.document_metadata, **connector["metadata"]}
                document.document_metadata = merged
+        # Set connector_id if provided for de-indexing support
+        if "connector_id" in connector:
+            document.connector_id = connector["connector_id"]
        await session.commit()


--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@ -295,6 +295,8 @@ async def add_received_markdown_file_document(
                unique_identifier_hash=primary_hash,
                blocknote_document=blocknote_json,
                updated_at=get_current_timestamp(),
+                created_by_id=user_id,
+                connector_id=connector.get("connector_id") if connector else None,
            )

            session.add(document)
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -357,6 +357,7 @@ async def add_youtube_video_document(
                unique_identifier_hash=unique_identifier_hash,
                blocknote_document=blocknote_json,
                updated_at=get_current_timestamp(),
+                created_by_id=user_id,
            )

            session.add(document)
--- a/surfsense_backend/scripts/docker/entrypoint.sh
+++ b/surfsense_backend/scripts/docker/entrypoint.sh
@ -39,7 +39,7 @@ backend_pid=$!
 sleep 5

 echo "Starting Celery Worker..."
-celery -A app.celery_app worker --loglevel=info &
+celery -A app.celery_app worker --loglevel=info --autoscale=128,4 &
 celery_worker_pid=$!

 # Wait a bit for worker to initialize