Merge pull request #1509 from MODSetter/dev

feat(release: 0.0.29): ETL/embedding caches, unified model connections, reverse-proxy support, podcast & indexing improvements
2026-06-26 21:39:43 +02:00 · 2026-06-17 23:46:24 -07:00 · 2026-06-17 23:46:24 -07:00 · c941907448
commit c941907448
parent 77688ac80c 0729e5a915
408 changed files with 15877 additions and 16310 deletions
--- a/.github/workflows/desktop-release.yml
+++ b/.github/workflows/desktop-release.yml
@ -95,10 +95,12 @@ jobs:
        run: pnpm build
        working-directory: surfsense_web
        env:
-          NEXT_PUBLIC_FASTAPI_BACKEND_URL: ${{ vars.NEXT_PUBLIC_FASTAPI_BACKEND_URL }}
+          NEXT_PUBLIC_FASTAPI_BACKEND_URL: ${{ vars.HOSTED_BACKEND_URL }}
          SURFSENSE_BACKEND_INTERNAL_URL: ${{ vars.HOSTED_BACKEND_URL }}
          NEXT_PUBLIC_ZERO_CACHE_URL: ${{ vars.NEXT_PUBLIC_ZERO_CACHE_URL }}
          NEXT_PUBLIC_DEPLOYMENT_MODE: ${{ vars.NEXT_PUBLIC_DEPLOYMENT_MODE }}
-          NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: ${{ vars.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE }}
+          NEXT_PUBLIC_AUTH_TYPE: ${{ vars.NEXT_PUBLIC_AUTH_TYPE }}
          NEXT_PUBLIC_ETL_SERVICE: ${{ vars.NEXT_PUBLIC_ETL_SERVICE }}
          NEXT_PUBLIC_POSTHOG_KEY: ${{ secrets.NEXT_PUBLIC_POSTHOG_KEY }}
      - name: Install desktop dependencies
@ -109,6 +111,7 @@ jobs:
        run: pnpm build
        working-directory: surfsense_desktop
        env:
          HOSTED_BACKEND_URL: ${{ vars.HOSTED_BACKEND_URL }}
          HOSTED_FRONTEND_URL: ${{ vars.HOSTED_FRONTEND_URL }}
          POSTHOG_KEY: ${{ secrets.POSTHOG_KEY }}
          POSTHOG_HOST: ${{ vars.POSTHOG_HOST }}
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@ -199,11 +199,6 @@ jobs:
          build-args: |
            ${{ matrix.image == 'backend' && format('USE_CUDA={0}', matrix.use_cuda) || '' }}
            ${{ matrix.image == 'backend' && format('CUDA_EXTRA={0}', matrix.cuda_extra) || '' }}
            ${{ matrix.image == 'web' && 'NEXT_PUBLIC_FASTAPI_BACKEND_URL=__NEXT_PUBLIC_FASTAPI_BACKEND_URL__' || '' }}
            ${{ matrix.image == 'web' && 'NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=__NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE__' || '' }}
            ${{ matrix.image == 'web' && 'NEXT_PUBLIC_ETL_SERVICE=__NEXT_PUBLIC_ETL_SERVICE__' || '' }}
            ${{ matrix.image == 'web' && 'NEXT_PUBLIC_ZERO_CACHE_URL=__NEXT_PUBLIC_ZERO_CACHE_URL__' || '' }}
            ${{ matrix.image == 'web' && 'NEXT_PUBLIC_DEPLOYMENT_MODE=__NEXT_PUBLIC_DEPLOYMENT_MODE__' || '' }}
      - name: Export digest
        run: |
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@ -27,9 +27,10 @@ jobs:
      PLAYWRIGHT_TEST_EMAIL: e2e-test@surfsense.net
      PLAYWRIGHT_TEST_PASSWORD: E2eTestPassword123!
      # Frontend env: Playwright's webServer (surfsense_web/playwright.config.ts)
-      # spawns `pnpm build && pnpm start` in CI; these get baked into the build.
+      # spawns `pnpm build && pnpm start` in CI.
      NEXT_PUBLIC_FASTAPI_BACKEND_URL: http://localhost:8000
-      NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: LOCAL
+      SURFSENSE_BACKEND_INTERNAL_URL: http://localhost:8000
      AUTH_TYPE: LOCAL
      # Shared secret for the test-only POST /__e2e__/auth/token endpoint.
      # Must match docker-compose.e2e.yml's backend env (x-backend-env).
      E2E_MINT_SECRET: e2e-mint-secret-not-for-production
--- a/2
+++ b/2
@ -1 +1 @@
-0.0.28
+0.0.29
--- a/docker/.env.example
+++ b/docker/.env.example
@ -30,6 +30,9 @@ SECRET_KEY=replace_me_with_a_random_string
 # Auth type: LOCAL (email/password) or GOOGLE (OAuth)
 AUTH_TYPE=LOCAL
 # Deployment mode: self-hosted enables local filesystem connectors; cloud hides them.
 DEPLOYMENT_MODE=self-hosted
 # Allow new user registrations (TRUE or FALSE)
 # REGISTRATION_ENABLED=TRUE
@ -43,51 +46,47 @@ ETL_SERVICE=DOCLING
 EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 # ------------------------------------------------------------------------------
-# Ports (change to avoid conflicts with other services on your machine)
+# How You Access SurfSense
 # ------------------------------------------------------------------------------
-
+# One public URL. Browser traffic stays same-origin and Caddy routes internally.
-# BACKEND_PORT=8929
+SURFSENSE_PUBLIC_URL=http://localhost:3929
 # FRONTEND_PORT=3929
 # ZERO_CACHE_PORT=5929
 # SEARXNG_PORT=8888
 # FLOWER_PORT=5555
 # ==============================================================================
 # DEV COMPOSE ONLY (docker-compose.dev.yml)
 # You only need them only if you are running `docker-compose.dev.yml`.
 # ==============================================================================
 # -- pgAdmin (database GUI) --
 # PGADMIN_PORT=5050
 # PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
 # PGADMIN_DEFAULT_PASSWORD=surfsense
 # -- Redis exposed port (dev only; Redis is internal-only in prod) --
 # REDIS_PORT=6379
 # -- WhatsApp bridge exposed port (dev/hybrid only; prod keeps it Docker-internal) --
 # WHATSAPP_BRIDGE_PORT=9929
 # -- Frontend Build Args --
 # In dev, the frontend is built from source and these are passed as build args.
 # In prod, they are automatically derived from AUTH_TYPE, ETL_SERVICE, and the port settings above.
 # NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL
 # NEXT_PUBLIC_ETL_SERVICE=DOCLING
 # NEXT_PUBLIC_DEPLOYMENT_MODE=self-hosted
 # ------------------------------------------------------------------------------
-# Custom Domain / Reverse Proxy
+# Public Ports
 # ------------------------------------------------------------------------------
-# ONLY set these if you are serving SurfSense on a real domain via a reverse
+# Production Docker exposes only Caddy to your machine. Caddy then routes
-# proxy (e.g. Caddy, Nginx, Cloudflare Tunnel).
+# frontend, backend, and zero-cache traffic internally.
 # For standard localhost deployments, leave all of these commented out.
 # they are automatically derived from the port settings above.
 #
-# NEXT_FRONTEND_URL=https://app.yourdomain.com
+# Local default: LISTEN_HTTP_PORT=3929
-# BACKEND_URL=https://api.yourdomain.com
+# Domain default: LISTEN_HTTP_PORT=80 and LISTEN_HTTPS_PORT=443
-# NEXT_PUBLIC_FASTAPI_BACKEND_URL=https://api.yourdomain.com
+LISTEN_HTTP_PORT=3929
-# NEXT_PUBLIC_ZERO_CACHE_URL=https://zero.yourdomain.com
+LISTEN_HTTPS_PORT=443
-# FASTAPI_BACKEND_INTERNAL_URL=http://backend:8000
+
 # ------------------------------------------------------------------------------
 # Custom Domain / HTTPS
 # ------------------------------------------------------------------------------
 # Leave SURFSENSE_SITE_ADDRESS as :80 for local HTTP.
 # Set it to your domain to enable automatic HTTPS:
 # SURFSENSE_SITE_ADDRESS=surf.example.com
 # CERT_EMAIL=you@example.com
 SURFSENSE_SITE_ADDRESS=:80
 CERT_EMAIL=
 # ------------------------------------------------------------------------------
 # Advanced Reverse Proxy Settings
 # ------------------------------------------------------------------------------
 # Usually do not change these. They are for custom certificate setup, CDNs/load
 # balancers, trusted proxy IPs, or changing upload limits.
 #
 # CERT_ACME_CA=https://acme-v02.api.letsencrypt.org/directory
 # CERT_ACME_DNS=
 # If a CDN/load balancer sits in front of Caddy, narrow this to that proxy's CIDRs.
 # TRUSTED_PROXIES=0.0.0.0/0
 # SURFSENSE_MAX_BODY_SIZE=5GB
 #
 # Browser API and Zero URLs are same-origin relative behind bundled Caddy.
 # Next.js server-side calls use Docker DNS through SURFSENSE_BACKEND_INTERNAL_URL
 # set internally by docker-compose.yml. Usually do not override it.
 # ------------------------------------------------------------------------------
 # Zero-cache (real-time sync)
@ -108,10 +107,9 @@ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 # Sync worker tuning. zero-cache defaults ZERO_NUM_SYNC_WORKERS to the number
 # of CPU cores, which can exceed the connection pool limits on high-core machines.
-# Each sync worker needs at least 1 connection from both the UPSTREAM and CVR
+# Each sync worker needs at least 1 connection from both the UPSTREAM and CVR pools.
-# pools, so these constraints must hold:
+# Keep ZERO_UPSTREAM_MAX_CONNS and ZERO_CVR_MAX_CONNS greater than or equal to
-#   ZERO_UPSTREAM_MAX_CONNS >= ZERO_NUM_SYNC_WORKERS
+# ZERO_NUM_SYNC_WORKERS.
 #   ZERO_CVR_MAX_CONNS      >= ZERO_NUM_SYNC_WORKERS
 # Default of 4 workers is sufficient for self-hosted / personal use.
 # ZERO_NUM_SYNC_WORKERS=4
 # ZERO_UPSTREAM_MAX_CONNS=20
@ -125,16 +123,16 @@ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 # ZERO_QUERY_URL: where zero-cache forwards query requests for resolution.
 # ZERO_MUTATE_URL: required by zero-cache when auth tokens are used, even though
-#   SurfSense does not use Zero mutators. Setting both URLs tells zero-cache to
+# SurfSense does not use Zero mutators. Setting both URLs tells zero-cache to
-#   skip its own JWT verification and let the app endpoints handle auth instead.
+# skip its own JWT verification and let the app endpoints handle auth instead.
-#   The mutate endpoint is a no-op that returns an empty response.
+# The mutate endpoint is a no-op that returns an empty response.
 # Default: Docker service networking (http://frontend:3000/api/zero/...).
 # Override when running the frontend outside Docker:
-#   ZERO_QUERY_URL=http://host.docker.internal:3000/api/zero/query
+# ZERO_QUERY_URL=http://host.docker.internal:3000/api/zero/query
-#   ZERO_MUTATE_URL=http://host.docker.internal:3000/api/zero/mutate
+# ZERO_MUTATE_URL=http://host.docker.internal:3000/api/zero/mutate
-# Override for custom domain:
+# Override for custom domain only when zero-cache is not in the bundled Docker network:
-#   ZERO_QUERY_URL=https://app.yourdomain.com/api/zero/query
+# ZERO_QUERY_URL=https://surf.example.com/api/zero/query
-#   ZERO_MUTATE_URL=https://app.yourdomain.com/api/zero/mutate
+# ZERO_MUTATE_URL=https://surf.example.com/api/zero/mutate
 # ZERO_QUERY_URL=http://frontend:3000/api/zero/query
 # ZERO_MUTATE_URL=http://frontend:3000/api/zero/mutate
@ -222,73 +220,74 @@ STT_SERVICE=local/base
 # ------------------------------------------------------------------------------
 # -- Google Connectors --
-# GOOGLE_CALENDAR_REDIRECT_URI=http://localhost:8000/api/v1/auth/google/calendar/connector/callback
+# GOOGLE_CALENDAR_REDIRECT_URI=http://localhost:3929/api/v1/auth/google/calendar/connector/callback
-# GOOGLE_GMAIL_REDIRECT_URI=http://localhost:8000/api/v1/auth/google/gmail/connector/callback
+# GOOGLE_GMAIL_REDIRECT_URI=http://localhost:3929/api/v1/auth/google/gmail/connector/callback
-# GOOGLE_DRIVE_REDIRECT_URI=http://localhost:8000/api/v1/auth/google/drive/connector/callback
+# GOOGLE_DRIVE_REDIRECT_URI=http://localhost:3929/api/v1/auth/google/drive/connector/callback
 # -- Notion --
 # NOTION_CLIENT_ID=
 # NOTION_CLIENT_SECRET=
-# NOTION_REDIRECT_URI=http://localhost:8000/api/v1/auth/notion/connector/callback
+# NOTION_REDIRECT_URI=http://localhost:3929/api/v1/auth/notion/connector/callback
 # -- Slack --
 # SLACK_CLIENT_ID=
 # SLACK_CLIENT_SECRET=
-# SLACK_REDIRECT_URI=http://localhost:8000/api/v1/auth/slack/connector/callback
+# SLACK_REDIRECT_URI=http://localhost:3929/api/v1/auth/slack/connector/callback
 # -- Discord --
 # DISCORD_CLIENT_ID=
 # DISCORD_CLIENT_SECRET=
-# DISCORD_REDIRECT_URI=http://localhost:8000/api/v1/auth/discord/connector/callback
+# DISCORD_REDIRECT_URI=http://localhost:3929/api/v1/auth/discord/connector/callback
 # DISCORD_BOT_TOKEN=
 # -- Atlassian (Jira & Confluence) --
 # ATLASSIAN_CLIENT_ID=
 # ATLASSIAN_CLIENT_SECRET=
-# JIRA_REDIRECT_URI=http://localhost:8000/api/v1/auth/jira/connector/callback
+# JIRA_REDIRECT_URI=http://localhost:3929/api/v1/auth/jira/connector/callback
-# CONFLUENCE_REDIRECT_URI=http://localhost:8000/api/v1/auth/confluence/connector/callback
+# CONFLUENCE_REDIRECT_URI=http://localhost:3929/api/v1/auth/confluence/connector/callback
 # -- Linear --
 # LINEAR_CLIENT_ID=
 # LINEAR_CLIENT_SECRET=
-# LINEAR_REDIRECT_URI=http://localhost:8000/api/v1/auth/linear/connector/callback
+# LINEAR_REDIRECT_URI=http://localhost:3929/api/v1/auth/linear/connector/callback
 # -- ClickUp --
 # CLICKUP_CLIENT_ID=
 # CLICKUP_CLIENT_SECRET=
-# CLICKUP_REDIRECT_URI=http://localhost:8000/api/v1/auth/clickup/connector/callback
+# CLICKUP_REDIRECT_URI=http://localhost:3929/api/v1/auth/clickup/connector/callback
 # -- Airtable --
 # AIRTABLE_CLIENT_ID=
 # AIRTABLE_CLIENT_SECRET=
-# AIRTABLE_REDIRECT_URI=http://localhost:8000/api/v1/auth/airtable/connector/callback
+# AIRTABLE_REDIRECT_URI=http://localhost:3929/api/v1/auth/airtable/connector/callback
 # -- Microsoft OAuth (Teams & OneDrive) --
 # MICROSOFT_CLIENT_ID=
 # MICROSOFT_CLIENT_SECRET=
-# TEAMS_REDIRECT_URI=http://localhost:8000/api/v1/auth/teams/connector/callback
+# TEAMS_REDIRECT_URI=http://localhost:3929/api/v1/auth/teams/connector/callback
-# ONEDRIVE_REDIRECT_URI=http://localhost:8000/api/v1/auth/onedrive/connector/callback
+# ONEDRIVE_REDIRECT_URI=http://localhost:3929/api/v1/auth/onedrive/connector/callback
 # -- Dropbox --
 # DROPBOX_APP_KEY=
 # DROPBOX_APP_SECRET=
-# DROPBOX_REDIRECT_URI=http://localhost:8000/api/v1/auth/dropbox/connector/callback
+# DROPBOX_REDIRECT_URI=http://localhost:3929/api/v1/auth/dropbox/connector/callback
 # -- Composio --
 # COMPOSIO_API_KEY=
 # COMPOSIO_ENABLED=TRUE
-# COMPOSIO_REDIRECT_URI=http://localhost:8000/api/v1/auth/composio/connector/callback
+# COMPOSIO_REDIRECT_URI=http://localhost:3929/api/v1/auth/composio/connector/callback
 # ------------------------------------------------------------------------------
 # Messaging Channels (optional)
 # ------------------------------------------------------------------------------
 # Configure only the external chat channels you want to use.
 # GATEWAY_ENABLED=TRUE
 # -- Telegram --
 # TELEGRAM_SHARED_BOT_TOKEN=
 # TELEGRAM_SHARED_BOT_USERNAME=
 # TELEGRAM_WEBHOOK_SECRET=
-# GATEWAY_BASE_URL=http://localhost:8929
+# GATEWAY_BASE_URL=http://localhost:3929
 # GATEWAY_TELEGRAM_INTAKE_MODE=webhook
 # -- WhatsApp --
@ -307,20 +306,20 @@ STT_SERVICE=local/base
 #
 # GATEWAY_SLACK_ENABLED=FALSE
 # GATEWAY_SLACK_SIGNING_SECRET=
-# GATEWAY_SLACK_REDIRECT_URI=http://localhost:8929/api/v1/gateway/slack/callback
+# GATEWAY_SLACK_REDIRECT_URI=http://localhost:3929/api/v1/gateway/slack/callback
 # -- Discord --
 # Uses DISCORD_CLIENT_ID, DISCORD_CLIENT_SECRET, and DISCORD_BOT_TOKEN from the
 # Discord connector section.
 #
 # GATEWAY_DISCORD_ENABLED=FALSE
-# GATEWAY_DISCORD_REDIRECT_URI=http://localhost:8929/api/v1/gateway/discord/callback
+# GATEWAY_DISCORD_REDIRECT_URI=http://localhost:3929/api/v1/gateway/discord/callback
 # ------------------------------------------------------------------------------
 # SearXNG (bundled web search, works out of the box with no config needed)
 # ------------------------------------------------------------------------------
 # SearXNG provides web search to all search spaces automatically.
-# To access the SearXNG UI directly: http://localhost:8888
+# To access the SearXNG UI directly in dev/deps-only compose: http://localhost:8888
 # To disable the service entirely: docker compose up --scale searxng=0
 # To point at your own SearXNG instance instead of the bundled one:
 # SEARXNG_DEFAULT_HOST=http://your-searxng:8080
@ -457,3 +456,36 @@ NOLOGIN_MODE_ENABLED=FALSE
 # RESIDENTIAL_PROXY_HOSTNAME=
 # RESIDENTIAL_PROXY_LOCATION=
 # RESIDENTIAL_PROXY_TYPE=1
 # ==============================================================================
 # DEV / DEPS-ONLY COMPOSE OVERRIDES
 # These are only needed for docker-compose.dev.yml or docker-compose.deps-only.yml.
 # Production Docker exposes Caddy only; raw app ports below do not affect
 # docker-compose.yml.
 # ==============================================================================
 # -- pgAdmin (database GUI, dev/deps-only only) --
 # PGADMIN_PORT=5050
 # PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
 # PGADMIN_DEFAULT_PASSWORD=surfsense
 # -- Redis exposed port (dev/deps-only only; Redis is internal-only in prod) --
 # REDIS_PORT=6379
 # -- SearXNG exposed port (dev/deps-only only; internal-only in prod) --
 # SEARXNG_PORT=8888
 # -- WhatsApp bridge exposed port (dev/hybrid only; prod keeps it Docker-internal) --
 # WHATSAPP_BRIDGE_PORT=9929
 # -- Raw app ports (dev/deps-only only; prod exposes Caddy instead) --
 # BACKEND_PORT=8000
 # FRONTEND_PORT=3000
 # ZERO_CACHE_PORT=4848
 # -- Frontend runtime flags (prod and dev compose) --
 # The frontend reads these at request time in Docker; no NEXT_PUBLIC_* rebuild
 # or startup substitution is required.
 # AUTH_TYPE=LOCAL
 # ETL_SERVICE=DOCLING
 # DEPLOYMENT_MODE=self-hosted
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@ -257,16 +257,15 @@ services:
  frontend:
    build:
      context: ../surfsense_web
      args:
        NEXT_PUBLIC_FASTAPI_BACKEND_URL: ${NEXT_PUBLIC_FASTAPI_BACKEND_URL:-http://localhost:8000}
        NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: ${NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE:-LOCAL}
        NEXT_PUBLIC_ETL_SERVICE: ${NEXT_PUBLIC_ETL_SERVICE:-DOCLING}
        NEXT_PUBLIC_ZERO_CACHE_URL: ${NEXT_PUBLIC_ZERO_CACHE_URL:-http://localhost:${ZERO_CACHE_PORT:-4848}}
        NEXT_PUBLIC_DEPLOYMENT_MODE: ${NEXT_PUBLIC_DEPLOYMENT_MODE:-self-hosted}
    ports:
      - "${FRONTEND_PORT:-3000}:3000"
    env_file:
      - ../surfsense_web/.env
    environment:
      AUTH_TYPE: ${AUTH_TYPE:-LOCAL}
      ETL_SERVICE: ${ETL_SERVICE:-DOCLING}
      DEPLOYMENT_MODE: ${DEPLOYMENT_MODE:-self-hosted}
      SURFSENSE_BACKEND_INTERNAL_URL: http://backend:8000
    depends_on:
      backend:
        condition: service_healthy
--- a/docker/docker-compose.proxy.yml
+++ b/docker/docker-compose.proxy.yml
@ -0,0 +1,54 @@
 # =============================================================================
 # SurfSense — Optional Caddy reverse-proxy overlay
 # =============================================================================
 # Usage (from docker/):
 #   PROXY_HTTP_PORT=8080 SURFSENSE_PUBLIC_URL=http://localhost:8080 \
 #     docker compose -f docker-compose.yml -f docker-compose.proxy.yml up -d
 #
 # This overlay is for validation and custom deployments. The production
 # docker-compose.yml includes Caddy by default.
 # =============================================================================
 services:
  backend:
    ports:
      - "${BACKEND_PORT:-8929}:8000"
  zero-cache:
    ports:
      - "${ZERO_CACHE_PORT:-5929}:4848"
  frontend:
    ports:
      - "${FRONTEND_PORT:-3929}:3000"
  proxy:
    image: caddy:2-alpine
    restart: unless-stopped
    ports:
      - "${PROXY_HTTP_PORT:-8080}:80"
      - "${PROXY_HTTPS_PORT:-8443}:443"
    volumes:
      - ./proxy/Caddyfile:/etc/caddy/Caddyfile:ro
      - caddy_data:/data
      - caddy_config:/config
    environment:
      SURFSENSE_SITE_ADDRESS: ${SURFSENSE_SITE_ADDRESS:-:80}
      CERT_EMAIL: ${CERT_EMAIL:-}
      CERT_ACME_CA: ${CERT_ACME_CA:-https://acme-v02.api.letsencrypt.org/directory}
      CERT_ACME_DNS: ${CERT_ACME_DNS:-}
      TRUSTED_PROXIES: ${TRUSTED_PROXIES:-0.0.0.0/0}
      SURFSENSE_MAX_BODY_SIZE: ${SURFSENSE_MAX_BODY_SIZE:-5GB}
    depends_on:
      frontend:
        condition: service_started
      backend:
        condition: service_healthy
      zero-cache:
        condition: service_healthy
 volumes:
  caddy_data:
    name: surfsense-caddy-data
  caddy_config:
    name: surfsense-caddy-config
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -94,10 +94,39 @@ services:
      timeout: 5s
      retries: 5
  # Single public entry point for the Docker stack. Comment this service out
  # only if you front SurfSense with your own reverse proxy.
  proxy:
    image: caddy:2-alpine
    # For DNS-01/wildcard certificates, replace image with:
    # build: ./proxy
    restart: unless-stopped
    ports:
      - "${LISTEN_HTTP_PORT:-3929}:80"
      - "${LISTEN_HTTPS_PORT:-443}:443"
    volumes:
      - ./proxy/Caddyfile:/etc/caddy/Caddyfile:ro
      - caddy_data:/data
      - caddy_config:/config
    environment:
      SURFSENSE_SITE_ADDRESS: ${SURFSENSE_SITE_ADDRESS:-:80}
      CERT_EMAIL: ${CERT_EMAIL:-}
      CERT_ACME_CA: ${CERT_ACME_CA:-https://acme-v02.api.letsencrypt.org/directory}
      CERT_ACME_DNS: ${CERT_ACME_DNS:-}
      TRUSTED_PROXIES: ${TRUSTED_PROXIES:-0.0.0.0/0}
      SURFSENSE_MAX_BODY_SIZE: ${SURFSENSE_MAX_BODY_SIZE:-5GB}
    depends_on:
      frontend:
        condition: service_started
      backend:
        condition: service_healthy
      zero-cache:
        condition: service_healthy
  backend:
    image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}${SURFSENSE_VARIANT:+-${SURFSENSE_VARIANT}}
-    ports:
+    expose:
-      - "${BACKEND_PORT:-8929}:8000"
+      - "8000"
    volumes:
      - shared_temp:/shared_tmp
      - object_store:/app/.local_object_store
@ -115,7 +144,8 @@ services:
      UVICORN_LOOP: asyncio
      UNSTRUCTURED_HAS_PATCHED_LOOP: "1"
      FILE_STORAGE_LOCAL_PATH: /app/.local_object_store
-      NEXT_FRONTEND_URL: ${NEXT_FRONTEND_URL:-http://localhost:${FRONTEND_PORT:-3929}}
+      NEXT_FRONTEND_URL: ${NEXT_FRONTEND_URL:-${SURFSENSE_PUBLIC_URL:-http://localhost:${LISTEN_HTTP_PORT:-3929}}}
      BACKEND_URL: ${BACKEND_URL:-${SURFSENSE_PUBLIC_URL:-http://localhost:${LISTEN_HTTP_PORT:-3929}}}
      SEARXNG_DEFAULT_HOST: ${SEARXNG_DEFAULT_HOST:-http://searxng:8080}
      WHATSAPP_BRIDGE_URL: ${WHATSAPP_BRIDGE_URL:-http://whatsapp-bridge:9929}
      # Daytona Sandbox – uncomment and set credentials to enable cloud code execution
@ -221,8 +251,8 @@ services:
  zero-cache:
    image: rocicorp/zero:1.4.0
-    ports:
+    expose:
-      - "${ZERO_CACHE_PORT:-5929}:4848"
+      - "4848"
    extra_hosts:
      - "host.docker.internal:host-gateway"
    environment:
@ -256,16 +286,13 @@ services:
  frontend:
    image: ghcr.io/modsetter/surfsense-web:${SURFSENSE_VERSION:-latest}
-    ports:
+    expose:
-      - "${FRONTEND_PORT:-3929}:3000"
+      - "3000"
    environment:
-      NEXT_PUBLIC_FASTAPI_BACKEND_URL: ${NEXT_PUBLIC_FASTAPI_BACKEND_URL:-http://localhost:${BACKEND_PORT:-8929}}
+      AUTH_TYPE: ${AUTH_TYPE:-LOCAL}
-      NEXT_PUBLIC_ZERO_CACHE_URL: ${NEXT_PUBLIC_ZERO_CACHE_URL:-http://localhost:${ZERO_CACHE_PORT:-5929}}
+      ETL_SERVICE: ${ETL_SERVICE:-DOCLING}
-      NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: ${AUTH_TYPE:-LOCAL}
+      DEPLOYMENT_MODE: ${DEPLOYMENT_MODE:-self-hosted}
-      NEXT_PUBLIC_ETL_SERVICE: ${ETL_SERVICE:-DOCLING}
+      SURFSENSE_BACKEND_INTERNAL_URL: http://backend:8000
      NEXT_PUBLIC_DEPLOYMENT_MODE: ${DEPLOYMENT_MODE:-self-hosted}
      NEXT_PUBLIC_WHATSAPP_DISPLAY_PHONE_NUMBER: ${WHATSAPP_SHARED_DISPLAY_PHONE_NUMBER:-}
      FASTAPI_BACKEND_INTERNAL_URL: ${FASTAPI_BACKEND_INTERNAL_URL:-http://backend:8000}
    labels:
      - "com.centurylinklabs.watchtower.enable=true"
    depends_on:
@ -286,5 +313,9 @@ volumes:
    name: surfsense-object-store
  zero_cache_data:
    name: surfsense-zero-cache
  caddy_data:
    name: surfsense-caddy-data
  caddy_config:
    name: surfsense-caddy-config
  whatsapp_sessions:
    name: surfsense-whatsapp-sessions
--- a/docker/proxy/Caddyfile
+++ b/docker/proxy/Caddyfile
@ -0,0 +1,45 @@
 {
 	# Optional ACME/global settings. These are harmless in the default :80
 	# localhost mode and become active when SURFSENSE_SITE_ADDRESS is a domain.
 	{$CERT_EMAIL}
 	acme_ca {$CERT_ACME_CA:https://acme-v02.api.letsencrypt.org/directory}
 	{$CERT_ACME_DNS}
 	servers {
 		client_ip_headers X-Forwarded-For X-Real-IP
 		trusted_proxies static {$TRUSTED_PROXIES:0.0.0.0/0}
 	}
 }
 (surfsense_proxy) {
 	request_body {
 		max_size {$SURFSENSE_MAX_BODY_SIZE:5GB}
 	}
 	# Frontend-owned auth page (the post-login token handler). More specific than
 	# /auth/*, so Caddy's matcher-specificity sort routes it here, not to backend.
 	reverse_proxy /auth/callback* frontend:3000
 	# Backend auth routes (FastAPI Users + OAuth helpers).
 	reverse_proxy /auth/* backend:8000
 	# Backend user profile routes (FastAPI Users users router, mounted at /users).
 	reverse_proxy /users/* backend:8000
 	# Backend REST, streaming, connector OAuth, and messaging gateway endpoints.
 	# FastAPI already serves /api/v1, so the path is forwarded unchanged.
 	reverse_proxy /api/v1/* backend:8000 {
 		flush_interval -1
 	}
 	# Zero accepts a single path-component base URL (Zero >= 0.6).
 	# Preserve /zero so browser cacheURL can be ${SURFSENSE_PUBLIC_URL}/zero.
 	reverse_proxy /zero/* zero-cache:4848
 	# Next.js app and frontend-owned API routes:
 	# /api/zero/*, /api/search, /api/contact, etc.
 	reverse_proxy /* frontend:3000
 }
 {$SURFSENSE_SITE_ADDRESS::80} {
 	import surfsense_proxy
 }
--- a/docker/proxy/Dockerfile
+++ b/docker/proxy/Dockerfile
@ -0,0 +1,10 @@
 FROM caddy:2-builder-alpine AS builder
 RUN xcaddy build \
 	--with github.com/caddy-dns/cloudflare \
 	--with github.com/caddy-dns/digitalocean
 FROM caddy:2-alpine
 COPY --from=builder /usr/bin/caddy /usr/bin/caddy
 COPY Caddyfile /etc/caddy/Caddyfile
--- a/docker/scripts/install.sh
+++ b/docker/scripts/install.sh
@ -333,11 +333,13 @@ step "Downloading SurfSense files"
 info "Installation directory: ${INSTALL_DIR}"
 mkdir -p "${INSTALL_DIR}/scripts"
 mkdir -p "${INSTALL_DIR}/searxng"
 mkdir -p "${INSTALL_DIR}/proxy"
 FILES=(
    "docker/docker-compose.yml:docker-compose.yml"
    "docker/docker-compose.gpu.yml:docker-compose.gpu.yml"
    "docker/.env.example:.env.example"
    "docker/proxy/Caddyfile:proxy/Caddyfile"
    "docker/postgresql.conf:postgresql.conf"
    "docker/scripts/migrate-database.sh:scripts/migrate-database.sh"
    "docker/searxng/settings.yml:searxng/settings.yml"
@ -532,9 +534,12 @@ _variant_display=$(grep '^SURFSENSE_VARIANT=' "${INSTALL_DIR}/.env" 2>/dev/null
 _variant_display="${_variant_display:-cpu}"
 step "SurfSense is now installed [${_version_display}]"
-info "  Frontend:  http://localhost:3929"
+_public_url=$(grep '^SURFSENSE_PUBLIC_URL=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2- | tr -d '"' | head -1 || true)
-info "  Backend:   http://localhost:8929"
+_public_url="${_public_url:-http://localhost:3929}"
-info "  API Docs:  http://localhost:8929/docs"
+
 info "  SurfSense: ${_public_url}"
 info "  Backend:   ${_public_url}/api/v1"
 info "  Zero sync: ${_public_url}/zero"
 info ""
 info "  Config:    ${INSTALL_DIR}/.env"
 info "  Variant:   ${_variant_display}"
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@ -30,12 +30,9 @@ CELERY_TASK_DEFAULT_QUEUE=surfsense
 # Optional: TTL in seconds for connector indexing lock key
 # CONNECTOR_INDEXING_LOCK_TTL_SECONDS=28800
-# Messaging Gateway (global)
+# Messaging Gateway: disabled by default; set TRUE to enable chat integrations.
-# GATEWAY_ENABLED: master switch for ALL messaging gateway channels (Telegram, WhatsApp,
+# Supported messaging gateways: WhatsApp, Telegram, Discord, Slack
-# Slack, Discord). When FALSE, no gateway background workers/supervisors start and all
+# GATEWAY_ENABLED=TRUE
 # gateway HTTP routes (webhooks, OAuth callbacks, pairing) return 404. Set per-channel
 # flags below to control individual platforms once the gateway is enabled.
 GATEWAY_ENABLED=TRUE
 # Telegram Gateway
 # TELEGRAM_WEBHOOK_SECRET must be 1-256 chars and contain only A-Z, a-z, 0-9, _ or -
@ -326,6 +323,42 @@ FILE_STORAGE_BACKEND=local
 # AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...;EndpointSuffix=core.windows.net
 # AZURE_STORAGE_CONTAINER=surfsense-documents
 # ETL Parse Cache
 # Reuse parser output for identical file bytes across workspaces (skips paid
 # re-parsing on LlamaCloud / Azure DI / Unstructured). Off by default.
 ETL_CACHE_ENABLED=false
 # Bump to invalidate all cached entries after a parser/behaviour change.
 # ETL_CACHE_PARSER_VERSION=1
 # Prune entries unused for this many days.
 # ETL_CACHE_TTL_DAYS=90
 # Soft cap on total cached markdown; coldest entries are evicted past it.
 # ETL_CACHE_MAX_TOTAL_MB=5120
 # Rows deleted per eviction pass.
 # ETL_CACHE_EVICTION_BATCH=500
 # Optional dedicated blob storage; unset reuses the main file storage backend.
 # ETL_CACHE_STORAGE_BACKEND=azure
 # ETL_CACHE_STORAGE_CONTAINER=surfsense-etl-cache
 # ETL_CACHE_STORAGE_LOCAL_PATH=/var/lib/surfsense/etl-cache
 # Embedding Cache
 # Reuse chunk+embedding output for identical markdown across workspaces (skips
 # re-chunking and re-embedding). Blobs share the ETL_CACHE_STORAGE_* backend.
 # Off by default.
 EMBEDDING_CACHE_ENABLED=false
 # Bump to invalidate all cached embedding sets after a chunker change.
 # EMBEDDING_CACHE_CHUNKER_VERSION=1
 # Prune entries unused for this many days.
 # EMBEDDING_CACHE_TTL_DAYS=90
 # Soft cap on total cached embeddings; coldest entries are evicted past it.
 # EMBEDDING_CACHE_MAX_TOTAL_MB=5120
 # Rows deleted per eviction pass.
 # EMBEDDING_CACHE_EVICTION_BATCH=500
 # Incremental re-indexing: on document edits, keep chunks whose text is
 # unchanged (reusing their embeddings) and embed only new/changed ones.
 # Set to false to fall back to delete-all + full re-embed (kill switch).
 # CHUNK_RECONCILE_ENABLED=true
 # Daytona Sandbox (isolated code execution)
 # DAYTONA_SANDBOX_ENABLED=FALSE
 # DAYTONA_API_KEY=your-daytona-api-key
@ -365,7 +398,9 @@ LANGSMITH_PROJECT=surfsense
 # SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false   # adds a per-turn LLM call
 # Observability - OTel
-# SURFSENSE_ENABLE_OTEL=false
+# Disabled by default. Uncomment to enable OpenTelemetry.
 # SURFSENSE_ENABLE_OTEL=true
 # OpenTelemetry - endpoint enables export; absent = no-op.
 # Production should point at an OTel Collector. For local docker-compose.dev.yml,
 # use http://otel-lgtm:4317 instead.
--- a/surfsense_backend/alembic/versions/138_add_thread_auto_model_pinning_fields.py
+++ b/surfsense_backend/alembic/versions/138_add_thread_auto_model_pinning_fields.py
@ -4,7 +4,7 @@ Revision ID: 138
 Revises: 137
 Create Date: 2026-04-30
-Add a single thread-level column to persist the Auto (Fastest) model pin:
+Add a single thread-level column to persist the Auto model pin:
 - pinned_llm_config_id: concrete resolved global LLM config id used for this
  thread. NULL means "no pin; Auto will resolve on next turn".
--- a/surfsense_backend/alembic/versions/158_evolve_podcasts_lifecycle.py
+++ b/surfsense_backend/alembic/versions/158_evolve_podcasts_lifecycle.py
@ -15,6 +15,19 @@ down_revision: str | None = "157"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 PUBLICATION_NAME = "zero_publication"
 TARGET_STATUS_LABELS = (
    "pending",
    "awaiting_brief",
    "drafting",
    "awaiting_review",
    "rendering",
    "ready",
    "failed",
    "cancelled",
 )
 LEGACY_STATUS_LABELS = ("pending", "generating", "ready", "failed")
 def _drop_podcasts_from_publication() -> None:
    """Detach podcasts from zero_publication so status can be retyped.
@ -28,31 +41,103 @@ def _drop_podcasts_from_publication() -> None:
    published = conn.execute(
        sa.text(
            "SELECT 1 FROM pg_publication_tables "
-            "WHERE pubname = 'zero_publication' "
+            "WHERE pubname = :publication "
            "AND schemaname = current_schema() AND tablename = 'podcasts'"
-        )
+        ),
        {"publication": PUBLICATION_NAME},
    ).fetchone()
    if published:
-        op.execute('ALTER PUBLICATION "zero_publication" DROP TABLE "podcasts";')
+        op.execute(f'ALTER PUBLICATION "{PUBLICATION_NAME}" DROP TABLE "podcasts";')
-def upgrade() -> None:
+def _enum_labels(type_name: str) -> list[str] | None:
-    _drop_podcasts_from_publication()
+    rows = (
        op.get_bind()
        .execute(
            sa.text(
                "SELECT e.enumlabel "
                "FROM pg_type t "
                "JOIN pg_namespace n ON n.oid = t.typnamespace "
                "JOIN pg_enum e ON e.enumtypid = t.oid "
                "WHERE n.nspname = current_schema() AND t.typname = :type_name "
                "ORDER BY e.enumsortorder"
            ),
            {"type_name": type_name},
        )
        .fetchall()
    )
    if not rows:
        return None
    return [str(row[0]) for row in rows]
-    # Retype the status enum by swapping in a fresh type and casting existing
+
-    # rows. The legacy transient value 'generating' maps onto 'rendering'.
+def _column_type_name(table: str, column: str) -> str | None:
-    op.execute("ALTER TYPE podcast_status RENAME TO podcast_status_old;")
+    row = (
        op.get_bind()
        .execute(
            sa.text(
                "SELECT udt_name "
                "FROM information_schema.columns "
                "WHERE table_schema = current_schema() "
                "AND table_name = :table AND column_name = :column"
            ),
            {"table": table, "column": column},
        )
        .fetchone()
    )
    return str(row[0]) if row else None
 def _ensure_status_enum(
    *,
    desired_labels: tuple[str, ...],
    temporary_type: str,
    create_sql: str,
    alter_sql: str,
    default_value: str,
 ) -> None:
    current_labels = _enum_labels("podcast_status")
    desired = list(desired_labels)
    if current_labels != desired:
        if current_labels is None:
            if _enum_labels(temporary_type) is None:
                raise RuntimeError("podcast_status enum is missing")
        elif _enum_labels(temporary_type) is None:
            op.execute(f"ALTER TYPE podcast_status RENAME TO {temporary_type};")
        else:
            raise RuntimeError(
                "podcast_status and its temporary replacement both exist"
            )
        if _enum_labels("podcast_status") is None:
            op.execute(create_sql)
    if _enum_labels("podcast_status") != desired:
        raise RuntimeError("podcast_status enum is not in the expected shape")
    op.execute("ALTER TABLE podcasts ALTER COLUMN status DROP DEFAULT;")
    if _column_type_name("podcasts", "status") != "podcast_status":
        op.execute(alter_sql)
    op.execute(
-        """
+        f"ALTER TABLE podcasts ALTER COLUMN status SET DEFAULT '{default_value}';"
    )
    if _enum_labels(temporary_type) is not None:
        op.execute(f"DROP TYPE {temporary_type};")
 def _upgrade_status_enum() -> None:
    _ensure_status_enum(
        desired_labels=TARGET_STATUS_LABELS,
        temporary_type="podcast_status_old",
        create_sql="""
        CREATE TYPE podcast_status AS ENUM (
            'pending', 'awaiting_brief', 'drafting', 'awaiting_review',
            'rendering', 'ready', 'failed', 'cancelled'
        );
-        """
+        """,
-    )
+        alter_sql="""
    op.execute("ALTER TABLE podcasts ALTER COLUMN status DROP DEFAULT;")
    op.execute(
        """
        ALTER TABLE podcasts
            ALTER COLUMN status TYPE podcast_status
            USING (
@ -61,10 +146,43 @@ def upgrade() -> None:
                    ELSE status::text
                END
            )::podcast_status;
-        """
+        """,
        default_value="pending",
    )
-    op.execute("ALTER TABLE podcasts ALTER COLUMN status SET DEFAULT 'pending';")
+
-    op.execute("DROP TYPE podcast_status_old;")
+
 def _downgrade_status_enum() -> None:
    _ensure_status_enum(
        desired_labels=LEGACY_STATUS_LABELS,
        temporary_type="podcast_status_new",
        create_sql=(
            "CREATE TYPE podcast_status AS ENUM "
            "('pending', 'generating', 'ready', 'failed');"
        ),
        alter_sql="""
        ALTER TABLE podcasts
            ALTER COLUMN status TYPE podcast_status
            USING (
                CASE status::text
                    WHEN 'awaiting_brief' THEN 'pending'
                    WHEN 'drafting' THEN 'generating'
                    WHEN 'awaiting_review' THEN 'generating'
                    WHEN 'rendering' THEN 'generating'
                    WHEN 'cancelled' THEN 'failed'
                    ELSE status::text
                END
            )::podcast_status;
        """,
        default_value="ready",
    )
 def upgrade() -> None:
    _drop_podcasts_from_publication()
    # Retype the status enum by swapping in a fresh type and casting existing
    # rows. The legacy transient value 'generating' maps onto 'rendering'.
    _upgrade_status_enum()
    op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS source_content TEXT;")
    op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS spec JSONB;")
@ -83,6 +201,8 @@ def upgrade() -> None:
 def downgrade() -> None:
    _drop_podcasts_from_publication()
    op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS error;")
    op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS duration_seconds;")
    op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS storage_key;")
@ -92,27 +212,4 @@ def downgrade() -> None:
    op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS source_content;")
    # Collapse the expanded lifecycle back onto the original four values.
-    op.execute("ALTER TYPE podcast_status RENAME TO podcast_status_new;")
+    _downgrade_status_enum()
    op.execute(
        "CREATE TYPE podcast_status AS ENUM "
        "('pending', 'generating', 'ready', 'failed');"
    )
    op.execute("ALTER TABLE podcasts ALTER COLUMN status DROP DEFAULT;")
    op.execute(
        """
        ALTER TABLE podcasts
            ALTER COLUMN status TYPE podcast_status
            USING (
                CASE status::text
                    WHEN 'awaiting_brief' THEN 'pending'
                    WHEN 'drafting' THEN 'generating'
                    WHEN 'awaiting_review' THEN 'generating'
                    WHEN 'rendering' THEN 'generating'
                    WHEN 'cancelled' THEN 'failed'
                    ELSE status::text
                END
            )::podcast_status;
        """
    )
    op.execute("ALTER TABLE podcasts ALTER COLUMN status SET DEFAULT 'ready';")
    op.execute("DROP TYPE podcast_status_new;")
--- a/surfsense_backend/alembic/versions/160_add_model_connections.py
+++ b/surfsense_backend/alembic/versions/160_add_model_connections.py
@ -0,0 +1,299 @@
 """add model connections
 Revision ID: 160
 Revises: 159
 """
 from collections.abc import Sequence
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
 from alembic import op
 revision: str = "160"
 down_revision: str | None = "159"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 connection_scope = postgresql.ENUM(
    "GLOBAL",
    "SEARCH_SPACE",
    "USER",
    name="connectionscope",
    create_type=False,
 )
 model_source = postgresql.ENUM(
    "DISCOVERED",
    "MANUAL",
    name="modelsource",
    create_type=False,
 )
 def _table_exists(table_name: str) -> bool:
    return table_name in sa.inspect(op.get_bind()).get_table_names()
 def _column_exists(table_name: str, column_name: str) -> bool:
    if not _table_exists(table_name):
        return False
    return column_name in {
        column["name"] for column in sa.inspect(op.get_bind()).get_columns(table_name)
    }
 def _index_exists(table_name: str, index_name: str) -> bool:
    if not _table_exists(table_name):
        return False
    return index_name in {
        index["name"] for index in sa.inspect(op.get_bind()).get_indexes(table_name)
    }
 def _create_index_if_missing(
    index_name: str,
    table_name: str,
    columns: list[str],
 ) -> None:
    if not _index_exists(table_name, index_name):
        op.create_index(index_name, table_name, columns, unique=False)
 def _add_searchspace_column_if_missing(
    column_name: str,
    *,
    server_default: object | None = None,
 ) -> None:
    if not _column_exists("searchspaces", column_name):
        op.add_column(
            "searchspaces",
            sa.Column(
                column_name,
                sa.Integer(),
                nullable=True,
                server_default=server_default,
            ),
        )
 def _drop_column_if_exists(table_name: str, column_name: str) -> None:
    if _column_exists(table_name, column_name):
        op.drop_column(table_name, column_name)
 def _drop_index_if_exists(table_name: str, index_name: str) -> None:
    if _index_exists(table_name, index_name):
        op.drop_index(index_name, table_name=table_name)
 def upgrade() -> None:
    bind = op.get_bind()
    connection_scope.create(bind, checkfirst=True)
    model_source.create(bind, checkfirst=True)
    if _table_exists("connections"):
        if _column_exists("connections", "litellm_provider") and not _column_exists(
            "connections", "provider"
        ):
            op.alter_column(
                "connections",
                "litellm_provider",
                new_column_name="provider",
                existing_type=sa.String(length=100),
                existing_nullable=True,
            )
            op.alter_column(
                "connections",
                "provider",
                existing_type=sa.String(length=100),
                nullable=False,
            )
        elif _column_exists("connections", "native_provider") and not _column_exists(
            "connections", "provider"
        ):
            op.alter_column(
                "connections",
                "native_provider",
                new_column_name="provider",
                existing_type=sa.String(length=100),
                existing_nullable=True,
            )
            op.alter_column(
                "connections",
                "provider",
                existing_type=sa.String(length=100),
                nullable=False,
            )
        elif not _column_exists("connections", "provider"):
            op.add_column(
                "connections",
                sa.Column("provider", sa.String(length=100), nullable=False),
            )
        _drop_index_if_exists("connections", "ix_connections_protocol")
        _drop_column_if_exists("connections", "protocol")
    else:
        op.create_table(
            "connections",
            sa.Column("id", sa.Integer(), nullable=False),
            sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
            sa.Column("provider", sa.String(length=100), nullable=False),
            sa.Column("base_url", sa.String(length=500), nullable=True),
            sa.Column("api_key", sa.String(), nullable=True),
            sa.Column(
                "extra",
                postgresql.JSONB(astext_type=sa.Text()),
                server_default=sa.text("'{}'::jsonb"),
                nullable=False,
            ),
            sa.Column("scope", connection_scope, nullable=False),
            sa.Column(
                "enabled", sa.Boolean(), server_default=sa.text("true"), nullable=False
            ),
            sa.Column("search_space_id", sa.Integer(), nullable=True),
            sa.Column("user_id", sa.UUID(), nullable=True),
            sa.CheckConstraint(
                "(scope = 'GLOBAL' AND search_space_id IS NULL AND user_id IS NULL) OR "
                "(scope = 'SEARCH_SPACE' AND search_space_id IS NOT NULL AND user_id IS NOT NULL) OR "
                "(scope = 'USER' AND user_id IS NOT NULL)",
                name="ck_connections_scope_owner",
            ),
            sa.ForeignKeyConstraint(
                ["search_space_id"], ["searchspaces.id"], ondelete="CASCADE"
            ),
            sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
            sa.PrimaryKeyConstraint("id"),
        )
    if _index_exists(
        "connections", "ix_connections_native_provider"
    ) and not _index_exists("connections", "ix_connections_provider"):
        op.execute(
            "ALTER INDEX ix_connections_native_provider "
            "RENAME TO ix_connections_provider"
        )
    if _index_exists(
        "connections", "ix_connections_litellm_provider"
    ) and not _index_exists("connections", "ix_connections_provider"):
        op.execute(
            "ALTER INDEX ix_connections_litellm_provider "
            "RENAME TO ix_connections_provider"
        )
    _create_index_if_missing("ix_connections_provider", "connections", ["provider"])
    _create_index_if_missing("ix_connections_scope", "connections", ["scope"])
    if not _table_exists("models"):
        op.create_table(
            "models",
            sa.Column("id", sa.Integer(), nullable=False),
            sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
            sa.Column("connection_id", sa.Integer(), nullable=False),
            sa.Column("model_id", sa.String(length=255), nullable=False),
            sa.Column("display_name", sa.String(length=255), nullable=True),
            sa.Column(
                "source",
                model_source,
                server_default="DISCOVERED",
                nullable=False,
            ),
            sa.Column("supports_chat", sa.Boolean(), nullable=True),
            sa.Column("max_input_tokens", sa.Integer(), nullable=True),
            sa.Column("supports_image_input", sa.Boolean(), nullable=True),
            sa.Column("supports_tools", sa.Boolean(), nullable=True),
            sa.Column("supports_image_generation", sa.Boolean(), nullable=True),
            sa.Column(
                "capabilities_override",
                postgresql.JSONB(astext_type=sa.Text()),
                server_default=sa.text("'{}'::jsonb"),
                nullable=False,
            ),
            sa.Column(
                "enabled", sa.Boolean(), server_default=sa.text("true"), nullable=False
            ),
            sa.Column("billing_tier", sa.String(length=50), nullable=True),
            sa.Column(
                "catalog",
                postgresql.JSONB(astext_type=sa.Text()),
                server_default=sa.text("'{}'::jsonb"),
                nullable=False,
            ),
            sa.ForeignKeyConstraint(
                ["connection_id"], ["connections.id"], ondelete="CASCADE"
            ),
            sa.PrimaryKeyConstraint("id"),
            sa.UniqueConstraint(
                "connection_id", "model_id", name="uq_models_connection_model_id"
            ),
        )
    else:
        if not _column_exists("models", "supports_chat"):
            op.add_column(
                "models", sa.Column("supports_chat", sa.Boolean(), nullable=True)
            )
        if not _column_exists("models", "max_input_tokens"):
            op.add_column(
                "models", sa.Column("max_input_tokens", sa.Integer(), nullable=True)
            )
        if not _column_exists("models", "supports_image_input"):
            op.add_column(
                "models", sa.Column("supports_image_input", sa.Boolean(), nullable=True)
            )
        if not _column_exists("models", "supports_tools"):
            op.add_column(
                "models", sa.Column("supports_tools", sa.Boolean(), nullable=True)
            )
        if not _column_exists("models", "supports_image_generation"):
            op.add_column(
                "models",
                sa.Column("supports_image_generation", sa.Boolean(), nullable=True),
            )
        _drop_column_if_exists("models", "capabilities")
        _drop_column_if_exists("models", "capabilities_declared")
        _drop_column_if_exists("models", "capabilities_verified")
    _create_index_if_missing("ix_models_connection_id", "models", ["connection_id"])
    _create_index_if_missing("ix_models_model_id", "models", ["model_id"])
    _create_index_if_missing("ix_models_billing_tier", "models", ["billing_tier"])
    _add_searchspace_column_if_missing("chat_model_id", server_default=sa.text("0"))
    _add_searchspace_column_if_missing(
        "image_gen_model_id", server_default=sa.text("0")
    )
    _add_searchspace_column_if_missing("vision_model_id", server_default=sa.text("0"))
    for column_name in ("chat_model_id", "image_gen_model_id", "vision_model_id"):
        op.alter_column(
            "searchspaces",
            column_name,
            existing_type=sa.Integer(),
            existing_nullable=True,
            server_default=sa.text("0"),
        )
    op.execute(
        """
        UPDATE searchspaces
        SET
            chat_model_id = COALESCE(chat_model_id, 0),
            image_gen_model_id = COALESCE(image_gen_model_id, 0),
            vision_model_id = COALESCE(vision_model_id, 0)
        """
    )
    op.execute("DROP TYPE IF EXISTS connectionprotocol")
 def downgrade() -> None:
    op.drop_column("searchspaces", "vision_model_id")
    op.drop_column("searchspaces", "image_gen_model_id")
    op.drop_column("searchspaces", "chat_model_id")
    op.drop_index(op.f("ix_models_billing_tier"), table_name="models")
    op.drop_index("ix_models_model_id", table_name="models")
    op.drop_index(op.f("ix_models_connection_id"), table_name="models")
    op.drop_table("models")
    op.drop_index(op.f("ix_connections_scope"), table_name="connections")
    op.drop_index(op.f("ix_connections_provider"), table_name="connections")
    op.drop_table("connections")
    bind = op.get_bind()
    model_source.drop(bind, checkfirst=True)
    connection_scope.drop(bind, checkfirst=True)
--- a/surfsense_backend/alembic/versions/161_remove_legacy_model_configs.py
+++ b/surfsense_backend/alembic/versions/161_remove_legacy_model_configs.py
@ -0,0 +1,270 @@
 """remove legacy model config tables
 Revision ID: 161
 Revises: 160
 """
 from collections.abc import Sequence
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
 from sqlalchemy.types import TypeEngine
 from alembic import op
 revision: str = "161"
 down_revision: str | None = "160"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 litellm_provider = postgresql.ENUM(
    "OPENAI",
    "ANTHROPIC",
    "GOOGLE",
    "AZURE_OPENAI",
    "BEDROCK",
    "VERTEX_AI",
    "GROQ",
    "COHERE",
    "MISTRAL",
    "DEEPSEEK",
    "XAI",
    "OPENROUTER",
    "TOGETHER_AI",
    "FIREWORKS_AI",
    "REPLICATE",
    "PERPLEXITY",
    "OLLAMA",
    "ALIBABA_QWEN",
    "MOONSHOT",
    "ZHIPU",
    "ANYSCALE",
    "DEEPINFRA",
    "CEREBRAS",
    "SAMBANOVA",
    "AI21",
    "CLOUDFLARE",
    "DATABRICKS",
    "COMETAPI",
    "HUGGINGFACE",
    "GITHUB_MODELS",
    "MINIMAX",
    "CUSTOM",
    name="litellmprovider",
    create_type=False,
 )
 image_gen_provider = postgresql.ENUM(
    "OPENAI",
    "AZURE_OPENAI",
    "GOOGLE",
    "VERTEX_AI",
    "BEDROCK",
    "RECRAFT",
    "OPENROUTER",
    "XINFERENCE",
    "NSCALE",
    name="imagegenprovider",
    create_type=False,
 )
 vision_provider = postgresql.ENUM(
    "OPENAI",
    "ANTHROPIC",
    "GOOGLE",
    "AZURE_OPENAI",
    "VERTEX_AI",
    "BEDROCK",
    "XAI",
    "OPENROUTER",
    "OLLAMA",
    "GROQ",
    "TOGETHER_AI",
    "FIREWORKS_AI",
    "DEEPSEEK",
    "MISTRAL",
    "CUSTOM",
    name="visionprovider",
    create_type=False,
 )
 def _table_exists(table_name: str) -> bool:
    return table_name in sa.inspect(op.get_bind()).get_table_names()
 def _column_exists(table_name: str, column_name: str) -> bool:
    if not _table_exists(table_name):
        return False
    return column_name in {
        column["name"] for column in sa.inspect(op.get_bind()).get_columns(table_name)
    }
 def _drop_column_if_exists(table_name: str, column_name: str) -> None:
    if _column_exists(table_name, column_name):
        op.drop_column(table_name, column_name)
 def _rename_column_if_exists(
    table_name: str,
    old_column_name: str,
    new_column_name: str,
    *,
    existing_type: TypeEngine,
    existing_nullable: bool = True,
 ) -> None:
    if _column_exists(table_name, old_column_name) and not _column_exists(
        table_name, new_column_name
    ):
        op.alter_column(
            table_name,
            old_column_name,
            new_column_name=new_column_name,
            existing_type=existing_type,
            existing_nullable=existing_nullable,
        )
 def upgrade() -> None:
    for table_name in (
        "new_llm_configs",
        "vision_llm_configs",
        "image_generation_configs",
    ):
        if _table_exists(table_name):
            op.drop_table(table_name)
    _drop_column_if_exists("searchspaces", "agent_llm_id")
    _drop_column_if_exists("searchspaces", "image_generation_config_id")
    _drop_column_if_exists("searchspaces", "vision_llm_config_id")
    _rename_column_if_exists(
        "image_generations",
        "image_generation_config_id",
        "image_gen_model_id",
        existing_type=sa.Integer(),
    )
    op.execute("DROP TYPE IF EXISTS litellmprovider")
    op.execute("DROP TYPE IF EXISTS imagegenprovider")
    op.execute("DROP TYPE IF EXISTS visionprovider")
 def downgrade() -> None:
    bind = op.get_bind()
    litellm_provider.create(bind, checkfirst=True)
    image_gen_provider.create(bind, checkfirst=True)
    vision_provider.create(bind, checkfirst=True)
    _rename_column_if_exists(
        "image_generations",
        "image_gen_model_id",
        "image_generation_config_id",
        existing_type=sa.Integer(),
    )
    if _table_exists("searchspaces"):
        if not _column_exists("searchspaces", "agent_llm_id"):
            op.add_column(
                "searchspaces",
                sa.Column("agent_llm_id", sa.Integer(), nullable=True),
            )
        if not _column_exists("searchspaces", "image_generation_config_id"):
            op.add_column(
                "searchspaces",
                sa.Column("image_generation_config_id", sa.Integer(), nullable=True),
            )
        if not _column_exists("searchspaces", "vision_llm_config_id"):
            op.add_column(
                "searchspaces",
                sa.Column("vision_llm_config_id", sa.Integer(), nullable=True),
            )
    if not _table_exists("image_generation_configs"):
        op.create_table(
            "image_generation_configs",
            sa.Column("id", sa.Integer(), nullable=False),
            sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
            sa.Column("name", sa.String(length=100), nullable=False),
            sa.Column("description", sa.String(length=500), nullable=True),
            sa.Column("provider", image_gen_provider, nullable=False),
            sa.Column("custom_provider", sa.String(length=100), nullable=True),
            sa.Column("model_name", sa.String(length=100), nullable=False),
            sa.Column("api_key", sa.String(), nullable=False),
            sa.Column("api_base", sa.String(length=500), nullable=True),
            sa.Column("api_version", sa.String(length=50), nullable=True),
            sa.Column("litellm_params", sa.JSON(), nullable=True),
            sa.Column("search_space_id", sa.Integer(), nullable=False),
            sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
            sa.ForeignKeyConstraint(
                ["search_space_id"], ["searchspaces.id"], ondelete="CASCADE"
            ),
            sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
            sa.PrimaryKeyConstraint("id"),
        )
        op.create_index(
            op.f("ix_image_generation_configs_name"),
            "image_generation_configs",
            ["name"],
            unique=False,
        )
    if not _table_exists("vision_llm_configs"):
        op.create_table(
            "vision_llm_configs",
            sa.Column("id", sa.Integer(), nullable=False),
            sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
            sa.Column("name", sa.String(length=100), nullable=False),
            sa.Column("description", sa.String(length=500), nullable=True),
            sa.Column("provider", vision_provider, nullable=False),
            sa.Column("custom_provider", sa.String(length=100), nullable=True),
            sa.Column("model_name", sa.String(length=100), nullable=False),
            sa.Column("api_key", sa.String(), nullable=False),
            sa.Column("api_base", sa.String(length=500), nullable=True),
            sa.Column("api_version", sa.String(length=50), nullable=True),
            sa.Column("litellm_params", sa.JSON(), nullable=True),
            sa.Column("search_space_id", sa.Integer(), nullable=False),
            sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
            sa.ForeignKeyConstraint(
                ["search_space_id"], ["searchspaces.id"], ondelete="CASCADE"
            ),
            sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
            sa.PrimaryKeyConstraint("id"),
        )
        op.create_index(
            op.f("ix_vision_llm_configs_name"),
            "vision_llm_configs",
            ["name"],
            unique=False,
        )
    if not _table_exists("new_llm_configs"):
        op.create_table(
            "new_llm_configs",
            sa.Column("id", sa.Integer(), nullable=False),
            sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
            sa.Column("name", sa.String(length=100), nullable=False),
            sa.Column("description", sa.String(length=500), nullable=True),
            sa.Column("provider", litellm_provider, nullable=False),
            sa.Column("custom_provider", sa.String(length=100), nullable=True),
            sa.Column("model_name", sa.String(length=100), nullable=False),
            sa.Column("api_key", sa.String(), nullable=False),
            sa.Column("api_base", sa.String(length=500), nullable=True),
            sa.Column("litellm_params", sa.JSON(), nullable=True),
            sa.Column("system_instructions", sa.Text(), nullable=False),
            sa.Column("use_default_system_instructions", sa.Boolean(), nullable=False),
            sa.Column("citations_enabled", sa.Boolean(), nullable=False),
            sa.Column("search_space_id", sa.Integer(), nullable=False),
            sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
            sa.ForeignKeyConstraint(
                ["search_space_id"], ["searchspaces.id"], ondelete="CASCADE"
            ),
            sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
            sa.PrimaryKeyConstraint("id"),
        )
        op.create_index(
            op.f("ix_new_llm_configs_name"),
            "new_llm_configs",
            ["name"],
            unique=False,
        )
--- a/surfsense_backend/alembic/versions/162_add_etl_cache_parses.py
+++ b/surfsense_backend/alembic/versions/162_add_etl_cache_parses.py
@ -0,0 +1,53 @@
 """add etl_cache_parses table for content-addressed parse reuse
 Revision ID: 162
 Revises: 161
 """
 from collections.abc import Sequence
 from alembic import op
 revision: str = "162"
 down_revision: str | None = "161"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    op.execute(
        """
        CREATE TABLE IF NOT EXISTS etl_cache_parses (
            id SERIAL PRIMARY KEY,
            source_sha256 VARCHAR(64) NOT NULL,
            etl_service VARCHAR(32) NOT NULL,
            mode VARCHAR(16) NOT NULL,
            parser_version INTEGER NOT NULL,
            storage_backend VARCHAR(32) NOT NULL,
            storage_key TEXT NOT NULL,
            size_bytes BIGINT NOT NULL,
            content_type VARCHAR(32) NOT NULL,
            actual_pages INTEGER NOT NULL DEFAULT 0,
            times_reused BIGINT NOT NULL DEFAULT 0,
            last_used_at TIMESTAMP WITH TIME ZONE NOT NULL,
            created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
            CONSTRAINT uq_etl_cache_parses_key
                UNIQUE (source_sha256, etl_service, mode, parser_version)
        );
        """
    )
    op.execute(
        "CREATE INDEX IF NOT EXISTS ix_etl_cache_parses_last_used_at "
        "ON etl_cache_parses(last_used_at);"
    )
    op.execute(
        "CREATE INDEX IF NOT EXISTS ix_etl_cache_parses_created_at "
        "ON etl_cache_parses(created_at);"
    )
 def downgrade() -> None:
    op.execute("DROP INDEX IF EXISTS ix_etl_cache_parses_created_at;")
    op.execute("DROP INDEX IF EXISTS ix_etl_cache_parses_last_used_at;")
    op.execute("DROP TABLE IF EXISTS etl_cache_parses;")
--- a/surfsense_backend/alembic/versions/163_add_embedding_cache_sets.py
+++ b/surfsense_backend/alembic/versions/163_add_embedding_cache_sets.py
@ -0,0 +1,53 @@
 """add embedding_cache_sets table for content-addressed embedding reuse
 Revision ID: 163
 Revises: 162
 """
 from collections.abc import Sequence
 from alembic import op
 revision: str = "163"
 down_revision: str | None = "162"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    op.execute(
        """
        CREATE TABLE IF NOT EXISTS embedding_cache_sets (
            id SERIAL PRIMARY KEY,
            markdown_sha256 VARCHAR(64) NOT NULL,
            embedding_model VARCHAR(255) NOT NULL,
            embedding_dim INTEGER NOT NULL,
            chunker_kind VARCHAR(8) NOT NULL,
            chunker_version INTEGER NOT NULL,
            storage_backend VARCHAR(32) NOT NULL,
            storage_key TEXT NOT NULL,
            size_bytes BIGINT NOT NULL,
            chunk_count INTEGER NOT NULL DEFAULT 0,
            times_reused BIGINT NOT NULL DEFAULT 0,
            last_used_at TIMESTAMP WITH TIME ZONE NOT NULL,
            created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
            CONSTRAINT uq_embedding_cache_sets_key
                UNIQUE (markdown_sha256, embedding_model, chunker_kind, chunker_version)
        );
        """
    )
    op.execute(
        "CREATE INDEX IF NOT EXISTS ix_embedding_cache_sets_last_used_at "
        "ON embedding_cache_sets(last_used_at);"
    )
    op.execute(
        "CREATE INDEX IF NOT EXISTS ix_embedding_cache_sets_created_at "
        "ON embedding_cache_sets(created_at);"
    )
 def downgrade() -> None:
    op.execute("DROP INDEX IF EXISTS ix_embedding_cache_sets_created_at;")
    op.execute("DROP INDEX IF EXISTS ix_embedding_cache_sets_last_used_at;")
    op.execute("DROP TABLE IF EXISTS embedding_cache_sets;")
--- a/surfsense_backend/alembic/versions/164_remove_inactive_users.py
+++ b/surfsense_backend/alembic/versions/164_remove_inactive_users.py
@ -0,0 +1,219 @@
 """remove users that never logged back in (last_login IS NULL)
 Migration 103 added ``user.last_login``. Any user whose ``last_login`` is still
 NULL has never authenticated since that column existed, i.e. they never logged
 back in. This migration purges those users together with everything that hangs
 off them: the search spaces they own, and (via ON DELETE CASCADE)
 ``searchspaces -> documents -> chunks`` plus all other user/space-scoped rows.
 This runs BEFORE the chunks.position backfill (revision 165) on purpose: it
 removes a large amount of dead chunk data first, so the expensive backfill has
 far fewer rows to rewrite.
 Work is done in committed batches (not one giant cascading DELETE) so that on a
 large table it streams progress to the alembic console, keeps each transaction
 small, bounds WAL/bloat growth, and is resumable if interrupted.
 Revision ID: 164
 Revises: 163
 """
 import logging
 import time
 from collections.abc import Sequence
 import sqlalchemy as sa
 from alembic import op
 revision: str = "164"
 down_revision: str | None = "163"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 # Documents removed per committed batch. Each document delete cascades to its
 # chunks (via ix_chunks_document_id), so keep this modest to bound batch size.
 DOC_BATCH = 1_000
 # Users removed per committed batch. Each cascades to owned search spaces and
 # the remaining space-/user-scoped rows.
 USER_BATCH = 500
 # Minimum seconds between progress log lines (keeps the console readable).
 LOG_EVERY_SECONDS = 5.0
 USER_SCRATCH = "_inactive_user_ids"
 DOC_SCRATCH = "_inactive_doc_ids"
 logger = logging.getLogger("alembic.runtime.migration")
 def _fmt_duration(seconds: float) -> str:
    seconds = int(seconds)
    h, rem = divmod(seconds, 3600)
    m, s = divmod(rem, 60)
    if h:
        return f"{h}h{m:02d}m{s:02d}s"
    if m:
        return f"{m}m{s:02d}s"
    return f"{s}s"
 def upgrade() -> None:
    bind = op.get_bind()
    # Run the heavy work outside the migration's single transaction so each
    # batch can commit on its own.
    with op.get_context().autocommit_block():
        # Materialize the target user ids once. Rebuilt from scratch on every
        # run, so a re-run after an interruption simply picks up whoever still
        # has NULL last_login -> the migration is idempotent and resumable.
        op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};")
        op.execute(
            f"CREATE UNLOGGED TABLE {USER_SCRATCH} AS "
            'SELECT id FROM "user" WHERE last_login IS NULL;'
        )
        op.execute(f"ALTER TABLE {USER_SCRATCH} ADD PRIMARY KEY (id);")
        total_users = (
            bind.execute(sa.text(f"SELECT count(*) FROM {USER_SCRATCH}")).scalar() or 0
        )
        if total_users == 0:
            logger.info("no users with NULL last_login; nothing to remove")
            op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};")
            return
        logger.info(
            "found %s users with NULL last_login (never logged back in); "
            "removing them and all data in search spaces they own",
            f"{total_users:,}",
        )
        # Documents living in search spaces owned by those users. Deleting these
        # explicitly (in batches) is what bounds the otherwise-unbounded
        # chunks cascade.
        op.execute(f"DROP TABLE IF EXISTS {DOC_SCRATCH};")
        op.execute(
            f"""
            CREATE UNLOGGED TABLE {DOC_SCRATCH} AS
            SELECT d.id
            FROM documents d
            JOIN searchspaces s ON s.id = d.search_space_id
            WHERE s.user_id IN (SELECT id FROM {USER_SCRATCH});
            """
        )
        op.execute(f"ALTER TABLE {DOC_SCRATCH} ADD PRIMARY KEY (id);")
        total_docs = (
            bind.execute(sa.text(f"SELECT count(*) FROM {DOC_SCRATCH}")).scalar() or 0
        )
        # Phase 1: delete documents (cascades chunks, document_versions,
        # document_files) in committed batches.
        logger.info(
            "phase 1/2: deleting %s documents (cascades their chunks) "
            "in batches of %s...",
            f"{total_docs:,}",
            f"{DOC_BATCH:,}",
        )
        _batched_delete(
            bind,
            scratch=DOC_SCRATCH,
            target_table="documents",
            target_col="id",
            batch_size=DOC_BATCH,
            total=total_docs,
            label="documents",
        )
        op.execute(f"DROP TABLE IF EXISTS {DOC_SCRATCH};")
        # Phase 2: delete the users themselves. This cascades the now-empty
        # search spaces plus all remaining user-/space-scoped rows.
        logger.info(
            "phase 2/2: deleting %s users (cascades search spaces and "
            "remaining data) in batches of %s...",
            f"{total_users:,}",
            f"{USER_BATCH:,}",
        )
        _batched_delete(
            bind,
            scratch=USER_SCRATCH,
            target_table='"user"',
            target_col="id",
            batch_size=USER_BATCH,
            total=total_users,
            label="users",
        )
        op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};")
        logger.info("migration 164 finished")
 def _batched_delete(
    bind: sa.engine.Connection,
    *,
    scratch: str,
    target_table: str,
    target_col: str,
    batch_size: int,
    total: int,
    label: str,
 ) -> None:
    """Pop ids from ``scratch`` and delete the matching rows, one committed
    batch at a time, logging progress. Atomic per batch: the row delete and the
    scratch pop happen in a single statement, so an interrupted run leaves the
    scratch table in sync with what has actually been deleted."""
    started = time.monotonic()
    last_log = 0.0
    done = 0
    stmt = sa.text(
        f"""
        WITH batch AS (
            SELECT id FROM {scratch} LIMIT :n
        ), deleted AS (
            DELETE FROM {target_table}
            WHERE {target_col} IN (SELECT id FROM batch)
        ), popped AS (
            DELETE FROM {scratch}
            WHERE id IN (SELECT id FROM batch)
            RETURNING id
        )
        SELECT count(*) FROM popped
        """
    )
    while True:
        popped = bind.execute(stmt, {"n": batch_size}).scalar() or 0
        if popped == 0:
            break
        done += popped
        now = time.monotonic()
        if now - last_log >= LOG_EVERY_SECONDS or done >= total:
            elapsed = now - started
            pct = (100.0 * done / total) if total else 100.0
            eta = (elapsed / pct * (100.0 - pct)) if pct > 0 else 0.0
            logger.info(
                "%s deleted: %.1f%% (%s/%s) elapsed %s eta %s",
                label,
                pct,
                f"{done:,}",
                f"{total:,}",
                _fmt_duration(elapsed),
                _fmt_duration(eta),
            )
            last_log = now
    logger.info(
        "deleted %s %s in %s",
        f"{done:,}",
        label,
        _fmt_duration(time.monotonic() - started),
    )
 def downgrade() -> None:
    # Irreversible: deleted users and their cascaded data cannot be restored.
    # No-op so the downgrade chain can still pass through this revision.
    logger.warning(
        "migration 164 (remove_inactive_users) is irreversible; "
        "downgrade is a no-op (deleted users/data are not restored)"
    )
--- a/surfsense_backend/alembic/versions/165_add_chunk_position.py
+++ b/surfsense_backend/alembic/versions/165_add_chunk_position.py
@ -0,0 +1,183 @@
 """add chunks.position for explicit document order
 Incremental re-indexing keeps unchanged chunk rows, so auto-increment ids no
 longer reflect document order. Backfill preserves the historical id ordering.
 The backfill is done in committed batches (not one giant UPDATE) so that on a
 large table it: streams progress to the alembic console, keeps each transaction
 small, bounds WAL/bloat growth, and is resumable if interrupted.
 Revision ID: 165
 Revises: 164
 """
 import logging
 import time
 from collections.abc import Sequence
 import sqlalchemy as sa
 from alembic import op
 revision: str = "165"
 down_revision: str | None = "164"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 # Number of chunk ids processed per committed batch.
 BATCH_SIZE = 100_000
 # Minimum seconds between progress log lines (keeps the console readable).
 LOG_EVERY_SECONDS = 5.0
 SCRATCH_TABLE = "_chunk_position_backfill"
 logger = logging.getLogger("alembic.runtime.migration")
 def _fmt_duration(seconds: float) -> str:
    seconds = int(seconds)
    h, rem = divmod(seconds, 3600)
    m, s = divmod(rem, 60)
    if h:
        return f"{h}h{m:02d}m{s:02d}s"
    if m:
        return f"{m}m{s:02d}s"
    return f"{s}s"
 def _index_exists(bind: sa.engine.Connection, name: str) -> bool:
    return bool(
        bind.execute(
            sa.text(
                "SELECT EXISTS (SELECT 1 FROM pg_class "
                "WHERE relkind = 'i' AND relname = :n)"
            ),
            {"n": name},
        ).scalar()
    )
 def upgrade() -> None:
    bind = op.get_bind()
    # Adding a NOT NULL column with a constant default is metadata-only on
    # PostgreSQL 11+, so this is fast even on very large tables.
    op.execute(
        "ALTER TABLE chunks ADD COLUMN IF NOT EXISTS position INTEGER NOT NULL DEFAULT 0;"
    )
    # Idempotent fast path: both indexes are created only after the backfill
    # has fully completed, so their presence is a reliable "already applied"
    # marker. This makes re-running the migration a cheap no-op.
    if _index_exists(bind, "ix_chunks_position") and _index_exists(
        bind, "ix_chunks_document_id_position"
    ):
        logger.info("migration 165 already applied; skipping backfill")
        return
    # Run the heavy work outside the migration's single transaction so each
    # batch can commit on its own.
    with op.get_context().autocommit_block():
        # reltuples is a planner estimate and is -1 on never-analyzed tables;
        # it is only used for the log line below, so treat <= 0 as "unknown".
        total_rows = (
            bind.execute(
                sa.text(
                    "SELECT reltuples::bigint FROM pg_class WHERE relname = 'chunks'"
                )
            ).scalar()
            or 0
        )
        total_rows_display = (
            f"~{total_rows:,}" if total_rows > 0 else "an unknown number of"
        )
        bounds = bind.execute(sa.text("SELECT min(id), max(id) FROM chunks")).one()
        min_id, max_id = bounds[0], bounds[1]
        if min_id is None:
            logger.info("chunks table is empty; nothing to backfill")
        else:
            # Precompute per-document ordering once into an UNLOGGED scratch
            # table (low WAL). ROW_NUMBER must see each whole document, so it
            # cannot be computed per id-range slice.
            logger.info(
                "building position mapping for %s chunks (this is a single "
                "scan; the batched UPDATE below reports progress)...",
                total_rows_display,
            )
            op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
            op.execute(
                f"""
                CREATE UNLOGGED TABLE {SCRATCH_TABLE} AS
                SELECT id,
                       (ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY id) - 1)::int AS rn
                FROM chunks;
                """
            )
            op.execute(f"ALTER TABLE {SCRATCH_TABLE} ADD PRIMARY KEY (id);")
            id_span = max(max_id - min_id + 1, 1)
            started = time.monotonic()
            last_log = 0.0
            updated_total = 0
            lo = min_id
            while lo <= max_id:
                hi = lo + BATCH_SIZE  # exclusive upper bound
                result = bind.execute(
                    sa.text(
                        f"""
                        UPDATE chunks c
                        SET position = m.rn
                        FROM {SCRATCH_TABLE} m
                        WHERE c.id = m.id
                          AND c.id >= :lo
                          AND c.id < :hi
                          AND c.position IS DISTINCT FROM m.rn
                        """
                    ),
                    {"lo": lo, "hi": hi},
                )
                updated_total += result.rowcount or 0
                now = time.monotonic()
                processed_ids = min(hi, max_id + 1) - min_id
                pct = min(100.0, 100.0 * processed_ids / id_span)
                if now - last_log >= LOG_EVERY_SECONDS or hi > max_id:
                    elapsed = now - started
                    eta = (elapsed / pct * (100.0 - pct)) if pct > 0 else 0.0
                    logger.info(
                        "backfill position: %.1f%% (id<%s, %s rows rewritten) "
                        "elapsed %s eta %s",
                        pct,
                        f"{min(hi, max_id + 1):,}",
                        f"{updated_total:,}",
                        _fmt_duration(elapsed),
                        _fmt_duration(eta),
                    )
                    last_log = now
                lo = hi
            logger.info(
                "backfill complete: %s rows rewritten in %s",
                f"{updated_total:,}",
                _fmt_duration(time.monotonic() - started),
            )
            op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
        logger.info("creating index ix_chunks_position...")
        op.execute("CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);")
        logger.info("creating index ix_chunks_document_id_position...")
        op.execute(
            "CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position "
            "ON chunks(document_id, position);"
        )
        logger.info("migration 165 finished")
 def downgrade() -> None:
    op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
    op.execute("DROP INDEX IF EXISTS ix_chunks_document_id_position;")
    op.execute("DROP INDEX IF EXISTS ix_chunks_position;")
    op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS position;")
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
@ -241,8 +241,15 @@ async def _create_document(
        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
        session.add_all(
            [
-                Chunk(document_id=doc.id, content=text, embedding=embedding)
+                Chunk(
-                for text, embedding in zip(chunks, chunk_embeddings, strict=True)
+                    document_id=doc.id,
                    content=text,
                    embedding=embedding,
                    position=i,
                )
                for i, (text, embedding) in enumerate(
                    zip(chunks, chunk_embeddings, strict=True)
                )
            ]
        )
    return doc
@ -289,8 +296,15 @@ async def _update_document(
        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
        session.add_all(
            [
-                Chunk(document_id=document.id, content=text, embedding=embedding)
+                Chunk(
-                for text, embedding in zip(chunks, chunk_embeddings, strict=True)
+                    document_id=document.id,
                    content=text,
                    embedding=embedding,
                    position=i,
                )
                for i, (text, embedding) in enumerate(
                    zip(chunks, chunk_embeddings, strict=True)
                )
            ]
        )
    return document
@ -475,7 +489,9 @@ async def _load_chunks_for_snapshot(
    session: AsyncSession, *, doc_id: int
 ) -> list[dict[str, str]]:
    rows = await session.execute(
-        select(Chunk.content).where(Chunk.document_id == doc_id).order_by(Chunk.id)
+        select(Chunk.content)
        .where(Chunk.document_id == doc_id)
        .order_by(Chunk.position, Chunk.id)
    )
    return [{"content": row.content} for row in rows.all() if row.content is not None]
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/runtime/agent_cache.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/runtime/agent_cache.py
@ -57,7 +57,7 @@ async def build_agent_with_cache(
    mcp_tools_by_agent: dict[str, list[BaseTool]],
    disabled_tools: list[str] | None,
    config_id: str | None,
-    image_generation_config_id_override: int | None = None,
+    image_gen_model_id_override: int | None = None,
 ) -> Any:
    """Compile the multi-agent graph, serving from cache when key components are stable."""
@ -121,7 +121,7 @@ async def build_agent_with_cache(
        # Bound into the generate_image subagent tool at construction time, so it
        # must key the compiled-agent cache to avoid leaking one automation's
        # image model into another with the same config_id/search_space.
-        image_generation_config_id_override,
+        image_gen_model_id_override,
    )
    return await get_cache().get_or_build(cache_key, builder=_build)
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/runtime/factory.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/runtime/factory.py
@ -72,11 +72,11 @@ async def create_multi_agent_chat_deep_agent(
    mentioned_document_ids: list[int] | None = None,
    anon_session_id: str | None = None,
    filesystem_selection: FilesystemSelection | None = None,
-    image_generation_config_id: int | None = None,
+    image_gen_model_id: int | None = None,
 ):
    """Deep agent with SurfSense tools/middleware; registry route subagents behind ``task`` when enabled.
-    ``image_generation_config_id`` overrides the search space's image model for
+    ``image_gen_model_id`` overrides the search space's image model for
    this invocation (used by automations to run on their captured model). When
    ``None``, the ``generate_image`` tool resolves the live search-space pref.
    """
@ -147,7 +147,7 @@ async def create_multi_agent_chat_deep_agent(
        "llm": llm,
        # Per-invocation image model override (automations run on their captured
        # model). Reaches the generate_image subagent tool via subagent_dependencies.
-        "image_generation_config_id_override": image_generation_config_id,
+        "image_gen_model_id_override": image_gen_model_id,
    }
    _t0 = time.perf_counter()
@ -303,7 +303,7 @@ async def create_multi_agent_chat_deep_agent(
        mcp_tools_by_agent=mcp_tools_by_agent,
        disabled_tools=disabled_tools,
        config_id=config_id,
-        image_generation_config_id_override=image_generation_config_id,
+        image_gen_model_id_override=image_gen_model_id,
    )
    _perf_log.info(
        "[create_agent] Middleware stack + graph compiled in %.3fs",
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
@ -508,7 +508,7 @@ class KBPostgresBackend(BackendProtocol):
            chunk_rows = await session.execute(
                select(Chunk.id, Chunk.content)
                .where(Chunk.document_id == document.id)
-                .order_by(Chunk.id)
+                .order_by(Chunk.position, Chunk.id)
            )
            chunks = [
                {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
@ -725,7 +725,7 @@ class KBPostgresBackend(BackendProtocol):
                        .join(Document, Document.id == Chunk.document_id)
                        .where(Document.search_space_id == self.search_space_id)
                        .where(Chunk.content.ilike(f"%{pattern}%"))
-                        .order_by(Chunk.document_id, Chunk.id)
+                        .order_by(Chunk.document_id, Chunk.position, Chunk.id)
                    )
                    chunk_rows = await session.execute(sub)
                    per_doc: dict[int, int] = {}
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py
@ -394,7 +394,10 @@ async def browse_recent_documents(
                Chunk.document_id,
                Chunk.content,
                func.row_number()
-                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .over(
                    partition_by=Chunk.document_id,
                    order_by=(Chunk.position, Chunk.id),
                )
                .label("rn"),
            )
            .where(Chunk.document_id.in_(doc_ids))
@ -404,7 +407,7 @@ async def browse_recent_documents(
        chunk_query = (
            select(numbered.c.chunk_id, numbered.c.document_id, numbered.c.content)
            .where(numbered.c.rn <= _RECENCY_MAX_CHUNKS_PER_DOC)
-            .order_by(numbered.c.document_id, numbered.c.chunk_id)
+            .order_by(numbered.c.document_id, numbered.c.rn)
        )
        chunk_result = await session.execute(chunk_query)
        fetched_chunks = chunk_result.all()
@ -531,7 +534,7 @@ async def fetch_mentioned_documents(
        chunk_result = await session.execute(
            select(Chunk.id, Chunk.content, Chunk.document_id)
            .where(Chunk.document_id.in_(list(docs.keys())))
-            .order_by(Chunk.document_id, Chunk.id)
+            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
        )
        chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
        for row in chunk_result.all():
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py
@ -10,70 +10,53 @@ from langgraph.types import Command
 from litellm import aimage_generation
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import selectinload
 from app.agents.chat.multi_agent_chat.shared.receipts.command import with_receipt
 from app.agents.chat.multi_agent_chat.shared.receipts.receipt import make_receipt
 from app.config import config
 from app.db import (
    ImageGeneration,
-    ImageGenerationConfig,
+    Model,
    SearchSpace,
    shielded_async_session,
 )
 from app.services.auto_model_pin_service import (
    auto_model_candidates,
    choose_auto_model_candidate,
 )
 from app.services.image_gen_router_service import (
    IMAGE_GEN_AUTO_MODE_ID,
    ImageGenRouterService,
    is_image_gen_auto_mode,
 )
-from app.services.provider_api_base import resolve_api_base
+from app.services.model_capabilities import has_capability
 from app.services.model_resolver import to_litellm
 from app.utils.signed_image_urls import generate_image_token
 logger = logging.getLogger(__name__)
-# Provider mapping (same as routes)
+
-_PROVIDER_MAP = {
+def _get_global_model(model_id: int) -> dict | None:
-    "OPENAI": "openai",
+    return next((m for m in config.GLOBAL_MODELS if m.get("id") == model_id), None)
    "AZURE_OPENAI": "azure",
    "GOOGLE": "gemini",
    "VERTEX_AI": "vertex_ai",
    "BEDROCK": "bedrock",
    "RECRAFT": "recraft",
    "OPENROUTER": "openrouter",
    "XINFERENCE": "xinference",
    "NSCALE": "nscale",
 }
-def _resolve_provider_prefix(provider: str, custom_provider: str | None) -> str:
+def _get_global_connection(connection_id: int) -> dict | None:
-    if custom_provider:
+    return next(
-        return custom_provider
+        (c for c in config.GLOBAL_CONNECTIONS if c.get("id") == connection_id),
-    return _PROVIDER_MAP.get(provider.upper(), provider.lower())
+        None,
-
+    )
 def _build_model_string(
    provider: str, model_name: str, custom_provider: str | None
 ) -> str:
    return f"{_resolve_provider_prefix(provider, custom_provider)}/{model_name}"
 def _get_global_image_gen_config(config_id: int) -> dict | None:
    """Get a global image gen config by negative ID."""
    for cfg in config.GLOBAL_IMAGE_GEN_CONFIGS:
        if cfg.get("id") == config_id:
            return cfg
    return None
 def create_generate_image_tool(
    search_space_id: int,
    db_session: AsyncSession,
-    image_generation_config_id_override: int | None = None,
+    image_gen_model_id_override: int | None = None,
 ):
    """Create ``generate_image`` with bound search space; DB work uses a per-call session.
-    ``image_generation_config_id_override``: when set (automations running on a
+    ``image_gen_model_id_override``: when set (automations running on a
-    captured model), use this config id instead of reading the search space's
+    captured model), use this model id instead of reading the search space's
-    live ``image_generation_config_id``.
+    live ``image_gen_model_id``.
    """
    del db_session  # tool uses a fresh per-call session instead
@ -118,26 +101,23 @@ def create_generate_image_tool(
            # task's session is shared across every tool; without isolation,
            # autoflushes from a concurrent writer poison this tool too.
            async with shielded_async_session() as session:
-                if image_generation_config_id_override is not None:
+                result = await session.execute(
                    select(SearchSpace).filter(SearchSpace.id == search_space_id)
                )
                search_space = result.scalars().first()
                if not search_space:
                    return _failed(
                        {"error": "Search space not found"},
                        error="Search space not found",
                    )
                if image_gen_model_id_override is not None:
                    # Automation run: use the captured image model, insulated from
                    # later search-space changes. No search-space read needed.
-                    config_id = (
+                    config_id = image_gen_model_id_override or IMAGE_GEN_AUTO_MODE_ID
                        image_generation_config_id_override or IMAGE_GEN_AUTO_MODE_ID
                    )
                else:
                    result = await session.execute(
                        select(SearchSpace).filter(SearchSpace.id == search_space_id)
                    )
                    search_space = result.scalars().first()
                    if not search_space:
                        return _failed(
                            {"error": "Search space not found"},
                            error="Search space not found",
                        )
                    config_id = (
-                        search_space.image_generation_config_id
+                        search_space.image_gen_model_id or IMAGE_GEN_AUTO_MODE_ID
                        or IMAGE_GEN_AUTO_MODE_ID
                    )
                # size/quality/style are intentionally omitted: valid values
@ -147,73 +127,86 @@ def create_generate_image_tool(
                    gen_kwargs["n"] = n
                if is_image_gen_auto_mode(config_id):
-                    if not ImageGenRouterService.is_initialized():
+                    candidates = await auto_model_candidates(
                        session,
                        search_space_id=search_space_id,
                        user_id=search_space.user_id,
                        capability="image_gen",
                    )
                    if not candidates:
                        err = (
-                            "No image generation models configured. "
+                            "No image generation models available. "
                            "Please add an image model in Settings > Image Models."
                        )
                        return _failed({"error": err}, error=err)
-                    response = await ImageGenRouterService.aimage_generation(
+                    config_id = int(
-                        prompt=prompt, model="auto", **gen_kwargs
+                        choose_auto_model_candidate(candidates, search_space_id)["id"]
                    )
-                elif config_id < 0:
+
-                    cfg = _get_global_image_gen_config(config_id)
+                provider_base_url: str | None = None
-                    if not cfg:
+
-                        err = f"Image generation config {config_id} not found"
+                if config_id < 0:
                    global_model = _get_global_model(config_id)
                    if not global_model or not has_capability(
                        global_model, "image_gen"
                    ):
                        err = f"Image generation model {config_id} not found"
                        return _failed({"error": err}, error=err)
                    global_connection = _get_global_connection(
                        global_model["connection_id"]
                    )
                    if not global_connection:
                        err = f"Image generation connection for model {config_id} not found"
                        return _failed({"error": err}, error=err)
-                    provider_prefix = _resolve_provider_prefix(
+                    model_string, resolved_kwargs = to_litellm(
-                        cfg.get("provider", ""), cfg.get("custom_provider")
+                        global_connection,
                        global_model["model_id"],
                    )
-                    model_string = f"{provider_prefix}/{cfg['model_name']}"
+                    gen_kwargs.update(resolved_kwargs)
-                    gen_kwargs["api_key"] = cfg.get("api_key")
+                    provider_base_url = resolved_kwargs.get("api_base")
                    # Defense-in-depth: an empty ``api_base`` must not fall
                    # through to LiteLLM's global ``api_base`` (e.g. Azure).
                    api_base = resolve_api_base(
                        provider=cfg.get("provider"),
                        provider_prefix=provider_prefix,
                        config_api_base=cfg.get("api_base"),
                    )
                    if api_base:
                        gen_kwargs["api_base"] = api_base
                    if cfg.get("api_version"):
                        gen_kwargs["api_version"] = cfg["api_version"]
                    if cfg.get("litellm_params"):
                        gen_kwargs.update(cfg["litellm_params"])
                    response = await aimage_generation(
                        prompt=prompt, model=model_string, **gen_kwargs
                    )
                else:
-                    # Positive ID = user-created ImageGenerationConfig
+                    # Positive ID = Model + Connection
                    cfg_result = await session.execute(
-                        select(ImageGenerationConfig).filter(
+                        select(Model)
-                            ImageGenerationConfig.id == config_id
+                        .options(selectinload(Model.connection))
-                        )
+                        .filter(Model.id == config_id, Model.enabled.is_(True))
                    )
-                    db_cfg = cfg_result.scalars().first()
+                    db_model = cfg_result.scalars().first()
-                    if not db_cfg:
+                    if (
-                        err = f"Image generation config {config_id} not found"
+                        not db_model
                        or not db_model.connection
                        or not db_model.connection.enabled
                    ):
                        err = f"Image generation model {config_id} not found"
                        return _failed({"error": err}, error=err)
                    conn = db_model.connection
                    if (
                        conn.search_space_id is not None
                        and conn.search_space_id != search_space_id
                    ):
                        err = f"Image generation model {config_id} not found"
                        return _failed({"error": err}, error=err)
                    if (
                        conn.user_id is not None
                        and conn.user_id != search_space.user_id
                    ):
                        err = f"Image generation model {config_id} not found"
                        return _failed({"error": err}, error=err)
                    if not has_capability(db_model, "image_gen"):
                        err = f"Model {config_id} is not image-generation capable"
                        return _failed({"error": err}, error=err)
-                    provider_prefix = _resolve_provider_prefix(
+                    model_string, resolved_kwargs = to_litellm(
-                        db_cfg.provider.value, db_cfg.custom_provider
+                        db_model.connection,
                        db_model.model_id,
                    )
-                    model_string = f"{provider_prefix}/{db_cfg.model_name}"
+                    gen_kwargs.update(resolved_kwargs)
-                    gen_kwargs["api_key"] = db_cfg.api_key
+                    provider_base_url = resolved_kwargs.get("api_base")
                    # Defense-in-depth: an empty ``api_base`` must not fall
                    # through to LiteLLM's global ``api_base`` (e.g. Azure).
                    api_base = resolve_api_base(
                        provider=db_cfg.provider.value,
                        provider_prefix=provider_prefix,
                        config_api_base=db_cfg.api_base,
                    )
                    if api_base:
                        gen_kwargs["api_base"] = api_base
                    if db_cfg.api_version:
                        gen_kwargs["api_version"] = db_cfg.api_version
                    if db_cfg.litellm_params:
                        gen_kwargs.update(db_cfg.litellm_params)
                    response = await aimage_generation(
                        prompt=prompt, model=model_string, **gen_kwargs
@ -230,7 +223,7 @@ def create_generate_image_tool(
                    prompt=prompt,
                    model=getattr(response, "_hidden_params", {}).get("model"),
                    n=n,
-                    image_generation_config_id=config_id,
+                    image_gen_model_id=config_id,
                    response_data=response_dict,
                    search_space_id=search_space_id,
                    access_token=access_token,
@ -252,8 +245,19 @@ def create_generate_image_tool(
            # b64_json (e.g. gpt-image-1) is served via our backend endpoint so
            # megabytes of base64 don't bloat the LLM context.
            # Some OpenAI-compatible backends (e.g. Xinference) return a relative
            # URL like /files/image.png. Browsers can't resolve these, so we
            # prepend the provider's base origin when the URL starts with "/".
            if first_image.get("url"):
-                image_url = first_image["url"]
+                raw_url: str = first_image["url"]
                if raw_url.startswith("/") and provider_base_url:
                    from urllib.parse import urlparse
                    parsed = urlparse(provider_base_url)
                    origin = f"{parsed.scheme}://{parsed.netloc}"
                    image_url = f"{origin}{raw_url}"
                else:
                    image_url = raw_url
            elif first_image.get("b64_json"):
                backend_url = config.BACKEND_URL or "http://localhost:8000"
                image_url = (
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/index.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/index.py
@ -51,8 +51,6 @@ def load_tools(
        create_generate_image_tool(
            search_space_id=d["search_space_id"],
            db_session=d["db_session"],
-            image_generation_config_id_override=d.get(
+            image_gen_model_id_override=d.get("image_gen_model_id_override"),
                "image_generation_config_id_override"
            ),
        ),
    ]
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
@ -122,7 +122,7 @@ async def _browse_recent_documents(
        chunk_query = (
            select(Chunk)
            .where(Chunk.document_id.in_(doc_ids))
-            .order_by(Chunk.document_id, Chunk.id)
+            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
        )
        chunk_result = await session.execute(chunk_query)
        raw_chunks = chunk_result.scalars().all()
--- a/surfsense_backend/app/agents/chat/runtime/llm_config.py
+++ b/surfsense_backend/app/agents/chat/runtime/llm_config.py
@ -2,9 +2,9 @@
 LLM configuration utilities for SurfSense agents.
 This module provides functions for loading LLM configurations from:
-1. Auto mode (ID 0) - Uses LiteLLM Router for load balancing
+1. Auto mode (ID 0) - Resolved by callers to a concrete model-connection model
 2. YAML files (global configs with negative IDs)
-3. Database NewLLMConfig table (user-created configs with positive IDs)
+3. Database model-connections table (user-created configs with positive IDs)
 It also provides utilities for creating ChatLiteLLM instances and
 managing prompt configurations.
@ -24,8 +24,6 @@ from langchain_core.messages import AIMessage, BaseMessage
 from langchain_core.outputs import ChatGenerationChunk, ChatResult
 from langchain_litellm import ChatLiteLLM
 from litellm import get_model_info
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.agents.chat.runtime.prompt_caching import (
    apply_litellm_prompt_caching,
@ -33,10 +31,7 @@ from app.agents.chat.runtime.prompt_caching import (
 from app.services.llm_router_service import (
    AUTO_MODE_ID,
    ChatLiteLLMRouter,
    LLMRouterService,
    _sanitize_content,
    get_auto_mode_llm,
    is_auto_mode,
 )
@ -51,16 +46,19 @@ def _sanitize_messages(messages: list[BaseMessage]) -> list[BaseMessage]:
      reject the blank text.  The OpenAI spec says ``content`` should be
      ``null`` when an assistant message only carries tool calls.
    """
    sanitized: list[BaseMessage] = []
    for msg in messages:
-        if isinstance(msg.content, list):
+        next_msg = msg.model_copy(deep=True)
-            msg.content = _sanitize_content(msg.content)
+        if isinstance(next_msg.content, list):
            next_msg.content = _sanitize_content(next_msg.content)
        if (
-            isinstance(msg, AIMessage)
+            isinstance(next_msg, AIMessage)
-            and (not msg.content or msg.content == "")
+            and (not next_msg.content or next_msg.content == "")
-            and getattr(msg, "tool_calls", None)
+            and getattr(next_msg, "tool_calls", None)
        ):
-            msg.content = None  # type: ignore[assignment]
+            next_msg.content = None  # type: ignore[assignment]
-    return messages
+        sanitized.append(next_msg)
    return sanitized
 class SanitizedChatLiteLLM(ChatLiteLLM):
@ -91,13 +89,21 @@ class SanitizedChatLiteLLM(ChatLiteLLM):
        ):
            yield chunk
-
+    async def _agenerate(
-# Re-exported under the historical name ``PROVIDER_MAP``. Source of truth lives
+        self,
-# in provider_capabilities so the YAML loader can resolve prefixes during
+        messages: list[BaseMessage],
-# app.config init without importing the agent/tools tree.
+        stop: list[str] | None = None,
-from app.services.provider_capabilities import (  # noqa: E402
+        run_manager: AsyncCallbackManagerForLLMRun | None = None,
-    _PROVIDER_PREFIX_MAP as PROVIDER_MAP,
+        stream: bool | None = None,
-)
+        **kwargs: Any,
    ) -> ChatResult:
        return await super()._agenerate(
            _sanitize_messages(messages),
            stop=stop,
            run_manager=run_manager,
            stream=stream,
            **kwargs,
        )
 def _attach_model_profile(llm: ChatLiteLLM, model_string: str) -> None:
@ -121,8 +127,9 @@ class AgentConfig:
    """
    Complete configuration for the SurfSense agent.
-    This combines LLM settings with prompt configuration from NewLLMConfig.
+    This combines resolved model settings with prompt configuration.
-    Supports Auto mode (ID 0) which uses LiteLLM Router for load balancing.
+    Supports Auto mode metadata (ID 0). Runtime callers must resolve Auto to
    a concrete global or BYOK model before constructing ChatLiteLLM.
    """
    # LLM Model Settings
@ -170,7 +177,7 @@ class AgentConfig:
            use_default_system_instructions=True,
            citations_enabled=True,
            config_id=AUTO_MODE_ID,
-            config_name="Auto (Fastest)",
+            config_name="Auto",
            is_auto_mode=True,
            billing_tier="free",
            is_premium=False,
@ -181,64 +188,21 @@ class AgentConfig:
            supports_image_input=True,
        )
    @classmethod
    def from_new_llm_config(cls, config) -> "AgentConfig":
        """Build an AgentConfig from a NewLLMConfig database model."""
        # Lazy import: keeps provider_capabilities (and litellm) out of init order.
        from app.services.provider_capabilities import derive_supports_image_input
        provider_value = (
            config.provider.value
            if hasattr(config.provider, "value")
            else str(config.provider)
        )
        litellm_params = config.litellm_params or {}
        base_model = (
            litellm_params.get("base_model")
            if isinstance(litellm_params, dict)
            else None
        )
        return cls(
            provider=provider_value,
            model_name=config.model_name,
            api_key=config.api_key,
            api_base=config.api_base,
            custom_provider=config.custom_provider,
            litellm_params=config.litellm_params,
            system_instructions=config.system_instructions,
            use_default_system_instructions=config.use_default_system_instructions,
            citations_enabled=config.citations_enabled,
            config_id=config.id,
            config_name=config.name,
            is_auto_mode=False,
            billing_tier="free",
            is_premium=False,
            anonymous_enabled=False,
            quota_reserve_tokens=None,
            # BYOK rows have no curated flag; ask LiteLLM (default-allow on
            # unknown). The streaming safety net still blocks explicit text-only.
            supports_image_input=derive_supports_image_input(
                provider=provider_value,
                model_name=config.model_name,
                base_model=base_model,
                custom_provider=config.custom_provider,
            ),
        )
    @classmethod
    def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig":
        """Build an AgentConfig from a YAML configuration dictionary.
-        Supports the same prompt fields as NewLLMConfig (system_instructions,
+        Supports prompt fields such as system_instructions,
-        use_default_system_instructions, citations_enabled).
+        use_default_system_instructions, and citations_enabled.
        """
        # Lazy import: keeps provider_capabilities (and litellm) out of init order.
        from app.services.provider_capabilities import derive_supports_image_input
        system_instructions = yaml_config.get("system_instructions", "")
-        provider = yaml_config.get("provider", "").upper()
+        provider = yaml_config.get("provider") or yaml_config.get(
            "litellm_provider", ""
        )
        model_name = yaml_config.get("model_name", "")
        custom_provider = yaml_config.get("custom_provider")
        litellm_params = yaml_config.get("litellm_params") or {}
@ -324,93 +288,15 @@ def load_global_llm_config_by_id(llm_config_id: int) -> dict | None:
    return load_llm_config_from_yaml(llm_config_id)
 async def load_new_llm_config_from_db(
    session: AsyncSession,
    config_id: int,
 ) -> "AgentConfig | None":
    """Load a NewLLMConfig from the database by ID."""
    from app.db import NewLLMConfig
    try:
        result = await session.execute(
            select(NewLLMConfig).filter(NewLLMConfig.id == config_id)
        )
        config = result.scalars().first()
        if not config:
            print(f"Error: NewLLMConfig with id {config_id} not found")
            return None
        return AgentConfig.from_new_llm_config(config)
    except Exception as e:
        print(f"Error loading NewLLMConfig from database: {e}")
        return None
 async def load_agent_llm_config_for_search_space(
    session: AsyncSession,
    search_space_id: int,
 ) -> "AgentConfig | None":
    """Load the agent LLM config for a search space via its agent_llm_id.
    Positive id -> DB; negative -> YAML; None -> first global config (-1).
    """
    from app.db import SearchSpace
    try:
        result = await session.execute(
            select(SearchSpace).filter(SearchSpace.id == search_space_id)
        )
        search_space = result.scalars().first()
        if not search_space:
            print(f"Error: SearchSpace with id {search_space_id} not found")
            return None
        config_id = (
            search_space.agent_llm_id if search_space.agent_llm_id is not None else -1
        )
        return await load_agent_config(session, config_id, search_space_id)
    except Exception as e:
        print(f"Error loading agent LLM config for search space {search_space_id}: {e}")
        return None
 async def load_agent_config(
    session: AsyncSession,
    config_id: int,
    search_space_id: int | None = None,
 ) -> "AgentConfig | None":
    """Main config loader: id 0 -> Auto mode; negative -> YAML; positive -> DB."""
    if is_auto_mode(config_id):
        if not LLMRouterService.is_initialized():
            print("Error: Auto mode requested but LLM Router not initialized")
            return None
        return AgentConfig.from_auto_mode()
    if config_id < 0:
        # In-memory covers static YAML + dynamic OpenRouter configs.
        from app.config import config as app_config
        for cfg in app_config.GLOBAL_LLM_CONFIGS:
            if cfg.get("id") == config_id:
                return AgentConfig.from_yaml_config(cfg)
        yaml_config = load_llm_config_from_yaml(config_id)
        if yaml_config:
            return AgentConfig.from_yaml_config(yaml_config)
        return None
    else:
        return await load_new_llm_config_from_db(session, config_id)
 def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
    """Create a ChatLiteLLM instance from a global LLM config dictionary."""
    if llm_config.get("custom_provider"):
        model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}"
    else:
-        provider = llm_config.get("provider", "").upper()
+        provider = llm_config.get("provider") or llm_config.get(
-        provider_prefix = PROVIDER_MAP.get(provider, provider.lower())
+            "litellm_provider", "openai"
-        model_string = f"{provider_prefix}/{llm_config['model_name']}"
+        )
        model_string = f"{provider}/{llm_config['model_name']}"
    litellm_kwargs = {
        "model": model_string,
@ -433,29 +319,17 @@ def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
 def create_chat_litellm_from_agent_config(
    agent_config: AgentConfig,
 ) -> ChatLiteLLM | ChatLiteLLMRouter | None:
-    """Create a ChatLiteLLM (or, for Auto mode, a load-balancing router) from config."""
+    """Create a ChatLiteLLM from an already resolved concrete model config."""
    if agent_config.is_auto_mode:
-        if not LLMRouterService.is_initialized():
+        print(
-            print("Error: Auto mode requested but LLM Router not initialized")
+            "Error: Auto mode must be resolved to a concrete model before LLM creation"
-            return None
+        )
-        try:
+        return None
            router_llm = get_auto_mode_llm()
            if router_llm is not None:
                # Universal injection points only: auto-mode fans out across
                # providers, so provider-specific kwargs have no known target.
                apply_litellm_prompt_caching(router_llm, agent_config=agent_config)
            return router_llm
        except Exception as e:
            print(f"Error creating ChatLiteLLMRouter: {e}")
            return None
    if agent_config.custom_provider:
        model_string = f"{agent_config.custom_provider}/{agent_config.model_name}"
    else:
-        provider_prefix = PROVIDER_MAP.get(
+        model_string = f"{agent_config.provider}/{agent_config.model_name}"
            agent_config.provider, agent_config.provider.lower()
        )
        model_string = f"{provider_prefix}/{agent_config.model_name}"
    litellm_kwargs = {
        "model": model_string,
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@ -33,7 +33,6 @@ from app.config import (
    initialize_llm_router,
    initialize_openrouter_integration,
    initialize_pricing_registration,
    initialize_vision_llm_router,
 )
 from app.db import User, create_db_and_tables, get_async_session
 from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError
@ -622,7 +621,6 @@ async def lifespan(app: FastAPI):
    initialize_pricing_registration()
    initialize_llm_router()
    initialize_image_gen_router()
    initialize_vision_llm_router()
    # Phase 1.7 — JIT warmup. Bounded so a stuck warmup never delays
    # worker readiness. ``shield`` so Uvicorn cancelling startup
--- a/surfsense_backend/app/automations/actions/builtin/agent_task/dependencies.py
+++ b/surfsense_backend/app/automations/actions/builtin/agent_task/dependencies.py
@ -39,31 +39,31 @@ async def build_dependencies(
    *,
    session: AsyncSession,
    search_space_id: int,
-    agent_llm_id: int | None = None,
+    chat_model_id: int | None = None,
-    image_generation_config_id: int | None = None,
+    image_gen_model_id: int | None = None,
-    vision_llm_config_id: int | None = None,
+    vision_model_id: int | None = None,
 ) -> AgentDependencies:
    """Load the LLM bundle, connector service, and a per-invoke in-memory checkpointer.
-    Resolves the agent LLM from the automation's *captured* model snapshot
+    Resolves the chat model from the automation's *captured* model snapshot
-    (``agent_llm_id``) so runs are insulated from later chat/search-space model
+    (``chat_model_id``) so runs are insulated from later chat/search-space model
    changes. The model policy is enforced here as a runtime backstop: a captured
    model that is no longer billable (e.g. a premium global config was removed)
    fails the run clearly instead of silently consuming a free model.
-    When ``agent_llm_id`` is ``None`` (no captured snapshot — defensive fallback),
+    When ``chat_model_id`` is ``None`` (no captured snapshot — defensive fallback),
-    fall back to the live search space's ``agent_llm_id`` and validate that.
+    fall back to the live search space's ``chat_model_id`` and validate that.
    """
-    if agent_llm_id is not None:
+    if chat_model_id is not None:
        try:
            assert_models_billable(
-                agent_llm_id=agent_llm_id,
+                chat_model_id=chat_model_id,
-                image_generation_config_id=image_generation_config_id,
+                image_gen_model_id=image_gen_model_id,
-                vision_llm_config_id=vision_llm_config_id,
+                vision_model_id=vision_model_id,
            )
        except AutomationModelPolicyError as exc:
            raise DependencyError(str(exc)) from exc
-        resolved_agent_llm_id = agent_llm_id or 0
+        resolved_chat_model_id = chat_model_id or 0
    else:
        search_space = await session.get(SearchSpace, search_space_id)
        if search_space is None:
@ -72,15 +72,15 @@ async def build_dependencies(
            assert_automation_models_billable(search_space)
        except AutomationModelPolicyError as exc:
            raise DependencyError(str(exc)) from exc
-        resolved_agent_llm_id = search_space.agent_llm_id or 0
+        resolved_chat_model_id = search_space.chat_model_id or 0
    llm, agent_config, err = await load_llm_bundle(
        session,
-        config_id=resolved_agent_llm_id,
+        config_id=resolved_chat_model_id,
        search_space_id=search_space_id,
    )
    if err is not None or llm is None:
-        raise DependencyError(err or "failed to load agent LLM config")
+        raise DependencyError(err or "failed to load chat model config")
    connector_service, firecrawl_api_key = await setup_connector_and_firecrawl(
        session, search_space_id=search_space_id
--- a/surfsense_backend/app/automations/actions/builtin/agent_task/invoke.py
+++ b/surfsense_backend/app/automations/actions/builtin/agent_task/invoke.py
@ -150,9 +150,9 @@ async def run_agent_task(
        deps = await build_dependencies(
            session=agent_session,
            search_space_id=ctx.search_space_id,
-            agent_llm_id=ctx.agent_llm_id,
+            chat_model_id=ctx.chat_model_id,
-            image_generation_config_id=ctx.image_generation_config_id,
+            image_gen_model_id=ctx.image_gen_model_id,
-            vision_llm_config_id=ctx.vision_llm_config_id,
+            vision_model_id=ctx.vision_model_id,
        )
        agent = await create_multi_agent_chat_deep_agent(
@ -167,7 +167,7 @@ async def run_agent_task(
            firecrawl_api_key=deps.firecrawl_api_key,
            thread_visibility=ChatVisibility.PRIVATE,
            mentioned_document_ids=mentioned_document_ids,
-            image_generation_config_id=ctx.image_generation_config_id,
+            image_gen_model_id=ctx.image_gen_model_id,
        )
        agent_query, runtime_context = await _resolve_mention_context(
--- a/surfsense_backend/app/automations/actions/types.py
+++ b/surfsense_backend/app/automations/actions/types.py
@ -23,9 +23,9 @@ class ActionContext:
    # Captured model snapshot from the automation definition (``definition.models``),
    # resolved per run instead of the live search space. ``None`` falls back to the
    # search space's current prefs (defensive; should not happen post-capture).
-    agent_llm_id: int | None = None
+    chat_model_id: int | None = None
-    image_generation_config_id: int | None = None
+    image_gen_model_id: int | None = None
-    vision_llm_config_id: int | None = None
+    vision_model_id: int | None = None
 ActionHandler = Callable[[dict[str, Any]], Awaitable[Any]]
--- a/surfsense_backend/app/automations/runtime/executor.py
+++ b/surfsense_backend/app/automations/runtime/executor.py
@ -132,9 +132,7 @@ def _build_action_ctx(
        step_id=step.step_id,
        search_space_id=automation.search_space_id,
        creator_user_id=automation.created_by_user_id,
-        agent_llm_id=models.agent_llm_id if models else None,
+        chat_model_id=models.chat_model_id if models else None,
-        image_generation_config_id=(
+        image_gen_model_id=models.image_gen_model_id if models else None,
-            models.image_generation_config_id if models else None
+        vision_model_id=models.vision_model_id if models else None,
        ),
        vision_llm_config_id=models.vision_llm_config_id if models else None,
    )
--- a/surfsense_backend/app/automations/schemas/definition/envelope.py
+++ b/surfsense_backend/app/automations/schemas/definition/envelope.py
@ -14,16 +14,16 @@ from .trigger_spec import TriggerSpec
 class AutomationModels(BaseModel):
    """Captured model profile for an automation.
-    Snapshotted from the search space's preferences at create time so runs are
+    Snapshotted from the search space's model roles at create time so runs are
-    insulated from later chat/search-space model changes. Config-id conventions
+    insulated from later chat/search-space model changes. Model-id conventions
    match the shared scheme (``0`` Auto, ``< 0`` global, ``> 0`` BYOK).
    """
    model_config = ConfigDict(extra="forbid")
-    agent_llm_id: int = 0
+    chat_model_id: int = 0
-    image_generation_config_id: int = 0
+    image_gen_model_id: int = 0
-    vision_llm_config_id: int = 0
+    vision_model_id: int = 0
 class AutomationDefinition(BaseModel):
--- a/surfsense_backend/app/automations/services/automation.py
+++ b/surfsense_backend/app/automations/services/automation.py
@ -57,9 +57,9 @@ class AutomationService:
        else:
            search_space = await self._assert_models_billable(payload.search_space_id)
            payload.definition.models = AutomationModels(
-                agent_llm_id=search_space.agent_llm_id or 0,
+                chat_model_id=search_space.chat_model_id or 0,
-                image_generation_config_id=search_space.image_generation_config_id or 0,
+                image_gen_model_id=search_space.image_gen_model_id or 0,
-                vision_llm_config_id=search_space.vision_llm_config_id or 0,
+                vision_model_id=search_space.vision_model_id or 0,
            )
        automation = Automation(
@ -225,9 +225,9 @@ class AutomationService:
        """
        try:
            assert_models_billable(
-                agent_llm_id=models.agent_llm_id,
+                chat_model_id=models.chat_model_id,
-                image_generation_config_id=models.image_generation_config_id,
+                image_gen_model_id=models.image_gen_model_id,
-                vision_llm_config_id=models.vision_llm_config_id,
+                vision_model_id=models.vision_model_id,
            )
        except AutomationModelPolicyError as exc:
            raise HTTPException(status_code=422, detail=str(exc)) from exc
--- a/surfsense_backend/app/automations/services/model_policy.py
+++ b/surfsense_backend/app/automations/services/model_policy.py
@ -2,11 +2,11 @@
 Automations run unattended, so every run must be **billable**: it may only use
 either a premium global model (``billing_tier == "premium"``) or a user-provided
-BYOK model (a positive config id pointing at a per-user/per-space DB row). Free
+BYOK model (a positive model id pointing at a per-user/per-space DB row). Free
 global models and Auto mode are blocked, because Auto can dispatch to a free
 deployment and free models aren't metered in premium credits.
-Config id conventions (shared across chat / image / vision):
+Model id conventions (shared across chat / image / vision):
 - ``id == 0``  → Auto mode (``AUTO_MODE_ID`` / ``IMAGE_GEN_AUTO_MODE_ID`` /
  ``VISION_AUTO_MODE_ID``). Blocked.
 - ``id < 0``   → global YAML/OpenRouter config. Allowed only if premium.
@ -24,70 +24,45 @@ from typing import TYPE_CHECKING, Literal
 if TYPE_CHECKING:
    from app.db import SearchSpace
-ModelKind = Literal["llm", "image", "vision"]
+ModelKind = Literal["chat", "image", "vision"]
 _KIND_LABEL: dict[ModelKind, str] = {
-    "llm": "agent LLM",
+    "chat": "chat model",
    "image": "image generation model",
    "vision": "vision model",
 }
-def _is_premium_global(kind: ModelKind, config_id: int) -> bool:
+def _is_premium_global(model_id: int) -> bool:
-    """Return True if a negative (global) config id is a premium tier model."""
+    """Return True if a negative (global) model id is a premium tier model."""
    from app.config import config as app_config
-    cfg: dict | None = None
+    model = next((m for m in app_config.GLOBAL_MODELS if m.get("id") == model_id), None)
-    if kind == "llm":
+    if not model:
        from app.agents.chat.runtime.llm_config import (
            load_global_llm_config_by_id,
        )
        cfg = load_global_llm_config_by_id(config_id)
    elif kind == "image":
        cfg = next(
            (
                c
                for c in app_config.GLOBAL_IMAGE_GEN_CONFIGS
                if c.get("id") == config_id
            ),
            None,
        )
    else:  # vision
        cfg = next(
            (
                c
                for c in app_config.GLOBAL_VISION_LLM_CONFIGS
                if c.get("id") == config_id
            ),
            None,
        )
    if not cfg:
        return False
-    return str(cfg.get("billing_tier", "free")).lower() == "premium"
+    return str(model.get("billing_tier", "free")).lower() == "premium"
-def _classify(kind: ModelKind, config_id: int | None) -> tuple[bool, str]:
+def _classify(kind: ModelKind, model_id: int | None) -> tuple[bool, str]:
-    """Classify a resolved config id as allowed or blocked.
+    """Classify a resolved model id as allowed or blocked.
    Returns ``(allowed, reason)``; ``reason`` is empty when allowed.
    """
    label = _KIND_LABEL[kind]
-    if config_id is None or config_id == 0:
+    if model_id is None or model_id == 0:
        return (
            False,
            f"The {label} is set to Auto mode. Automations require an explicit "
            "premium model or your own (BYOK) model so every run is billable.",
        )
-    if config_id > 0:
+    if model_id > 0:
-        # Positive id → user-owned BYOK config. Always allowed.
+        # Positive id -> user/search-space BYOK model. Always allowed.
        return True, ""
-    # Negative id → global config. Allowed only if premium.
+    # Negative id -> global model. Allowed only if premium.
-    if _is_premium_global(kind, config_id):
+    if _is_premium_global(model_id):
        return True, ""
    return (
@ -99,27 +74,27 @@ def _classify(kind: ModelKind, config_id: int | None) -> tuple[bool, str]:
 def get_model_eligibility(
    *,
-    agent_llm_id: int | None,
+    chat_model_id: int | None,
-    image_generation_config_id: int | None,
+    image_gen_model_id: int | None,
-    vision_llm_config_id: int | None,
+    vision_model_id: int | None,
 ) -> dict:
-    """Return ``{"allowed": bool, "violations": [...]}`` for explicit config ids.
+    """Return ``{"allowed": bool, "violations": [...]}`` for explicit model ids.
    The ID-based core shared by both the search-space path (creation/eligibility)
    and the captured-snapshot path (runtime backstop). Each violation is
-    ``{"kind", "config_id", "reason"}``.
+    ``{"kind", "model_id", "reason"}``.
    """
    checks: list[tuple[ModelKind, int | None]] = [
-        ("llm", agent_llm_id),
+        ("chat", chat_model_id),
-        ("image", image_generation_config_id),
+        ("image", image_gen_model_id),
-        ("vision", vision_llm_config_id),
+        ("vision", vision_model_id),
    ]
    violations: list[dict] = []
-    for kind, config_id in checks:
+    for kind, model_id in checks:
-        allowed, reason = _classify(kind, config_id)
+        allowed, reason = _classify(kind, model_id)
        if not allowed:
-            violations.append({"kind": kind, "config_id": config_id, "reason": reason})
+            violations.append({"kind": kind, "model_id": model_id, "reason": reason})
    return {"allowed": not violations, "violations": violations}
@ -131,9 +106,9 @@ def get_automation_model_eligibility(search_space: SearchSpace) -> dict:
    wrapper over :func:`get_model_eligibility`.
    """
    return get_model_eligibility(
-        agent_llm_id=search_space.agent_llm_id,
+        chat_model_id=search_space.chat_model_id,
-        image_generation_config_id=search_space.image_generation_config_id,
+        image_gen_model_id=search_space.image_gen_model_id,
-        vision_llm_config_id=search_space.vision_llm_config_id,
+        vision_model_id=search_space.vision_model_id,
    )
@ -150,9 +125,9 @@ class AutomationModelPolicyError(Exception):
 def assert_models_billable(
    *,
-    agent_llm_id: int | None,
+    chat_model_id: int | None,
-    image_generation_config_id: int | None,
+    image_gen_model_id: int | None,
-    vision_llm_config_id: int | None,
+    vision_model_id: int | None,
 ) -> None:
    """Raise :class:`AutomationModelPolicyError` if any explicit id is not billable.
@ -160,9 +135,9 @@ def assert_models_billable(
    captured model snapshot.
    """
    result = get_model_eligibility(
-        agent_llm_id=agent_llm_id,
+        chat_model_id=chat_model_id,
-        image_generation_config_id=image_generation_config_id,
+        image_gen_model_id=image_gen_model_id,
-        vision_llm_config_id=vision_llm_config_id,
+        vision_model_id=vision_model_id,
    )
    if not result["allowed"]:
        raise AutomationModelPolicyError(result["violations"])
--- a/surfsense_backend/app/celery_app.py
+++ b/surfsense_backend/app/celery_app.py
@ -115,14 +115,12 @@ def init_worker(**kwargs):
        initialize_llm_router,
        initialize_openrouter_integration,
        initialize_pricing_registration,
        initialize_vision_llm_router,
    )
    initialize_openrouter_integration()
    initialize_pricing_registration()
    initialize_llm_router()
    initialize_image_gen_router()
    initialize_vision_llm_router()
 # Celery configuration, sourced from the central Config singleton
@ -192,6 +190,8 @@ celery_app = Celery(
        "app.tasks.celery_tasks.stripe_reconciliation_task",
        "app.tasks.celery_tasks.auto_reload_task",
        "app.tasks.celery_tasks.gateway_tasks",
        "app.etl_pipeline.cache.eviction.task",
        "app.indexing_pipeline.cache.eviction.task",
        "app.automations.tasks.execute_run",
        "app.automations.triggers.builtin.schedule.selector",
        "app.automations.triggers.builtin.event.selector",
@ -306,6 +306,18 @@ celery_app.conf.beat_schedule = {
        "schedule": crontab(hour="3", minute="17"),
        "options": {"expires": 600},
    },
    # Prune the ETL parse cache (TTL + size budget) once daily, off-peak.
    "evict-etl-cache": {
        "task": "evict_etl_cache",
        "schedule": crontab(hour="4", minute="0"),
        "options": {"expires": 600},
    },
    # Prune the embedding cache (chunk+embedding sets) once daily, off-peak.
    "evict-embedding-cache": {
        "task": "evict_embedding_cache",
        "schedule": crontab(hour="4", minute="30"),
        "options": {"expires": 600},
    },
    # Fire due automation schedule triggers (Beat entry owned by the schedule
    # trigger; see app.automations.triggers.builtin.schedule.source).
    **SCHEDULE_BEAT_SCHEDULE,
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -78,8 +78,7 @@ def load_global_llm_configs():
        # stamps) never leak into the cached YAML structure.
        configs = copy.deepcopy(data.get("global_llm_configs", []))
-        # Lazy import keeps the `app.config` -> `app.services` edge one-way
+        # Lazy import keeps the `app.config` -> `app.services` edge one-way.
        # and matches the `provider_api_base` pattern used elsewhere.
        from app.services.provider_capabilities import derive_supports_image_input
        seen_slugs: dict[str, int] = {}
@ -104,7 +103,7 @@ def load_global_llm_configs():
                    else None
                )
                cfg["supports_image_input"] = derive_supports_image_input(
-                    provider=cfg.get("provider"),
+                    provider=cfg.get("provider") or cfg.get("litellm_provider"),
                    model_name=cfg.get("model_name"),
                    base_model=base_model,
                    custom_provider=cfg.get("custom_provider"),
@ -120,10 +119,10 @@ def load_global_llm_configs():
                else:
                    seen_slugs[slug] = cfg.get("id", 0)
-        # Stamp Auto (Fastest) ranking metadata. YAML configs are always
+        # Stamp Auto ranking metadata. YAML configs are always
        # Tier A — operator-curated, locked first when premium-eligible.
        # The OpenRouter refresh tick later re-stamps health for any cfg
-        # whose provider == "OPENROUTER" via _enrich_health.
+        # whose provider == "openrouter" via _enrich_health.
        try:
            from app.services.quality_score import static_score_yaml
@ -133,7 +132,7 @@ def load_global_llm_configs():
                cfg["quality_score_static"] = static_q
                cfg["quality_score"] = static_q
                cfg["quality_score_health"] = None
-                # YAML cfgs whose provider is OPENROUTER are also subject
+                # YAML cfgs whose provider is openrouter are also subject
                # to health gating against their own /endpoints data — a
                # hand-picked dead OR model is still dead. _enrich_health
                # re-stamps health_gated for them on the next refresh tick.
@ -211,42 +210,6 @@ def load_global_image_gen_configs():
        return []
 def load_global_vision_llm_configs():
    data = _global_config_data()
    if not data:
        return []
    try:
        configs = copy.deepcopy(data.get("global_vision_llm_configs", []) or [])
        for cfg in configs:
            if isinstance(cfg, dict):
                cfg.setdefault("billing_tier", "free")
        return configs
    except Exception as e:
        print(f"Warning: Failed to load global vision LLM configs: {e}")
        return []
 def load_vision_llm_router_settings():
    default_settings = {
        "routing_strategy": "usage-based-routing",
        "num_retries": 3,
        "allowed_fails": 3,
        "cooldown_time": 60,
    }
    data = _global_config_data()
    if not data:
        return default_settings
    try:
        settings = data.get("vision_llm_router_settings", {})
        return {**default_settings, **settings}
    except Exception as e:
        print(f"Warning: Failed to load vision LLM router settings: {e}")
        return default_settings
 def load_image_gen_router_settings():
    """
    Load router settings for image generation Auto mode from YAML file.
@ -363,8 +326,8 @@ def initialize_openrouter_integration():
        else:
            print("Info: OpenRouter integration enabled but no models fetched")
-        # Image generation + vision LLM emissions are opt-in (issue L).
+        # Image generation emissions reuse the catalogue already cached by
-        # Both reuse the catalogue already cached by ``service.initialize``
+        # ``service.initialize``
        # so we don't make additional network calls here.
        if settings.get("image_generation_enabled"):
            try:
@ -378,21 +341,26 @@ def initialize_openrouter_integration():
            except Exception as e:
                print(f"Warning: Failed to inject OpenRouter image-gen configs: {e}")
-        if settings.get("vision_enabled"):
+        refresh_global_model_catalog()
            try:
                vision_configs = service.get_vision_llm_configs()
                if vision_configs:
                    config.GLOBAL_VISION_LLM_CONFIGS.extend(vision_configs)
                    print(
                        f"Info: OpenRouter integration added {len(vision_configs)} "
                        f"vision LLM models"
                    )
            except Exception as e:
                print(f"Warning: Failed to inject OpenRouter vision-LLM configs: {e}")
    except Exception as e:
        print(f"Warning: Failed to initialize OpenRouter integration: {e}")
 def materialize_global_configs():
    from app.services.global_model_catalog import materialize_global_model_catalog
    return materialize_global_model_catalog(
        chat_configs=getattr(config, "GLOBAL_LLM_CONFIGS", []),
        image_configs=getattr(config, "GLOBAL_IMAGE_GEN_CONFIGS", []),
    )
 def refresh_global_model_catalog():
    connections, models = materialize_global_configs()
    config.GLOBAL_CONNECTIONS = connections
    config.GLOBAL_MODELS = models
 def initialize_pricing_registration():
    """
    Teach LiteLLM the per-token cost of every deployment in
@ -430,7 +398,10 @@ def initialize_llm_router():
    router_settings = config.ROUTER_SETTINGS
    if not all_configs:
-        print("Info: No global LLM configs found, Auto mode will not be available")
+        print(
            "Info: No global LLM configs found; global Auto pool is unavailable. "
            "Auto can still use enabled BYOK models."
        )
        return
    try:
@ -475,32 +446,6 @@ def initialize_image_gen_router():
        print(f"Warning: Failed to initialize Image Generation Router: {e}")
 def initialize_vision_llm_router():
    vision_configs = load_global_vision_llm_configs()
    # Reuse the router settings already parsed at Config construction. The
    # *configs* list is intentionally re-read from YAML (it must exclude the
    # OpenRouter-injected dynamic models held in config.GLOBAL_VISION_LLM_CONFIGS).
    router_settings = config.VISION_LLM_ROUTER_SETTINGS
    if not vision_configs:
        print(
            "Info: No global vision LLM configs found, "
            "Vision LLM Auto mode will not be available"
        )
        return
    try:
        from app.services.vision_llm_router_service import VisionLLMRouterService
        VisionLLMRouterService.initialize(vision_configs, router_settings)
        print(
            f"Info: Vision LLM Router initialized with {len(vision_configs)} models "
            f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
        )
    except Exception as e:
        print(f"Warning: Failed to initialize Vision LLM Router: {e}")
 class Config:
    # Check if ffmpeg is installed
    if not is_ffmpeg_installed():
@ -612,14 +557,15 @@ class Config:
    # Platform web search (SearXNG)
    SEARXNG_DEFAULT_HOST = os.getenv("SEARXNG_DEFAULT_HOST")
-    NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL")
+    SURFSENSE_PUBLIC_URL = os.getenv("SURFSENSE_PUBLIC_URL")
    NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL") or SURFSENSE_PUBLIC_URL
    # Backend URL to override the http to https in the OAuth redirect URI
-    BACKEND_URL = os.getenv("BACKEND_URL")
+    BACKEND_URL = os.getenv("BACKEND_URL") or SURFSENSE_PUBLIC_URL
-    # Messaging gateway (Telegram v1)
+    # Messaging gateway
    # Global master switch: when FALSE, no gateway supervisors/workers start and all
-    # gateway HTTP routes return 404, regardless of the per-channel flags below.
+    # gated gateway HTTP routes return 404, regardless of the per-channel flags below.
-    GATEWAY_ENABLED = os.getenv("GATEWAY_ENABLED", "TRUE").upper() == "TRUE"
+    GATEWAY_ENABLED = os.getenv("GATEWAY_ENABLED", "FALSE").upper() == "TRUE"
    TELEGRAM_SHARED_BOT_TOKEN = os.getenv("TELEGRAM_SHARED_BOT_TOKEN")
    TELEGRAM_SHARED_BOT_USERNAME = os.getenv("TELEGRAM_SHARED_BOT_USERNAME")
    TELEGRAM_WEBHOOK_SECRET = os.getenv("TELEGRAM_WEBHOOK_SECRET")
@ -784,7 +730,7 @@ class Config:
        os.getenv("QUOTA_DEFAULT_IMAGE_RESERVE_MICROS", "50000")
    )
-    # Per-podcast reservation (in micro-USD). One agent LLM call generating
+    # Per-podcast reservation (in micro-USD). One chat model call generating
    # a transcript, typically 5k-20k completion tokens. $0.20 covers a long
    # premium-model run. Tune via env.
    QUOTA_DEFAULT_PODCAST_RESERVE_MICROS = int(
@ -890,6 +836,13 @@ class Config:
    # LLM instances are now managed per-user through the LLMConfig system
    # Legacy environment variables removed in favor of user-specific configurations
    # True when an operator-provided global_llm_config.yaml is present.
    # Used to gate the per-search-space LLM onboarding flow: when a global
    # config file exists, search spaces inherit it and onboarding is skipped.
    GLOBAL_LLM_CONFIG_FILE_EXISTS = (
        BASE_DIR / "app" / "config" / "global_llm_config.yaml"
    ).exists()
    # Global LLM Configurations (optional)
    # Load from global_llm_config.yaml if available
    # These can be used as default options for users
@ -904,11 +857,17 @@ class Config:
    # Router settings for Image Generation Auto mode
    IMAGE_GEN_ROUTER_SETTINGS = load_image_gen_router_settings()
-    # Global Vision LLM Configurations (optional)
+    # Virtual GLOBAL connection/model catalog. This is server-only metadata
-    GLOBAL_VISION_LLM_CONFIGS = load_global_vision_llm_configs()
+    # derived from global_llm_config.yaml; GLOBAL keys are not stored in DB.
    from app.services.global_model_catalog import (
        materialize_global_model_catalog as _materialize_global_model_catalog,
    )
-    # Router settings for Vision LLM Auto mode
+    GLOBAL_CONNECTIONS, GLOBAL_MODELS = _materialize_global_model_catalog(
-    VISION_LLM_ROUTER_SETTINGS = load_vision_llm_router_settings()
+        chat_configs=GLOBAL_LLM_CONFIGS,
        image_configs=GLOBAL_IMAGE_GEN_CONFIGS,
    )
    del _materialize_global_model_catalog
    # OpenRouter Integration settings (optional)
    OPENROUTER_INTEGRATION_SETTINGS = load_openrouter_integration_settings()
@ -974,6 +933,47 @@ class Config:
        AZURE_DI_ENDPOINT = os.getenv("AZURE_DI_ENDPOINT")
        AZURE_DI_KEY = os.getenv("AZURE_DI_KEY")
    # ETL parse cache: reuse parser output for identical bytes across workspaces.
    ETL_CACHE_ENABLED = (
        os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true"
    )
    # Bump to invalidate every cached entry after a parser/behaviour change.
    ETL_CACHE_PARSER_VERSION = int(os.getenv("ETL_CACHE_PARSER_VERSION", "1"))
    ETL_CACHE_TTL_DAYS = int(os.getenv("ETL_CACHE_TTL_DAYS", "90"))
    ETL_CACHE_MAX_TOTAL_MB = int(os.getenv("ETL_CACHE_MAX_TOTAL_MB", "5120"))
    ETL_CACHE_EVICTION_BATCH = int(os.getenv("ETL_CACHE_EVICTION_BATCH", "500"))
    # Optional dedicated blob storage; unset reuses the main file_storage backend.
    ETL_CACHE_STORAGE_BACKEND = os.getenv("ETL_CACHE_STORAGE_BACKEND")
    ETL_CACHE_STORAGE_CONTAINER = os.getenv("ETL_CACHE_STORAGE_CONTAINER")
    ETL_CACHE_STORAGE_LOCAL_PATH = os.getenv("ETL_CACHE_STORAGE_LOCAL_PATH")
    # Embedding cache: reuse chunk+embedding output for identical markdown across
    # workspaces. Blobs share the ETL_CACHE_STORAGE_* backend.
    EMBEDDING_CACHE_ENABLED = (
        os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
    )
    # Bump to invalidate every cached embedding set after a chunker change.
    EMBEDDING_CACHE_CHUNKER_VERSION = int(
        os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
    )
    EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
    EMBEDDING_CACHE_MAX_TOTAL_MB = int(
        os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120")
    )
    EMBEDDING_CACHE_EVICTION_BATCH = int(
        os.getenv("EMBEDDING_CACHE_EVICTION_BATCH", "500")
    )
    # Incremental re-indexing: on document edits, keep chunk rows whose text is
    # unchanged (reusing their embeddings) and embed only new/changed chunks.
    # Kill switch -- disabling falls back to delete-all + full re-embed.
    CHUNK_RECONCILE_ENABLED = (
        os.getenv("CHUNK_RECONCILE_ENABLED", "true").strip().lower() == "true"
    )
    INDEXING_CHUNK_INSERT_BATCH_SIZE = int(
        os.getenv("INDEXING_CHUNK_INSERT_BATCH_SIZE", "200")
    )
    # Proxy provider selection. Maps to a ProxyProvider implementation registered
    # in app/utils/proxy/registry.py. Add new vendors there and switch via this var.
    PROXY_PROVIDER = os.getenv("PROXY_PROVIDER", "anonymous_proxies")
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@ -1,362 +1,236 @@
 # Global LLM Configuration
 #
 # SETUP INSTRUCTIONS:
-# 1. For production: Copy this file to global_llm_config.yaml and add your real API keys
+# 1. Copy this file to global_llm_config.yaml.
-# 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist
+# 2. Replace placeholder credentials, endpoints, deployment names, and pricing
 #    with values from your own provider accounts.
 #
-# NOTE: The example API keys below are placeholders and won't work.
+# This file is intentionally safe to commit. Do not put real API keys in this
-# Replace them with your actual API keys to enable global configurations.
+# example file.
 #
-# These configurations will be available to all users as a convenient option
+# These YAML entries are materialized at startup as server-owned GLOBAL
-# Users can choose to use these global configs or add their own
+# connections and models:
 #
-# AUTO MODE (Recommended):
+#   global_llm_configs              -> GLOBAL chat models
-# - Auto mode (ID: 0) uses LiteLLM Router to automatically load balance across all global configs
+#   global_image_generation_configs -> GLOBAL image generation models
 # - This helps avoid rate limits by distributing requests across multiple providers
 # - New users are automatically assigned Auto mode by default
 # - Configure router_settings below to customize the load balancing behavior
 #
-# Structure matches NewLLMConfig:
+# Do not add global_connections or global_models sections here. They are
-# - Model configuration (provider, model_name, api_key, etc.)
+# runtime-derived metadata exposed through the model-connections APIs.
-# - Prompt configuration (system_instructions, citations_enabled)
+#
 # Static config shape:
 # - Connection fields: provider, api_key, api_base, api_version
 # - Model fields: model_name, billing_tier, rpm/tpm, capabilities, litellm_params
 # - Public no-login SEO metadata: seo_title, seo_description
 # - Prompt defaults: system_instructions, use_default_system_instructions,
 #   citations_enabled
 #
 # Provider notes:
 # - Use the canonical provider field.
 # - For Azure, use the bare deployment name in model_name, for example
 #   model_name: "gpt-5.1". The resolver prefixes the LiteLLM model string from
 #   provider: "azure".
 #
 # GLOBAL ID namespace:
 # - ID 0 is reserved for Auto mode.
 # - Negative IDs are server-owned GLOBAL models.
 # - Positive IDs are user/BYOK database models.
 # - Keep static IDs unique across chat and image generation.
 # - Suggested static ranges: chat -1..-999, image -2001..-2999.
 # - Vision is not a separate config/table. Chat models that accept images use
 #   supports_image_input: true.
 #
 # COST-BASED PREMIUM CREDITS:
-# Each premium config bills the user's USD-credit balance based on the
+# Each premium model bills the user's USD-credit balance based on provider cost
-# actual provider cost reported by LiteLLM. For models LiteLLM already
+# reported by LiteLLM. For custom Azure deployments or any model LiteLLM does
-# knows (most OpenAI/Anthropic/etc. names) you don't need to do anything.
+# not know, declare per-token costs inline:
 # For custom Azure deployment names (e.g. an in-house "gpt-5.4" deployment)
 # or any model LiteLLM doesn't have in its built-in pricing table, declare
 # per-token costs inline so they bill correctly:
 #
 #   litellm_params:
-#     base_model: "my-custom-azure-deploy"
+#     base_model: "my-custom-deployment"
-#     # USD per token; e.g. 0.000003 == $3.00 per million input tokens
+#     # USD per token; 0.00000125 == $1.25 per million input tokens.
-#     input_cost_per_token: 0.000003
+#     input_cost_per_token: 0.00000125
-#     output_cost_per_token: 0.000015
+#     output_cost_per_token: 0.00001
 #
-# OpenRouter dynamic models pull pricing automatically from OpenRouter's
+# OpenRouter dynamic chat models pull pricing automatically from OpenRouter's
-# API — no inline declaration needed. Models without resolvable pricing
+# API. Models without resolvable pricing debit $0 and log a warning.
 # debit $0 from the user's balance and log a WARNING.
-# Router Settings for Auto Mode
+# =============================================================================
-# These settings control how the LiteLLM Router distributes requests across models
+# Chat Auto Mode Router Settings
 # =============================================================================
 # These settings control how the LiteLLM Router distributes Auto-mode requests
 # across curated router-eligible GLOBAL chat deployments.
 router_settings:
  # Routing strategy options:
-  # - "usage-based-routing": Routes to deployment with lowest current usage (recommended for rate limits)
+  # - "usage-based-routing": Routes to deployment with lowest current usage.
-  # - "simple-shuffle": Random distribution with optional RPM/TPM weighting
+  # - "simple-shuffle": Random distribution with optional RPM/TPM weighting.
-  # - "least-busy": Routes to least busy deployment
+  # - "least-busy": Routes to least busy deployment.
-  # - "latency-based-routing": Routes based on response latency
+  # - "latency-based-routing": Routes based on response latency.
  routing_strategy: "usage-based-routing"
  # Number of retries before failing
  num_retries: 3
  # Number of failures allowed before cooling down a deployment
  allowed_fails: 3
  # Cooldown time in seconds after allowed_fails is exceeded
  cooldown_time: 60
  # Optional fallback map:
  # fallbacks:
  #   - {"azure/gpt-5.1": ["azure/gpt-5.4-mini"]}
-  # Fallback models (optional) - when primary fails, try these
+# =============================================================================
-  # Format: [{"primary_model": ["fallback1", "fallback2"]}]
+# Static GLOBAL Chat Models
-  # fallbacks: []
+# =============================================================================
 global_llm_configs:
-  # Example: OpenAI GPT-4 Turbo with citations enabled
+  # Premium Azure chat model with image input support and explicit custom
  # pricing. This is the current shape to use for hosted GPT 5.x deployments.
  - id: -1
-    name: "Global GPT-4 Turbo"
+    name: "Azure GPT 5.1"
-    description: "OpenAI's GPT-4 Turbo with default prompts and citations"
+    billing_tier: "premium"
-    billing_tier: "free"
+    anonymous_enabled: false
-    anonymous_enabled: true
+    seo_enabled: false
-    seo_enabled: true
+    seo_slug: "azure-gpt-5-1"
    seo_slug: "gpt-4-turbo"
    quota_reserve_tokens: 4000
-    provider: "OPENAI"
+    provider: "azure"
-    model_name: "gpt-4-turbo-preview"
+    model_name: "gpt-5.1"
-    api_key: "sk-your-openai-api-key-here"
+    supports_image_input: true
-    api_base: ""
+    supports_tools: true
-    # Rate limits for load balancing (requests/tokens per minute)
+    max_input_tokens: 400000
-    rpm: 500 # Requests per minute
+    api_key: "your-azure-api-key-here"
-    tpm: 100000 # Tokens per minute
+    api_base: "https://your-resource.openai.azure.com"
    # api_version is optional. Include it if your Azure deployment requires a
    # specific API version.
    # api_version: "2025-04-01-preview"
    rpm: 47500
    tpm: 14750000
    litellm_params:
-      temperature: 0.7
+      max_tokens: 16384
-      max_tokens: 4000
+      base_model: "gpt-5.1"
-    # Prompt Configuration
+      input_cost_per_token: 0.00000125
-    system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
+      output_cost_per_token: 0.00001
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true
-  # Example: Anthropic Claude 3 Opus
+  # Larger premium chat model. If your provider prices long-context traffic
  # differently, choose a conservative flat price or document the limitation
  # next to the inline pricing.
  - id: -2
-    name: "Global Claude 3 Opus"
+    name: "Azure GPT 5.4"
-    description: "Anthropic's most capable model with citations"
+    billing_tier: "premium"
-    billing_tier: "free"
+    anonymous_enabled: false
-    anonymous_enabled: true
+    seo_enabled: false
-    seo_enabled: true
+    seo_slug: "azure-gpt-5-4"
    seo_slug: "claude-3-opus"
    quota_reserve_tokens: 4000
-    provider: "ANTHROPIC"
+    provider: "azure"
-    model_name: "claude-3-opus-20240229"
+    model_name: "gpt-5.4"
-    api_key: "sk-ant-your-anthropic-api-key-here"
+    supports_image_input: true
-    api_base: ""
+    supports_tools: true
-    rpm: 1000
+    max_input_tokens: 400000
-    tpm: 100000
+    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    rpm: 150000
    tpm: 15000000
    litellm_params:
-      temperature: 0.7
+      max_tokens: 16384
-      max_tokens: 4000
+      base_model: "gpt-5.4"
      input_cost_per_token: 0.0000025
      output_cost_per_token: 0.000015
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true
-  # Example: Fast model - GPT-3.5 Turbo (citations disabled for speed)
+  # Free/no-login hosted model. Free models are visible to users when
  # anonymous_enabled/seo_enabled are true but do not debit premium credits.
  - id: -3
-    name: "Global GPT-3.5 Turbo (Fast)"
+    name: "Azure GPT 5.4 Mini"
    description: "Fast responses without citations for quick queries"
    billing_tier: "free"
    anonymous_enabled: true
    seo_enabled: true
-    seo_slug: "gpt-3.5-turbo-fast"
+    seo_slug: "gpt-5-4-mini-no-login"
-    quota_reserve_tokens: 2000
+    seo_title: "Free GPT 5.4 Mini Chat"
-    provider: "OPENAI"
+    seo_description: "Chat with a hosted GPT 5.4 Mini model without signing in."
    model_name: "gpt-3.5-turbo"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
    rpm: 3500 # GPT-3.5 has higher rate limits
    tpm: 200000
    litellm_params:
      temperature: 0.5
      max_tokens: 2000
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: false # Disabled for faster responses
  # Example: Chinese LLM - DeepSeek with custom instructions
  - id: -4
    name: "Global DeepSeek Chat (Chinese)"
    description: "DeepSeek optimized for Chinese language responses"
    billing_tier: "free"
    anonymous_enabled: true
    seo_enabled: true
    seo_slug: "deepseek-chat-chinese"
    quota_reserve_tokens: 4000
-    provider: "DEEPSEEK"
+    provider: "azure"
-    model_name: "deepseek-chat"
+    model_name: "gpt-5.4-mini"
-    api_key: "your-deepseek-api-key-here"
+    supports_image_input: false
-    api_base: "https://api.deepseek.com/v1"
+    supports_tools: true
-    rpm: 60
+    max_input_tokens: 128000
    tpm: 100000
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
    # Custom system instructions for Chinese responses
    system_instructions: |
      <system_instruction>
      You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
      Today's date (UTC): {resolved_today}
      IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language.
      </system_instruction>
    use_default_system_instructions: false
    citations_enabled: true
  # Example: Azure OpenAI GPT-4o
  # IMPORTANT: For Azure deployments, always include 'base_model' in litellm_params
  # to enable accurate token counting, cost tracking, and max token limits
  - id: -5
    name: "Global Azure GPT-4o"
    description: "Azure OpenAI GPT-4o deployment"
    billing_tier: "free"
    anonymous_enabled: true
    seo_enabled: true
    seo_slug: "azure-gpt-4o"
    quota_reserve_tokens: 4000
    provider: "AZURE"
    # model_name format for Azure: azure/<your-deployment-name>
    model_name: "azure/gpt-4o-deployment"
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
-    api_version: "2024-02-15-preview" # Azure API version
+    rpm: 15000
-    rpm: 1000
+    tpm: 15000000
    tpm: 150000
    litellm_params:
-      temperature: 0.7
+      max_tokens: 16384
-      max_tokens: 4000
+      base_model: "gpt-5.4-mini"
      # REQUIRED for Azure: Specify the underlying OpenAI model
      # This fixes "Could not identify azure model" warnings
      # Common base_model values: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo
      base_model: "gpt-4o"
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true
-  # Example: Azure OpenAI GPT-4 Turbo
+  # Planner LLM. This is operator-only and is not shown in the user-facing
-  - id: -6
+  # model selector. Only one global_llm_configs entry should set is_planner.
    name: "Global Azure GPT-4 Turbo"
    description: "Azure OpenAI GPT-4 Turbo deployment"
    billing_tier: "free"
    anonymous_enabled: true
    seo_enabled: true
    seo_slug: "azure-gpt-4-turbo"
    quota_reserve_tokens: 4000
    provider: "AZURE"
    model_name: "azure/gpt-4-turbo-deployment"
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    api_version: "2024-02-15-preview"
    rpm: 500
    tpm: 100000
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
      base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true
  # Example: Groq - Fast inference
  - id: -7
    name: "Global Groq Llama 3"
    description: "Ultra-fast Llama 3 70B via Groq"
    billing_tier: "free"
    anonymous_enabled: true
    seo_enabled: true
    seo_slug: "groq-llama-3"
    quota_reserve_tokens: 8000
    provider: "GROQ"
    model_name: "llama3-70b-8192"
    api_key: "your-groq-api-key-here"
    api_base: ""
    rpm: 30 # Groq has lower rate limits on free tier
    tpm: 14400
    litellm_params:
      temperature: 0.7
      max_tokens: 8000
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true
  # Example: MiniMax M3 - High-performance with 512K context window
  - id: -8
    name: "Global MiniMax M3"
    description: "MiniMax M3 with 512K context window and competitive pricing"
    billing_tier: "free"
    anonymous_enabled: true
    seo_enabled: true
    seo_slug: "minimax-m3"
    quota_reserve_tokens: 4000
    provider: "MINIMAX"
    model_name: "MiniMax-M3"
    api_key: "your-minimax-api-key-here"
    api_base: "https://api.minimax.io/v1"
    rpm: 60
    tpm: 100000
    litellm_params:
      temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
      max_tokens: 4000
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true
  # Example: Planner LLM - small, fast model used for internal utility tasks
  #
  # The PLANNER role handles short, structured internal calls (KB query
  # rewriting, date extraction, recency classification, etc.) that don't
  # need frontier-tier capability. Pointing the planner at a cheap+fast
  # model (gpt-4o-mini, Claude Haiku, Azure gpt-5.x-nano, Groq Llama, ...)
  # typically saves 500ms-1.5s per turn vs. routing those same internal
  # calls through the user's chat model.
  #
  # Activation:
  #   - Mark EXACTLY ONE global config with ``is_planner: true``.
  #   - If multiple are marked, the first one wins and a WARNING is logged.
  #   - If none is marked, every internal call falls back to the user's
  #     chat LLM (same behavior as before this flag existed).
  #
  # This config is operator-only — it is NOT exposed in the user-facing
  # model selector, never billed against premium quota, and the
  # billing_tier / anonymous_enabled fields below are ignored.
  - id: -9
-    name: "Global Planner (GPT-4o mini)"
+    name: "Azure GPT 5.x Nano Planner"
    description: "Internal-only planner LLM for query rewriting and classification"
    is_planner: true
    billing_tier: "free"
    anonymous_enabled: false
    seo_enabled: false
    quota_reserve_tokens: 1000
-    provider: "OPENAI"
+    provider: "azure"
-    model_name: "gpt-4o-mini"
+    model_name: "gpt-5.4-nano"
-    api_key: "sk-your-openai-api-key-here"
+    supports_image_input: false
-    api_base: ""
+    supports_tools: false
-    rpm: 3500
+    router_pool_eligible: false
-    tpm: 200000
+    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    rpm: 20000
    tpm: 4000000
    litellm_params:
      temperature: 0
      max_tokens: 1000
      base_model: "gpt-5.4-nano"
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: false
 # =============================================================================
-# OpenRouter Integration
+# OpenRouter Dynamic Model Integration
 # =============================================================================
-# When enabled, dynamically fetches ALL available models from the OpenRouter API
+# When enabled, SurfSense fetches the OpenRouter catalog at startup and injects
-# and injects them as global configs. This gives premium users access to any model
+# supported models as GLOBAL chat and optionally image-generation models.
-# on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota,
+# Tier is derived per model from OpenRouter data:
-# while free-tier OpenRouter models show up with a green Free badge and do NOT
+# - model id ends with ":free" -> billing_tier=free
-# consume premium quota.
+# - prompt and completion pricing are zero -> billing_tier=free
-# Models are fetched at startup and refreshed periodically in the background.
+# - otherwise -> billing_tier=premium
-# All calls go through LiteLLM with the openrouter/ prefix.
+#
 # Do not use deprecated openrouter_integration.billing_tier or
 # openrouter_integration.anonymous_enabled. Use the tier-specific anonymous
 # switches below.
 openrouter_integration:
  enabled: false
  api_key: "sk-or-your-openrouter-api-key"
  # Tier is derived PER MODEL from OpenRouter's own API signals:
  #   - id ends with ":free"                         -> billing_tier=free
  #   - pricing.prompt AND pricing.completion == "0" -> billing_tier=free
  #   - otherwise                                    -> billing_tier=premium
  # No global billing_tier knob is honored; any legacy value emits a startup warning.
  # Anonymous access is split by tier so operators can expose only free
  # models to no-login users without leaking paid inference.
  anonymous_enabled_paid: false
  anonymous_enabled_free: false
  seo_enabled: false
  # quota_reserve_tokens: tokens reserved per call for quota enforcement
  quota_reserve_tokens: 4000
-  # id_offset: base negative ID for dynamically generated configs.
+
-  # Model IDs are derived deterministically via BLAKE2b so they survive
+  # Base negative ID namespace for dynamic chat models. IDs are derived
-  # catalogue churn. Must not overlap with your static global_llm_configs IDs.
+  # deterministically so they survive catalog churn. Do not overlap static IDs.
  id_offset: -10000
-  # refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only)
+
  # Separate base negative ID namespace for dynamic image-generation models.
  image_id_offset: -20000
  # How often to refresh the OpenRouter catalog. 0 means startup only.
  refresh_interval_hours: 24
-  # Rate limits for PAID OpenRouter models. These are used by LiteLLM Router
+  # Paid OpenRouter models may join curated router pools when eligible.
  # for per-deployment accounting when OR premium models participate in the
  # shared sub-agent "auto" pool. They do NOT cap OpenRouter itself — your
  # real account limits live at https://openrouter.ai/settings/limits.
  rpm: 200
  tpm: 1000000
-  # Rate limits for FREE OpenRouter models. Informational only: free OR
+  # Free OpenRouter models are available for user-facing selection/pinning but
-  # models are intentionally kept OUT of the LiteLLM Router pool, because
+  # should be treated as a shared-account bucket, not normal router capacity.
  # OpenRouter enforces free-tier limits globally per account (~20 RPM +
  # 50-1000 daily requests across every ":free" model combined) —
  # per-deployment router accounting can't represent a shared bucket
  # correctly. Free OR models stay fully available in the model selector
  # and for user-facing Auto thread pinning.
  free_rpm: 20
  free_tpm: 100000
-  # Image generation + vision LLM emission are OPT-IN. OpenRouter's catalogue
+  # Image generation is opt-in to avoid injecting a large image catalog during
-  # contains hundreds of image- and vision-capable models; turning these on
+  # upgrades. Vision-capable chat models are represented with
-  # injects them into the global Image-Generation / Vision-LLM model
+  # supports_image_input: true.
  # selectors alongside any static configs. Tier (free/premium) is derived
  # per model the same way it is for chat (`:free` suffix or zero pricing).
  # When a user picks a premium image/vision model the call debits the
  # shared $5 USD-cost-based premium credit pool — so leaving these off
  # avoids surprise quota burn on existing deployments. Default: false.
  image_generation_enabled: false
  vision_enabled: false
@ -367,191 +241,80 @@ openrouter_integration:
  citations_enabled: true
 # =============================================================================
-# Image Generation Configuration
+# Image Generation Auto Mode Router Settings
 # =============================================================================
 # These configurations power the image generation feature using litellm.aimage_generation().
 # Supported providers: OpenAI, Azure, Google AI Studio, Vertex AI, AWS Bedrock,
 # Recraft, OpenRouter, Xinference, Nscale
 #
 # Auto mode (ID 0) uses LiteLLM Router for load balancing across all image gen configs.
 # Router Settings for Image Generation Auto Mode
 image_generation_router_settings:
  routing_strategy: "usage-based-routing"
  num_retries: 3
  allowed_fails: 3
  cooldown_time: 60
 # =============================================================================
 # Static GLOBAL Image Generation Models
 # =============================================================================
 global_image_generation_configs:
-  # Example: OpenAI DALL-E 3
+  - id: -2001
-  - id: -1
+    name: "Azure GPT Image 1.5"
-    name: "Global DALL-E 3"
+    billing_tier: "premium"
-    description: "OpenAI's DALL-E 3 for high-quality image generation"
+    provider: "azure"
-    provider: "OPENAI"
+    model_name: "gpt-image-1.5"
    model_name: "dall-e-3"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
    rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
    litellm_params: {}
  # Example: OpenAI GPT Image 1
  - id: -2
    name: "Global GPT Image 1"
    description: "OpenAI's GPT Image 1 model"
    provider: "OPENAI"
    model_name: "gpt-image-1"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
    rpm: 50
    litellm_params: {}
  # Example: Azure OpenAI DALL-E 3
  - id: -3
    name: "Global Azure DALL-E 3"
    description: "Azure-hosted DALL-E 3 deployment"
    provider: "AZURE_OPENAI"
    model_name: "azure/dall-e-3-deployment"
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
-    api_version: "2024-02-15-preview"
+    # api_version: "2025-04-01-preview"
-    rpm: 50
+    rpm: 60
    litellm_params:
-      base_model: "dall-e-3"
+      base_model: "gpt-image-1.5"
-  # Example: OpenRouter Gemini Image Generation
+  - id: -2002
-  # - id: -4
+    name: "Azure GPT Image 1 Mini"
-  #   name: "Global Gemini Image Gen"
+    billing_tier: "free"
-  #   description: "Google Gemini image generation via OpenRouter"
+    provider: "azure"
-  #   provider: "OPENROUTER"
+    model_name: "gpt-image-1-mini"
-  #   model_name: "google/gemini-2.5-flash-image"
+    api_key: "your-azure-api-key-here"
-  #   api_key: "your-openrouter-api-key-here"
+    api_base: "https://your-resource.openai.azure.com"
-  #   api_base: ""
+    # api_version: "2025-04-01-preview"
-  #   rpm: 30
+    rpm: 120
-  #   litellm_params: {}
+    litellm_params:
      base_model: "gpt-image-1-mini"
 # =============================================================================
-# Vision LLM Configuration
+# Field Notes
 # =============================================================================
-# These configurations power the vision autocomplete feature (screenshot analysis).
+# Common chat/image fields:
-# Only vision-capable models should be used here (e.g. GPT-4o, Gemini Pro, Claude 3).
+# - provider: Canonical provider adapter name. Example: azure, openai,
-# Supported providers: OpenAI, Anthropic, Google, Azure OpenAI, Vertex AI, Bedrock,
+#   anthropic, openrouter, groq, bedrock.
-# xAI, OpenRouter, Ollama, Groq, Together AI, Fireworks AI, DeepSeek, Mistral, Custom
+# - model_name: Provider model or deployment id. For Azure, use the bare
 #   deployment name. The resolver prefixes LiteLLM model strings from provider.
 # - api_base: Provider endpoint/root URL. For OpenAI-compatible providers, the
 #   resolver adds /v1 when needed.
 # - api_version: Optional provider-specific API version, stored on the
 #   materialized connection extra metadata.
 # - litellm_params: Passed to LiteLLM when invoking the model. Also used for
 #   base_model and inline pricing registration.
 #
-# Auto mode (ID 0) uses LiteLLM Router for load balancing across all vision configs.
+# Chat model fields:
-
+# - supports_image_input: true when the chat model can consume image inputs.
-# Router Settings for Vision LLM Auto Mode
+# - supports_tools: true when the model can use tools/function calling.
-vision_llm_router_settings:
+# - max_input_tokens: Optional UI/catalog metadata for context size.
-  routing_strategy: "usage-based-routing"
+# - router_pool_eligible: false keeps a model out of shared router pools while
-  num_retries: 3
+#   still allowing direct selection/pinning.
-  allowed_fails: 3
+# - is_planner: true marks the internal-only planner model. Only one config
-  cooldown_time: 60
+#   should set this flag.
 global_vision_llm_configs:
  # Example: OpenAI GPT-4o (recommended for vision)
  - id: -1
    name: "Global GPT-4o Vision"
    description: "OpenAI's GPT-4o with strong vision capabilities"
    provider: "OPENAI"
    model_name: "gpt-4o"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
    rpm: 500
    tpm: 100000
    litellm_params:
      temperature: 0.3
      max_tokens: 1000
  # Example: Google Gemini 2.0 Flash
  - id: -2
    name: "Global Gemini 2.0 Flash"
    description: "Google's fast vision model with large context"
    provider: "GOOGLE"
    model_name: "gemini-2.0-flash"
    api_key: "your-google-ai-api-key-here"
    api_base: ""
    rpm: 1000
    tpm: 200000
    litellm_params:
      temperature: 0.3
      max_tokens: 1000
  # Example: Anthropic Claude 3.5 Sonnet
  - id: -3
    name: "Global Claude 3.5 Sonnet Vision"
    description: "Anthropic's Claude 3.5 Sonnet with vision support"
    provider: "ANTHROPIC"
    model_name: "claude-3-5-sonnet-20241022"
    api_key: "sk-ant-your-anthropic-api-key-here"
    api_base: ""
    rpm: 1000
    tpm: 100000
    litellm_params:
      temperature: 0.3
      max_tokens: 1000
  # Example: Azure OpenAI GPT-4o
  # - id: -4
  #   name: "Global Azure GPT-4o Vision"
  #   description: "Azure-hosted GPT-4o for vision analysis"
  #   provider: "AZURE_OPENAI"
  #   model_name: "azure/gpt-4o-deployment"
  #   api_key: "your-azure-api-key-here"
  #   api_base: "https://your-resource.openai.azure.com"
  #   api_version: "2024-02-15-preview"
  #   rpm: 500
  #   tpm: 100000
  #   litellm_params:
  #     temperature: 0.3
  #     max_tokens: 1000
  #     base_model: "gpt-4o"
 # Notes:
 # - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing
 # - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB)
 # - IDs should be unique and sequential (e.g., -1, -2, -3, etc.)
 # - The 'api_key' field will not be exposed to users via API
 # - system_instructions: Custom prompt or empty string to use defaults
 # - use_default_system_instructions: true = use SURFSENSE_SYSTEM_INSTRUCTIONS when system_instructions is empty
 # - citations_enabled: true = include citation instructions, false = include anti-citation instructions
 # - All standard LiteLLM providers are supported
 # - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute)
 #   These help the router distribute load evenly and avoid rate limit errors
 #
 # Catalog and access fields:
 # - billing_tier: "free" or "premium".
 # - anonymous_enabled: Whether the model appears in the public no-login catalog.
 # - seo_enabled: Whether a /free/<seo_slug> landing page is generated.
 # - seo_slug: Stable URL slug for SEO pages. Keep unique and do not change once
 #   public.
 # - seo_title / seo_description: Optional SEO metadata overrides.
 # - quota_reserve_tokens: Tokens reserved before each chat LLM call.
 # - rpm / tpm: Optional rate limits for router accounting and load balancing.
 #
-# IMAGE GENERATION NOTES:
+# Image generation notes:
-# - Image generation configs use the same ID scheme as LLM configs (negative for global)
+# - Image-generation configs use the same GLOBAL ID namespace as chat models.
-# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure),
+# - Only RPM is relevant for most image-generation APIs.
-#   bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter)
+# - The runtime uses litellm.aimage_generation().
-# - The router uses litellm.aimage_generation() for async image generation
+# - Image billing currently uses billing_tier and model catalog metadata. Keep
-# - Only RPM (requests per minute) is relevant for image generation rate limiting.
+#   quota reserve tuning in code/catalog unless the materializer copies a YAML
-#   TPM (tokens per minute) does not apply since image APIs are billed/rate-limited per request, not per token.
+#   key for image quota reservation.
 #
 # VISION LLM NOTES:
 # - Vision configs use the same ID scheme (negative for global, positive for user DB)
 # - Only use vision-capable models (GPT-4o, Gemini, Claude 3, etc.)
 # - Lower temperature (0.3) is recommended for accurate screenshot analysis
 # - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
 #
 # PLANNER LLM NOTES:
 # - is_planner: true marks a config as the internal-only planner LLM (small,
 #   fast model used for KB query rewriting, date extraction, recency
 #   classification, etc.). Only one config may carry this flag — if
 #   multiple do, the first one wins and a startup WARNING is logged.
 # - When no config is marked is_planner, every internal utility call falls
 #   back to the user's chat LLM (the historical behavior).
 # - Planner configs are NOT shown in the user-facing model selector and
 #   are NOT billed against the user's premium quota. Their billing_tier,
 #   anonymous_enabled, seo_* fields are ignored.
 # - Recommended models: gpt-4o-mini, claude-3-5-haiku, gemini-1.5-flash,
 #   azure gpt-5.x-nano, groq llama3-8b — anything <200ms p50 on a 1-2k
 #   prompt. Frontier models here defeat the purpose of the flag.
 #
 # TOKEN QUOTA & ANONYMOUS ACCESS NOTES:
 # - billing_tier: "free" or "premium". Controls whether registered users need premium token quota.
 # - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog.
 # - seo_enabled: true/false. Whether a /free/<seo_slug> landing page is generated.
 # - seo_slug: Stable URL slug for SEO pages. Must be unique. Do NOT change once public.
 # - seo_title: Optional HTML title tag override for the model's /free/<slug> page.
 # - seo_description: Optional meta description override for the model's /free/<slug> page.
 # - quota_reserve_tokens: Tokens reserved before each LLM call for quota enforcement.
 #   Independent of litellm_params.max_tokens. Used by the token quota service.
--- a/surfsense_backend/app/connectors/dropbox/content_extractor.py
+++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py
@ -90,11 +90,12 @@ async def download_and_extract_content(
        if error:
            return None, metadata, error
        from app.etl_pipeline.cache import extract_with_cache
        from app.etl_pipeline.etl_document import EtlRequest
        from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-        result = await EtlPipelineService(vision_llm=vision_llm).extract(
+        result = await extract_with_cache(
-            EtlRequest(file_path=temp_file_path, filename=file_name)
+            EtlRequest(file_path=temp_file_path, filename=file_name),
            vision_llm=vision_llm,
        )
        markdown = result.markdown_content
        return markdown, metadata, None
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@ -122,12 +122,13 @@ async def download_and_extract_content(
 async def _parse_file_to_markdown(
    file_path: str, filename: str, *, vision_llm=None
 ) -> str:
-    """Parse a local file to markdown using the unified ETL pipeline."""
+    """Parse a local file to markdown via the cache-aware ETL pipeline."""
    from app.etl_pipeline.cache import extract_with_cache
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    result = await EtlPipelineService(vision_llm=vision_llm).extract(
+    result = await extract_with_cache(
-        EtlRequest(file_path=file_path, filename=filename)
+        EtlRequest(file_path=file_path, filename=filename),
        vision_llm=vision_llm,
    )
    return result.markdown_content
--- a/surfsense_backend/app/connectors/onedrive/content_extractor.py
+++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py
@ -84,11 +84,12 @@ async def download_and_extract_content(
 async def _parse_file_to_markdown(
    file_path: str, filename: str, *, vision_llm=None
 ) -> str:
-    """Parse a local file to markdown using the unified ETL pipeline."""
+    """Parse a local file to markdown via the cache-aware ETL pipeline."""
    from app.etl_pipeline.cache import extract_with_cache
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    result = await EtlPipelineService(vision_llm=vision_llm).extract(
+    result = await extract_with_cache(
-        EtlRequest(file_path=file_path, filename=filename)
+        EtlRequest(file_path=file_path, filename=filename),
        vision_llm=vision_llm,
    )
    return result.markdown_content
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -201,79 +201,15 @@ class DocumentStatus:
        return None
-class LiteLLMProvider(StrEnum):
+class ConnectionScope(StrEnum):
-    """
+    GLOBAL = "GLOBAL"
-    Enum for LLM providers supported by LiteLLM.
+    SEARCH_SPACE = "SEARCH_SPACE"
-    """
+    USER = "USER"
    OPENAI = "OPENAI"
    ANTHROPIC = "ANTHROPIC"
    GOOGLE = "GOOGLE"
    AZURE_OPENAI = "AZURE_OPENAI"
    BEDROCK = "BEDROCK"
    VERTEX_AI = "VERTEX_AI"
    GROQ = "GROQ"
    COHERE = "COHERE"
    MISTRAL = "MISTRAL"
    DEEPSEEK = "DEEPSEEK"
    XAI = "XAI"
    OPENROUTER = "OPENROUTER"
    TOGETHER_AI = "TOGETHER_AI"
    FIREWORKS_AI = "FIREWORKS_AI"
    REPLICATE = "REPLICATE"
    PERPLEXITY = "PERPLEXITY"
    OLLAMA = "OLLAMA"
    ALIBABA_QWEN = "ALIBABA_QWEN"
    MOONSHOT = "MOONSHOT"
    ZHIPU = "ZHIPU"
    ANYSCALE = "ANYSCALE"
    DEEPINFRA = "DEEPINFRA"
    CEREBRAS = "CEREBRAS"
    SAMBANOVA = "SAMBANOVA"
    AI21 = "AI21"
    CLOUDFLARE = "CLOUDFLARE"
    DATABRICKS = "DATABRICKS"
    COMETAPI = "COMETAPI"
    HUGGINGFACE = "HUGGINGFACE"
    GITHUB_MODELS = "GITHUB_MODELS"
    MINIMAX = "MINIMAX"
    CUSTOM = "CUSTOM"
-class ImageGenProvider(StrEnum):
+class ModelSource(StrEnum):
-    """
+    DISCOVERED = "DISCOVERED"
-    Enum for image generation providers supported by LiteLLM.
+    MANUAL = "MANUAL"
    This is a subset of LLM providers — only those that support image generation.
    See: https://docs.litellm.ai/docs/image_generation#supported-providers
    """
    OPENAI = "OPENAI"
    AZURE_OPENAI = "AZURE_OPENAI"
    GOOGLE = "GOOGLE"  # Google AI Studio
    VERTEX_AI = "VERTEX_AI"
    BEDROCK = "BEDROCK"  # AWS Bedrock
    RECRAFT = "RECRAFT"
    OPENROUTER = "OPENROUTER"
    XINFERENCE = "XINFERENCE"
    NSCALE = "NSCALE"
 class VisionProvider(StrEnum):
    OPENAI = "OPENAI"
    ANTHROPIC = "ANTHROPIC"
    GOOGLE = "GOOGLE"
    AZURE_OPENAI = "AZURE_OPENAI"
    VERTEX_AI = "VERTEX_AI"
    BEDROCK = "BEDROCK"
    XAI = "XAI"
    OPENROUTER = "OPENROUTER"
    OLLAMA = "OLLAMA"
    GROQ = "GROQ"
    TOGETHER_AI = "TOGETHER_AI"
    FIREWORKS_AI = "FIREWORKS_AI"
    DEEPSEEK = "DEEPSEEK"
    MISTRAL = "MISTRAL"
    CUSTOM = "CUSTOM"
 class LogLevel(StrEnum):
@ -702,11 +638,11 @@ class NewChatThread(BaseModel, TimestampMixin):
        default=False,
        server_default="false",
    )
-    # Auto (Fastest) model pin for this thread: concrete resolved global LLM
+    # Auto model pin for this thread: concrete resolved global LLM
    # config id. NULL means no pin; Auto will resolve on the next turn.
    # Single-writer invariant: only app.services.auto_model_pin_service sets
    # or clears this column (plus bulk clears when a search space's
-    # agent_llm_id changes). Unindexed: all reads are by primary key.
+    # chat_model_id changes). Unindexed: all reads are by primary key.
    pinned_llm_config_id = Column(Integer, nullable=True)
    # Surface metadata for first-party SurfSense and external chat threads.
@ -1487,7 +1423,10 @@ class Document(BaseModel, TimestampMixin):
    created_by = relationship("User", back_populates="documents")
    connector = relationship("SearchSourceConnector", back_populates="documents")
    chunks = relationship(
-        "Chunk", back_populates="document", cascade="all, delete-orphan"
+        "Chunk",
        back_populates="document",
        cascade="all, delete-orphan",
        order_by="Chunk.position",
    )
    # Original upload + future derived artifacts (redacted, filled-form).
    # Model lives in app.file_storage.persistence to keep that feature cohesive.
@ -1523,6 +1462,9 @@ class Chunk(BaseModel, TimestampMixin):
    content = Column(Text, nullable=False)
    embedding = Column(Vector(config.embedding_model_instance.dimension))
    # Explicit document order; ids don't follow it since incremental
    # re-indexing keeps unchanged rows across edits.
    position = Column(Integer, nullable=False, server_default="0", index=True)
    document_id = Column(
        Integer,
@ -1604,73 +1546,80 @@ class Report(BaseModel, TimestampMixin):
    thread = relationship("NewChatThread")
-class ImageGenerationConfig(BaseModel, TimestampMixin):
+class Connection(BaseModel, TimestampMixin):
-    """
+    __tablename__ = "connections"
    Dedicated configuration table for image generation models.
-    Separate from NewLLMConfig because image generation models don't need
+    provider = Column(String(100), nullable=False, index=True)
-    system_instructions, citations_enabled, or use_default_system_instructions.
+    base_url = Column(String(500), nullable=True)
-    They only need provider credentials and model parameters.
+    api_key = Column(String, nullable=True)
-    """
+    extra = Column(JSONB, nullable=False, default=dict, server_default="{}")
-
+    scope = Column(SQLAlchemyEnum(ConnectionScope), nullable=False, index=True)
-    __tablename__ = "image_generation_configs"
+    enabled = Column(Boolean, nullable=False, default=True, server_default="true")
    name = Column(String(100), nullable=False, index=True)
    description = Column(String(500), nullable=True)
    # Provider & model (uses ImageGenProvider, NOT LiteLLMProvider)
    provider = Column(SQLAlchemyEnum(ImageGenProvider), nullable=False)
    custom_provider = Column(String(100), nullable=True)
    model_name = Column(String(100), nullable=False)
    # Credentials
    api_key = Column(String, nullable=False)
    api_base = Column(String(500), nullable=True)
    api_version = Column(String(50), nullable=True)  # Azure-specific
    # Additional litellm parameters
    litellm_params = Column(JSON, nullable=True, default={})
    # Relationships
    search_space_id = Column(
        Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
    )
    search_space = relationship(
        "SearchSpace", back_populates="image_generation_configs"
    )
    # User who created this config
    user_id = Column(
        UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
    )
    user = relationship("User", back_populates="image_generation_configs")
 class VisionLLMConfig(BaseModel, TimestampMixin):
    __tablename__ = "vision_llm_configs"
    name = Column(String(100), nullable=False, index=True)
    description = Column(String(500), nullable=True)
    provider = Column(SQLAlchemyEnum(VisionProvider), nullable=False)
    custom_provider = Column(String(100), nullable=True)
    model_name = Column(String(100), nullable=False)
    api_key = Column(String, nullable=False)
    api_base = Column(String(500), nullable=True)
    api_version = Column(String(50), nullable=True)
    litellm_params = Column(JSON, nullable=True, default={})
    search_space_id = Column(
-        Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
+        Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=True
    )
    search_space = relationship("SearchSpace", back_populates="vision_llm_configs")
    user_id = Column(
-        UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
+        UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=True
    )
    search_space = relationship("SearchSpace", back_populates="connections")
    user = relationship("User", back_populates="connections")
    models = relationship(
        "Model",
        back_populates="connection",
        order_by="Model.id",
        cascade="all, delete-orphan",
        passive_deletes=True,
    )
    __table_args__ = (
        CheckConstraint(
            "(scope = 'GLOBAL' AND search_space_id IS NULL AND user_id IS NULL) OR "
            "(scope = 'SEARCH_SPACE' AND search_space_id IS NOT NULL AND user_id IS NOT NULL) OR "
            "(scope = 'USER' AND user_id IS NOT NULL)",
            name="ck_connections_scope_owner",
        ),
    )
 class Model(BaseModel, TimestampMixin):
    __tablename__ = "models"
    connection_id = Column(
        Integer,
        ForeignKey("connections.id", ondelete="CASCADE"),
        nullable=False,
        index=True,
    )
    model_id = Column(String(255), nullable=False)
    display_name = Column(String(255), nullable=True)
    source = Column(
        SQLAlchemyEnum(ModelSource),
        nullable=False,
        default=ModelSource.DISCOVERED,
        server_default=ModelSource.DISCOVERED.value,
    )
    supports_chat = Column(Boolean, nullable=True)
    max_input_tokens = Column(Integer, nullable=True)
    supports_image_input = Column(Boolean, nullable=True)
    supports_tools = Column(Boolean, nullable=True)
    supports_image_generation = Column(Boolean, nullable=True)
    capabilities_override = Column(
        JSONB, nullable=False, default=dict, server_default="{}"
    )
    enabled = Column(Boolean, nullable=False, default=True, server_default="true")
    billing_tier = Column(String(50), nullable=True, index=True)
    catalog = Column(JSONB, nullable=False, default=dict, server_default="{}")
    connection = relationship("Connection", back_populates="models")
    __table_args__ = (
        UniqueConstraint(
            "connection_id", "model_id", name="uq_models_connection_model_id"
        ),
        Index("ix_models_model_id", "model_id"),
    )
    user = relationship("User", back_populates="vision_llm_configs")
 class ImageGeneration(BaseModel, TimestampMixin):
@ -1704,10 +1653,9 @@ class ImageGeneration(BaseModel, TimestampMixin):
    style = Column(String(50), nullable=True)  # Model-specific style parameter
    response_format = Column(String(50), nullable=True)  # "url" or "b64_json"
-    # Image generation config reference
+    # Image generation model provenance.
-    # 0 = Auto mode (router), negative IDs = global configs from YAML,
+    # 0 = Auto mode, negative IDs = GLOBAL models, positive IDs = Model records.
-    # positive IDs = ImageGenerationConfig records in DB
+    image_gen_model_id = Column(Integer, nullable=True)
    image_generation_config_id = Column(Integer, nullable=True)
    # Response data (full litellm response as JSONB) — present on success
    response_data = Column(JSONB, nullable=True)
@ -1749,19 +1697,19 @@ class SearchSpace(BaseModel, TimestampMixin):
    shared_memory_md = Column(Text, nullable=True, server_default="")
-    # Search space-level LLM preferences (shared by all members)
+    # Connection/model role bindings.
-    # Note: ID values:
+    # Note: ID values preserve the existing convention:
-    #   - 0: Auto mode (uses LiteLLM Router for load balancing) - default for new search spaces
+    #   - 0: Auto mode
-    #   - Negative IDs: Global configs from YAML
+    #   - Negative IDs: Global virtual models from global_llm_config.yaml
-    #   - Positive IDs: Custom configs from DB (NewLLMConfig table)
+    #   - Positive IDs: User/search-space models from the models table
-    agent_llm_id = Column(
+    chat_model_id = Column(
-        Integer, nullable=True, default=0
+        Integer, nullable=True, default=0, server_default="0"
    )  # For agent/chat operations, defaults to Auto mode
-    image_generation_config_id = Column(
+    image_gen_model_id = Column(
-        Integer, nullable=True, default=0
+        Integer, nullable=True, default=0, server_default="0"
-    )  # For image generation, defaults to Auto mode
+    )  # For image generation, defaults to Auto mode when eligible
-    vision_llm_config_id = Column(
+    vision_model_id = Column(
-        Integer, nullable=True, default=0
+        Integer, nullable=True, default=0, server_default="0"
    )  # For vision/screenshot analysis, defaults to Auto mode
    ai_file_sort_enabled = Column(
@ -1833,23 +1781,12 @@ class SearchSpace(BaseModel, TimestampMixin):
        order_by="SearchSourceConnector.id",
        cascade="all, delete-orphan",
    )
-    new_llm_configs = relationship(
+    connections = relationship(
-        "NewLLMConfig",
+        "Connection",
        back_populates="search_space",
-        order_by="NewLLMConfig.id",
+        order_by="Connection.id",
        cascade="all, delete-orphan",
    )
    image_generation_configs = relationship(
        "ImageGenerationConfig",
        back_populates="search_space",
        order_by="ImageGenerationConfig.id",
        cascade="all, delete-orphan",
    )
    vision_llm_configs = relationship(
        "VisionLLMConfig",
        back_populates="search_space",
        order_by="VisionLLMConfig.id",
        cascade="all, delete-orphan",
        passive_deletes=True,
    )
    automations = relationship(
@ -1952,64 +1889,6 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
    documents = relationship("Document", back_populates="connector")
 class NewLLMConfig(BaseModel, TimestampMixin):
    """
    New LLM configuration table that combines model settings with prompt configuration.
    This table provides:
    - LLM model configuration (provider, model_name, api_key, etc.)
    - Configurable system instructions (defaults to SURFSENSE_SYSTEM_INSTRUCTIONS)
    - Citation toggle (enable/disable citation instructions)
    Note: Tools instructions are built by get_tools_instructions(thread_visibility) (personal vs shared memory).
    """
    __tablename__ = "new_llm_configs"
    name = Column(String(100), nullable=False, index=True)
    description = Column(String(500), nullable=True)
    # === LLM Model Configuration (from original LLMConfig, excluding 'language') ===
    # Provider from the enum
    provider = Column(SQLAlchemyEnum(LiteLLMProvider), nullable=False)
    # Custom provider name when provider is CUSTOM
    custom_provider = Column(String(100), nullable=True)
    # Just the model name without provider prefix
    model_name = Column(String(100), nullable=False)
    # API Key should be encrypted before storing
    api_key = Column(String, nullable=False)
    api_base = Column(String(500), nullable=True)
    # For any other parameters that litellm supports
    litellm_params = Column(JSON, nullable=True, default={})
    # === Prompt Configuration ===
    # Configurable system instructions (defaults to SURFSENSE_SYSTEM_INSTRUCTIONS)
    # Users can customize this from the UI
    system_instructions = Column(
        Text,
        nullable=False,
        default="",  # Empty string means use default SURFSENSE_SYSTEM_INSTRUCTIONS
    )
    # Whether to use the default system instructions when system_instructions is empty
    use_default_system_instructions = Column(Boolean, nullable=False, default=True)
    # Citation toggle - when enabled, SURFSENSE_CITATION_INSTRUCTIONS is injected
    # When disabled, an anti-citation prompt is injected instead
    citations_enabled = Column(Boolean, nullable=False, default=True)
    # === Relationships ===
    search_space_id = Column(
        Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
    )
    search_space = relationship("SearchSpace", back_populates="new_llm_configs")
    # User who created this config
    user_id = Column(
        UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
    )
    user = relationship("User", back_populates="new_llm_configs")
 class Log(BaseModel, TimestampMixin):
    __tablename__ = "logs"
@ -2376,22 +2255,8 @@ if config.AUTH_TYPE == "GOOGLE":
            passive_deletes=True,
        )
-        # LLM configs created by this user
+        connections = relationship(
-        new_llm_configs = relationship(
+            "Connection",
            "NewLLMConfig",
            back_populates="user",
            passive_deletes=True,
        )
        # Image generation configs created by this user
        image_generation_configs = relationship(
            "ImageGenerationConfig",
            back_populates="user",
            passive_deletes=True,
        )
        vision_llm_configs = relationship(
            "VisionLLMConfig",
            back_populates="user",
            passive_deletes=True,
        )
@ -2522,22 +2387,8 @@ else:
            passive_deletes=True,
        )
-        # LLM configs created by this user
+        connections = relationship(
-        new_llm_configs = relationship(
+            "Connection",
            "NewLLMConfig",
            back_populates="user",
            passive_deletes=True,
        )
        # Image generation configs created by this user
        image_generation_configs = relationship(
            "ImageGenerationConfig",
            back_populates="user",
            passive_deletes=True,
        )
        vision_llm_configs = relationship(
            "VisionLLMConfig",
            back_populates="user",
            passive_deletes=True,
        )
@ -2867,7 +2718,11 @@ from app.automations.persistence import (  # noqa: E402, F401
    AutomationRun,
    AutomationTrigger,
 )
 from app.etl_pipeline.cache.persistence.models import CachedParse  # noqa: E402, F401
 from app.file_storage.persistence import DocumentFile  # noqa: E402, F401
 from app.indexing_pipeline.cache.persistence.models import (  # noqa: E402, F401
    CachedEmbeddingSet,
 )
 from app.notifications.persistence import Notification  # noqa: E402, F401
 from app.podcasts.persistence import (  # noqa: E402, F401
    Podcast,
--- a/surfsense_backend/app/etl_pipeline/cache/init.py
+++ b/surfsense_backend/app/etl_pipeline/cache/init.py
@ -0,0 +1,11 @@
 """Content-addressed reuse of expensive ETL parser output across workspaces."""
 from __future__ import annotations
 from app.etl_pipeline.cache.cached_extraction import extract_with_cache
 from app.etl_pipeline.cache.service import EtlCacheService
 __all__ = [
    "EtlCacheService",
    "extract_with_cache",
 ]
--- a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py
+++ b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py
@ -0,0 +1,86 @@
 """Entry point: serve ETL parses from cache, parsing only on a miss."""
 from __future__ import annotations
 import asyncio
 import hashlib
 import logging
 from app.config import config
 from app.etl_pipeline.cache.eligibility import is_parse_cacheable
 from app.etl_pipeline.cache.schemas import ParseKey
 from app.etl_pipeline.cache.service import EtlCacheService
 from app.etl_pipeline.cache.settings import load_etl_cache_settings
 from app.etl_pipeline.etl_document import EtlRequest, EtlResult
 from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 from app.observability import metrics
 logger = logging.getLogger(__name__)
 _HASH_CHUNK = 1024 * 1024
 async def extract_with_cache(request: EtlRequest, *, vision_llm=None) -> EtlResult:
    """Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output."""
    settings = load_etl_cache_settings()
    cacheable = is_parse_cacheable(
        filename=request.filename,
        etl_service=config.ETL_SERVICE,
        cache_enabled=settings.enabled,
        has_vision_llm=vision_llm is not None,
    )
    if not cacheable:
        return await EtlPipelineService(vision_llm=vision_llm).extract(request)
    key = ParseKey.for_document(
        await asyncio.to_thread(_hash_file, request.file_path),
        etl_service=config.ETL_SERVICE,
        mode=request.processing_mode.value,
        version=settings.parser_version,
    )
    cached_result = await _recall(key)
    if cached_result is not None:
        metrics.record_etl_cache_lookup(
            etl_service=key.etl_service, mode=key.mode, outcome="hit"
        )
        logger.debug("ETL cache hit for %s", key.source_sha256)
        return cached_result
    metrics.record_etl_cache_lookup(
        etl_service=key.etl_service, mode=key.mode, outcome="miss"
    )
    result = await EtlPipelineService(vision_llm=vision_llm).extract(request)
    await _remember(key, result)
    return result
 async def _recall(key: ParseKey) -> EtlResult | None:
    # Caching is best-effort: any failure falls through to a normal parse.
    try:
        from app.tasks.celery_tasks import get_celery_session_maker
        async with get_celery_session_maker()() as session:
            return await EtlCacheService(session).recall(key)
    except Exception:
        logger.warning("ETL cache recall failed; parsing fresh", exc_info=True)
        return None
 async def _remember(key: ParseKey, result: EtlResult) -> None:
    try:
        from app.tasks.celery_tasks import get_celery_session_maker
        async with get_celery_session_maker()() as session:
            await EtlCacheService(session).remember(key, result)
    except Exception:
        logger.warning("ETL cache write failed; result not cached", exc_info=True)
 def _hash_file(path: str) -> str:
    digest = hashlib.sha256()
    with open(path, "rb") as handle:
        for chunk in iter(lambda: handle.read(_HASH_CHUNK), b""):
            digest.update(chunk)
    return digest.hexdigest()
--- a/surfsense_backend/app/etl_pipeline/cache/eligibility.py
+++ b/surfsense_backend/app/etl_pipeline/cache/eligibility.py
@ -0,0 +1,28 @@
 """Gating rule: may this upload be served from / written to the parse cache?"""
 from __future__ import annotations
 from app.etl_pipeline.file_classifier import FileCategory, classify_file
 def is_parse_cacheable(
    *,
    filename: str,
    etl_service: str | None,
    cache_enabled: bool,
    has_vision_llm: bool,
 ) -> bool:
    """Only deterministic document parses are shareable across workspaces.
    Vision-LLM runs append model-generated content not captured by the cache key,
    and a missing ETL service means there is no document parser to key against --
    both bypass the cache. Non-document categories (plaintext, audio, images,
    direct-convert) are cheap or parser-agnostic and are handled outside it.
    """
    if not cache_enabled:
        return False
    if has_vision_llm:
        return False
    if not etl_service:
        return False
    return classify_file(filename) == FileCategory.DOCUMENT
--- a/surfsense_backend/app/etl_pipeline/cache/eviction/init.py
+++ b/surfsense_backend/app/etl_pipeline/cache/eviction/init.py
@ -0,0 +1,9 @@
 """Background pruning of the parse cache by age and size budget."""
 from __future__ import annotations
 from .task import evict_etl_cache_task
 __all__ = [
    "evict_etl_cache_task",
 ]
--- a/surfsense_backend/app/etl_pipeline/cache/eviction/policy.py
+++ b/surfsense_backend/app/etl_pipeline/cache/eviction/policy.py
@ -0,0 +1,28 @@
 """Pure selection rules for which cached entries to drop."""
 from __future__ import annotations
 from collections.abc import Iterable
 from app.etl_pipeline.cache.schemas import EvictionCandidate
 def select_over_budget(
    coldest_first: Iterable[EvictionCandidate],
    *,
    current_total_bytes: int,
    max_total_bytes: int,
 ) -> list[EvictionCandidate]:
    """Pick coldest entries until the footprint drops under the budget."""
    bytes_to_free = current_total_bytes - max_total_bytes
    if bytes_to_free <= 0:
        return []
    chosen: list[EvictionCandidate] = []
    bytes_freed = 0
    for candidate in coldest_first:
        if bytes_freed >= bytes_to_free:
            break
        chosen.append(candidate)
        bytes_freed += candidate.size_bytes
    return chosen
--- a/surfsense_backend/app/etl_pipeline/cache/eviction/task.py
+++ b/surfsense_backend/app/etl_pipeline/cache/eviction/task.py
@ -0,0 +1,68 @@
 """Celery task that prunes the parse cache by TTL, then by size budget."""
 from __future__ import annotations
 import contextlib
 import logging
 from datetime import UTC, datetime, timedelta
 from app.celery_app import celery_app
 from app.etl_pipeline.cache.eviction.policy import select_over_budget
 from app.etl_pipeline.cache.persistence import CachedParseRepository
 from app.etl_pipeline.cache.schemas import EvictionCandidate
 from app.etl_pipeline.cache.settings import load_etl_cache_settings
 from app.etl_pipeline.cache.storage import MarkdownCacheStore
 from app.observability import metrics
 from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 logger = logging.getLogger(__name__)
@celery_app.task(name="evict_etl_cache")
 def evict_etl_cache_task():
    return run_async_celery_task(_evict)
 async def _evict() -> None:
    """Expire stale entries, then shed the coldest overflow only if still over budget."""
    settings = load_etl_cache_settings()
    if not settings.enabled:
        return
    store = MarkdownCacheStore()
    async with get_celery_session_maker()() as session:
        index = CachedParseRepository(session)
        cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days)
        expired = await index.select_expired(
            cutoff=cutoff, limit=settings.eviction_batch
        )
        await _drop(index, store, expired, phase="ttl")
        total = await index.total_size_bytes()
        if total > settings.max_total_bytes:
            coldest = await index.select_coldest(limit=settings.eviction_batch)
            over_budget = select_over_budget(
                coldest,
                current_total_bytes=total,
                max_total_bytes=settings.max_total_bytes,
            )
            await _drop(index, store, over_budget, phase="size")
 async def _drop(
    index: CachedParseRepository,
    store: MarkdownCacheStore,
    candidates: list[EvictionCandidate],
    *,
    phase: str,
 ) -> None:
    if not candidates:
        return
    for candidate in candidates:
        # Drop the index row even if the blob delete fails (orphan blob is harmless).
        with contextlib.suppress(Exception):
            await store.delete(candidate.storage_key)
    await index.delete_by_ids([candidate.id for candidate in candidates])
    metrics.record_etl_cache_eviction(len(candidates), phase=phase)
    logger.info("Evicted %d cached parses (%s)", len(candidates), phase)
--- a/surfsense_backend/app/etl_pipeline/cache/persistence/init.py
+++ b/surfsense_backend/app/etl_pipeline/cache/persistence/init.py
@ -0,0 +1,11 @@
 """Database access for cached parse rows."""
 from __future__ import annotations
 from .models import CachedParse
 from .repository import CachedParseRepository
 __all__ = [
    "CachedParse",
    "CachedParseRepository",
 ]
--- a/surfsense_backend/app/etl_pipeline/cache/persistence/models.py
+++ b/surfsense_backend/app/etl_pipeline/cache/persistence/models.py
@ -0,0 +1,49 @@
 """``etl_cache_parses``: one reusable parser result per (bytes + recipe)."""
 from __future__ import annotations
 from sqlalchemy import (
    BigInteger,
    Column,
    DateTime,
    Index,
    Integer,
    String,
    UniqueConstraint,
 )
 from app.db import BaseModel, TimestampMixin
 class CachedParse(BaseModel, TimestampMixin):
    __tablename__ = "etl_cache_parses"
    # Key: raw bytes + the recipe that produced the markdown.
    source_sha256 = Column(String(64), nullable=False)
    etl_service = Column(String(32), nullable=False)
    mode = Column(String(16), nullable=False)
    parser_version = Column(Integer, nullable=False)
    # Where the markdown blob lives (kept out of the row to stay small).
    storage_backend = Column(String(32), nullable=False)
    storage_key = Column(String, nullable=False)
    size_bytes = Column(BigInteger, nullable=False)
    # Payload needed to rebuild the EtlResult on a hit.
    content_type = Column(String(32), nullable=False)
    actual_pages = Column(Integer, nullable=False, default=0, server_default="0")
    # Drives eviction (popularity + recency).
    times_reused = Column(BigInteger, nullable=False, default=0, server_default="0")
    last_used_at = Column(DateTime(timezone=True), nullable=False)
    __table_args__ = (
        UniqueConstraint(
            "source_sha256",
            "etl_service",
            "mode",
            "parser_version",
            name="uq_etl_cache_parses_key",
        ),
        Index("ix_etl_cache_parses_last_used_at", "last_used_at"),
    )
--- a/surfsense_backend/app/etl_pipeline/cache/persistence/repository.py
+++ b/surfsense_backend/app/etl_pipeline/cache/persistence/repository.py
@ -0,0 +1,121 @@
 """CRUD and eviction selectors for ``etl_cache_parses`` (no business rules)."""
 from __future__ import annotations
 from datetime import UTC, datetime
 from sqlalchemy import delete, func, select, update
 from sqlalchemy.dialects.postgresql import insert as pg_insert
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.etl_pipeline.cache.schemas import EvictionCandidate, ParseKey
 from .models import CachedParse
 _EVICTION_COLUMNS = (
    CachedParse.id,
    CachedParse.storage_key,
    CachedParse.size_bytes,
    CachedParse.last_used_at,
    CachedParse.times_reused,
 )
 def _as_eviction_candidate(row) -> EvictionCandidate:
    return EvictionCandidate(
        id=row.id,
        storage_key=row.storage_key,
        size_bytes=row.size_bytes,
        last_used_at=row.last_used_at,
        times_reused=row.times_reused,
    )
 class CachedParseRepository:
    def __init__(self, session: AsyncSession) -> None:
        self._session = session
    async def get(self, key: ParseKey) -> CachedParse | None:
        result = await self._session.execute(
            select(CachedParse).where(
                CachedParse.source_sha256 == key.source_sha256,
                CachedParse.etl_service == key.etl_service,
                CachedParse.mode == key.mode,
                CachedParse.parser_version == key.version,
            )
        )
        return result.scalars().first()
    async def insert(
        self,
        *,
        key: ParseKey,
        content_type: str,
        actual_pages: int,
        storage_backend: str,
        storage_key: str,
        size_bytes: int,
    ) -> None:
        # Concurrent writers parse identical bytes, so a lost race is harmless.
        now = datetime.now(UTC)
        await self._session.execute(
            pg_insert(CachedParse)
            .values(
                source_sha256=key.source_sha256,
                etl_service=key.etl_service,
                mode=key.mode,
                parser_version=key.version,
                content_type=content_type,
                actual_pages=actual_pages,
                storage_backend=storage_backend,
                storage_key=storage_key,
                size_bytes=size_bytes,
                times_reused=0,
                last_used_at=now,
                created_at=now,
            )
            .on_conflict_do_nothing(constraint="uq_etl_cache_parses_key")
        )
        await self._session.commit()
    async def mark_used(self, row_id: int) -> None:
        await self._session.execute(
            update(CachedParse)
            .where(CachedParse.id == row_id)
            .values(
                times_reused=CachedParse.times_reused + 1,
                last_used_at=datetime.now(UTC),
            )
        )
        await self._session.commit()
    async def total_size_bytes(self) -> int:
        result = await self._session.execute(
            select(func.coalesce(func.sum(CachedParse.size_bytes), 0))
        )
        return int(result.scalar() or 0)
    async def select_expired(
        self, *, cutoff: datetime, limit: int
    ) -> list[EvictionCandidate]:
        result = await self._session.execute(
            select(*_EVICTION_COLUMNS)
            .where(CachedParse.last_used_at < cutoff)
            .order_by(CachedParse.last_used_at.asc())
            .limit(limit)
        )
        return [_as_eviction_candidate(row) for row in result]
    async def select_coldest(self, *, limit: int) -> list[EvictionCandidate]:
        result = await self._session.execute(
            select(*_EVICTION_COLUMNS)
            .order_by(CachedParse.times_reused.asc(), CachedParse.last_used_at.asc())
            .limit(limit)
        )
        return [_as_eviction_candidate(row) for row in result]
    async def delete_by_ids(self, ids: list[int]) -> None:
        if not ids:
            return
        await self._session.execute(delete(CachedParse).where(CachedParse.id.in_(ids)))
        await self._session.commit()
--- a/surfsense_backend/app/etl_pipeline/cache/schemas/init.py
+++ b/surfsense_backend/app/etl_pipeline/cache/schemas/init.py
@ -0,0 +1,11 @@
 """Pure value objects for the parse cache."""
 from __future__ import annotations
 from .eviction_candidate import EvictionCandidate
 from .parse_key import ParseKey
 __all__ = [
    "EvictionCandidate",
    "ParseKey",
 ]
--- a/surfsense_backend/app/etl_pipeline/cache/schemas/eviction_candidate.py
+++ b/surfsense_backend/app/etl_pipeline/cache/schemas/eviction_candidate.py
@ -0,0 +1,15 @@
 """Row projection handed to the eviction policy."""
 from __future__ import annotations
 from dataclasses import dataclass
 from datetime import datetime
@dataclass(frozen=True, slots=True)
 class EvictionCandidate:
    id: int
    storage_key: str
    size_bytes: int
    last_used_at: datetime
    times_reused: int
--- a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py
+++ b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py
@ -0,0 +1,28 @@
 """Identity of a cacheable parse: equal keys yield identical markdown."""
 from __future__ import annotations
 from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
 class ParseKey:
    source_sha256: str
    etl_service: str
    mode: str
    version: int
    @classmethod
    def for_document(
        cls, source_sha256: str, *, etl_service: str, mode: str, version: int
    ) -> ParseKey:
        return cls(
            source_sha256=source_sha256,
            etl_service=etl_service,
            mode=mode,
            version=version,
        )
    @property
    def object_suffix(self) -> str:
        return f"{self.etl_service}.{self.mode}.v{self.version}.md"
--- a/surfsense_backend/app/etl_pipeline/cache/service.py
+++ b/surfsense_backend/app/etl_pipeline/cache/service.py
@ -0,0 +1,53 @@
 """Recall and remember parser output, coordinating the index and blob store."""
 from __future__ import annotations
 import logging
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.etl_pipeline.cache.persistence import CachedParseRepository
 from app.etl_pipeline.cache.schemas import ParseKey
 from app.etl_pipeline.cache.storage import MarkdownCacheStore
 from app.etl_pipeline.etl_document import EtlResult
 logger = logging.getLogger(__name__)
 class EtlCacheService:
    def __init__(self, session: AsyncSession) -> None:
        self._index = CachedParseRepository(session)
        self._store = MarkdownCacheStore()
    async def recall(self, key: ParseKey) -> EtlResult | None:
        """Return the cached result, or None on a miss."""
        row = await self._index.get(key)
        if row is None:
            return None
        try:
            markdown = await self._store.load(row.storage_key)
        except Exception:
            # Index points at a blob that is gone; treat as a miss and re-parse.
            logger.warning("Cache blob missing: %s", row.storage_key, exc_info=True)
            return None
        await self._index.mark_used(row.id)
        return EtlResult(
            markdown_content=markdown,
            etl_service=row.etl_service,
            actual_pages=row.actual_pages,
            content_type=row.content_type,
        )
    async def remember(self, key: ParseKey, result: EtlResult) -> None:
        """Store a freshly parsed result for future reuse."""
        storage_key = await self._store.save(key, result.markdown_content)
        await self._index.insert(
            key=key,
            content_type=result.content_type,
            actual_pages=result.actual_pages,
            storage_backend=self._store.backend_name,
            storage_key=storage_key,
            size_bytes=len(result.markdown_content.encode("utf-8")),
        )
--- a/surfsense_backend/app/etl_pipeline/cache/settings.py
+++ b/surfsense_backend/app/etl_pipeline/cache/settings.py
@ -0,0 +1,33 @@
 """Cache configuration resolved from the central ``Config``."""
 from __future__ import annotations
 from dataclasses import dataclass
@dataclass(frozen=True)
 class EtlCacheSettings:
    enabled: bool
    parser_version: int
    ttl_days: int
    max_total_bytes: int
    eviction_batch: int
    # None for any storage_* field means: reuse the main file_storage backend.
    storage_backend: str | None
    storage_container: str | None
    storage_local_root: str | None
 def load_etl_cache_settings() -> EtlCacheSettings:
    from app.config import config
    return EtlCacheSettings(
        enabled=config.ETL_CACHE_ENABLED,
        parser_version=config.ETL_CACHE_PARSER_VERSION,
        ttl_days=config.ETL_CACHE_TTL_DAYS,
        max_total_bytes=config.ETL_CACHE_MAX_TOTAL_MB * 1024 * 1024,
        eviction_batch=config.ETL_CACHE_EVICTION_BATCH,
        storage_backend=config.ETL_CACHE_STORAGE_BACKEND or None,
        storage_container=config.ETL_CACHE_STORAGE_CONTAINER or None,
        storage_local_root=config.ETL_CACHE_STORAGE_LOCAL_PATH or None,
    )
--- a/surfsense_backend/app/etl_pipeline/cache/storage/init.py
+++ b/surfsense_backend/app/etl_pipeline/cache/storage/init.py
@ -0,0 +1,9 @@
 """Blob storage for cached parse markdown."""
 from __future__ import annotations
 from .markdown_store import MarkdownCacheStore
 __all__ = [
    "MarkdownCacheStore",
 ]
--- a/surfsense_backend/app/etl_pipeline/cache/storage/backend.py
+++ b/surfsense_backend/app/etl_pipeline/cache/storage/backend.py
@ -0,0 +1,48 @@
 """Resolve the storage backend for cache blobs: shared main store or a dedicated one."""
 from __future__ import annotations
 from functools import lru_cache
 from app.file_storage.backends.base import StorageBackend
@lru_cache(maxsize=1)
 def resolve_cache_backend() -> StorageBackend:
    from app.etl_pipeline.cache.settings import load_etl_cache_settings
    settings = load_etl_cache_settings()
    if not settings.storage_backend:
        from app.file_storage.factory import get_storage_backend
        return get_storage_backend()
    backend = settings.storage_backend.strip().lower()
    if backend == "azure":
        from app.config import config
        if not settings.storage_container:
            raise ValueError("ETL_CACHE_STORAGE_CONTAINER is required for azure cache.")
        if not config.AZURE_STORAGE_CONNECTION_STRING:
            raise ValueError(
                "AZURE_STORAGE_CONNECTION_STRING is required for azure cache."
            )
        from app.file_storage.backends.azure import AzureBlobBackend
        return AzureBlobBackend(
            connection_string=config.AZURE_STORAGE_CONNECTION_STRING,
            container=settings.storage_container,
        )
    if backend == "local":
        if not settings.storage_local_root:
            raise ValueError(
                "ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache."
            )
        from app.file_storage.backends.local import LocalFileBackend
        return LocalFileBackend(settings.storage_local_root)
    raise ValueError(f"Unknown ETL_CACHE_STORAGE_BACKEND: {settings.storage_backend!r}")
--- a/surfsense_backend/app/etl_pipeline/cache/storage/markdown_store.py
+++ b/surfsense_backend/app/etl_pipeline/cache/storage/markdown_store.py
@ -0,0 +1,35 @@
 """Read and write cached markdown blobs through the resolved backend."""
 from __future__ import annotations
 from app.etl_pipeline.cache.schemas import ParseKey
 from app.etl_pipeline.cache.storage.backend import resolve_cache_backend
 from app.etl_pipeline.cache.storage.object_keys import build_parse_object_key
 _MARKDOWN_CONTENT_TYPE = "text/markdown; charset=utf-8"
 class MarkdownCacheStore:
    def __init__(self) -> None:
        self._backend = resolve_cache_backend()
    @property
    def backend_name(self) -> str:
        return self._backend.backend_name
    async def save(self, key: ParseKey, markdown: str) -> str:
        """Persist the markdown and return its storage key for the index row."""
        storage_key = build_parse_object_key(key)
        await self._backend.put(
            storage_key,
            markdown.encode("utf-8"),
            content_type=_MARKDOWN_CONTENT_TYPE,
        )
        return storage_key
    async def load(self, storage_key: str) -> str:
        chunks = [chunk async for chunk in self._backend.open_stream(storage_key)]
        return b"".join(chunks).decode("utf-8")
    async def delete(self, storage_key: str) -> None:
        await self._backend.delete(storage_key)
--- a/surfsense_backend/app/etl_pipeline/cache/storage/object_keys.py
+++ b/surfsense_backend/app/etl_pipeline/cache/storage/object_keys.py
@ -0,0 +1,12 @@
 """Object keys for cached markdown, namespaced under a dedicated prefix."""
 from __future__ import annotations
 from app.etl_pipeline.cache.schemas import ParseKey
 CACHE_PREFIX = "etl_cache"
 def build_parse_object_key(key: ParseKey) -> str:
    # Content-addressed: identical bytes + recipe always map to the same key.
    return f"{CACHE_PREFIX}/{key.source_sha256}/{key.object_suffix}"
--- a/surfsense_backend/app/gateway/init.py
+++ b/surfsense_backend/app/gateway/init.py
@ -8,7 +8,7 @@ from app.config import config
 def require_gateway_enabled() -> None:
-    """FastAPI dependency that gates all gateway HTTP routes on the global flag.
+    """FastAPI dependency that gates gateway operational routes on the global flag.
    Returns 404 (rather than 503) when ``GATEWAY_ENABLED`` is FALSE so that
    disabling the gateway makes its webhook/OAuth/pairing surface indistinguishable
--- a/surfsense_backend/app/indexing_pipeline/cache/init.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/init.py
@ -0,0 +1,11 @@
 """Content-addressed reuse of chunk+embedding output across workspaces."""
 from __future__ import annotations
 from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings
 from app.indexing_pipeline.cache.service import EmbeddingCacheService
 __all__ = [
    "EmbeddingCacheService",
    "build_chunk_embeddings",
 ]
--- a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py
@ -0,0 +1,129 @@
 """Entry point: serve chunk embeddings from cache, embedding only on a miss.
 Embeddings are a pure function of the markdown, the embedding model, and the
 chunker -- so identical markdown is chunked and embedded once and reused across
 workspaces, even when it came from different sources.
 """
 from __future__ import annotations
 import asyncio
 import hashlib
 import logging
 import numpy as np
 from app.config import config
 from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable
 from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
 from app.indexing_pipeline.cache.service import EmbeddingCacheService
 from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
 from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
 from app.indexing_pipeline.document_embedder import embed_texts
 from app.observability import metrics
 logger = logging.getLogger(__name__)
 ChunkPair = tuple[str, np.ndarray]
 async def build_chunk_embeddings(
    markdown: str, *, use_code_chunker: bool
 ) -> tuple[np.ndarray, list[ChunkPair]]:
    """Return the document-level vector and ordered ``(chunk_text, vector)`` pairs.
    Drop-in for the inline chunk+embed step; reuses prior output when the same
    markdown has already been embedded with the current model and chunker.
    """
    settings = load_embedding_cache_settings()
    chunker_kind = "code" if use_code_chunker else "hybrid"
    embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
    cacheable = is_embedding_cacheable(
        cache_enabled=settings.enabled,
        embedding_model=config.EMBEDDING_MODEL,
        embedding_dim=embedding_dim,
    )
    if not cacheable:
        return await _compute(markdown, use_code_chunker=use_code_chunker)
    key = EmbeddingKey(
        markdown_sha256=_hash_text(markdown),
        embedding_model=config.EMBEDDING_MODEL,
        embedding_dim=int(embedding_dim),
        chunker_kind=chunker_kind,
        chunker_version=settings.chunker_version,
    )
    cached = await _recall(key)
    if cached is not None:
        metrics.record_embedding_cache_lookup(
            embedding_model=key.embedding_model,
            chunker_kind=chunker_kind,
            outcome="hit",
        )
        logger.debug("Embedding cache hit for %s", key.markdown_sha256)
        return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
    metrics.record_embedding_cache_lookup(
        embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
    )
    summary_embedding, chunk_pairs = await _compute(
        markdown, use_code_chunker=use_code_chunker
    )
    await _remember(key, summary_embedding, chunk_pairs)
    return summary_embedding, chunk_pairs
 async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
    """Chunk markdown into ordered texts with the pipeline's chunker selection."""
    if use_code_chunker:
        return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
    # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
    return await asyncio.to_thread(chunk_text_hybrid, markdown)
 async def embed_batch(texts: list[str]) -> list[np.ndarray]:
    """Embed texts in one batch off the event loop."""
    return await asyncio.to_thread(embed_texts, texts)
 async def _compute(
    markdown: str, *, use_code_chunker: bool
 ) -> tuple[np.ndarray, list[ChunkPair]]:
    chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker)
    embeddings = await embed_batch([markdown, *chunk_texts])
    summary_embedding, *chunk_embeddings = embeddings
    return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
 async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
    # Caching is best-effort: any failure falls through to a normal embed.
    try:
        from app.tasks.celery_tasks import get_celery_session_maker
        async with get_celery_session_maker()() as session:
            return await EmbeddingCacheService(session).recall(key)
    except Exception:
        logger.warning("Embedding cache recall failed; embedding fresh", exc_info=True)
        return None
 async def _remember(
    key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair]
 ) -> None:
    try:
        from app.tasks.celery_tasks import get_celery_session_maker
        embedding_set = EmbeddingSet(
            summary_embedding=summary_embedding,
            chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs],
        )
        async with get_celery_session_maker()() as session:
            await EmbeddingCacheService(session).remember(key, embedding_set)
    except Exception:
        logger.warning("Embedding cache write failed; result not cached", exc_info=True)
 def _hash_text(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()
--- a/surfsense_backend/app/indexing_pipeline/cache/eligibility.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/eligibility.py
@ -0,0 +1,21 @@
 """Gating rule: may this document be served from / written to the embedding cache?"""
 from __future__ import annotations
 def is_embedding_cacheable(
    *,
    cache_enabled: bool,
    embedding_model: str | None,
    embedding_dim: int | None,
 ) -> bool:
    """Cache only when a concrete embedding model and dimension are configured.
    Without a model there is nothing to key against, and without a dimension the
    blob's integrity guard cannot run -- both bypass the cache.
    """
    if not cache_enabled:
        return False
    if not embedding_model:
        return False
    return bool(embedding_dim)
--- a/surfsense_backend/app/indexing_pipeline/cache/eviction/init.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/eviction/init.py
@ -0,0 +1,9 @@
 """Background pruning of the embedding cache by age and size budget."""
 from __future__ import annotations
 from .task import evict_embedding_cache_task
 __all__ = [
    "evict_embedding_cache_task",
 ]
--- a/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py
@ -0,0 +1,68 @@
 """Celery task that prunes the embedding cache by TTL, then by size budget."""
 from __future__ import annotations
 import contextlib
 import logging
 from datetime import UTC, datetime, timedelta
 from app.celery_app import celery_app
 from app.etl_pipeline.cache.eviction.policy import select_over_budget
 from app.etl_pipeline.cache.schemas import EvictionCandidate
 from app.indexing_pipeline.cache.persistence import CachedEmbeddingSetRepository
 from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
 from app.indexing_pipeline.cache.storage import EmbeddingCacheStore
 from app.observability import metrics
 from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 logger = logging.getLogger(__name__)
@celery_app.task(name="evict_embedding_cache")
 def evict_embedding_cache_task():
    return run_async_celery_task(_evict)
 async def _evict() -> None:
    """Expire stale entries, then shed the coldest overflow only if still over budget."""
    settings = load_embedding_cache_settings()
    if not settings.enabled:
        return
    store = EmbeddingCacheStore()
    async with get_celery_session_maker()() as session:
        index = CachedEmbeddingSetRepository(session)
        cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days)
        expired = await index.select_expired(
            cutoff=cutoff, limit=settings.eviction_batch
        )
        await _drop(index, store, expired, phase="ttl")
        total = await index.total_size_bytes()
        if total > settings.max_total_bytes:
            coldest = await index.select_coldest(limit=settings.eviction_batch)
            over_budget = select_over_budget(
                coldest,
                current_total_bytes=total,
                max_total_bytes=settings.max_total_bytes,
            )
            await _drop(index, store, over_budget, phase="size")
 async def _drop(
    index: CachedEmbeddingSetRepository,
    store: EmbeddingCacheStore,
    candidates: list[EvictionCandidate],
    *,
    phase: str,
 ) -> None:
    if not candidates:
        return
    for candidate in candidates:
        # Drop the index row even if the blob delete fails (orphan blob is harmless).
        with contextlib.suppress(Exception):
            await store.delete(candidate.storage_key)
    await index.delete_by_ids([candidate.id for candidate in candidates])
    metrics.record_embedding_cache_eviction(len(candidates), phase=phase)
    logger.info("Evicted %d cached embedding sets (%s)", len(candidates), phase)
--- a/surfsense_backend/app/indexing_pipeline/cache/persistence/init.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/persistence/init.py
@ -0,0 +1,11 @@
 """Database access for cached embedding sets."""
 from __future__ import annotations
 from .models import CachedEmbeddingSet
 from .repository import CachedEmbeddingSetRepository
 __all__ = [
    "CachedEmbeddingSet",
    "CachedEmbeddingSetRepository",
 ]
--- a/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py
@ -0,0 +1,47 @@
 """``embedding_cache_sets``: one reusable chunk+embedding set per markdown."""
 from __future__ import annotations
 from sqlalchemy import (
    BigInteger,
    Column,
    DateTime,
    Index,
    Integer,
    String,
    UniqueConstraint,
 )
 from app.db import BaseModel, TimestampMixin
 class CachedEmbeddingSet(BaseModel, TimestampMixin):
    __tablename__ = "embedding_cache_sets"
    # Key: markdown text + the recipe that turned it into vectors.
    markdown_sha256 = Column(String(64), nullable=False)
    embedding_model = Column(String(255), nullable=False)
    embedding_dim = Column(Integer, nullable=False)
    chunker_kind = Column(String(8), nullable=False)
    chunker_version = Column(Integer, nullable=False)
    # Where the embedding blob lives (kept out of the row to stay small).
    storage_backend = Column(String(32), nullable=False)
    storage_key = Column(String, nullable=False)
    size_bytes = Column(BigInteger, nullable=False)
    chunk_count = Column(Integer, nullable=False, default=0, server_default="0")
    # Drives eviction (popularity + recency).
    times_reused = Column(BigInteger, nullable=False, default=0, server_default="0")
    last_used_at = Column(DateTime(timezone=True), nullable=False)
    __table_args__ = (
        UniqueConstraint(
            "markdown_sha256",
            "embedding_model",
            "chunker_kind",
            "chunker_version",
            name="uq_embedding_cache_sets_key",
        ),
        Index("ix_embedding_cache_sets_last_used_at", "last_used_at"),
    )
--- a/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py
@ -0,0 +1,126 @@
 """CRUD and eviction selectors for ``embedding_cache_sets`` (no business rules)."""
 from __future__ import annotations
 from datetime import UTC, datetime
 from sqlalchemy import delete, func, select, update
 from sqlalchemy.dialects.postgresql import insert as pg_insert
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.etl_pipeline.cache.schemas import EvictionCandidate
 from app.indexing_pipeline.cache.schemas import EmbeddingKey
 from .models import CachedEmbeddingSet
 _EVICTION_COLUMNS = (
    CachedEmbeddingSet.id,
    CachedEmbeddingSet.storage_key,
    CachedEmbeddingSet.size_bytes,
    CachedEmbeddingSet.last_used_at,
    CachedEmbeddingSet.times_reused,
 )
 def _as_eviction_candidate(row) -> EvictionCandidate:
    return EvictionCandidate(
        id=row.id,
        storage_key=row.storage_key,
        size_bytes=row.size_bytes,
        last_used_at=row.last_used_at,
        times_reused=row.times_reused,
    )
 class CachedEmbeddingSetRepository:
    def __init__(self, session: AsyncSession) -> None:
        self._session = session
    async def get(self, key: EmbeddingKey) -> CachedEmbeddingSet | None:
        result = await self._session.execute(
            select(CachedEmbeddingSet).where(
                CachedEmbeddingSet.markdown_sha256 == key.markdown_sha256,
                CachedEmbeddingSet.embedding_model == key.embedding_model,
                CachedEmbeddingSet.chunker_kind == key.chunker_kind,
                CachedEmbeddingSet.chunker_version == key.chunker_version,
            )
        )
        return result.scalars().first()
    async def insert(
        self,
        *,
        key: EmbeddingKey,
        storage_backend: str,
        storage_key: str,
        size_bytes: int,
        chunk_count: int,
    ) -> None:
        # Concurrent writers embed identical markdown, so a lost race is harmless.
        now = datetime.now(UTC)
        await self._session.execute(
            pg_insert(CachedEmbeddingSet)
            .values(
                markdown_sha256=key.markdown_sha256,
                embedding_model=key.embedding_model,
                embedding_dim=key.embedding_dim,
                chunker_kind=key.chunker_kind,
                chunker_version=key.chunker_version,
                storage_backend=storage_backend,
                storage_key=storage_key,
                size_bytes=size_bytes,
                chunk_count=chunk_count,
                times_reused=0,
                last_used_at=now,
                created_at=now,
            )
            .on_conflict_do_nothing(constraint="uq_embedding_cache_sets_key")
        )
        await self._session.commit()
    async def mark_used(self, row_id: int) -> None:
        await self._session.execute(
            update(CachedEmbeddingSet)
            .where(CachedEmbeddingSet.id == row_id)
            .values(
                times_reused=CachedEmbeddingSet.times_reused + 1,
                last_used_at=datetime.now(UTC),
            )
        )
        await self._session.commit()
    async def total_size_bytes(self) -> int:
        result = await self._session.execute(
            select(func.coalesce(func.sum(CachedEmbeddingSet.size_bytes), 0))
        )
        return int(result.scalar() or 0)
    async def select_expired(
        self, *, cutoff: datetime, limit: int
    ) -> list[EvictionCandidate]:
        result = await self._session.execute(
            select(*_EVICTION_COLUMNS)
            .where(CachedEmbeddingSet.last_used_at < cutoff)
            .order_by(CachedEmbeddingSet.last_used_at.asc())
            .limit(limit)
        )
        return [_as_eviction_candidate(row) for row in result]
    async def select_coldest(self, *, limit: int) -> list[EvictionCandidate]:
        result = await self._session.execute(
            select(*_EVICTION_COLUMNS)
            .order_by(
                CachedEmbeddingSet.times_reused.asc(),
                CachedEmbeddingSet.last_used_at.asc(),
            )
            .limit(limit)
        )
        return [_as_eviction_candidate(row) for row in result]
    async def delete_by_ids(self, ids: list[int]) -> None:
        if not ids:
            return
        await self._session.execute(
            delete(CachedEmbeddingSet).where(CachedEmbeddingSet.id.in_(ids))
        )
        await self._session.commit()
--- a/surfsense_backend/app/indexing_pipeline/cache/schemas/init.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/init.py
@ -0,0 +1,12 @@
 """Pure value objects for the embedding cache."""
 from __future__ import annotations
 from .embedding_key import EmbeddingKey
 from .embedding_set import CachedChunk, EmbeddingSet
 __all__ = [
    "CachedChunk",
    "EmbeddingKey",
    "EmbeddingSet",
 ]
--- a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
@ -0,0 +1,27 @@
 """Identity of a cacheable embedding set: equal keys yield identical vectors.
 Embeddings depend on the markdown text, the embedding model, and the chunker --
 never on how the markdown was produced. So the key is the markdown's own hash
 plus the model and chunker recipe, not the upstream parse identity.
 """
 from __future__ import annotations
 import hashlib
 from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
 class EmbeddingKey:
    markdown_sha256: str
    embedding_model: str
    embedding_dim: int
    chunker_kind: str
    chunker_version: int
    @property
    def object_suffix(self) -> str:
        # Fingerprint the model so distinct models never share a blob, while the
        # markdown hash (the object's folder) stays human-readable.
        fingerprint = hashlib.sha256(self.embedding_model.encode("utf-8")).hexdigest()
        return f"{fingerprint[:16]}.{self.chunker_kind}.v{self.chunker_version}.emb"
--- a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py
@ -0,0 +1,29 @@
 """The cached payload: a document's chunk texts paired with their vectors."""
 from __future__ import annotations
 from dataclasses import dataclass
 import numpy as np
@dataclass(frozen=True, slots=True)
 class CachedChunk:
    text: str
    embedding: np.ndarray
@dataclass(frozen=True, slots=True)
 class EmbeddingSet:
    """Everything the indexer needs to rebuild a document's chunks without embedding.
    ``summary_embedding`` is the document-level vector; ``chunks`` are the ordered
    chunk texts and their vectors.
    """
    summary_embedding: np.ndarray
    chunks: list[CachedChunk]
    @property
    def chunk_count(self) -> int:
        return len(self.chunks)
--- a/surfsense_backend/app/indexing_pipeline/cache/serialization.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/serialization.py
@ -0,0 +1,75 @@
 """Serialize an EmbeddingSet to a compact, self-describing blob (no pickle).
 Layout: ``MAGIC | uint32 header_len | json header | float32 matrix``. The header
 carries the dim, chunk count, and ordered chunk texts; the matrix holds the
 summary vector followed by one row per chunk, all float32 for compactness.
 """
 from __future__ import annotations
 import json
 import struct
 import numpy as np
 from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingSet
 # Marker at the start of every blob: "SurfSense EMBeddings, version 1"-> SSEMB1. Lets us
 # reject foreign blobs and bump the trailing digit if the layout ever changes.
 _MAGIC = b"SSEMB1"
 # 4-byte big-endian unsigned int written before the variable-length JSON header,
 # so the reader knows where the header ends and the float matrix begins.
 _HEADER_LEN = struct.Struct(">I")
 def serialize(embedding_set: EmbeddingSet) -> bytes:
    summary = np.asarray(embedding_set.summary_embedding, dtype=np.float32).reshape(-1)
    dim = int(summary.shape[0])
    rows = [summary]
    texts: list[str] = []
    for chunk in embedding_set.chunks:
        vector = np.asarray(chunk.embedding, dtype=np.float32).reshape(-1)
        if vector.shape[0] != dim:
            raise ValueError(
                "All vectors in an embedding set must share one dimension."
            )
        rows.append(vector)
        texts.append(chunk.text)
    matrix = np.stack(rows, axis=0)
    header = json.dumps(
        {"dim": dim, "count": len(texts), "texts": texts}, ensure_ascii=False
    ).encode("utf-8")
    return b"".join(
        [_MAGIC, _HEADER_LEN.pack(len(header)), header, matrix.tobytes(order="C")]
    )
 def deserialize(blob: bytes) -> EmbeddingSet:
    view = memoryview(blob)
    if bytes(view[: len(_MAGIC)]) != _MAGIC:
        raise ValueError("Unrecognized embedding cache blob.")
    offset = len(_MAGIC)
    (header_len,) = _HEADER_LEN.unpack(view[offset : offset + _HEADER_LEN.size])
    offset += _HEADER_LEN.size
    header = json.loads(bytes(view[offset : offset + header_len]).decode("utf-8"))
    offset += header_len
    dim = int(header["dim"])
    count = int(header["count"])
    texts: list[str] = header["texts"]
    matrix = np.frombuffer(view[offset:], dtype=np.float32)
    if matrix.shape[0] != (count + 1) * dim:
        raise ValueError("Embedding cache blob is truncated or corrupt.")
    matrix = matrix.reshape(count + 1, dim)
    return EmbeddingSet(
        summary_embedding=matrix[0],
        chunks=[
            CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count)
        ],
    )
--- a/surfsense_backend/app/indexing_pipeline/cache/service.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/service.py
@ -0,0 +1,51 @@
 """Recall and remember embedding sets, coordinating the index and blob store."""
 from __future__ import annotations
 import logging
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.indexing_pipeline.cache.persistence import CachedEmbeddingSetRepository
 from app.indexing_pipeline.cache.schemas import EmbeddingKey, EmbeddingSet
 from app.indexing_pipeline.cache.storage import EmbeddingCacheStore
 logger = logging.getLogger(__name__)
 class EmbeddingCacheService:
    def __init__(self, session: AsyncSession) -> None:
        self._index = CachedEmbeddingSetRepository(session)
        self._store = EmbeddingCacheStore()
    async def recall(self, key: EmbeddingKey) -> EmbeddingSet | None:
        """Return the cached embedding set, or None on a miss."""
        row = await self._index.get(key)
        if row is None:
            return None
        try:
            embedding_set = await self._store.load(row.storage_key)
        except Exception:
            # Index points at a blob that is gone; treat as a miss and re-embed.
            logger.warning("Cache blob missing: %s", row.storage_key, exc_info=True)
            return None
        if int(embedding_set.summary_embedding.shape[0]) != key.embedding_dim:
            # A model swapped its dimension under a reused name; never serve it.
            logger.warning("Cached embedding dimension mismatch: %s", row.storage_key)
            return None
        await self._index.mark_used(row.id)
        return embedding_set
    async def remember(self, key: EmbeddingKey, embedding_set: EmbeddingSet) -> None:
        """Store a freshly embedded set for future reuse."""
        storage_key, size_bytes = await self._store.save(key, embedding_set)
        await self._index.insert(
            key=key,
            storage_backend=self._store.backend_name,
            storage_key=storage_key,
            size_bytes=size_bytes,
            chunk_count=embedding_set.chunk_count,
        )
--- a/surfsense_backend/app/indexing_pipeline/cache/settings.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/settings.py
@ -0,0 +1,30 @@
 """Embedding-cache configuration resolved from the central ``Config``.
 The blob backend is intentionally not configured here: it is shared with the ETL
 parse cache (see ``ETL_CACHE_STORAGE_*``).
 """
 from __future__ import annotations
 from dataclasses import dataclass
@dataclass(frozen=True)
 class EmbeddingCacheSettings:
    enabled: bool
    chunker_version: int
    ttl_days: int
    max_total_bytes: int
    eviction_batch: int
 def load_embedding_cache_settings() -> EmbeddingCacheSettings:
    from app.config import config
    return EmbeddingCacheSettings(
        enabled=config.EMBEDDING_CACHE_ENABLED,
        chunker_version=config.EMBEDDING_CACHE_CHUNKER_VERSION,
        ttl_days=config.EMBEDDING_CACHE_TTL_DAYS,
        max_total_bytes=config.EMBEDDING_CACHE_MAX_TOTAL_MB * 1024 * 1024,
        eviction_batch=config.EMBEDDING_CACHE_EVICTION_BATCH,
    )
--- a/surfsense_backend/app/indexing_pipeline/cache/storage/init.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/storage/init.py
@ -0,0 +1,9 @@
 """Blob storage for cached embedding sets."""
 from __future__ import annotations
 from .embedding_store import EmbeddingCacheStore
 __all__ = [
    "EmbeddingCacheStore",
 ]
--- a/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py
@ -0,0 +1,39 @@
 """Read and write cached embedding blobs through the shared cache backend.
 The blob backend is shared with the ETL parse cache (same bucket / root), so
 markdown and its embeddings live side by side; only the object prefix differs.
 """
 from __future__ import annotations
 from app.etl_pipeline.cache.storage.backend import resolve_cache_backend
 from app.indexing_pipeline.cache.schemas import EmbeddingKey, EmbeddingSet
 from app.indexing_pipeline.cache.serialization import deserialize, serialize
 from app.indexing_pipeline.cache.storage.object_keys import build_embedding_object_key
 _EMBEDDING_CONTENT_TYPE = "application/octet-stream"
 class EmbeddingCacheStore:
    def __init__(self) -> None:
        self._backend = resolve_cache_backend()
    @property
    def backend_name(self) -> str:
        return self._backend.backend_name
    async def save(
        self, key: EmbeddingKey, embedding_set: EmbeddingSet
    ) -> tuple[str, int]:
        """Persist the embedding set and return its storage key and byte size."""
        blob = serialize(embedding_set)
        storage_key = build_embedding_object_key(key)
        await self._backend.put(storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE)
        return storage_key, len(blob)
    async def load(self, storage_key: str) -> EmbeddingSet:
        chunks = [chunk async for chunk in self._backend.open_stream(storage_key)]
        return deserialize(b"".join(chunks))
    async def delete(self, storage_key: str) -> None:
        await self._backend.delete(storage_key)
--- a/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py
@ -0,0 +1,12 @@
 """Object keys for cached embedding sets, namespaced under a dedicated prefix."""
 from __future__ import annotations
 from app.indexing_pipeline.cache.schemas import EmbeddingKey
 CACHE_PREFIX = "embedding_cache"
 def build_embedding_object_key(key: EmbeddingKey) -> str:
    # Content-addressed: identical markdown + recipe always map to the same key.
    return f"{CACHE_PREFIX}/{key.markdown_sha256}/{key.object_suffix}"
--- a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py
+++ b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py
@ -0,0 +1,56 @@
 """Diff a document's existing chunk rows against its freshly chunked texts.
 Embeddings are a pure function of chunk text, so a row whose content reappears
 in the new chunking keeps its embedding (and its HNSW/GIN index entries); only
 genuinely new texts are embedded and only vanished rows are deleted. Matching
 is a greedy multiset match on content in document order, so duplicate
 boilerplate chunks pair up one-to-one and reordered chunks become cheap
 position updates instead of delete+reinsert.
 """
 from __future__ import annotations
 from collections import defaultdict, deque
 from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
 class ExistingChunk:
    id: int
    content: str
    position: int
@dataclass(frozen=True, slots=True)
 class ChunkPlan:
    """The minimal set of writes that turns the stored chunks into the new ones.
    ``reused`` holds only kept rows whose position actually changed; rows that
    match in place need no write at all. Kept-row count (for metrics) is
    ``len(existing) - len(to_delete)``.
    """
    reused: list[tuple[int, int]]  # (existing_chunk_id, new_position)
    to_embed: list[tuple[int, str]]  # (new_position, text)
    to_delete: list[int]  # existing chunk ids
 def reconcile(existing: list[ExistingChunk], new_texts: list[str]) -> ChunkPlan:
    available: dict[str, deque[ExistingChunk]] = defaultdict(deque)
    for chunk in sorted(existing, key=lambda c: c.position):
        available[chunk.content].append(chunk)
    reused: list[tuple[int, int]] = []
    to_embed: list[tuple[int, str]] = []
    for new_position, text in enumerate(new_texts):
        matches = available.get(text)
        if matches:
            chunk = matches.popleft()
            if chunk.position != new_position:
                reused.append((chunk.id, new_position))
        else:
            to_embed.append((new_position, text))
    to_delete = [chunk.id for queue in available.values() for chunk in queue]
    return ChunkPlan(reused=reused, to_embed=to_embed, to_delete=to_delete)
--- a/surfsense_backend/app/indexing_pipeline/document_persistence.py
+++ b/surfsense_backend/app/indexing_pipeline/document_persistence.py
@ -1,12 +1,12 @@
 import contextlib
 import logging
 import time
 from datetime import UTC, datetime
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import object_session
 from sqlalchemy.orm.attributes import set_committed_value
-from app.db import Document, DocumentStatus
+from app.db import Chunk, Document, DocumentStatus
 logger = logging.getLogger(__name__)
@ -22,7 +22,6 @@ async def rollback_and_persist_failure(
    try:
        await session.rollback()
    except Exception:
        # Session is completely dead; surface it but never raise.
        logger.warning(
            "Rollback failed; cannot persist failed status for document %s",
            getattr(document, "id", "unknown"),
@ -35,8 +34,6 @@ async def rollback_and_persist_failure(
        document.status = DocumentStatus.failed(message)
        await session.commit()
    except Exception:
        # Best-effort: the document stays non-ready and is retried next sync.
        # Log it so a permanently-stuck document is at least traceable.
        logger.warning(
            "Could not persist failed status for document %s; will retry next sync",
            getattr(document, "id", "unknown"),
@ -46,12 +43,60 @@ async def rollback_and_persist_failure(
            await session.rollback()
-def attach_chunks_to_document(document: Document, chunks: list) -> None:
+async def persist_scratch_index(
-    """Assign chunks to a document without triggering SQLAlchemy async lazy loading."""
+    session: AsyncSession,
    document: Document,
    content: str,
    chunks: list[Chunk],
    *,
    batch_size: int,
    perf: logging.Logger,
 ) -> None:
    """Commit document content first, then chunk rows in batches, then mark ready."""
    if document.id is None:
        raise ValueError("document.id is required to persist chunks")
    document.content = content
    document.updated_at = datetime.now(UTC)
    await session.commit()
    t_persist = time.perf_counter()
    total = len(chunks)
    if total == 0:
        set_committed_value(document, "chunks", [])
        document.status = DocumentStatus.ready()
        document.updated_at = datetime.now(UTC)
        await session.commit()
        return
    effective_batch = total if batch_size <= 0 else batch_size
    num_batches = (total + effective_batch - 1) // effective_batch
    doc_id = document.id
    for batch_idx, start in enumerate(range(0, total, effective_batch), start=1):
        batch = chunks[start : start + effective_batch]
        t_batch = time.perf_counter()
        for chunk in batch:
            chunk.document_id = doc_id
        session.add_all(batch)
        await session.commit()
        perf.info(
            "[indexing] chunk batch doc=%d batch=%d/%d rows=%d in %.3fs",
            doc_id,
            batch_idx,
            num_batches,
            len(batch),
            time.perf_counter() - t_batch,
        )
    set_committed_value(document, "chunks", chunks)
-    session = object_session(document)
+    document.status = DocumentStatus.ready()
-    if session is not None:
+    document.updated_at = datetime.now(UTC)
-        if document.id is not None:
+    await session.commit()
-            for chunk in chunks:
+    perf.info(
-                chunk.document_id = document.id
+        "[indexing] chunk persist doc=%d chunks=%d batches=%d in %.3fs",
-        session.add_all(chunks)
+        doc_id,
        total,
        num_batches,
        time.perf_counter() - t_persist,
    )
--- a/surfsense_backend/app/indexing_pipeline/exceptions.py
+++ b/surfsense_backend/app/indexing_pipeline/exceptions.py
@ -14,6 +14,8 @@ from litellm.exceptions import (
 )
 from sqlalchemy.exc import IntegrityError as IntegrityError
 from app.services.llm_error_adapter import LLMErrorCategory, adapt_llm_exception
 # Tuples for use directly in except clauses.
 RETRYABLE_LLM_ERRORS = (
    RateLimitError,
@ -97,38 +99,20 @@ def safe_exception_message(exc: Exception) -> str:
 def llm_retryable_message(exc: Exception) -> str:
    try:
-        if isinstance(exc, RateLimitError):
+        adapted = adapt_llm_exception(exc)
-            return PipelineMessages.RATE_LIMIT
+        if adapted.category is LLMErrorCategory.UNKNOWN:
-        if isinstance(exc, Timeout):
+            return safe_exception_message(exc)
-            return PipelineMessages.LLM_TIMEOUT
+        return adapted.user_message
        if isinstance(exc, ServiceUnavailableError):
            return PipelineMessages.LLM_UNAVAILABLE
        if isinstance(exc, BadGatewayError):
            return PipelineMessages.LLM_BAD_GATEWAY
        if isinstance(exc, InternalServerError):
            return PipelineMessages.LLM_SERVER_ERROR
        if isinstance(exc, APIConnectionError):
            return PipelineMessages.LLM_CONNECTION
        return safe_exception_message(exc)
    except Exception:
        return "Something went wrong when calling the LLM."
 def llm_permanent_message(exc: Exception) -> str:
    try:
-        if isinstance(exc, AuthenticationError):
+        adapted = adapt_llm_exception(exc)
-            return PipelineMessages.LLM_AUTH
+        if adapted.category is LLMErrorCategory.UNKNOWN:
-        if isinstance(exc, PermissionDeniedError):
+            return safe_exception_message(exc)
-            return PipelineMessages.LLM_PERMISSION
+        return adapted.user_message
        if isinstance(exc, NotFoundError):
            return PipelineMessages.LLM_NOT_FOUND
        if isinstance(exc, BadRequestError):
            return PipelineMessages.LLM_BAD_REQUEST
        if isinstance(exc, UnprocessableEntityError):
            return PipelineMessages.LLM_UNPROCESSABLE
        if isinstance(exc, APIResponseValidationError):
            return PipelineMessages.LLM_RESPONSE
        return safe_exception_message(exc)
    except Exception:
        return "Something went wrong when calling the LLM."
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@ -8,7 +8,7 @@ from collections.abc import Awaitable, Callable
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
-from sqlalchemy import delete, select
+from sqlalchemy import delete, select, update
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
@ -19,16 +19,17 @@ from app.db import (
    DocumentStatus,
    DocumentType,
 )
 from app.indexing_pipeline.cache import build_chunk_embeddings
 from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch
 from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile
 from app.indexing_pipeline.connector_document import ConnectorDocument
 from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
 from app.indexing_pipeline.document_embedder import embed_texts
 from app.indexing_pipeline.document_hashing import (
    compute_content_hash,
    compute_identifier_hash,
    compute_unique_identifier_hash,
 )
 from app.indexing_pipeline.document_persistence import (
-    attach_chunks_to_document,
+    persist_scratch_index,
    rollback_and_persist_failure,
 )
 from app.indexing_pipeline.exceptions import (
@ -380,53 +381,50 @@ class IndexingPipelineService:
            content = connector_doc.source_markdown
            await self.session.execute(
                delete(Chunk).where(Chunk.document_id == document.id)
            )
            t_step = time.perf_counter()
-            if connector_doc.should_use_code_chunker:
+            existing = await self._load_existing_chunks(document.id)
-                chunk_texts = await asyncio.to_thread(
+            if existing and self._reconcile_enabled():
-                    chunk_text,
+                chunk_count = await self._reindex_incrementally(
-                    connector_doc.source_markdown,
+                    document, content, connector_doc, existing
                    use_code_chunker=True,
                )
                perf.info(
                    "[indexing] chunk+embed doc=%d chunks=%d in %.3fs",
                    document.id,
                    chunk_count,
                    time.perf_counter() - t_step,
                )
                document.content = content
                document.updated_at = datetime.now(UTC)
                document.status = DocumentStatus.ready()
                await self.session.commit()
            else:
-                # Use the table-aware hybrid chunker so Markdown tables are not
+                from app.config import config
-                # split mid-row (see issue #1334).
+
-                chunk_texts = await asyncio.to_thread(
+                chunks = await self._reindex_from_scratch(
-                    chunk_text_hybrid,
+                    document, content, connector_doc
-                    connector_doc.source_markdown,
+                )
                chunk_count = len(chunks)
                perf.info(
                    "[indexing] chunk+embed doc=%d chunks=%d in %.3fs",
                    document.id,
                    chunk_count,
                    time.perf_counter() - t_step,
                )
                await persist_scratch_index(
                    self.session,
                    document,
                    content,
                    chunks,
                    batch_size=config.INDEXING_CHUNK_INSERT_BATCH_SIZE,
                    perf=perf,
                )
            texts_to_embed = [content, *chunk_texts]
            embeddings = await asyncio.to_thread(embed_texts, texts_to_embed)
            summary_embedding, *chunk_embeddings = embeddings
            chunks = [
                Chunk(content=text, embedding=emb)
                for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)
            ]
            perf.info(
                "[indexing] chunk+embed doc=%d chunks=%d in %.3fs",
                document.id,
                len(chunks),
                time.perf_counter() - t_step,
            )
            document.content = content
            document.embedding = summary_embedding
            attach_chunks_to_document(document, chunks)
            document.updated_at = datetime.now(UTC)
            document.status = DocumentStatus.ready()
            await self.session.commit()
            perf.info(
                "[indexing] index TOTAL doc=%d chunks=%d in %.3fs",
                document.id,
-                len(chunks),
+                chunk_count,
                time.perf_counter() - t_index,
            )
-            log_index_success(ctx, chunk_count=len(chunks))
+            log_index_success(ctx, chunk_count=chunk_count)
            outcome_status = "success"
            await self._enqueue_ai_sort_if_enabled(document)
@ -483,6 +481,89 @@ class IndexingPipelineService:
        persist_span_cm.__exit__(*sys.exc_info())
        return document
    @staticmethod
    def _reconcile_enabled() -> bool:
        from app.config import config
        return config.CHUNK_RECONCILE_ENABLED
    async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
        result = await self.session.execute(
            select(Chunk.id, Chunk.content, Chunk.position).where(
                Chunk.document_id == document_id
            )
        )
        return [
            ExistingChunk(id=row.id, content=row.content, position=row.position)
            for row in result
        ]
    async def _reindex_from_scratch(
        self, document: Document, content: str, connector_doc: ConnectorDocument
    ) -> list[Chunk]:
        await self.session.execute(
            delete(Chunk).where(Chunk.document_id == document.id)
        )
        summary_embedding, chunk_pairs = await build_chunk_embeddings(
            content,
            use_code_chunker=connector_doc.should_use_code_chunker,
        )
        document.embedding = summary_embedding
        return [
            Chunk(content=text, embedding=emb, position=i)
            for i, (text, emb) in enumerate(chunk_pairs)
        ]
    async def _reindex_incrementally(
        self,
        document: Document,
        content: str,
        connector_doc: ConnectorDocument,
        existing: list[ExistingChunk],
    ) -> int:
        """Edit path: keep rows whose text survived, embed only new texts.
        Unchanged rows keep their embedding and their HNSW/GIN index entries;
        moved rows get a position-only UPDATE, which touches neither index.
        """
        new_texts = await chunk_markdown(
            content, use_code_chunker=connector_doc.should_use_code_chunker
        )
        plan = reconcile(existing, new_texts)
        # One batch: the document-level summary vector plus the missing chunks.
        embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
        summary_embedding, *new_embeddings = embeddings
        if plan.reused:
            await self.session.execute(
                update(Chunk),
                [{"id": cid, "position": pos} for cid, pos in plan.reused],
            )
        if plan.to_delete:
            await self.session.execute(
                delete(Chunk).where(Chunk.id.in_(plan.to_delete))
            )
        self.session.add_all(
            Chunk(
                content=text,
                embedding=emb,
                position=pos,
                document_id=document.id,
            )
            for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True)
        )
        document.embedding = summary_embedding
        ot_metrics.record_chunk_reconcile(
            reused=len(existing) - len(plan.to_delete),
            embedded=len(plan.to_embed),
            deleted=len(plan.to_delete),
        )
        return len(new_texts)
    async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
        """Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
        try:
--- a/surfsense_backend/app/notifications/constants.py
+++ b/surfsense_backend/app/notifications/constants.py
@ -2,6 +2,9 @@
 from __future__ import annotations
 # Matches notifications.title VARCHAR(200).
 TITLE_MAX_LENGTH = 200
 # Notifications newer than this are live-synced; older ones load via the list endpoint.
 SYNC_WINDOW_DAYS = 14
--- a/surfsense_backend/app/notifications/service/handlers/document_processing.py
+++ b/surfsense_backend/app/notifications/service/handlers/document_processing.py
@ -28,7 +28,7 @@ class DocumentProcessingNotificationHandler(BaseNotificationHandler):
    ) -> Notification:
        """Open the notification when document processing is queued."""
        operation_id = msg.operation_id(document_type, document_name, search_space_id)
-        title = f"Processing: {document_name}"
+        title = msg.started_title(document_name)
        message = "Waiting in queue"
        metadata = {
--- a/surfsense_backend/app/notifications/service/messages/document_processing.py
+++ b/surfsense_backend/app/notifications/service/messages/document_processing.py
@ -6,6 +6,8 @@ import hashlib
 from datetime import UTC, datetime
 from typing import Any
 from app.notifications.service.messages.text import format_title
 def operation_id(document_type: str, filename: str, search_space_id: int) -> str:
    """Build a unique id for a document processing run."""
@ -14,6 +16,11 @@ def operation_id(document_type: str, filename: str, search_space_id: int) -> str
    return f"doc_{document_type}_{search_space_id}_{timestamp}_{filename_hash}"
 def started_title(document_name: str) -> str:
    """Title shown when document processing is queued."""
    return format_title("Processing: ", document_name)
 def progress(
    stage: str,
    stage_message: str | None = None,
@ -44,11 +51,11 @@ def completion(
 ) -> tuple[str, str, str, dict[str, Any]]:
    """Compute the final title, message, status, and metadata for a finished run."""
    if error_message:
-        title = f"Failed: {document_name}"
+        title = format_title("Failed: ", document_name)
        message = f"Processing failed: {error_message}"
        status = "failed"
    else:
-        title = f"Ready: {document_name}"
+        title = format_title("Ready: ", document_name)
        message = "Now searchable!"
        status = "completed"
--- a/surfsense_backend/app/notifications/service/messages/text.py
+++ b/surfsense_backend/app/notifications/service/messages/text.py
@ -2,7 +2,21 @@
 from __future__ import annotations
 from app.notifications.constants import TITLE_MAX_LENGTH
 def truncate(text: str, limit: int) -> str:
    """Return ``text`` capped at ``limit`` chars, appending an ellipsis if cut."""
    return text[:limit] + "..." if len(text) > limit else text
 def format_title(prefix: str, text: str, *, max_length: int = TITLE_MAX_LENGTH) -> str:
    """Build a notification title that fits ``max_length`` including ``prefix``."""
    budget = max_length - len(prefix)
    if budget <= 0:
        return prefix[:max_length]
    if len(text) <= budget:
        return f"{prefix}{text}"
    if budget <= 3:
        return f"{prefix}{text[:budget]}"
    return f"{prefix}{text[: budget - 3]}..."
--- a/surfsense_backend/app/observability/metrics.py
+++ b/surfsense_backend/app/observability/metrics.py
@ -289,6 +289,49 @@ def _etl_extract_outcome():
    )
@lru_cache(maxsize=1)
 def _etl_cache_lookups():
    return _get_meter().create_counter(
        "surfsense.etl.cache.lookups",
        description="Count of ETL parse-cache lookups by outcome (hit/miss).",
    )
@lru_cache(maxsize=1)
 def _etl_cache_evictions():
    return _get_meter().create_counter(
        "surfsense.etl.cache.evictions",
        description="Count of ETL parse-cache entries evicted, by phase.",
    )
@lru_cache(maxsize=1)
 def _embedding_cache_lookups():
    return _get_meter().create_counter(
        "surfsense.embedding.cache.lookups",
        description="Count of embedding (chunk+embedding) cache lookups by outcome (hit/miss).",
    )
@lru_cache(maxsize=1)
 def _embedding_cache_evictions():
    return _get_meter().create_counter(
        "surfsense.embedding.cache.evictions",
        description="Count of embedding cache entries evicted, by phase.",
    )
@lru_cache(maxsize=1)
 def _chunk_reconcile_chunks():
    return _get_meter().create_counter(
        "surfsense.indexing.reconcile.chunks",
        description=(
            "Chunks handled by incremental re-indexing, by outcome "
            "(reused/embedded/deleted)."
        ),
    )
@lru_cache(maxsize=1)
 def _celery_heartbeat_refreshes():
    return _get_meter().create_counter(
@ -670,6 +713,61 @@ def record_etl_extract_outcome(
    )
 def record_etl_cache_lookup(
    *, etl_service: str | None, mode: str | None, outcome: str
 ) -> None:
    """Record a parse-cache lookup. ``outcome`` is ``hit`` or ``miss``."""
    _add(
        _etl_cache_lookups(),
        1,
        {
            "etl.service": etl_service or "unknown",
            "mode": mode or "unknown",
            "outcome": outcome,
        },
    )
 def record_etl_cache_eviction(count: int, *, phase: str) -> None:
    """Record evicted entries. ``phase`` is ``ttl`` or ``size``."""
    if count <= 0:
        return
    _add(_etl_cache_evictions(), count, {"phase": phase})
 def record_embedding_cache_lookup(
    *, embedding_model: str | None, chunker_kind: str | None, outcome: str
 ) -> None:
    """Record an embedding-cache lookup. ``outcome`` is ``hit`` or ``miss``."""
    _add(
        _embedding_cache_lookups(),
        1,
        {
            "embedding.model": embedding_model or "unknown",
            "chunker.kind": chunker_kind or "unknown",
            "outcome": outcome,
        },
    )
 def record_embedding_cache_eviction(count: int, *, phase: str) -> None:
    """Record evicted entries. ``phase`` is ``ttl`` or ``size``."""
    if count <= 0:
        return
    _add(_embedding_cache_evictions(), count, {"phase": phase})
 def record_chunk_reconcile(*, reused: int, embedded: int, deleted: int) -> None:
    """Record an incremental re-index: how many chunks were kept vs recomputed."""
    for outcome, count in (
        ("reused", reused),
        ("embedded", embedded),
        ("deleted", deleted),
    ):
        if count > 0:
            _add(_chunk_reconcile_chunks(), count, {"outcome": outcome})
 def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None:
    _add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type})
@ -863,9 +961,14 @@ __all__ = [
    "record_celery_queue_latency",
    "record_chat_request_duration",
    "record_chat_request_outcome",
    "record_chunk_reconcile",
    "record_compaction_run",
    "record_connector_sync_duration",
    "record_connector_sync_outcome",
    "record_embedding_cache_eviction",
    "record_embedding_cache_lookup",
    "record_etl_cache_eviction",
    "record_etl_cache_lookup",
    "record_etl_extract_duration",
    "record_etl_extract_outcome",
    "record_indexing_document_duration",
--- a/surfsense_backend/app/podcasts/api/routes.py
+++ b/surfsense_backend/app/podcasts/api/routes.py
@ -47,6 +47,7 @@ from app.utils.rbac import check_permission
 from .schemas import (
    CreatePodcastRequest,
    LanguageOptions,
    PodcastDetail,
    PodcastSummary,
    UpdateSpecRequest,
@ -114,6 +115,20 @@ async def list_voices(language: str | None = None):
    ]
@router.get("/podcasts/languages", response_model=LanguageOptions)
 async def list_languages():
    """Languages the active TTS provider can offer the brief editor."""
    if not app_config.TTS_SERVICE:
        raise HTTPException(status_code=503, detail="No TTS provider configured")
    provider = provider_from_service(app_config.TTS_SERVICE)
    offering = get_voice_catalog().offerable_languages(provider)
    return LanguageOptions(
        languages=offering.languages,
        allows_custom=offering.allows_custom,
    )
@router.get("/podcasts/voices/{voice_id}/preview")
 async def preview_voice(
    voice_id: str,
--- a/surfsense_backend/app/podcasts/api/schemas.py
+++ b/surfsense_backend/app/podcasts/api/schemas.py
@ -63,6 +63,17 @@ class VoiceOption(BaseModel):
    gender: str
 class LanguageOptions(BaseModel):
    """The languages the brief editor may offer for the active provider.
    When ``allows_custom`` is true the list is a curated starting point and
    the editor accepts any BCP-47 tag beyond it.
    """
    languages: list[str]
    allows_custom: bool
 class PodcastSummary(BaseModel):
    """Lightweight list item."""
--- a/surfsense_backend/app/podcasts/voices/init.py
+++ b/surfsense_backend/app/podcasts/voices/init.py
@ -6,7 +6,7 @@ configured provider via :func:`provider_from_service`.
 from __future__ import annotations
-from .catalog import VoiceCatalog, get_voice_catalog
+from .catalog import LanguageOffering, VoiceCatalog, get_voice_catalog
 from .preview import render_voice_preview
 from .provider import TtsProvider, provider_from_service
 from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender
@ -14,6 +14,7 @@ from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender
 __all__ = [
    "ANY_LANGUAGE",
    "CatalogVoice",
    "LanguageOffering",
    "TtsProvider",
    "VoiceCatalog",
    "VoiceGender",
--- a/surfsense_backend/app/podcasts/voices/catalog.py
+++ b/surfsense_backend/app/podcasts/voices/catalog.py
@ -9,11 +9,26 @@ provider-native reference.
 from __future__ import annotations
 from collections.abc import Iterable
 from dataclasses import dataclass
 from functools import lru_cache
 from .data import AZURE_VOICES, KOKORO_VOICES, OPENAI_VOICES, VERTEX_VOICES
 from .data.languages import COMMON_LANGUAGES
 from .provider import TtsProvider
-from .voice import CatalogVoice
+from .voice import ANY_LANGUAGE, CatalogVoice
@dataclass(frozen=True, slots=True)
 class LanguageOffering:
    """The languages a provider's roster can offer the brief form.
    ``allows_custom`` is true when the roster has wildcard voices: the listed
    languages are then a curated starting point, not a limit, and any BCP-47
    tag may be entered.
    """
    languages: list[str]
    allows_custom: bool
 class VoiceCatalog:
@ -44,6 +59,20 @@ class VoiceCatalog:
        """Whether ``provider`` has at least one voice for ``language``."""
        return any(v.speaks(language) for v in self.for_provider(provider))
    def offerable_languages(self, provider: TtsProvider) -> LanguageOffering:
        """The languages ``provider`` can offer up front.
        Language-bound voices contribute their concrete tags; wildcard voices
        cannot enumerate languages, so their presence merges in the curated
        common list and opens free entry.
        """
        voices = self.for_provider(provider)
        tags = {v.language for v in voices if v.language != ANY_LANGUAGE}
        has_wildcard = any(v.language == ANY_LANGUAGE for v in voices)
        if has_wildcard:
            tags.update(COMMON_LANGUAGES)
        return LanguageOffering(languages=sorted(tags), allows_custom=has_wildcard)
@lru_cache(maxsize=1)
 def get_voice_catalog() -> VoiceCatalog:
--- a/surfsense_backend/app/podcasts/voices/data/languages.py
+++ b/surfsense_backend/app/podcasts/voices/data/languages.py
@ -0,0 +1,33 @@
 """Curated languages offered when a roster has wildcard (any-language) voices.
 OpenAI-style multilingual voices speak whatever language the text is in, so
 there is no provider list to enumerate. This is the set the brief form offers
 up front for such providers; it is an offering, not a limit — the API flags
 ``allows_custom`` so users can enter any BCP-47 tag beyond it.
 """
 from __future__ import annotations
 COMMON_LANGUAGES: tuple[str, ...] = (
    "ar",
    "bn",
    "de",
    "en",
    "es",
    "fr",
    "hi",
    "id",
    "it",
    "ja",
    "ko",
    "nl",
    "pl",
    "pt",
    "ru",
    "sw",
    "th",
    "tr",
    "uk",
    "vi",
    "zh",
 )
--- a/surfsense_backend/app/prompts/default_system_instructions.py
+++ b/surfsense_backend/app/prompts/default_system_instructions.py
@ -82,7 +82,7 @@ def build_configurable_system_prompt(
    *,
    model_name: str | None = None,
 ) -> str:
-    """Build a configurable SurfSense system prompt (NewLLMConfig path).
+    """Build a configurable SurfSense system prompt.
    See :func:`app.prompts.system_prompt_composer.composer.compose_system_prompt`
    for full parameter docs.
@ -104,7 +104,7 @@ def build_configurable_system_prompt(
 def get_default_system_instructions() -> str:
    """Return the default ``<system_instruction>`` block (no tools / citations).
-    Useful for populating the UI when seeding ``NewLLMConfig.system_instructions``.
+    Useful for populating the UI when editing custom system instructions.
    The output reflects the current fragment tree, not a baked-in constant.
    """
    resolved_today = datetime.now(UTC).date().isoformat()
--- a/surfsense_backend/app/prompts/system_prompt_composer/composer.py
+++ b/surfsense_backend/app/prompts/system_prompt_composer/composer.py
@ -348,8 +348,7 @@ def compose_system_prompt(
        mcp_connector_tools: ``{server_name: [tool_names...]}`` to inject
            an explicit MCP routing block.
        custom_system_instructions: Free-form instructions that override
-            the default ``<system_instruction>`` block (legacy support
+            the default ``<system_instruction>`` block.
            for ``NewLLMConfig.system_instructions``).
        use_default_system_instructions: When ``custom_system_instructions``
            is empty/None, fall back to defaults (legacy semantics).
        citations_enabled: Include ``citations_on.md`` (true) or
--- a/surfsense_backend/app/retriever/chunks_hybrid_search.py
+++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py
@ -420,7 +420,10 @@ class ChucksHybridSearchRetriever:
            select(
                Chunk.id.label("chunk_id"),
                func.row_number()
-                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .over(
                    partition_by=Chunk.document_id,
                    order_by=(Chunk.position, Chunk.id),
                )
                .label("rn"),
            )
            .where(Chunk.document_id.in_(doc_ids))
@ -441,7 +444,7 @@ class ChucksHybridSearchRetriever:
            select(Chunk.id, Chunk.content, Chunk.document_id)
            .join(numbered, Chunk.id == numbered.c.chunk_id)
            .where(chunk_filter)
-            .order_by(Chunk.document_id, Chunk.id)
+            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
        )
        t_fetch = time.perf_counter()
--- a/surfsense_backend/app/retriever/documents_hybrid_search.py
+++ b/surfsense_backend/app/retriever/documents_hybrid_search.py
@ -357,7 +357,10 @@ class DocumentHybridSearchRetriever:
            select(
                Chunk.id.label("chunk_id"),
                func.row_number()
-                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .over(
                    partition_by=Chunk.document_id,
                    order_by=(Chunk.position, Chunk.id),
                )
                .label("rn"),
            )
            .where(Chunk.document_id.in_(doc_ids))
@ -369,7 +372,7 @@ class DocumentHybridSearchRetriever:
            select(Chunk.id, Chunk.content, Chunk.document_id)
            .join(numbered, Chunk.id == numbered.c.chunk_id)
            .where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC)
-            .order_by(Chunk.document_id, Chunk.id)
+            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
        )
        t_fetch = time.perf_counter()
--- a/surfsense_backend/app/routes/init.py
+++ b/surfsense_backend/app/routes/init.py
@ -24,7 +24,10 @@ from .dropbox_add_connector_route import router as dropbox_add_connector_router
 from .editor_routes import router as editor_router
 from .export_routes import router as export_router
 from .folders_routes import router as folders_router
-from .gateway_webhook_routes import router as gateway_router
+from .gateway_webhook_routes import (
    config_router as gateway_config_router,
    router as gateway_router,
 )
 from .gateway_whatsapp_baileys_routes import router as gateway_whatsapp_baileys_router
 from .gateway_whatsapp_webhook_routes import router as gateway_whatsapp_webhook_router
 from .google_calendar_add_connector_route import (
@ -44,9 +47,9 @@ from .logs_routes import router as logs_router
 from .luma_add_connector_route import router as luma_add_connector_router
 from .mcp_oauth_route import router as mcp_oauth_router
 from .memory_routes import router as memory_router
 from .model_connections_routes import router as model_connections_router
 from .model_list_routes import router as model_list_router
 from .new_chat_routes import router as new_chat_router
 from .new_llm_config_routes import router as new_llm_config_router
 from .notes_routes import router as notes_router
 from .notion_add_connector_route import router as notion_add_connector_router
 from .obsidian_plugin_routes import router as obsidian_plugin_router
@ -63,7 +66,6 @@ from .stripe_routes import router as stripe_router
 from .team_memory_routes import router as team_memory_router
 from .teams_add_connector_route import router as teams_add_connector_router
 from .video_presentations_routes import router as video_presentations_router
 from .vision_llm_routes import router as vision_llm_router
 from .youtube_routes import router as youtube_router
 router = APIRouter()
@ -75,6 +77,7 @@ router.include_router(export_router)
 router.include_router(documents_router)
 router.include_router(folders_router)
 _gateway_enabled_dep = [Depends(require_gateway_enabled)]
 router.include_router(gateway_config_router)
 router.include_router(gateway_router, dependencies=_gateway_enabled_dep)
 router.include_router(
    gateway_whatsapp_webhook_router, dependencies=_gateway_enabled_dep
@ -98,7 +101,6 @@ router.include_router(
 )  # Video presentation status and streaming
 router.include_router(reports_router)  # Report CRUD and multi-format export
 router.include_router(image_generation_router)  # Image generation via litellm
 router.include_router(vision_llm_router)  # Vision LLM configs for screenshot analysis
 router.include_router(search_source_connectors_router)
 router.include_router(google_calendar_add_connector_router)
 router.include_router(google_gmail_add_connector_router)
@ -116,7 +118,7 @@ router.include_router(jira_add_connector_router)
 router.include_router(confluence_add_connector_router)
 router.include_router(clickup_add_connector_router)
 router.include_router(dropbox_add_connector_router)
-router.include_router(new_llm_config_router)  # LLM configs with prompt configuration
+router.include_router(model_connections_router)  # Connection-centric model catalog
 router.include_router(model_list_router)  # Dynamic model catalogue from OpenRouter
 router.include_router(logs_router)
 router.include_router(circleback_webhook_router)  # Circleback meeting webhooks
--- a/surfsense_backend/app/routes/anonymous_chat_routes.py
+++ b/surfsense_backend/app/routes/anonymous_chat_routes.py
@ -18,6 +18,7 @@ from app.etl_pipeline.file_classifier import (
    PLAINTEXT_EXTENSIONS,
 )
 from app.rate_limiter import limiter
 from app.tasks.chat.streaming.errors.classifier import classify_stream_exception
 logger = logging.getLogger(__name__)
@ -98,7 +99,6 @@ class AnonQuotaResponse(BaseModel):
 class AnonModelResponse(BaseModel):
    id: int
    name: str
    description: str | None = None
    provider: str
    model_name: str
    billing_tier: str = "free"
@ -131,8 +131,7 @@ async def list_anonymous_models():
                AnonModelResponse(
                    id=cfg.get("id", 0),
                    name=cfg.get("name", ""),
-                    description=cfg.get("description"),
+                    provider=cfg.get("provider") or cfg.get("litellm_provider", ""),
                    provider=cfg.get("provider", ""),
                    model_name=cfg.get("model_name", ""),
                    billing_tier=cfg.get("billing_tier", "free"),
                    is_premium=cfg.get("billing_tier", "free") == "premium",
@ -160,8 +159,7 @@ async def get_anonymous_model(slug: str):
            return AnonModelResponse(
                id=cfg.get("id", 0),
                name=cfg.get("name", ""),
-                description=cfg.get("description"),
+                provider=cfg.get("provider") or cfg.get("litellm_provider", ""),
                provider=cfg.get("provider", ""),
                model_name=cfg.get("model_name", ""),
                billing_tier=cfg.get("billing_tier", "free"),
                is_premium=cfg.get("billing_tier", "free") == "premium",
@ -474,7 +472,15 @@ async def stream_anonymous_chat(
        except Exception as e:
            logger.exception("Anonymous chat stream error")
            await TokenQuotaService.anon_release(session_key, ip_key, request_id)
-            yield streaming_service.format_error(f"Error during chat: {e!s}")
+            _, error_code, _, _, user_message, extra = classify_stream_exception(
                e,
                flow_label="chat",
            )
            yield streaming_service.format_error(
                user_message,
                error_code=error_code,
                extra=extra,
            )
            yield streaming_service.format_done()
        finally:
            await TokenQuotaService.anon_release_stream_slot(client_ip)
--- a/Show more
+++ b/Show more