Merge pull request #1509 from MODSetter/dev

feat(release: 0.0.29): ETL/embedding caches, unified model connections, reverse-proxy support, podcast & indexing improvements
This commit is contained in:
Rohan Verma 2026-06-17 23:46:24 -07:00 committed by GitHub
commit c941907448
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
408 changed files with 15877 additions and 16310 deletions

View file

@ -95,10 +95,12 @@ jobs:
run: pnpm build run: pnpm build
working-directory: surfsense_web working-directory: surfsense_web
env: env:
NEXT_PUBLIC_FASTAPI_BACKEND_URL: ${{ vars.NEXT_PUBLIC_FASTAPI_BACKEND_URL }} NEXT_PUBLIC_FASTAPI_BACKEND_URL: ${{ vars.HOSTED_BACKEND_URL }}
SURFSENSE_BACKEND_INTERNAL_URL: ${{ vars.HOSTED_BACKEND_URL }}
NEXT_PUBLIC_ZERO_CACHE_URL: ${{ vars.NEXT_PUBLIC_ZERO_CACHE_URL }} NEXT_PUBLIC_ZERO_CACHE_URL: ${{ vars.NEXT_PUBLIC_ZERO_CACHE_URL }}
NEXT_PUBLIC_DEPLOYMENT_MODE: ${{ vars.NEXT_PUBLIC_DEPLOYMENT_MODE }} NEXT_PUBLIC_DEPLOYMENT_MODE: ${{ vars.NEXT_PUBLIC_DEPLOYMENT_MODE }}
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: ${{ vars.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE }} NEXT_PUBLIC_AUTH_TYPE: ${{ vars.NEXT_PUBLIC_AUTH_TYPE }}
NEXT_PUBLIC_ETL_SERVICE: ${{ vars.NEXT_PUBLIC_ETL_SERVICE }}
NEXT_PUBLIC_POSTHOG_KEY: ${{ secrets.NEXT_PUBLIC_POSTHOG_KEY }} NEXT_PUBLIC_POSTHOG_KEY: ${{ secrets.NEXT_PUBLIC_POSTHOG_KEY }}
- name: Install desktop dependencies - name: Install desktop dependencies
@ -109,6 +111,7 @@ jobs:
run: pnpm build run: pnpm build
working-directory: surfsense_desktop working-directory: surfsense_desktop
env: env:
HOSTED_BACKEND_URL: ${{ vars.HOSTED_BACKEND_URL }}
HOSTED_FRONTEND_URL: ${{ vars.HOSTED_FRONTEND_URL }} HOSTED_FRONTEND_URL: ${{ vars.HOSTED_FRONTEND_URL }}
POSTHOG_KEY: ${{ secrets.POSTHOG_KEY }} POSTHOG_KEY: ${{ secrets.POSTHOG_KEY }}
POSTHOG_HOST: ${{ vars.POSTHOG_HOST }} POSTHOG_HOST: ${{ vars.POSTHOG_HOST }}

View file

@ -199,11 +199,6 @@ jobs:
build-args: | build-args: |
${{ matrix.image == 'backend' && format('USE_CUDA={0}', matrix.use_cuda) || '' }} ${{ matrix.image == 'backend' && format('USE_CUDA={0}', matrix.use_cuda) || '' }}
${{ matrix.image == 'backend' && format('CUDA_EXTRA={0}', matrix.cuda_extra) || '' }} ${{ matrix.image == 'backend' && format('CUDA_EXTRA={0}', matrix.cuda_extra) || '' }}
${{ matrix.image == 'web' && 'NEXT_PUBLIC_FASTAPI_BACKEND_URL=__NEXT_PUBLIC_FASTAPI_BACKEND_URL__' || '' }}
${{ matrix.image == 'web' && 'NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=__NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE__' || '' }}
${{ matrix.image == 'web' && 'NEXT_PUBLIC_ETL_SERVICE=__NEXT_PUBLIC_ETL_SERVICE__' || '' }}
${{ matrix.image == 'web' && 'NEXT_PUBLIC_ZERO_CACHE_URL=__NEXT_PUBLIC_ZERO_CACHE_URL__' || '' }}
${{ matrix.image == 'web' && 'NEXT_PUBLIC_DEPLOYMENT_MODE=__NEXT_PUBLIC_DEPLOYMENT_MODE__' || '' }}
- name: Export digest - name: Export digest
run: | run: |

View file

@ -27,9 +27,10 @@ jobs:
PLAYWRIGHT_TEST_EMAIL: e2e-test@surfsense.net PLAYWRIGHT_TEST_EMAIL: e2e-test@surfsense.net
PLAYWRIGHT_TEST_PASSWORD: E2eTestPassword123! PLAYWRIGHT_TEST_PASSWORD: E2eTestPassword123!
# Frontend env: Playwright's webServer (surfsense_web/playwright.config.ts) # Frontend env: Playwright's webServer (surfsense_web/playwright.config.ts)
# spawns `pnpm build && pnpm start` in CI; these get baked into the build. # spawns `pnpm build && pnpm start` in CI.
NEXT_PUBLIC_FASTAPI_BACKEND_URL: http://localhost:8000 NEXT_PUBLIC_FASTAPI_BACKEND_URL: http://localhost:8000
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: LOCAL SURFSENSE_BACKEND_INTERNAL_URL: http://localhost:8000
AUTH_TYPE: LOCAL
# Shared secret for the test-only POST /__e2e__/auth/token endpoint. # Shared secret for the test-only POST /__e2e__/auth/token endpoint.
# Must match docker-compose.e2e.yml's backend env (x-backend-env). # Must match docker-compose.e2e.yml's backend env (x-backend-env).
E2E_MINT_SECRET: e2e-mint-secret-not-for-production E2E_MINT_SECRET: e2e-mint-secret-not-for-production

View file

@ -1 +1 @@
0.0.28 0.0.29

View file

@ -30,6 +30,9 @@ SECRET_KEY=replace_me_with_a_random_string
# Auth type: LOCAL (email/password) or GOOGLE (OAuth) # Auth type: LOCAL (email/password) or GOOGLE (OAuth)
AUTH_TYPE=LOCAL AUTH_TYPE=LOCAL
# Deployment mode: self-hosted enables local filesystem connectors; cloud hides them.
DEPLOYMENT_MODE=self-hosted
# Allow new user registrations (TRUE or FALSE) # Allow new user registrations (TRUE or FALSE)
# REGISTRATION_ENABLED=TRUE # REGISTRATION_ENABLED=TRUE
@ -43,51 +46,47 @@ ETL_SERVICE=DOCLING
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Ports (change to avoid conflicts with other services on your machine) # How You Access SurfSense
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# One public URL. Browser traffic stays same-origin and Caddy routes internally.
# BACKEND_PORT=8929 SURFSENSE_PUBLIC_URL=http://localhost:3929
# FRONTEND_PORT=3929
# ZERO_CACHE_PORT=5929
# SEARXNG_PORT=8888
# FLOWER_PORT=5555
# ==============================================================================
# DEV COMPOSE ONLY (docker-compose.dev.yml)
# You only need them only if you are running `docker-compose.dev.yml`.
# ==============================================================================
# -- pgAdmin (database GUI) --
# PGADMIN_PORT=5050
# PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
# PGADMIN_DEFAULT_PASSWORD=surfsense
# -- Redis exposed port (dev only; Redis is internal-only in prod) --
# REDIS_PORT=6379
# -- WhatsApp bridge exposed port (dev/hybrid only; prod keeps it Docker-internal) --
# WHATSAPP_BRIDGE_PORT=9929
# -- Frontend Build Args --
# In dev, the frontend is built from source and these are passed as build args.
# In prod, they are automatically derived from AUTH_TYPE, ETL_SERVICE, and the port settings above.
# NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL
# NEXT_PUBLIC_ETL_SERVICE=DOCLING
# NEXT_PUBLIC_DEPLOYMENT_MODE=self-hosted
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Custom Domain / Reverse Proxy # Public Ports
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# ONLY set these if you are serving SurfSense on a real domain via a reverse # Production Docker exposes only Caddy to your machine. Caddy then routes
# proxy (e.g. Caddy, Nginx, Cloudflare Tunnel). # frontend, backend, and zero-cache traffic internally.
# For standard localhost deployments, leave all of these commented out.
# they are automatically derived from the port settings above.
# #
# NEXT_FRONTEND_URL=https://app.yourdomain.com # Local default: LISTEN_HTTP_PORT=3929
# BACKEND_URL=https://api.yourdomain.com # Domain default: LISTEN_HTTP_PORT=80 and LISTEN_HTTPS_PORT=443
# NEXT_PUBLIC_FASTAPI_BACKEND_URL=https://api.yourdomain.com LISTEN_HTTP_PORT=3929
# NEXT_PUBLIC_ZERO_CACHE_URL=https://zero.yourdomain.com LISTEN_HTTPS_PORT=443
# FASTAPI_BACKEND_INTERNAL_URL=http://backend:8000
# ------------------------------------------------------------------------------
# Custom Domain / HTTPS
# ------------------------------------------------------------------------------
# Leave SURFSENSE_SITE_ADDRESS as :80 for local HTTP.
# Set it to your domain to enable automatic HTTPS:
# SURFSENSE_SITE_ADDRESS=surf.example.com
# CERT_EMAIL=you@example.com
SURFSENSE_SITE_ADDRESS=:80
CERT_EMAIL=
# ------------------------------------------------------------------------------
# Advanced Reverse Proxy Settings
# ------------------------------------------------------------------------------
# Usually do not change these. They are for custom certificate setup, CDNs/load
# balancers, trusted proxy IPs, or changing upload limits.
#
# CERT_ACME_CA=https://acme-v02.api.letsencrypt.org/directory
# CERT_ACME_DNS=
# If a CDN/load balancer sits in front of Caddy, narrow this to that proxy's CIDRs.
# TRUSTED_PROXIES=0.0.0.0/0
# SURFSENSE_MAX_BODY_SIZE=5GB
#
# Browser API and Zero URLs are same-origin relative behind bundled Caddy.
# Next.js server-side calls use Docker DNS through SURFSENSE_BACKEND_INTERNAL_URL
# set internally by docker-compose.yml. Usually do not override it.
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Zero-cache (real-time sync) # Zero-cache (real-time sync)
@ -108,10 +107,9 @@ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
# Sync worker tuning. zero-cache defaults ZERO_NUM_SYNC_WORKERS to the number # Sync worker tuning. zero-cache defaults ZERO_NUM_SYNC_WORKERS to the number
# of CPU cores, which can exceed the connection pool limits on high-core machines. # of CPU cores, which can exceed the connection pool limits on high-core machines.
# Each sync worker needs at least 1 connection from both the UPSTREAM and CVR # Each sync worker needs at least 1 connection from both the UPSTREAM and CVR pools.
# pools, so these constraints must hold: # Keep ZERO_UPSTREAM_MAX_CONNS and ZERO_CVR_MAX_CONNS greater than or equal to
# ZERO_UPSTREAM_MAX_CONNS >= ZERO_NUM_SYNC_WORKERS # ZERO_NUM_SYNC_WORKERS.
# ZERO_CVR_MAX_CONNS >= ZERO_NUM_SYNC_WORKERS
# Default of 4 workers is sufficient for self-hosted / personal use. # Default of 4 workers is sufficient for self-hosted / personal use.
# ZERO_NUM_SYNC_WORKERS=4 # ZERO_NUM_SYNC_WORKERS=4
# ZERO_UPSTREAM_MAX_CONNS=20 # ZERO_UPSTREAM_MAX_CONNS=20
@ -125,16 +123,16 @@ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
# ZERO_QUERY_URL: where zero-cache forwards query requests for resolution. # ZERO_QUERY_URL: where zero-cache forwards query requests for resolution.
# ZERO_MUTATE_URL: required by zero-cache when auth tokens are used, even though # ZERO_MUTATE_URL: required by zero-cache when auth tokens are used, even though
# SurfSense does not use Zero mutators. Setting both URLs tells zero-cache to # SurfSense does not use Zero mutators. Setting both URLs tells zero-cache to
# skip its own JWT verification and let the app endpoints handle auth instead. # skip its own JWT verification and let the app endpoints handle auth instead.
# The mutate endpoint is a no-op that returns an empty response. # The mutate endpoint is a no-op that returns an empty response.
# Default: Docker service networking (http://frontend:3000/api/zero/...). # Default: Docker service networking (http://frontend:3000/api/zero/...).
# Override when running the frontend outside Docker: # Override when running the frontend outside Docker:
# ZERO_QUERY_URL=http://host.docker.internal:3000/api/zero/query # ZERO_QUERY_URL=http://host.docker.internal:3000/api/zero/query
# ZERO_MUTATE_URL=http://host.docker.internal:3000/api/zero/mutate # ZERO_MUTATE_URL=http://host.docker.internal:3000/api/zero/mutate
# Override for custom domain: # Override for custom domain only when zero-cache is not in the bundled Docker network:
# ZERO_QUERY_URL=https://app.yourdomain.com/api/zero/query # ZERO_QUERY_URL=https://surf.example.com/api/zero/query
# ZERO_MUTATE_URL=https://app.yourdomain.com/api/zero/mutate # ZERO_MUTATE_URL=https://surf.example.com/api/zero/mutate
# ZERO_QUERY_URL=http://frontend:3000/api/zero/query # ZERO_QUERY_URL=http://frontend:3000/api/zero/query
# ZERO_MUTATE_URL=http://frontend:3000/api/zero/mutate # ZERO_MUTATE_URL=http://frontend:3000/api/zero/mutate
@ -222,73 +220,74 @@ STT_SERVICE=local/base
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# -- Google Connectors -- # -- Google Connectors --
# GOOGLE_CALENDAR_REDIRECT_URI=http://localhost:8000/api/v1/auth/google/calendar/connector/callback # GOOGLE_CALENDAR_REDIRECT_URI=http://localhost:3929/api/v1/auth/google/calendar/connector/callback
# GOOGLE_GMAIL_REDIRECT_URI=http://localhost:8000/api/v1/auth/google/gmail/connector/callback # GOOGLE_GMAIL_REDIRECT_URI=http://localhost:3929/api/v1/auth/google/gmail/connector/callback
# GOOGLE_DRIVE_REDIRECT_URI=http://localhost:8000/api/v1/auth/google/drive/connector/callback # GOOGLE_DRIVE_REDIRECT_URI=http://localhost:3929/api/v1/auth/google/drive/connector/callback
# -- Notion -- # -- Notion --
# NOTION_CLIENT_ID= # NOTION_CLIENT_ID=
# NOTION_CLIENT_SECRET= # NOTION_CLIENT_SECRET=
# NOTION_REDIRECT_URI=http://localhost:8000/api/v1/auth/notion/connector/callback # NOTION_REDIRECT_URI=http://localhost:3929/api/v1/auth/notion/connector/callback
# -- Slack -- # -- Slack --
# SLACK_CLIENT_ID= # SLACK_CLIENT_ID=
# SLACK_CLIENT_SECRET= # SLACK_CLIENT_SECRET=
# SLACK_REDIRECT_URI=http://localhost:8000/api/v1/auth/slack/connector/callback # SLACK_REDIRECT_URI=http://localhost:3929/api/v1/auth/slack/connector/callback
# -- Discord -- # -- Discord --
# DISCORD_CLIENT_ID= # DISCORD_CLIENT_ID=
# DISCORD_CLIENT_SECRET= # DISCORD_CLIENT_SECRET=
# DISCORD_REDIRECT_URI=http://localhost:8000/api/v1/auth/discord/connector/callback # DISCORD_REDIRECT_URI=http://localhost:3929/api/v1/auth/discord/connector/callback
# DISCORD_BOT_TOKEN= # DISCORD_BOT_TOKEN=
# -- Atlassian (Jira & Confluence) -- # -- Atlassian (Jira & Confluence) --
# ATLASSIAN_CLIENT_ID= # ATLASSIAN_CLIENT_ID=
# ATLASSIAN_CLIENT_SECRET= # ATLASSIAN_CLIENT_SECRET=
# JIRA_REDIRECT_URI=http://localhost:8000/api/v1/auth/jira/connector/callback # JIRA_REDIRECT_URI=http://localhost:3929/api/v1/auth/jira/connector/callback
# CONFLUENCE_REDIRECT_URI=http://localhost:8000/api/v1/auth/confluence/connector/callback # CONFLUENCE_REDIRECT_URI=http://localhost:3929/api/v1/auth/confluence/connector/callback
# -- Linear -- # -- Linear --
# LINEAR_CLIENT_ID= # LINEAR_CLIENT_ID=
# LINEAR_CLIENT_SECRET= # LINEAR_CLIENT_SECRET=
# LINEAR_REDIRECT_URI=http://localhost:8000/api/v1/auth/linear/connector/callback # LINEAR_REDIRECT_URI=http://localhost:3929/api/v1/auth/linear/connector/callback
# -- ClickUp -- # -- ClickUp --
# CLICKUP_CLIENT_ID= # CLICKUP_CLIENT_ID=
# CLICKUP_CLIENT_SECRET= # CLICKUP_CLIENT_SECRET=
# CLICKUP_REDIRECT_URI=http://localhost:8000/api/v1/auth/clickup/connector/callback # CLICKUP_REDIRECT_URI=http://localhost:3929/api/v1/auth/clickup/connector/callback
# -- Airtable -- # -- Airtable --
# AIRTABLE_CLIENT_ID= # AIRTABLE_CLIENT_ID=
# AIRTABLE_CLIENT_SECRET= # AIRTABLE_CLIENT_SECRET=
# AIRTABLE_REDIRECT_URI=http://localhost:8000/api/v1/auth/airtable/connector/callback # AIRTABLE_REDIRECT_URI=http://localhost:3929/api/v1/auth/airtable/connector/callback
# -- Microsoft OAuth (Teams & OneDrive) -- # -- Microsoft OAuth (Teams & OneDrive) --
# MICROSOFT_CLIENT_ID= # MICROSOFT_CLIENT_ID=
# MICROSOFT_CLIENT_SECRET= # MICROSOFT_CLIENT_SECRET=
# TEAMS_REDIRECT_URI=http://localhost:8000/api/v1/auth/teams/connector/callback # TEAMS_REDIRECT_URI=http://localhost:3929/api/v1/auth/teams/connector/callback
# ONEDRIVE_REDIRECT_URI=http://localhost:8000/api/v1/auth/onedrive/connector/callback # ONEDRIVE_REDIRECT_URI=http://localhost:3929/api/v1/auth/onedrive/connector/callback
# -- Dropbox -- # -- Dropbox --
# DROPBOX_APP_KEY= # DROPBOX_APP_KEY=
# DROPBOX_APP_SECRET= # DROPBOX_APP_SECRET=
# DROPBOX_REDIRECT_URI=http://localhost:8000/api/v1/auth/dropbox/connector/callback # DROPBOX_REDIRECT_URI=http://localhost:3929/api/v1/auth/dropbox/connector/callback
# -- Composio -- # -- Composio --
# COMPOSIO_API_KEY= # COMPOSIO_API_KEY=
# COMPOSIO_ENABLED=TRUE # COMPOSIO_ENABLED=TRUE
# COMPOSIO_REDIRECT_URI=http://localhost:8000/api/v1/auth/composio/connector/callback # COMPOSIO_REDIRECT_URI=http://localhost:3929/api/v1/auth/composio/connector/callback
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Messaging Channels (optional) # Messaging Channels (optional)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Configure only the external chat channels you want to use. # Configure only the external chat channels you want to use.
# GATEWAY_ENABLED=TRUE
# -- Telegram -- # -- Telegram --
# TELEGRAM_SHARED_BOT_TOKEN= # TELEGRAM_SHARED_BOT_TOKEN=
# TELEGRAM_SHARED_BOT_USERNAME= # TELEGRAM_SHARED_BOT_USERNAME=
# TELEGRAM_WEBHOOK_SECRET= # TELEGRAM_WEBHOOK_SECRET=
# GATEWAY_BASE_URL=http://localhost:8929 # GATEWAY_BASE_URL=http://localhost:3929
# GATEWAY_TELEGRAM_INTAKE_MODE=webhook # GATEWAY_TELEGRAM_INTAKE_MODE=webhook
# -- WhatsApp -- # -- WhatsApp --
@ -307,20 +306,20 @@ STT_SERVICE=local/base
# #
# GATEWAY_SLACK_ENABLED=FALSE # GATEWAY_SLACK_ENABLED=FALSE
# GATEWAY_SLACK_SIGNING_SECRET= # GATEWAY_SLACK_SIGNING_SECRET=
# GATEWAY_SLACK_REDIRECT_URI=http://localhost:8929/api/v1/gateway/slack/callback # GATEWAY_SLACK_REDIRECT_URI=http://localhost:3929/api/v1/gateway/slack/callback
# -- Discord -- # -- Discord --
# Uses DISCORD_CLIENT_ID, DISCORD_CLIENT_SECRET, and DISCORD_BOT_TOKEN from the # Uses DISCORD_CLIENT_ID, DISCORD_CLIENT_SECRET, and DISCORD_BOT_TOKEN from the
# Discord connector section. # Discord connector section.
# #
# GATEWAY_DISCORD_ENABLED=FALSE # GATEWAY_DISCORD_ENABLED=FALSE
# GATEWAY_DISCORD_REDIRECT_URI=http://localhost:8929/api/v1/gateway/discord/callback # GATEWAY_DISCORD_REDIRECT_URI=http://localhost:3929/api/v1/gateway/discord/callback
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# SearXNG (bundled web search, works out of the box with no config needed) # SearXNG (bundled web search, works out of the box with no config needed)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# SearXNG provides web search to all search spaces automatically. # SearXNG provides web search to all search spaces automatically.
# To access the SearXNG UI directly: http://localhost:8888 # To access the SearXNG UI directly in dev/deps-only compose: http://localhost:8888
# To disable the service entirely: docker compose up --scale searxng=0 # To disable the service entirely: docker compose up --scale searxng=0
# To point at your own SearXNG instance instead of the bundled one: # To point at your own SearXNG instance instead of the bundled one:
# SEARXNG_DEFAULT_HOST=http://your-searxng:8080 # SEARXNG_DEFAULT_HOST=http://your-searxng:8080
@ -457,3 +456,36 @@ NOLOGIN_MODE_ENABLED=FALSE
# RESIDENTIAL_PROXY_HOSTNAME= # RESIDENTIAL_PROXY_HOSTNAME=
# RESIDENTIAL_PROXY_LOCATION= # RESIDENTIAL_PROXY_LOCATION=
# RESIDENTIAL_PROXY_TYPE=1 # RESIDENTIAL_PROXY_TYPE=1
# ==============================================================================
# DEV / DEPS-ONLY COMPOSE OVERRIDES
# These are only needed for docker-compose.dev.yml or docker-compose.deps-only.yml.
# Production Docker exposes Caddy only; raw app ports below do not affect
# docker-compose.yml.
# ==============================================================================
# -- pgAdmin (database GUI, dev/deps-only only) --
# PGADMIN_PORT=5050
# PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
# PGADMIN_DEFAULT_PASSWORD=surfsense
# -- Redis exposed port (dev/deps-only only; Redis is internal-only in prod) --
# REDIS_PORT=6379
# -- SearXNG exposed port (dev/deps-only only; internal-only in prod) --
# SEARXNG_PORT=8888
# -- WhatsApp bridge exposed port (dev/hybrid only; prod keeps it Docker-internal) --
# WHATSAPP_BRIDGE_PORT=9929
# -- Raw app ports (dev/deps-only only; prod exposes Caddy instead) --
# BACKEND_PORT=8000
# FRONTEND_PORT=3000
# ZERO_CACHE_PORT=4848
# -- Frontend runtime flags (prod and dev compose) --
# The frontend reads these at request time in Docker; no NEXT_PUBLIC_* rebuild
# or startup substitution is required.
# AUTH_TYPE=LOCAL
# ETL_SERVICE=DOCLING
# DEPLOYMENT_MODE=self-hosted

View file

@ -257,16 +257,15 @@ services:
frontend: frontend:
build: build:
context: ../surfsense_web context: ../surfsense_web
args:
NEXT_PUBLIC_FASTAPI_BACKEND_URL: ${NEXT_PUBLIC_FASTAPI_BACKEND_URL:-http://localhost:8000}
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: ${NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE:-LOCAL}
NEXT_PUBLIC_ETL_SERVICE: ${NEXT_PUBLIC_ETL_SERVICE:-DOCLING}
NEXT_PUBLIC_ZERO_CACHE_URL: ${NEXT_PUBLIC_ZERO_CACHE_URL:-http://localhost:${ZERO_CACHE_PORT:-4848}}
NEXT_PUBLIC_DEPLOYMENT_MODE: ${NEXT_PUBLIC_DEPLOYMENT_MODE:-self-hosted}
ports: ports:
- "${FRONTEND_PORT:-3000}:3000" - "${FRONTEND_PORT:-3000}:3000"
env_file: env_file:
- ../surfsense_web/.env - ../surfsense_web/.env
environment:
AUTH_TYPE: ${AUTH_TYPE:-LOCAL}
ETL_SERVICE: ${ETL_SERVICE:-DOCLING}
DEPLOYMENT_MODE: ${DEPLOYMENT_MODE:-self-hosted}
SURFSENSE_BACKEND_INTERNAL_URL: http://backend:8000
depends_on: depends_on:
backend: backend:
condition: service_healthy condition: service_healthy

View file

@ -0,0 +1,54 @@
# =============================================================================
# SurfSense — Optional Caddy reverse-proxy overlay
# =============================================================================
# Usage (from docker/):
# PROXY_HTTP_PORT=8080 SURFSENSE_PUBLIC_URL=http://localhost:8080 \
# docker compose -f docker-compose.yml -f docker-compose.proxy.yml up -d
#
# This overlay is for validation and custom deployments. The production
# docker-compose.yml includes Caddy by default.
# =============================================================================
services:
backend:
ports:
- "${BACKEND_PORT:-8929}:8000"
zero-cache:
ports:
- "${ZERO_CACHE_PORT:-5929}:4848"
frontend:
ports:
- "${FRONTEND_PORT:-3929}:3000"
proxy:
image: caddy:2-alpine
restart: unless-stopped
ports:
- "${PROXY_HTTP_PORT:-8080}:80"
- "${PROXY_HTTPS_PORT:-8443}:443"
volumes:
- ./proxy/Caddyfile:/etc/caddy/Caddyfile:ro
- caddy_data:/data
- caddy_config:/config
environment:
SURFSENSE_SITE_ADDRESS: ${SURFSENSE_SITE_ADDRESS:-:80}
CERT_EMAIL: ${CERT_EMAIL:-}
CERT_ACME_CA: ${CERT_ACME_CA:-https://acme-v02.api.letsencrypt.org/directory}
CERT_ACME_DNS: ${CERT_ACME_DNS:-}
TRUSTED_PROXIES: ${TRUSTED_PROXIES:-0.0.0.0/0}
SURFSENSE_MAX_BODY_SIZE: ${SURFSENSE_MAX_BODY_SIZE:-5GB}
depends_on:
frontend:
condition: service_started
backend:
condition: service_healthy
zero-cache:
condition: service_healthy
volumes:
caddy_data:
name: surfsense-caddy-data
caddy_config:
name: surfsense-caddy-config

View file

@ -94,10 +94,39 @@ services:
timeout: 5s timeout: 5s
retries: 5 retries: 5
# Single public entry point for the Docker stack. Comment this service out
# only if you front SurfSense with your own reverse proxy.
proxy:
image: caddy:2-alpine
# For DNS-01/wildcard certificates, replace image with:
# build: ./proxy
restart: unless-stopped
ports:
- "${LISTEN_HTTP_PORT:-3929}:80"
- "${LISTEN_HTTPS_PORT:-443}:443"
volumes:
- ./proxy/Caddyfile:/etc/caddy/Caddyfile:ro
- caddy_data:/data
- caddy_config:/config
environment:
SURFSENSE_SITE_ADDRESS: ${SURFSENSE_SITE_ADDRESS:-:80}
CERT_EMAIL: ${CERT_EMAIL:-}
CERT_ACME_CA: ${CERT_ACME_CA:-https://acme-v02.api.letsencrypt.org/directory}
CERT_ACME_DNS: ${CERT_ACME_DNS:-}
TRUSTED_PROXIES: ${TRUSTED_PROXIES:-0.0.0.0/0}
SURFSENSE_MAX_BODY_SIZE: ${SURFSENSE_MAX_BODY_SIZE:-5GB}
depends_on:
frontend:
condition: service_started
backend:
condition: service_healthy
zero-cache:
condition: service_healthy
backend: backend:
image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}${SURFSENSE_VARIANT:+-${SURFSENSE_VARIANT}} image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}${SURFSENSE_VARIANT:+-${SURFSENSE_VARIANT}}
ports: expose:
- "${BACKEND_PORT:-8929}:8000" - "8000"
volumes: volumes:
- shared_temp:/shared_tmp - shared_temp:/shared_tmp
- object_store:/app/.local_object_store - object_store:/app/.local_object_store
@ -115,7 +144,8 @@ services:
UVICORN_LOOP: asyncio UVICORN_LOOP: asyncio
UNSTRUCTURED_HAS_PATCHED_LOOP: "1" UNSTRUCTURED_HAS_PATCHED_LOOP: "1"
FILE_STORAGE_LOCAL_PATH: /app/.local_object_store FILE_STORAGE_LOCAL_PATH: /app/.local_object_store
NEXT_FRONTEND_URL: ${NEXT_FRONTEND_URL:-http://localhost:${FRONTEND_PORT:-3929}} NEXT_FRONTEND_URL: ${NEXT_FRONTEND_URL:-${SURFSENSE_PUBLIC_URL:-http://localhost:${LISTEN_HTTP_PORT:-3929}}}
BACKEND_URL: ${BACKEND_URL:-${SURFSENSE_PUBLIC_URL:-http://localhost:${LISTEN_HTTP_PORT:-3929}}}
SEARXNG_DEFAULT_HOST: ${SEARXNG_DEFAULT_HOST:-http://searxng:8080} SEARXNG_DEFAULT_HOST: ${SEARXNG_DEFAULT_HOST:-http://searxng:8080}
WHATSAPP_BRIDGE_URL: ${WHATSAPP_BRIDGE_URL:-http://whatsapp-bridge:9929} WHATSAPP_BRIDGE_URL: ${WHATSAPP_BRIDGE_URL:-http://whatsapp-bridge:9929}
# Daytona Sandbox uncomment and set credentials to enable cloud code execution # Daytona Sandbox uncomment and set credentials to enable cloud code execution
@ -221,8 +251,8 @@ services:
zero-cache: zero-cache:
image: rocicorp/zero:1.4.0 image: rocicorp/zero:1.4.0
ports: expose:
- "${ZERO_CACHE_PORT:-5929}:4848" - "4848"
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
environment: environment:
@ -256,16 +286,13 @@ services:
frontend: frontend:
image: ghcr.io/modsetter/surfsense-web:${SURFSENSE_VERSION:-latest} image: ghcr.io/modsetter/surfsense-web:${SURFSENSE_VERSION:-latest}
ports: expose:
- "${FRONTEND_PORT:-3929}:3000" - "3000"
environment: environment:
NEXT_PUBLIC_FASTAPI_BACKEND_URL: ${NEXT_PUBLIC_FASTAPI_BACKEND_URL:-http://localhost:${BACKEND_PORT:-8929}} AUTH_TYPE: ${AUTH_TYPE:-LOCAL}
NEXT_PUBLIC_ZERO_CACHE_URL: ${NEXT_PUBLIC_ZERO_CACHE_URL:-http://localhost:${ZERO_CACHE_PORT:-5929}} ETL_SERVICE: ${ETL_SERVICE:-DOCLING}
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: ${AUTH_TYPE:-LOCAL} DEPLOYMENT_MODE: ${DEPLOYMENT_MODE:-self-hosted}
NEXT_PUBLIC_ETL_SERVICE: ${ETL_SERVICE:-DOCLING} SURFSENSE_BACKEND_INTERNAL_URL: http://backend:8000
NEXT_PUBLIC_DEPLOYMENT_MODE: ${DEPLOYMENT_MODE:-self-hosted}
NEXT_PUBLIC_WHATSAPP_DISPLAY_PHONE_NUMBER: ${WHATSAPP_SHARED_DISPLAY_PHONE_NUMBER:-}
FASTAPI_BACKEND_INTERNAL_URL: ${FASTAPI_BACKEND_INTERNAL_URL:-http://backend:8000}
labels: labels:
- "com.centurylinklabs.watchtower.enable=true" - "com.centurylinklabs.watchtower.enable=true"
depends_on: depends_on:
@ -286,5 +313,9 @@ volumes:
name: surfsense-object-store name: surfsense-object-store
zero_cache_data: zero_cache_data:
name: surfsense-zero-cache name: surfsense-zero-cache
caddy_data:
name: surfsense-caddy-data
caddy_config:
name: surfsense-caddy-config
whatsapp_sessions: whatsapp_sessions:
name: surfsense-whatsapp-sessions name: surfsense-whatsapp-sessions

45
docker/proxy/Caddyfile Normal file
View file

@ -0,0 +1,45 @@
{
# Optional ACME/global settings. These are harmless in the default :80
# localhost mode and become active when SURFSENSE_SITE_ADDRESS is a domain.
{$CERT_EMAIL}
acme_ca {$CERT_ACME_CA:https://acme-v02.api.letsencrypt.org/directory}
{$CERT_ACME_DNS}
servers {
client_ip_headers X-Forwarded-For X-Real-IP
trusted_proxies static {$TRUSTED_PROXIES:0.0.0.0/0}
}
}
(surfsense_proxy) {
request_body {
max_size {$SURFSENSE_MAX_BODY_SIZE:5GB}
}
# Frontend-owned auth page (the post-login token handler). More specific than
# /auth/*, so Caddy's matcher-specificity sort routes it here, not to backend.
reverse_proxy /auth/callback* frontend:3000
# Backend auth routes (FastAPI Users + OAuth helpers).
reverse_proxy /auth/* backend:8000
# Backend user profile routes (FastAPI Users users router, mounted at /users).
reverse_proxy /users/* backend:8000
# Backend REST, streaming, connector OAuth, and messaging gateway endpoints.
# FastAPI already serves /api/v1, so the path is forwarded unchanged.
reverse_proxy /api/v1/* backend:8000 {
flush_interval -1
}
# Zero accepts a single path-component base URL (Zero >= 0.6).
# Preserve /zero so browser cacheURL can be ${SURFSENSE_PUBLIC_URL}/zero.
reverse_proxy /zero/* zero-cache:4848
# Next.js app and frontend-owned API routes:
# /api/zero/*, /api/search, /api/contact, etc.
reverse_proxy /* frontend:3000
}
{$SURFSENSE_SITE_ADDRESS::80} {
import surfsense_proxy
}

10
docker/proxy/Dockerfile Normal file
View file

@ -0,0 +1,10 @@
FROM caddy:2-builder-alpine AS builder
RUN xcaddy build \
--with github.com/caddy-dns/cloudflare \
--with github.com/caddy-dns/digitalocean
FROM caddy:2-alpine
COPY --from=builder /usr/bin/caddy /usr/bin/caddy
COPY Caddyfile /etc/caddy/Caddyfile

View file

@ -333,11 +333,13 @@ step "Downloading SurfSense files"
info "Installation directory: ${INSTALL_DIR}" info "Installation directory: ${INSTALL_DIR}"
mkdir -p "${INSTALL_DIR}/scripts" mkdir -p "${INSTALL_DIR}/scripts"
mkdir -p "${INSTALL_DIR}/searxng" mkdir -p "${INSTALL_DIR}/searxng"
mkdir -p "${INSTALL_DIR}/proxy"
FILES=( FILES=(
"docker/docker-compose.yml:docker-compose.yml" "docker/docker-compose.yml:docker-compose.yml"
"docker/docker-compose.gpu.yml:docker-compose.gpu.yml" "docker/docker-compose.gpu.yml:docker-compose.gpu.yml"
"docker/.env.example:.env.example" "docker/.env.example:.env.example"
"docker/proxy/Caddyfile:proxy/Caddyfile"
"docker/postgresql.conf:postgresql.conf" "docker/postgresql.conf:postgresql.conf"
"docker/scripts/migrate-database.sh:scripts/migrate-database.sh" "docker/scripts/migrate-database.sh:scripts/migrate-database.sh"
"docker/searxng/settings.yml:searxng/settings.yml" "docker/searxng/settings.yml:searxng/settings.yml"
@ -532,9 +534,12 @@ _variant_display=$(grep '^SURFSENSE_VARIANT=' "${INSTALL_DIR}/.env" 2>/dev/null
_variant_display="${_variant_display:-cpu}" _variant_display="${_variant_display:-cpu}"
step "SurfSense is now installed [${_version_display}]" step "SurfSense is now installed [${_version_display}]"
info " Frontend: http://localhost:3929" _public_url=$(grep '^SURFSENSE_PUBLIC_URL=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2- | tr -d '"' | head -1 || true)
info " Backend: http://localhost:8929" _public_url="${_public_url:-http://localhost:3929}"
info " API Docs: http://localhost:8929/docs"
info " SurfSense: ${_public_url}"
info " Backend: ${_public_url}/api/v1"
info " Zero sync: ${_public_url}/zero"
info "" info ""
info " Config: ${INSTALL_DIR}/.env" info " Config: ${INSTALL_DIR}/.env"
info " Variant: ${_variant_display}" info " Variant: ${_variant_display}"

View file

@ -30,12 +30,9 @@ CELERY_TASK_DEFAULT_QUEUE=surfsense
# Optional: TTL in seconds for connector indexing lock key # Optional: TTL in seconds for connector indexing lock key
# CONNECTOR_INDEXING_LOCK_TTL_SECONDS=28800 # CONNECTOR_INDEXING_LOCK_TTL_SECONDS=28800
# Messaging Gateway (global) # Messaging Gateway: disabled by default; set TRUE to enable chat integrations.
# GATEWAY_ENABLED: master switch for ALL messaging gateway channels (Telegram, WhatsApp, # Supported messaging gateways: WhatsApp, Telegram, Discord, Slack
# Slack, Discord). When FALSE, no gateway background workers/supervisors start and all # GATEWAY_ENABLED=TRUE
# gateway HTTP routes (webhooks, OAuth callbacks, pairing) return 404. Set per-channel
# flags below to control individual platforms once the gateway is enabled.
GATEWAY_ENABLED=TRUE
# Telegram Gateway # Telegram Gateway
# TELEGRAM_WEBHOOK_SECRET must be 1-256 chars and contain only A-Z, a-z, 0-9, _ or - # TELEGRAM_WEBHOOK_SECRET must be 1-256 chars and contain only A-Z, a-z, 0-9, _ or -
@ -326,6 +323,42 @@ FILE_STORAGE_BACKEND=local
# AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...;EndpointSuffix=core.windows.net # AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...;EndpointSuffix=core.windows.net
# AZURE_STORAGE_CONTAINER=surfsense-documents # AZURE_STORAGE_CONTAINER=surfsense-documents
# ETL Parse Cache
# Reuse parser output for identical file bytes across workspaces (skips paid
# re-parsing on LlamaCloud / Azure DI / Unstructured). Off by default.
ETL_CACHE_ENABLED=false
# Bump to invalidate all cached entries after a parser/behaviour change.
# ETL_CACHE_PARSER_VERSION=1
# Prune entries unused for this many days.
# ETL_CACHE_TTL_DAYS=90
# Soft cap on total cached markdown; coldest entries are evicted past it.
# ETL_CACHE_MAX_TOTAL_MB=5120
# Rows deleted per eviction pass.
# ETL_CACHE_EVICTION_BATCH=500
# Optional dedicated blob storage; unset reuses the main file storage backend.
# ETL_CACHE_STORAGE_BACKEND=azure
# ETL_CACHE_STORAGE_CONTAINER=surfsense-etl-cache
# ETL_CACHE_STORAGE_LOCAL_PATH=/var/lib/surfsense/etl-cache
# Embedding Cache
# Reuse chunk+embedding output for identical markdown across workspaces (skips
# re-chunking and re-embedding). Blobs share the ETL_CACHE_STORAGE_* backend.
# Off by default.
EMBEDDING_CACHE_ENABLED=false
# Bump to invalidate all cached embedding sets after a chunker change.
# EMBEDDING_CACHE_CHUNKER_VERSION=1
# Prune entries unused for this many days.
# EMBEDDING_CACHE_TTL_DAYS=90
# Soft cap on total cached embeddings; coldest entries are evicted past it.
# EMBEDDING_CACHE_MAX_TOTAL_MB=5120
# Rows deleted per eviction pass.
# EMBEDDING_CACHE_EVICTION_BATCH=500
# Incremental re-indexing: on document edits, keep chunks whose text is
# unchanged (reusing their embeddings) and embed only new/changed ones.
# Set to false to fall back to delete-all + full re-embed (kill switch).
# CHUNK_RECONCILE_ENABLED=true
# Daytona Sandbox (isolated code execution) # Daytona Sandbox (isolated code execution)
# DAYTONA_SANDBOX_ENABLED=FALSE # DAYTONA_SANDBOX_ENABLED=FALSE
# DAYTONA_API_KEY=your-daytona-api-key # DAYTONA_API_KEY=your-daytona-api-key
@ -365,7 +398,9 @@ LANGSMITH_PROJECT=surfsense
# SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false # adds a per-turn LLM call # SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false # adds a per-turn LLM call
# Observability - OTel # Observability - OTel
# SURFSENSE_ENABLE_OTEL=false # Disabled by default. Uncomment to enable OpenTelemetry.
# SURFSENSE_ENABLE_OTEL=true
# OpenTelemetry - endpoint enables export; absent = no-op. # OpenTelemetry - endpoint enables export; absent = no-op.
# Production should point at an OTel Collector. For local docker-compose.dev.yml, # Production should point at an OTel Collector. For local docker-compose.dev.yml,
# use http://otel-lgtm:4317 instead. # use http://otel-lgtm:4317 instead.

View file

@ -4,7 +4,7 @@ Revision ID: 138
Revises: 137 Revises: 137
Create Date: 2026-04-30 Create Date: 2026-04-30
Add a single thread-level column to persist the Auto (Fastest) model pin: Add a single thread-level column to persist the Auto model pin:
- pinned_llm_config_id: concrete resolved global LLM config id used for this - pinned_llm_config_id: concrete resolved global LLM config id used for this
thread. NULL means "no pin; Auto will resolve on next turn". thread. NULL means "no pin; Auto will resolve on next turn".

View file

@ -15,6 +15,19 @@ down_revision: str | None = "157"
branch_labels: str | Sequence[str] | None = None branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None
PUBLICATION_NAME = "zero_publication"
TARGET_STATUS_LABELS = (
"pending",
"awaiting_brief",
"drafting",
"awaiting_review",
"rendering",
"ready",
"failed",
"cancelled",
)
LEGACY_STATUS_LABELS = ("pending", "generating", "ready", "failed")
def _drop_podcasts_from_publication() -> None: def _drop_podcasts_from_publication() -> None:
"""Detach podcasts from zero_publication so status can be retyped. """Detach podcasts from zero_publication so status can be retyped.
@ -28,31 +41,103 @@ def _drop_podcasts_from_publication() -> None:
published = conn.execute( published = conn.execute(
sa.text( sa.text(
"SELECT 1 FROM pg_publication_tables " "SELECT 1 FROM pg_publication_tables "
"WHERE pubname = 'zero_publication' " "WHERE pubname = :publication "
"AND schemaname = current_schema() AND tablename = 'podcasts'" "AND schemaname = current_schema() AND tablename = 'podcasts'"
) ),
{"publication": PUBLICATION_NAME},
).fetchone() ).fetchone()
if published: if published:
op.execute('ALTER PUBLICATION "zero_publication" DROP TABLE "podcasts";') op.execute(f'ALTER PUBLICATION "{PUBLICATION_NAME}" DROP TABLE "podcasts";')
def upgrade() -> None: def _enum_labels(type_name: str) -> list[str] | None:
_drop_podcasts_from_publication() rows = (
op.get_bind()
.execute(
sa.text(
"SELECT e.enumlabel "
"FROM pg_type t "
"JOIN pg_namespace n ON n.oid = t.typnamespace "
"JOIN pg_enum e ON e.enumtypid = t.oid "
"WHERE n.nspname = current_schema() AND t.typname = :type_name "
"ORDER BY e.enumsortorder"
),
{"type_name": type_name},
)
.fetchall()
)
if not rows:
return None
return [str(row[0]) for row in rows]
# Retype the status enum by swapping in a fresh type and casting existing
# rows. The legacy transient value 'generating' maps onto 'rendering'. def _column_type_name(table: str, column: str) -> str | None:
op.execute("ALTER TYPE podcast_status RENAME TO podcast_status_old;") row = (
op.get_bind()
.execute(
sa.text(
"SELECT udt_name "
"FROM information_schema.columns "
"WHERE table_schema = current_schema() "
"AND table_name = :table AND column_name = :column"
),
{"table": table, "column": column},
)
.fetchone()
)
return str(row[0]) if row else None
def _ensure_status_enum(
*,
desired_labels: tuple[str, ...],
temporary_type: str,
create_sql: str,
alter_sql: str,
default_value: str,
) -> None:
current_labels = _enum_labels("podcast_status")
desired = list(desired_labels)
if current_labels != desired:
if current_labels is None:
if _enum_labels(temporary_type) is None:
raise RuntimeError("podcast_status enum is missing")
elif _enum_labels(temporary_type) is None:
op.execute(f"ALTER TYPE podcast_status RENAME TO {temporary_type};")
else:
raise RuntimeError(
"podcast_status and its temporary replacement both exist"
)
if _enum_labels("podcast_status") is None:
op.execute(create_sql)
if _enum_labels("podcast_status") != desired:
raise RuntimeError("podcast_status enum is not in the expected shape")
op.execute("ALTER TABLE podcasts ALTER COLUMN status DROP DEFAULT;")
if _column_type_name("podcasts", "status") != "podcast_status":
op.execute(alter_sql)
op.execute( op.execute(
""" f"ALTER TABLE podcasts ALTER COLUMN status SET DEFAULT '{default_value}';"
)
if _enum_labels(temporary_type) is not None:
op.execute(f"DROP TYPE {temporary_type};")
def _upgrade_status_enum() -> None:
_ensure_status_enum(
desired_labels=TARGET_STATUS_LABELS,
temporary_type="podcast_status_old",
create_sql="""
CREATE TYPE podcast_status AS ENUM ( CREATE TYPE podcast_status AS ENUM (
'pending', 'awaiting_brief', 'drafting', 'awaiting_review', 'pending', 'awaiting_brief', 'drafting', 'awaiting_review',
'rendering', 'ready', 'failed', 'cancelled' 'rendering', 'ready', 'failed', 'cancelled'
); );
""" """,
) alter_sql="""
op.execute("ALTER TABLE podcasts ALTER COLUMN status DROP DEFAULT;")
op.execute(
"""
ALTER TABLE podcasts ALTER TABLE podcasts
ALTER COLUMN status TYPE podcast_status ALTER COLUMN status TYPE podcast_status
USING ( USING (
@ -61,10 +146,43 @@ def upgrade() -> None:
ELSE status::text ELSE status::text
END END
)::podcast_status; )::podcast_status;
""" """,
default_value="pending",
) )
op.execute("ALTER TABLE podcasts ALTER COLUMN status SET DEFAULT 'pending';")
op.execute("DROP TYPE podcast_status_old;")
def _downgrade_status_enum() -> None:
_ensure_status_enum(
desired_labels=LEGACY_STATUS_LABELS,
temporary_type="podcast_status_new",
create_sql=(
"CREATE TYPE podcast_status AS ENUM "
"('pending', 'generating', 'ready', 'failed');"
),
alter_sql="""
ALTER TABLE podcasts
ALTER COLUMN status TYPE podcast_status
USING (
CASE status::text
WHEN 'awaiting_brief' THEN 'pending'
WHEN 'drafting' THEN 'generating'
WHEN 'awaiting_review' THEN 'generating'
WHEN 'rendering' THEN 'generating'
WHEN 'cancelled' THEN 'failed'
ELSE status::text
END
)::podcast_status;
""",
default_value="ready",
)
def upgrade() -> None:
_drop_podcasts_from_publication()
# Retype the status enum by swapping in a fresh type and casting existing
# rows. The legacy transient value 'generating' maps onto 'rendering'.
_upgrade_status_enum()
op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS source_content TEXT;") op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS source_content TEXT;")
op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS spec JSONB;") op.execute("ALTER TABLE podcasts ADD COLUMN IF NOT EXISTS spec JSONB;")
@ -83,6 +201,8 @@ def upgrade() -> None:
def downgrade() -> None: def downgrade() -> None:
_drop_podcasts_from_publication()
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS error;") op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS error;")
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS duration_seconds;") op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS duration_seconds;")
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS storage_key;") op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS storage_key;")
@ -92,27 +212,4 @@ def downgrade() -> None:
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS source_content;") op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS source_content;")
# Collapse the expanded lifecycle back onto the original four values. # Collapse the expanded lifecycle back onto the original four values.
op.execute("ALTER TYPE podcast_status RENAME TO podcast_status_new;") _downgrade_status_enum()
op.execute(
"CREATE TYPE podcast_status AS ENUM "
"('pending', 'generating', 'ready', 'failed');"
)
op.execute("ALTER TABLE podcasts ALTER COLUMN status DROP DEFAULT;")
op.execute(
"""
ALTER TABLE podcasts
ALTER COLUMN status TYPE podcast_status
USING (
CASE status::text
WHEN 'awaiting_brief' THEN 'pending'
WHEN 'drafting' THEN 'generating'
WHEN 'awaiting_review' THEN 'generating'
WHEN 'rendering' THEN 'generating'
WHEN 'cancelled' THEN 'failed'
ELSE status::text
END
)::podcast_status;
"""
)
op.execute("ALTER TABLE podcasts ALTER COLUMN status SET DEFAULT 'ready';")
op.execute("DROP TYPE podcast_status_new;")

View file

@ -0,0 +1,299 @@
"""add model connections
Revision ID: 160
Revises: 159
"""
from collections.abc import Sequence
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
from alembic import op
revision: str = "160"
down_revision: str | None = "159"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
connection_scope = postgresql.ENUM(
"GLOBAL",
"SEARCH_SPACE",
"USER",
name="connectionscope",
create_type=False,
)
model_source = postgresql.ENUM(
"DISCOVERED",
"MANUAL",
name="modelsource",
create_type=False,
)
def _table_exists(table_name: str) -> bool:
return table_name in sa.inspect(op.get_bind()).get_table_names()
def _column_exists(table_name: str, column_name: str) -> bool:
if not _table_exists(table_name):
return False
return column_name in {
column["name"] for column in sa.inspect(op.get_bind()).get_columns(table_name)
}
def _index_exists(table_name: str, index_name: str) -> bool:
if not _table_exists(table_name):
return False
return index_name in {
index["name"] for index in sa.inspect(op.get_bind()).get_indexes(table_name)
}
def _create_index_if_missing(
index_name: str,
table_name: str,
columns: list[str],
) -> None:
if not _index_exists(table_name, index_name):
op.create_index(index_name, table_name, columns, unique=False)
def _add_searchspace_column_if_missing(
column_name: str,
*,
server_default: object | None = None,
) -> None:
if not _column_exists("searchspaces", column_name):
op.add_column(
"searchspaces",
sa.Column(
column_name,
sa.Integer(),
nullable=True,
server_default=server_default,
),
)
def _drop_column_if_exists(table_name: str, column_name: str) -> None:
if _column_exists(table_name, column_name):
op.drop_column(table_name, column_name)
def _drop_index_if_exists(table_name: str, index_name: str) -> None:
if _index_exists(table_name, index_name):
op.drop_index(index_name, table_name=table_name)
def upgrade() -> None:
bind = op.get_bind()
connection_scope.create(bind, checkfirst=True)
model_source.create(bind, checkfirst=True)
if _table_exists("connections"):
if _column_exists("connections", "litellm_provider") and not _column_exists(
"connections", "provider"
):
op.alter_column(
"connections",
"litellm_provider",
new_column_name="provider",
existing_type=sa.String(length=100),
existing_nullable=True,
)
op.alter_column(
"connections",
"provider",
existing_type=sa.String(length=100),
nullable=False,
)
elif _column_exists("connections", "native_provider") and not _column_exists(
"connections", "provider"
):
op.alter_column(
"connections",
"native_provider",
new_column_name="provider",
existing_type=sa.String(length=100),
existing_nullable=True,
)
op.alter_column(
"connections",
"provider",
existing_type=sa.String(length=100),
nullable=False,
)
elif not _column_exists("connections", "provider"):
op.add_column(
"connections",
sa.Column("provider", sa.String(length=100), nullable=False),
)
_drop_index_if_exists("connections", "ix_connections_protocol")
_drop_column_if_exists("connections", "protocol")
else:
op.create_table(
"connections",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
sa.Column("provider", sa.String(length=100), nullable=False),
sa.Column("base_url", sa.String(length=500), nullable=True),
sa.Column("api_key", sa.String(), nullable=True),
sa.Column(
"extra",
postgresql.JSONB(astext_type=sa.Text()),
server_default=sa.text("'{}'::jsonb"),
nullable=False,
),
sa.Column("scope", connection_scope, nullable=False),
sa.Column(
"enabled", sa.Boolean(), server_default=sa.text("true"), nullable=False
),
sa.Column("search_space_id", sa.Integer(), nullable=True),
sa.Column("user_id", sa.UUID(), nullable=True),
sa.CheckConstraint(
"(scope = 'GLOBAL' AND search_space_id IS NULL AND user_id IS NULL) OR "
"(scope = 'SEARCH_SPACE' AND search_space_id IS NOT NULL AND user_id IS NOT NULL) OR "
"(scope = 'USER' AND user_id IS NOT NULL)",
name="ck_connections_scope_owner",
),
sa.ForeignKeyConstraint(
["search_space_id"], ["searchspaces.id"], ondelete="CASCADE"
),
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
sa.PrimaryKeyConstraint("id"),
)
if _index_exists(
"connections", "ix_connections_native_provider"
) and not _index_exists("connections", "ix_connections_provider"):
op.execute(
"ALTER INDEX ix_connections_native_provider "
"RENAME TO ix_connections_provider"
)
if _index_exists(
"connections", "ix_connections_litellm_provider"
) and not _index_exists("connections", "ix_connections_provider"):
op.execute(
"ALTER INDEX ix_connections_litellm_provider "
"RENAME TO ix_connections_provider"
)
_create_index_if_missing("ix_connections_provider", "connections", ["provider"])
_create_index_if_missing("ix_connections_scope", "connections", ["scope"])
if not _table_exists("models"):
op.create_table(
"models",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
sa.Column("connection_id", sa.Integer(), nullable=False),
sa.Column("model_id", sa.String(length=255), nullable=False),
sa.Column("display_name", sa.String(length=255), nullable=True),
sa.Column(
"source",
model_source,
server_default="DISCOVERED",
nullable=False,
),
sa.Column("supports_chat", sa.Boolean(), nullable=True),
sa.Column("max_input_tokens", sa.Integer(), nullable=True),
sa.Column("supports_image_input", sa.Boolean(), nullable=True),
sa.Column("supports_tools", sa.Boolean(), nullable=True),
sa.Column("supports_image_generation", sa.Boolean(), nullable=True),
sa.Column(
"capabilities_override",
postgresql.JSONB(astext_type=sa.Text()),
server_default=sa.text("'{}'::jsonb"),
nullable=False,
),
sa.Column(
"enabled", sa.Boolean(), server_default=sa.text("true"), nullable=False
),
sa.Column("billing_tier", sa.String(length=50), nullable=True),
sa.Column(
"catalog",
postgresql.JSONB(astext_type=sa.Text()),
server_default=sa.text("'{}'::jsonb"),
nullable=False,
),
sa.ForeignKeyConstraint(
["connection_id"], ["connections.id"], ondelete="CASCADE"
),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint(
"connection_id", "model_id", name="uq_models_connection_model_id"
),
)
else:
if not _column_exists("models", "supports_chat"):
op.add_column(
"models", sa.Column("supports_chat", sa.Boolean(), nullable=True)
)
if not _column_exists("models", "max_input_tokens"):
op.add_column(
"models", sa.Column("max_input_tokens", sa.Integer(), nullable=True)
)
if not _column_exists("models", "supports_image_input"):
op.add_column(
"models", sa.Column("supports_image_input", sa.Boolean(), nullable=True)
)
if not _column_exists("models", "supports_tools"):
op.add_column(
"models", sa.Column("supports_tools", sa.Boolean(), nullable=True)
)
if not _column_exists("models", "supports_image_generation"):
op.add_column(
"models",
sa.Column("supports_image_generation", sa.Boolean(), nullable=True),
)
_drop_column_if_exists("models", "capabilities")
_drop_column_if_exists("models", "capabilities_declared")
_drop_column_if_exists("models", "capabilities_verified")
_create_index_if_missing("ix_models_connection_id", "models", ["connection_id"])
_create_index_if_missing("ix_models_model_id", "models", ["model_id"])
_create_index_if_missing("ix_models_billing_tier", "models", ["billing_tier"])
_add_searchspace_column_if_missing("chat_model_id", server_default=sa.text("0"))
_add_searchspace_column_if_missing(
"image_gen_model_id", server_default=sa.text("0")
)
_add_searchspace_column_if_missing("vision_model_id", server_default=sa.text("0"))
for column_name in ("chat_model_id", "image_gen_model_id", "vision_model_id"):
op.alter_column(
"searchspaces",
column_name,
existing_type=sa.Integer(),
existing_nullable=True,
server_default=sa.text("0"),
)
op.execute(
"""
UPDATE searchspaces
SET
chat_model_id = COALESCE(chat_model_id, 0),
image_gen_model_id = COALESCE(image_gen_model_id, 0),
vision_model_id = COALESCE(vision_model_id, 0)
"""
)
op.execute("DROP TYPE IF EXISTS connectionprotocol")
def downgrade() -> None:
op.drop_column("searchspaces", "vision_model_id")
op.drop_column("searchspaces", "image_gen_model_id")
op.drop_column("searchspaces", "chat_model_id")
op.drop_index(op.f("ix_models_billing_tier"), table_name="models")
op.drop_index("ix_models_model_id", table_name="models")
op.drop_index(op.f("ix_models_connection_id"), table_name="models")
op.drop_table("models")
op.drop_index(op.f("ix_connections_scope"), table_name="connections")
op.drop_index(op.f("ix_connections_provider"), table_name="connections")
op.drop_table("connections")
bind = op.get_bind()
model_source.drop(bind, checkfirst=True)
connection_scope.drop(bind, checkfirst=True)

View file

@ -0,0 +1,270 @@
"""remove legacy model config tables
Revision ID: 161
Revises: 160
"""
from collections.abc import Sequence
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
from sqlalchemy.types import TypeEngine
from alembic import op
revision: str = "161"
down_revision: str | None = "160"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
litellm_provider = postgresql.ENUM(
"OPENAI",
"ANTHROPIC",
"GOOGLE",
"AZURE_OPENAI",
"BEDROCK",
"VERTEX_AI",
"GROQ",
"COHERE",
"MISTRAL",
"DEEPSEEK",
"XAI",
"OPENROUTER",
"TOGETHER_AI",
"FIREWORKS_AI",
"REPLICATE",
"PERPLEXITY",
"OLLAMA",
"ALIBABA_QWEN",
"MOONSHOT",
"ZHIPU",
"ANYSCALE",
"DEEPINFRA",
"CEREBRAS",
"SAMBANOVA",
"AI21",
"CLOUDFLARE",
"DATABRICKS",
"COMETAPI",
"HUGGINGFACE",
"GITHUB_MODELS",
"MINIMAX",
"CUSTOM",
name="litellmprovider",
create_type=False,
)
image_gen_provider = postgresql.ENUM(
"OPENAI",
"AZURE_OPENAI",
"GOOGLE",
"VERTEX_AI",
"BEDROCK",
"RECRAFT",
"OPENROUTER",
"XINFERENCE",
"NSCALE",
name="imagegenprovider",
create_type=False,
)
vision_provider = postgresql.ENUM(
"OPENAI",
"ANTHROPIC",
"GOOGLE",
"AZURE_OPENAI",
"VERTEX_AI",
"BEDROCK",
"XAI",
"OPENROUTER",
"OLLAMA",
"GROQ",
"TOGETHER_AI",
"FIREWORKS_AI",
"DEEPSEEK",
"MISTRAL",
"CUSTOM",
name="visionprovider",
create_type=False,
)
def _table_exists(table_name: str) -> bool:
return table_name in sa.inspect(op.get_bind()).get_table_names()
def _column_exists(table_name: str, column_name: str) -> bool:
if not _table_exists(table_name):
return False
return column_name in {
column["name"] for column in sa.inspect(op.get_bind()).get_columns(table_name)
}
def _drop_column_if_exists(table_name: str, column_name: str) -> None:
if _column_exists(table_name, column_name):
op.drop_column(table_name, column_name)
def _rename_column_if_exists(
table_name: str,
old_column_name: str,
new_column_name: str,
*,
existing_type: TypeEngine,
existing_nullable: bool = True,
) -> None:
if _column_exists(table_name, old_column_name) and not _column_exists(
table_name, new_column_name
):
op.alter_column(
table_name,
old_column_name,
new_column_name=new_column_name,
existing_type=existing_type,
existing_nullable=existing_nullable,
)
def upgrade() -> None:
for table_name in (
"new_llm_configs",
"vision_llm_configs",
"image_generation_configs",
):
if _table_exists(table_name):
op.drop_table(table_name)
_drop_column_if_exists("searchspaces", "agent_llm_id")
_drop_column_if_exists("searchspaces", "image_generation_config_id")
_drop_column_if_exists("searchspaces", "vision_llm_config_id")
_rename_column_if_exists(
"image_generations",
"image_generation_config_id",
"image_gen_model_id",
existing_type=sa.Integer(),
)
op.execute("DROP TYPE IF EXISTS litellmprovider")
op.execute("DROP TYPE IF EXISTS imagegenprovider")
op.execute("DROP TYPE IF EXISTS visionprovider")
def downgrade() -> None:
bind = op.get_bind()
litellm_provider.create(bind, checkfirst=True)
image_gen_provider.create(bind, checkfirst=True)
vision_provider.create(bind, checkfirst=True)
_rename_column_if_exists(
"image_generations",
"image_gen_model_id",
"image_generation_config_id",
existing_type=sa.Integer(),
)
if _table_exists("searchspaces"):
if not _column_exists("searchspaces", "agent_llm_id"):
op.add_column(
"searchspaces",
sa.Column("agent_llm_id", sa.Integer(), nullable=True),
)
if not _column_exists("searchspaces", "image_generation_config_id"):
op.add_column(
"searchspaces",
sa.Column("image_generation_config_id", sa.Integer(), nullable=True),
)
if not _column_exists("searchspaces", "vision_llm_config_id"):
op.add_column(
"searchspaces",
sa.Column("vision_llm_config_id", sa.Integer(), nullable=True),
)
if not _table_exists("image_generation_configs"):
op.create_table(
"image_generation_configs",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
sa.Column("name", sa.String(length=100), nullable=False),
sa.Column("description", sa.String(length=500), nullable=True),
sa.Column("provider", image_gen_provider, nullable=False),
sa.Column("custom_provider", sa.String(length=100), nullable=True),
sa.Column("model_name", sa.String(length=100), nullable=False),
sa.Column("api_key", sa.String(), nullable=False),
sa.Column("api_base", sa.String(length=500), nullable=True),
sa.Column("api_version", sa.String(length=50), nullable=True),
sa.Column("litellm_params", sa.JSON(), nullable=True),
sa.Column("search_space_id", sa.Integer(), nullable=False),
sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
sa.ForeignKeyConstraint(
["search_space_id"], ["searchspaces.id"], ondelete="CASCADE"
),
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
sa.PrimaryKeyConstraint("id"),
)
op.create_index(
op.f("ix_image_generation_configs_name"),
"image_generation_configs",
["name"],
unique=False,
)
if not _table_exists("vision_llm_configs"):
op.create_table(
"vision_llm_configs",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
sa.Column("name", sa.String(length=100), nullable=False),
sa.Column("description", sa.String(length=500), nullable=True),
sa.Column("provider", vision_provider, nullable=False),
sa.Column("custom_provider", sa.String(length=100), nullable=True),
sa.Column("model_name", sa.String(length=100), nullable=False),
sa.Column("api_key", sa.String(), nullable=False),
sa.Column("api_base", sa.String(length=500), nullable=True),
sa.Column("api_version", sa.String(length=50), nullable=True),
sa.Column("litellm_params", sa.JSON(), nullable=True),
sa.Column("search_space_id", sa.Integer(), nullable=False),
sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
sa.ForeignKeyConstraint(
["search_space_id"], ["searchspaces.id"], ondelete="CASCADE"
),
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
sa.PrimaryKeyConstraint("id"),
)
op.create_index(
op.f("ix_vision_llm_configs_name"),
"vision_llm_configs",
["name"],
unique=False,
)
if not _table_exists("new_llm_configs"):
op.create_table(
"new_llm_configs",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
sa.Column("name", sa.String(length=100), nullable=False),
sa.Column("description", sa.String(length=500), nullable=True),
sa.Column("provider", litellm_provider, nullable=False),
sa.Column("custom_provider", sa.String(length=100), nullable=True),
sa.Column("model_name", sa.String(length=100), nullable=False),
sa.Column("api_key", sa.String(), nullable=False),
sa.Column("api_base", sa.String(length=500), nullable=True),
sa.Column("litellm_params", sa.JSON(), nullable=True),
sa.Column("system_instructions", sa.Text(), nullable=False),
sa.Column("use_default_system_instructions", sa.Boolean(), nullable=False),
sa.Column("citations_enabled", sa.Boolean(), nullable=False),
sa.Column("search_space_id", sa.Integer(), nullable=False),
sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
sa.ForeignKeyConstraint(
["search_space_id"], ["searchspaces.id"], ondelete="CASCADE"
),
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
sa.PrimaryKeyConstraint("id"),
)
op.create_index(
op.f("ix_new_llm_configs_name"),
"new_llm_configs",
["name"],
unique=False,
)

View file

@ -0,0 +1,53 @@
"""add etl_cache_parses table for content-addressed parse reuse
Revision ID: 162
Revises: 161
"""
from collections.abc import Sequence
from alembic import op
revision: str = "162"
down_revision: str | None = "161"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.execute(
"""
CREATE TABLE IF NOT EXISTS etl_cache_parses (
id SERIAL PRIMARY KEY,
source_sha256 VARCHAR(64) NOT NULL,
etl_service VARCHAR(32) NOT NULL,
mode VARCHAR(16) NOT NULL,
parser_version INTEGER NOT NULL,
storage_backend VARCHAR(32) NOT NULL,
storage_key TEXT NOT NULL,
size_bytes BIGINT NOT NULL,
content_type VARCHAR(32) NOT NULL,
actual_pages INTEGER NOT NULL DEFAULT 0,
times_reused BIGINT NOT NULL DEFAULT 0,
last_used_at TIMESTAMP WITH TIME ZONE NOT NULL,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
CONSTRAINT uq_etl_cache_parses_key
UNIQUE (source_sha256, etl_service, mode, parser_version)
);
"""
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_etl_cache_parses_last_used_at "
"ON etl_cache_parses(last_used_at);"
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_etl_cache_parses_created_at "
"ON etl_cache_parses(created_at);"
)
def downgrade() -> None:
op.execute("DROP INDEX IF EXISTS ix_etl_cache_parses_created_at;")
op.execute("DROP INDEX IF EXISTS ix_etl_cache_parses_last_used_at;")
op.execute("DROP TABLE IF EXISTS etl_cache_parses;")

View file

@ -0,0 +1,53 @@
"""add embedding_cache_sets table for content-addressed embedding reuse
Revision ID: 163
Revises: 162
"""
from collections.abc import Sequence
from alembic import op
revision: str = "163"
down_revision: str | None = "162"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.execute(
"""
CREATE TABLE IF NOT EXISTS embedding_cache_sets (
id SERIAL PRIMARY KEY,
markdown_sha256 VARCHAR(64) NOT NULL,
embedding_model VARCHAR(255) NOT NULL,
embedding_dim INTEGER NOT NULL,
chunker_kind VARCHAR(8) NOT NULL,
chunker_version INTEGER NOT NULL,
storage_backend VARCHAR(32) NOT NULL,
storage_key TEXT NOT NULL,
size_bytes BIGINT NOT NULL,
chunk_count INTEGER NOT NULL DEFAULT 0,
times_reused BIGINT NOT NULL DEFAULT 0,
last_used_at TIMESTAMP WITH TIME ZONE NOT NULL,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
CONSTRAINT uq_embedding_cache_sets_key
UNIQUE (markdown_sha256, embedding_model, chunker_kind, chunker_version)
);
"""
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_embedding_cache_sets_last_used_at "
"ON embedding_cache_sets(last_used_at);"
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_embedding_cache_sets_created_at "
"ON embedding_cache_sets(created_at);"
)
def downgrade() -> None:
op.execute("DROP INDEX IF EXISTS ix_embedding_cache_sets_created_at;")
op.execute("DROP INDEX IF EXISTS ix_embedding_cache_sets_last_used_at;")
op.execute("DROP TABLE IF EXISTS embedding_cache_sets;")

View file

@ -0,0 +1,219 @@
"""remove users that never logged back in (last_login IS NULL)
Migration 103 added ``user.last_login``. Any user whose ``last_login`` is still
NULL has never authenticated since that column existed, i.e. they never logged
back in. This migration purges those users together with everything that hangs
off them: the search spaces they own, and (via ON DELETE CASCADE)
``searchspaces -> documents -> chunks`` plus all other user/space-scoped rows.
This runs BEFORE the chunks.position backfill (revision 165) on purpose: it
removes a large amount of dead chunk data first, so the expensive backfill has
far fewer rows to rewrite.
Work is done in committed batches (not one giant cascading DELETE) so that on a
large table it streams progress to the alembic console, keeps each transaction
small, bounds WAL/bloat growth, and is resumable if interrupted.
Revision ID: 164
Revises: 163
"""
import logging
import time
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "164"
down_revision: str | None = "163"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
# Documents removed per committed batch. Each document delete cascades to its
# chunks (via ix_chunks_document_id), so keep this modest to bound batch size.
DOC_BATCH = 1_000
# Users removed per committed batch. Each cascades to owned search spaces and
# the remaining space-/user-scoped rows.
USER_BATCH = 500
# Minimum seconds between progress log lines (keeps the console readable).
LOG_EVERY_SECONDS = 5.0
USER_SCRATCH = "_inactive_user_ids"
DOC_SCRATCH = "_inactive_doc_ids"
logger = logging.getLogger("alembic.runtime.migration")
def _fmt_duration(seconds: float) -> str:
seconds = int(seconds)
h, rem = divmod(seconds, 3600)
m, s = divmod(rem, 60)
if h:
return f"{h}h{m:02d}m{s:02d}s"
if m:
return f"{m}m{s:02d}s"
return f"{s}s"
def upgrade() -> None:
bind = op.get_bind()
# Run the heavy work outside the migration's single transaction so each
# batch can commit on its own.
with op.get_context().autocommit_block():
# Materialize the target user ids once. Rebuilt from scratch on every
# run, so a re-run after an interruption simply picks up whoever still
# has NULL last_login -> the migration is idempotent and resumable.
op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};")
op.execute(
f"CREATE UNLOGGED TABLE {USER_SCRATCH} AS "
'SELECT id FROM "user" WHERE last_login IS NULL;'
)
op.execute(f"ALTER TABLE {USER_SCRATCH} ADD PRIMARY KEY (id);")
total_users = (
bind.execute(sa.text(f"SELECT count(*) FROM {USER_SCRATCH}")).scalar() or 0
)
if total_users == 0:
logger.info("no users with NULL last_login; nothing to remove")
op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};")
return
logger.info(
"found %s users with NULL last_login (never logged back in); "
"removing them and all data in search spaces they own",
f"{total_users:,}",
)
# Documents living in search spaces owned by those users. Deleting these
# explicitly (in batches) is what bounds the otherwise-unbounded
# chunks cascade.
op.execute(f"DROP TABLE IF EXISTS {DOC_SCRATCH};")
op.execute(
f"""
CREATE UNLOGGED TABLE {DOC_SCRATCH} AS
SELECT d.id
FROM documents d
JOIN searchspaces s ON s.id = d.search_space_id
WHERE s.user_id IN (SELECT id FROM {USER_SCRATCH});
"""
)
op.execute(f"ALTER TABLE {DOC_SCRATCH} ADD PRIMARY KEY (id);")
total_docs = (
bind.execute(sa.text(f"SELECT count(*) FROM {DOC_SCRATCH}")).scalar() or 0
)
# Phase 1: delete documents (cascades chunks, document_versions,
# document_files) in committed batches.
logger.info(
"phase 1/2: deleting %s documents (cascades their chunks) "
"in batches of %s...",
f"{total_docs:,}",
f"{DOC_BATCH:,}",
)
_batched_delete(
bind,
scratch=DOC_SCRATCH,
target_table="documents",
target_col="id",
batch_size=DOC_BATCH,
total=total_docs,
label="documents",
)
op.execute(f"DROP TABLE IF EXISTS {DOC_SCRATCH};")
# Phase 2: delete the users themselves. This cascades the now-empty
# search spaces plus all remaining user-/space-scoped rows.
logger.info(
"phase 2/2: deleting %s users (cascades search spaces and "
"remaining data) in batches of %s...",
f"{total_users:,}",
f"{USER_BATCH:,}",
)
_batched_delete(
bind,
scratch=USER_SCRATCH,
target_table='"user"',
target_col="id",
batch_size=USER_BATCH,
total=total_users,
label="users",
)
op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};")
logger.info("migration 164 finished")
def _batched_delete(
bind: sa.engine.Connection,
*,
scratch: str,
target_table: str,
target_col: str,
batch_size: int,
total: int,
label: str,
) -> None:
"""Pop ids from ``scratch`` and delete the matching rows, one committed
batch at a time, logging progress. Atomic per batch: the row delete and the
scratch pop happen in a single statement, so an interrupted run leaves the
scratch table in sync with what has actually been deleted."""
started = time.monotonic()
last_log = 0.0
done = 0
stmt = sa.text(
f"""
WITH batch AS (
SELECT id FROM {scratch} LIMIT :n
), deleted AS (
DELETE FROM {target_table}
WHERE {target_col} IN (SELECT id FROM batch)
), popped AS (
DELETE FROM {scratch}
WHERE id IN (SELECT id FROM batch)
RETURNING id
)
SELECT count(*) FROM popped
"""
)
while True:
popped = bind.execute(stmt, {"n": batch_size}).scalar() or 0
if popped == 0:
break
done += popped
now = time.monotonic()
if now - last_log >= LOG_EVERY_SECONDS or done >= total:
elapsed = now - started
pct = (100.0 * done / total) if total else 100.0
eta = (elapsed / pct * (100.0 - pct)) if pct > 0 else 0.0
logger.info(
"%s deleted: %.1f%% (%s/%s) elapsed %s eta %s",
label,
pct,
f"{done:,}",
f"{total:,}",
_fmt_duration(elapsed),
_fmt_duration(eta),
)
last_log = now
logger.info(
"deleted %s %s in %s",
f"{done:,}",
label,
_fmt_duration(time.monotonic() - started),
)
def downgrade() -> None:
# Irreversible: deleted users and their cascaded data cannot be restored.
# No-op so the downgrade chain can still pass through this revision.
logger.warning(
"migration 164 (remove_inactive_users) is irreversible; "
"downgrade is a no-op (deleted users/data are not restored)"
)

View file

@ -0,0 +1,183 @@
"""add chunks.position for explicit document order
Incremental re-indexing keeps unchanged chunk rows, so auto-increment ids no
longer reflect document order. Backfill preserves the historical id ordering.
The backfill is done in committed batches (not one giant UPDATE) so that on a
large table it: streams progress to the alembic console, keeps each transaction
small, bounds WAL/bloat growth, and is resumable if interrupted.
Revision ID: 165
Revises: 164
"""
import logging
import time
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "165"
down_revision: str | None = "164"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
# Number of chunk ids processed per committed batch.
BATCH_SIZE = 100_000
# Minimum seconds between progress log lines (keeps the console readable).
LOG_EVERY_SECONDS = 5.0
SCRATCH_TABLE = "_chunk_position_backfill"
logger = logging.getLogger("alembic.runtime.migration")
def _fmt_duration(seconds: float) -> str:
seconds = int(seconds)
h, rem = divmod(seconds, 3600)
m, s = divmod(rem, 60)
if h:
return f"{h}h{m:02d}m{s:02d}s"
if m:
return f"{m}m{s:02d}s"
return f"{s}s"
def _index_exists(bind: sa.engine.Connection, name: str) -> bool:
return bool(
bind.execute(
sa.text(
"SELECT EXISTS (SELECT 1 FROM pg_class "
"WHERE relkind = 'i' AND relname = :n)"
),
{"n": name},
).scalar()
)
def upgrade() -> None:
bind = op.get_bind()
# Adding a NOT NULL column with a constant default is metadata-only on
# PostgreSQL 11+, so this is fast even on very large tables.
op.execute(
"ALTER TABLE chunks ADD COLUMN IF NOT EXISTS position INTEGER NOT NULL DEFAULT 0;"
)
# Idempotent fast path: both indexes are created only after the backfill
# has fully completed, so their presence is a reliable "already applied"
# marker. This makes re-running the migration a cheap no-op.
if _index_exists(bind, "ix_chunks_position") and _index_exists(
bind, "ix_chunks_document_id_position"
):
logger.info("migration 165 already applied; skipping backfill")
return
# Run the heavy work outside the migration's single transaction so each
# batch can commit on its own.
with op.get_context().autocommit_block():
# reltuples is a planner estimate and is -1 on never-analyzed tables;
# it is only used for the log line below, so treat <= 0 as "unknown".
total_rows = (
bind.execute(
sa.text(
"SELECT reltuples::bigint FROM pg_class WHERE relname = 'chunks'"
)
).scalar()
or 0
)
total_rows_display = (
f"~{total_rows:,}" if total_rows > 0 else "an unknown number of"
)
bounds = bind.execute(sa.text("SELECT min(id), max(id) FROM chunks")).one()
min_id, max_id = bounds[0], bounds[1]
if min_id is None:
logger.info("chunks table is empty; nothing to backfill")
else:
# Precompute per-document ordering once into an UNLOGGED scratch
# table (low WAL). ROW_NUMBER must see each whole document, so it
# cannot be computed per id-range slice.
logger.info(
"building position mapping for %s chunks (this is a single "
"scan; the batched UPDATE below reports progress)...",
total_rows_display,
)
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
op.execute(
f"""
CREATE UNLOGGED TABLE {SCRATCH_TABLE} AS
SELECT id,
(ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY id) - 1)::int AS rn
FROM chunks;
"""
)
op.execute(f"ALTER TABLE {SCRATCH_TABLE} ADD PRIMARY KEY (id);")
id_span = max(max_id - min_id + 1, 1)
started = time.monotonic()
last_log = 0.0
updated_total = 0
lo = min_id
while lo <= max_id:
hi = lo + BATCH_SIZE # exclusive upper bound
result = bind.execute(
sa.text(
f"""
UPDATE chunks c
SET position = m.rn
FROM {SCRATCH_TABLE} m
WHERE c.id = m.id
AND c.id >= :lo
AND c.id < :hi
AND c.position IS DISTINCT FROM m.rn
"""
),
{"lo": lo, "hi": hi},
)
updated_total += result.rowcount or 0
now = time.monotonic()
processed_ids = min(hi, max_id + 1) - min_id
pct = min(100.0, 100.0 * processed_ids / id_span)
if now - last_log >= LOG_EVERY_SECONDS or hi > max_id:
elapsed = now - started
eta = (elapsed / pct * (100.0 - pct)) if pct > 0 else 0.0
logger.info(
"backfill position: %.1f%% (id<%s, %s rows rewritten) "
"elapsed %s eta %s",
pct,
f"{min(hi, max_id + 1):,}",
f"{updated_total:,}",
_fmt_duration(elapsed),
_fmt_duration(eta),
)
last_log = now
lo = hi
logger.info(
"backfill complete: %s rows rewritten in %s",
f"{updated_total:,}",
_fmt_duration(time.monotonic() - started),
)
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
logger.info("creating index ix_chunks_position...")
op.execute("CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);")
logger.info("creating index ix_chunks_document_id_position...")
op.execute(
"CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position "
"ON chunks(document_id, position);"
)
logger.info("migration 165 finished")
def downgrade() -> None:
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
op.execute("DROP INDEX IF EXISTS ix_chunks_document_id_position;")
op.execute("DROP INDEX IF EXISTS ix_chunks_position;")
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS position;")

View file

@ -241,8 +241,15 @@ async def _create_document(
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
session.add_all( session.add_all(
[ [
Chunk(document_id=doc.id, content=text, embedding=embedding) Chunk(
for text, embedding in zip(chunks, chunk_embeddings, strict=True) document_id=doc.id,
content=text,
embedding=embedding,
position=i,
)
for i, (text, embedding) in enumerate(
zip(chunks, chunk_embeddings, strict=True)
)
] ]
) )
return doc return doc
@ -289,8 +296,15 @@ async def _update_document(
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
session.add_all( session.add_all(
[ [
Chunk(document_id=document.id, content=text, embedding=embedding) Chunk(
for text, embedding in zip(chunks, chunk_embeddings, strict=True) document_id=document.id,
content=text,
embedding=embedding,
position=i,
)
for i, (text, embedding) in enumerate(
zip(chunks, chunk_embeddings, strict=True)
)
] ]
) )
return document return document
@ -475,7 +489,9 @@ async def _load_chunks_for_snapshot(
session: AsyncSession, *, doc_id: int session: AsyncSession, *, doc_id: int
) -> list[dict[str, str]]: ) -> list[dict[str, str]]:
rows = await session.execute( rows = await session.execute(
select(Chunk.content).where(Chunk.document_id == doc_id).order_by(Chunk.id) select(Chunk.content)
.where(Chunk.document_id == doc_id)
.order_by(Chunk.position, Chunk.id)
) )
return [{"content": row.content} for row in rows.all() if row.content is not None] return [{"content": row.content} for row in rows.all() if row.content is not None]

View file

@ -57,7 +57,7 @@ async def build_agent_with_cache(
mcp_tools_by_agent: dict[str, list[BaseTool]], mcp_tools_by_agent: dict[str, list[BaseTool]],
disabled_tools: list[str] | None, disabled_tools: list[str] | None,
config_id: str | None, config_id: str | None,
image_generation_config_id_override: int | None = None, image_gen_model_id_override: int | None = None,
) -> Any: ) -> Any:
"""Compile the multi-agent graph, serving from cache when key components are stable.""" """Compile the multi-agent graph, serving from cache when key components are stable."""
@ -121,7 +121,7 @@ async def build_agent_with_cache(
# Bound into the generate_image subagent tool at construction time, so it # Bound into the generate_image subagent tool at construction time, so it
# must key the compiled-agent cache to avoid leaking one automation's # must key the compiled-agent cache to avoid leaking one automation's
# image model into another with the same config_id/search_space. # image model into another with the same config_id/search_space.
image_generation_config_id_override, image_gen_model_id_override,
) )
return await get_cache().get_or_build(cache_key, builder=_build) return await get_cache().get_or_build(cache_key, builder=_build)

View file

@ -72,11 +72,11 @@ async def create_multi_agent_chat_deep_agent(
mentioned_document_ids: list[int] | None = None, mentioned_document_ids: list[int] | None = None,
anon_session_id: str | None = None, anon_session_id: str | None = None,
filesystem_selection: FilesystemSelection | None = None, filesystem_selection: FilesystemSelection | None = None,
image_generation_config_id: int | None = None, image_gen_model_id: int | None = None,
): ):
"""Deep agent with SurfSense tools/middleware; registry route subagents behind ``task`` when enabled. """Deep agent with SurfSense tools/middleware; registry route subagents behind ``task`` when enabled.
``image_generation_config_id`` overrides the search space's image model for ``image_gen_model_id`` overrides the search space's image model for
this invocation (used by automations to run on their captured model). When this invocation (used by automations to run on their captured model). When
``None``, the ``generate_image`` tool resolves the live search-space pref. ``None``, the ``generate_image`` tool resolves the live search-space pref.
""" """
@ -147,7 +147,7 @@ async def create_multi_agent_chat_deep_agent(
"llm": llm, "llm": llm,
# Per-invocation image model override (automations run on their captured # Per-invocation image model override (automations run on their captured
# model). Reaches the generate_image subagent tool via subagent_dependencies. # model). Reaches the generate_image subagent tool via subagent_dependencies.
"image_generation_config_id_override": image_generation_config_id, "image_gen_model_id_override": image_gen_model_id,
} }
_t0 = time.perf_counter() _t0 = time.perf_counter()
@ -303,7 +303,7 @@ async def create_multi_agent_chat_deep_agent(
mcp_tools_by_agent=mcp_tools_by_agent, mcp_tools_by_agent=mcp_tools_by_agent,
disabled_tools=disabled_tools, disabled_tools=disabled_tools,
config_id=config_id, config_id=config_id,
image_generation_config_id_override=image_generation_config_id, image_gen_model_id_override=image_gen_model_id,
) )
_perf_log.info( _perf_log.info(
"[create_agent] Middleware stack + graph compiled in %.3fs", "[create_agent] Middleware stack + graph compiled in %.3fs",

View file

@ -508,7 +508,7 @@ class KBPostgresBackend(BackendProtocol):
chunk_rows = await session.execute( chunk_rows = await session.execute(
select(Chunk.id, Chunk.content) select(Chunk.id, Chunk.content)
.where(Chunk.document_id == document.id) .where(Chunk.document_id == document.id)
.order_by(Chunk.id) .order_by(Chunk.position, Chunk.id)
) )
chunks = [ chunks = [
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all() {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
@ -725,7 +725,7 @@ class KBPostgresBackend(BackendProtocol):
.join(Document, Document.id == Chunk.document_id) .join(Document, Document.id == Chunk.document_id)
.where(Document.search_space_id == self.search_space_id) .where(Document.search_space_id == self.search_space_id)
.where(Chunk.content.ilike(f"%{pattern}%")) .where(Chunk.content.ilike(f"%{pattern}%"))
.order_by(Chunk.document_id, Chunk.id) .order_by(Chunk.document_id, Chunk.position, Chunk.id)
) )
chunk_rows = await session.execute(sub) chunk_rows = await session.execute(sub)
per_doc: dict[int, int] = {} per_doc: dict[int, int] = {}

View file

@ -394,7 +394,10 @@ async def browse_recent_documents(
Chunk.document_id, Chunk.document_id,
Chunk.content, Chunk.content,
func.row_number() func.row_number()
.over(partition_by=Chunk.document_id, order_by=Chunk.id) .over(
partition_by=Chunk.document_id,
order_by=(Chunk.position, Chunk.id),
)
.label("rn"), .label("rn"),
) )
.where(Chunk.document_id.in_(doc_ids)) .where(Chunk.document_id.in_(doc_ids))
@ -404,7 +407,7 @@ async def browse_recent_documents(
chunk_query = ( chunk_query = (
select(numbered.c.chunk_id, numbered.c.document_id, numbered.c.content) select(numbered.c.chunk_id, numbered.c.document_id, numbered.c.content)
.where(numbered.c.rn <= _RECENCY_MAX_CHUNKS_PER_DOC) .where(numbered.c.rn <= _RECENCY_MAX_CHUNKS_PER_DOC)
.order_by(numbered.c.document_id, numbered.c.chunk_id) .order_by(numbered.c.document_id, numbered.c.rn)
) )
chunk_result = await session.execute(chunk_query) chunk_result = await session.execute(chunk_query)
fetched_chunks = chunk_result.all() fetched_chunks = chunk_result.all()
@ -531,7 +534,7 @@ async def fetch_mentioned_documents(
chunk_result = await session.execute( chunk_result = await session.execute(
select(Chunk.id, Chunk.content, Chunk.document_id) select(Chunk.id, Chunk.content, Chunk.document_id)
.where(Chunk.document_id.in_(list(docs.keys()))) .where(Chunk.document_id.in_(list(docs.keys())))
.order_by(Chunk.document_id, Chunk.id) .order_by(Chunk.document_id, Chunk.position, Chunk.id)
) )
chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs} chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
for row in chunk_result.all(): for row in chunk_result.all():

View file

@ -10,70 +10,53 @@ from langgraph.types import Command
from litellm import aimage_generation from litellm import aimage_generation
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from app.agents.chat.multi_agent_chat.shared.receipts.command import with_receipt from app.agents.chat.multi_agent_chat.shared.receipts.command import with_receipt
from app.agents.chat.multi_agent_chat.shared.receipts.receipt import make_receipt from app.agents.chat.multi_agent_chat.shared.receipts.receipt import make_receipt
from app.config import config from app.config import config
from app.db import ( from app.db import (
ImageGeneration, ImageGeneration,
ImageGenerationConfig, Model,
SearchSpace, SearchSpace,
shielded_async_session, shielded_async_session,
) )
from app.services.auto_model_pin_service import (
auto_model_candidates,
choose_auto_model_candidate,
)
from app.services.image_gen_router_service import ( from app.services.image_gen_router_service import (
IMAGE_GEN_AUTO_MODE_ID, IMAGE_GEN_AUTO_MODE_ID,
ImageGenRouterService,
is_image_gen_auto_mode, is_image_gen_auto_mode,
) )
from app.services.provider_api_base import resolve_api_base from app.services.model_capabilities import has_capability
from app.services.model_resolver import to_litellm
from app.utils.signed_image_urls import generate_image_token from app.utils.signed_image_urls import generate_image_token
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Provider mapping (same as routes)
_PROVIDER_MAP = { def _get_global_model(model_id: int) -> dict | None:
"OPENAI": "openai", return next((m for m in config.GLOBAL_MODELS if m.get("id") == model_id), None)
"AZURE_OPENAI": "azure",
"GOOGLE": "gemini",
"VERTEX_AI": "vertex_ai",
"BEDROCK": "bedrock",
"RECRAFT": "recraft",
"OPENROUTER": "openrouter",
"XINFERENCE": "xinference",
"NSCALE": "nscale",
}
def _resolve_provider_prefix(provider: str, custom_provider: str | None) -> str: def _get_global_connection(connection_id: int) -> dict | None:
if custom_provider: return next(
return custom_provider (c for c in config.GLOBAL_CONNECTIONS if c.get("id") == connection_id),
return _PROVIDER_MAP.get(provider.upper(), provider.lower()) None,
)
def _build_model_string(
provider: str, model_name: str, custom_provider: str | None
) -> str:
return f"{_resolve_provider_prefix(provider, custom_provider)}/{model_name}"
def _get_global_image_gen_config(config_id: int) -> dict | None:
"""Get a global image gen config by negative ID."""
for cfg in config.GLOBAL_IMAGE_GEN_CONFIGS:
if cfg.get("id") == config_id:
return cfg
return None
def create_generate_image_tool( def create_generate_image_tool(
search_space_id: int, search_space_id: int,
db_session: AsyncSession, db_session: AsyncSession,
image_generation_config_id_override: int | None = None, image_gen_model_id_override: int | None = None,
): ):
"""Create ``generate_image`` with bound search space; DB work uses a per-call session. """Create ``generate_image`` with bound search space; DB work uses a per-call session.
``image_generation_config_id_override``: when set (automations running on a ``image_gen_model_id_override``: when set (automations running on a
captured model), use this config id instead of reading the search space's captured model), use this model id instead of reading the search space's
live ``image_generation_config_id``. live ``image_gen_model_id``.
""" """
del db_session # tool uses a fresh per-call session instead del db_session # tool uses a fresh per-call session instead
@ -118,26 +101,23 @@ def create_generate_image_tool(
# task's session is shared across every tool; without isolation, # task's session is shared across every tool; without isolation,
# autoflushes from a concurrent writer poison this tool too. # autoflushes from a concurrent writer poison this tool too.
async with shielded_async_session() as session: async with shielded_async_session() as session:
if image_generation_config_id_override is not None: result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
search_space = result.scalars().first()
if not search_space:
return _failed(
{"error": "Search space not found"},
error="Search space not found",
)
if image_gen_model_id_override is not None:
# Automation run: use the captured image model, insulated from # Automation run: use the captured image model, insulated from
# later search-space changes. No search-space read needed. # later search-space changes. No search-space read needed.
config_id = ( config_id = image_gen_model_id_override or IMAGE_GEN_AUTO_MODE_ID
image_generation_config_id_override or IMAGE_GEN_AUTO_MODE_ID
)
else: else:
result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
search_space = result.scalars().first()
if not search_space:
return _failed(
{"error": "Search space not found"},
error="Search space not found",
)
config_id = ( config_id = (
search_space.image_generation_config_id search_space.image_gen_model_id or IMAGE_GEN_AUTO_MODE_ID
or IMAGE_GEN_AUTO_MODE_ID
) )
# size/quality/style are intentionally omitted: valid values # size/quality/style are intentionally omitted: valid values
@ -147,73 +127,86 @@ def create_generate_image_tool(
gen_kwargs["n"] = n gen_kwargs["n"] = n
if is_image_gen_auto_mode(config_id): if is_image_gen_auto_mode(config_id):
if not ImageGenRouterService.is_initialized(): candidates = await auto_model_candidates(
session,
search_space_id=search_space_id,
user_id=search_space.user_id,
capability="image_gen",
)
if not candidates:
err = ( err = (
"No image generation models configured. " "No image generation models available. "
"Please add an image model in Settings > Image Models." "Please add an image model in Settings > Image Models."
) )
return _failed({"error": err}, error=err) return _failed({"error": err}, error=err)
response = await ImageGenRouterService.aimage_generation( config_id = int(
prompt=prompt, model="auto", **gen_kwargs choose_auto_model_candidate(candidates, search_space_id)["id"]
) )
elif config_id < 0:
cfg = _get_global_image_gen_config(config_id) provider_base_url: str | None = None
if not cfg:
err = f"Image generation config {config_id} not found" if config_id < 0:
global_model = _get_global_model(config_id)
if not global_model or not has_capability(
global_model, "image_gen"
):
err = f"Image generation model {config_id} not found"
return _failed({"error": err}, error=err)
global_connection = _get_global_connection(
global_model["connection_id"]
)
if not global_connection:
err = f"Image generation connection for model {config_id} not found"
return _failed({"error": err}, error=err) return _failed({"error": err}, error=err)
provider_prefix = _resolve_provider_prefix( model_string, resolved_kwargs = to_litellm(
cfg.get("provider", ""), cfg.get("custom_provider") global_connection,
global_model["model_id"],
) )
model_string = f"{provider_prefix}/{cfg['model_name']}" gen_kwargs.update(resolved_kwargs)
gen_kwargs["api_key"] = cfg.get("api_key") provider_base_url = resolved_kwargs.get("api_base")
# Defense-in-depth: an empty ``api_base`` must not fall
# through to LiteLLM's global ``api_base`` (e.g. Azure).
api_base = resolve_api_base(
provider=cfg.get("provider"),
provider_prefix=provider_prefix,
config_api_base=cfg.get("api_base"),
)
if api_base:
gen_kwargs["api_base"] = api_base
if cfg.get("api_version"):
gen_kwargs["api_version"] = cfg["api_version"]
if cfg.get("litellm_params"):
gen_kwargs.update(cfg["litellm_params"])
response = await aimage_generation( response = await aimage_generation(
prompt=prompt, model=model_string, **gen_kwargs prompt=prompt, model=model_string, **gen_kwargs
) )
else: else:
# Positive ID = user-created ImageGenerationConfig # Positive ID = Model + Connection
cfg_result = await session.execute( cfg_result = await session.execute(
select(ImageGenerationConfig).filter( select(Model)
ImageGenerationConfig.id == config_id .options(selectinload(Model.connection))
) .filter(Model.id == config_id, Model.enabled.is_(True))
) )
db_cfg = cfg_result.scalars().first() db_model = cfg_result.scalars().first()
if not db_cfg: if (
err = f"Image generation config {config_id} not found" not db_model
or not db_model.connection
or not db_model.connection.enabled
):
err = f"Image generation model {config_id} not found"
return _failed({"error": err}, error=err)
conn = db_model.connection
if (
conn.search_space_id is not None
and conn.search_space_id != search_space_id
):
err = f"Image generation model {config_id} not found"
return _failed({"error": err}, error=err)
if (
conn.user_id is not None
and conn.user_id != search_space.user_id
):
err = f"Image generation model {config_id} not found"
return _failed({"error": err}, error=err)
if not has_capability(db_model, "image_gen"):
err = f"Model {config_id} is not image-generation capable"
return _failed({"error": err}, error=err) return _failed({"error": err}, error=err)
provider_prefix = _resolve_provider_prefix( model_string, resolved_kwargs = to_litellm(
db_cfg.provider.value, db_cfg.custom_provider db_model.connection,
db_model.model_id,
) )
model_string = f"{provider_prefix}/{db_cfg.model_name}" gen_kwargs.update(resolved_kwargs)
gen_kwargs["api_key"] = db_cfg.api_key provider_base_url = resolved_kwargs.get("api_base")
# Defense-in-depth: an empty ``api_base`` must not fall
# through to LiteLLM's global ``api_base`` (e.g. Azure).
api_base = resolve_api_base(
provider=db_cfg.provider.value,
provider_prefix=provider_prefix,
config_api_base=db_cfg.api_base,
)
if api_base:
gen_kwargs["api_base"] = api_base
if db_cfg.api_version:
gen_kwargs["api_version"] = db_cfg.api_version
if db_cfg.litellm_params:
gen_kwargs.update(db_cfg.litellm_params)
response = await aimage_generation( response = await aimage_generation(
prompt=prompt, model=model_string, **gen_kwargs prompt=prompt, model=model_string, **gen_kwargs
@ -230,7 +223,7 @@ def create_generate_image_tool(
prompt=prompt, prompt=prompt,
model=getattr(response, "_hidden_params", {}).get("model"), model=getattr(response, "_hidden_params", {}).get("model"),
n=n, n=n,
image_generation_config_id=config_id, image_gen_model_id=config_id,
response_data=response_dict, response_data=response_dict,
search_space_id=search_space_id, search_space_id=search_space_id,
access_token=access_token, access_token=access_token,
@ -252,8 +245,19 @@ def create_generate_image_tool(
# b64_json (e.g. gpt-image-1) is served via our backend endpoint so # b64_json (e.g. gpt-image-1) is served via our backend endpoint so
# megabytes of base64 don't bloat the LLM context. # megabytes of base64 don't bloat the LLM context.
# Some OpenAI-compatible backends (e.g. Xinference) return a relative
# URL like /files/image.png. Browsers can't resolve these, so we
# prepend the provider's base origin when the URL starts with "/".
if first_image.get("url"): if first_image.get("url"):
image_url = first_image["url"] raw_url: str = first_image["url"]
if raw_url.startswith("/") and provider_base_url:
from urllib.parse import urlparse
parsed = urlparse(provider_base_url)
origin = f"{parsed.scheme}://{parsed.netloc}"
image_url = f"{origin}{raw_url}"
else:
image_url = raw_url
elif first_image.get("b64_json"): elif first_image.get("b64_json"):
backend_url = config.BACKEND_URL or "http://localhost:8000" backend_url = config.BACKEND_URL or "http://localhost:8000"
image_url = ( image_url = (

View file

@ -51,8 +51,6 @@ def load_tools(
create_generate_image_tool( create_generate_image_tool(
search_space_id=d["search_space_id"], search_space_id=d["search_space_id"],
db_session=d["db_session"], db_session=d["db_session"],
image_generation_config_id_override=d.get( image_gen_model_id_override=d.get("image_gen_model_id_override"),
"image_generation_config_id_override"
),
), ),
] ]

View file

@ -122,7 +122,7 @@ async def _browse_recent_documents(
chunk_query = ( chunk_query = (
select(Chunk) select(Chunk)
.where(Chunk.document_id.in_(doc_ids)) .where(Chunk.document_id.in_(doc_ids))
.order_by(Chunk.document_id, Chunk.id) .order_by(Chunk.document_id, Chunk.position, Chunk.id)
) )
chunk_result = await session.execute(chunk_query) chunk_result = await session.execute(chunk_query)
raw_chunks = chunk_result.scalars().all() raw_chunks = chunk_result.scalars().all()

View file

@ -2,9 +2,9 @@
LLM configuration utilities for SurfSense agents. LLM configuration utilities for SurfSense agents.
This module provides functions for loading LLM configurations from: This module provides functions for loading LLM configurations from:
1. Auto mode (ID 0) - Uses LiteLLM Router for load balancing 1. Auto mode (ID 0) - Resolved by callers to a concrete model-connection model
2. YAML files (global configs with negative IDs) 2. YAML files (global configs with negative IDs)
3. Database NewLLMConfig table (user-created configs with positive IDs) 3. Database model-connections table (user-created configs with positive IDs)
It also provides utilities for creating ChatLiteLLM instances and It also provides utilities for creating ChatLiteLLM instances and
managing prompt configurations. managing prompt configurations.
@ -24,8 +24,6 @@ from langchain_core.messages import AIMessage, BaseMessage
from langchain_core.outputs import ChatGenerationChunk, ChatResult from langchain_core.outputs import ChatGenerationChunk, ChatResult
from langchain_litellm import ChatLiteLLM from langchain_litellm import ChatLiteLLM
from litellm import get_model_info from litellm import get_model_info
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.runtime.prompt_caching import ( from app.agents.chat.runtime.prompt_caching import (
apply_litellm_prompt_caching, apply_litellm_prompt_caching,
@ -33,10 +31,7 @@ from app.agents.chat.runtime.prompt_caching import (
from app.services.llm_router_service import ( from app.services.llm_router_service import (
AUTO_MODE_ID, AUTO_MODE_ID,
ChatLiteLLMRouter, ChatLiteLLMRouter,
LLMRouterService,
_sanitize_content, _sanitize_content,
get_auto_mode_llm,
is_auto_mode,
) )
@ -51,16 +46,19 @@ def _sanitize_messages(messages: list[BaseMessage]) -> list[BaseMessage]:
reject the blank text. The OpenAI spec says ``content`` should be reject the blank text. The OpenAI spec says ``content`` should be
``null`` when an assistant message only carries tool calls. ``null`` when an assistant message only carries tool calls.
""" """
sanitized: list[BaseMessage] = []
for msg in messages: for msg in messages:
if isinstance(msg.content, list): next_msg = msg.model_copy(deep=True)
msg.content = _sanitize_content(msg.content) if isinstance(next_msg.content, list):
next_msg.content = _sanitize_content(next_msg.content)
if ( if (
isinstance(msg, AIMessage) isinstance(next_msg, AIMessage)
and (not msg.content or msg.content == "") and (not next_msg.content or next_msg.content == "")
and getattr(msg, "tool_calls", None) and getattr(next_msg, "tool_calls", None)
): ):
msg.content = None # type: ignore[assignment] next_msg.content = None # type: ignore[assignment]
return messages sanitized.append(next_msg)
return sanitized
class SanitizedChatLiteLLM(ChatLiteLLM): class SanitizedChatLiteLLM(ChatLiteLLM):
@ -91,13 +89,21 @@ class SanitizedChatLiteLLM(ChatLiteLLM):
): ):
yield chunk yield chunk
async def _agenerate(
# Re-exported under the historical name ``PROVIDER_MAP``. Source of truth lives self,
# in provider_capabilities so the YAML loader can resolve prefixes during messages: list[BaseMessage],
# app.config init without importing the agent/tools tree. stop: list[str] | None = None,
from app.services.provider_capabilities import ( # noqa: E402 run_manager: AsyncCallbackManagerForLLMRun | None = None,
_PROVIDER_PREFIX_MAP as PROVIDER_MAP, stream: bool | None = None,
) **kwargs: Any,
) -> ChatResult:
return await super()._agenerate(
_sanitize_messages(messages),
stop=stop,
run_manager=run_manager,
stream=stream,
**kwargs,
)
def _attach_model_profile(llm: ChatLiteLLM, model_string: str) -> None: def _attach_model_profile(llm: ChatLiteLLM, model_string: str) -> None:
@ -121,8 +127,9 @@ class AgentConfig:
""" """
Complete configuration for the SurfSense agent. Complete configuration for the SurfSense agent.
This combines LLM settings with prompt configuration from NewLLMConfig. This combines resolved model settings with prompt configuration.
Supports Auto mode (ID 0) which uses LiteLLM Router for load balancing. Supports Auto mode metadata (ID 0). Runtime callers must resolve Auto to
a concrete global or BYOK model before constructing ChatLiteLLM.
""" """
# LLM Model Settings # LLM Model Settings
@ -170,7 +177,7 @@ class AgentConfig:
use_default_system_instructions=True, use_default_system_instructions=True,
citations_enabled=True, citations_enabled=True,
config_id=AUTO_MODE_ID, config_id=AUTO_MODE_ID,
config_name="Auto (Fastest)", config_name="Auto",
is_auto_mode=True, is_auto_mode=True,
billing_tier="free", billing_tier="free",
is_premium=False, is_premium=False,
@ -181,64 +188,21 @@ class AgentConfig:
supports_image_input=True, supports_image_input=True,
) )
@classmethod
def from_new_llm_config(cls, config) -> "AgentConfig":
"""Build an AgentConfig from a NewLLMConfig database model."""
# Lazy import: keeps provider_capabilities (and litellm) out of init order.
from app.services.provider_capabilities import derive_supports_image_input
provider_value = (
config.provider.value
if hasattr(config.provider, "value")
else str(config.provider)
)
litellm_params = config.litellm_params or {}
base_model = (
litellm_params.get("base_model")
if isinstance(litellm_params, dict)
else None
)
return cls(
provider=provider_value,
model_name=config.model_name,
api_key=config.api_key,
api_base=config.api_base,
custom_provider=config.custom_provider,
litellm_params=config.litellm_params,
system_instructions=config.system_instructions,
use_default_system_instructions=config.use_default_system_instructions,
citations_enabled=config.citations_enabled,
config_id=config.id,
config_name=config.name,
is_auto_mode=False,
billing_tier="free",
is_premium=False,
anonymous_enabled=False,
quota_reserve_tokens=None,
# BYOK rows have no curated flag; ask LiteLLM (default-allow on
# unknown). The streaming safety net still blocks explicit text-only.
supports_image_input=derive_supports_image_input(
provider=provider_value,
model_name=config.model_name,
base_model=base_model,
custom_provider=config.custom_provider,
),
)
@classmethod @classmethod
def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig": def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig":
"""Build an AgentConfig from a YAML configuration dictionary. """Build an AgentConfig from a YAML configuration dictionary.
Supports the same prompt fields as NewLLMConfig (system_instructions, Supports prompt fields such as system_instructions,
use_default_system_instructions, citations_enabled). use_default_system_instructions, and citations_enabled.
""" """
# Lazy import: keeps provider_capabilities (and litellm) out of init order. # Lazy import: keeps provider_capabilities (and litellm) out of init order.
from app.services.provider_capabilities import derive_supports_image_input from app.services.provider_capabilities import derive_supports_image_input
system_instructions = yaml_config.get("system_instructions", "") system_instructions = yaml_config.get("system_instructions", "")
provider = yaml_config.get("provider", "").upper() provider = yaml_config.get("provider") or yaml_config.get(
"litellm_provider", ""
)
model_name = yaml_config.get("model_name", "") model_name = yaml_config.get("model_name", "")
custom_provider = yaml_config.get("custom_provider") custom_provider = yaml_config.get("custom_provider")
litellm_params = yaml_config.get("litellm_params") or {} litellm_params = yaml_config.get("litellm_params") or {}
@ -324,93 +288,15 @@ def load_global_llm_config_by_id(llm_config_id: int) -> dict | None:
return load_llm_config_from_yaml(llm_config_id) return load_llm_config_from_yaml(llm_config_id)
async def load_new_llm_config_from_db(
session: AsyncSession,
config_id: int,
) -> "AgentConfig | None":
"""Load a NewLLMConfig from the database by ID."""
from app.db import NewLLMConfig
try:
result = await session.execute(
select(NewLLMConfig).filter(NewLLMConfig.id == config_id)
)
config = result.scalars().first()
if not config:
print(f"Error: NewLLMConfig with id {config_id} not found")
return None
return AgentConfig.from_new_llm_config(config)
except Exception as e:
print(f"Error loading NewLLMConfig from database: {e}")
return None
async def load_agent_llm_config_for_search_space(
session: AsyncSession,
search_space_id: int,
) -> "AgentConfig | None":
"""Load the agent LLM config for a search space via its agent_llm_id.
Positive id -> DB; negative -> YAML; None -> first global config (-1).
"""
from app.db import SearchSpace
try:
result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
search_space = result.scalars().first()
if not search_space:
print(f"Error: SearchSpace with id {search_space_id} not found")
return None
config_id = (
search_space.agent_llm_id if search_space.agent_llm_id is not None else -1
)
return await load_agent_config(session, config_id, search_space_id)
except Exception as e:
print(f"Error loading agent LLM config for search space {search_space_id}: {e}")
return None
async def load_agent_config(
session: AsyncSession,
config_id: int,
search_space_id: int | None = None,
) -> "AgentConfig | None":
"""Main config loader: id 0 -> Auto mode; negative -> YAML; positive -> DB."""
if is_auto_mode(config_id):
if not LLMRouterService.is_initialized():
print("Error: Auto mode requested but LLM Router not initialized")
return None
return AgentConfig.from_auto_mode()
if config_id < 0:
# In-memory covers static YAML + dynamic OpenRouter configs.
from app.config import config as app_config
for cfg in app_config.GLOBAL_LLM_CONFIGS:
if cfg.get("id") == config_id:
return AgentConfig.from_yaml_config(cfg)
yaml_config = load_llm_config_from_yaml(config_id)
if yaml_config:
return AgentConfig.from_yaml_config(yaml_config)
return None
else:
return await load_new_llm_config_from_db(session, config_id)
def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None: def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
"""Create a ChatLiteLLM instance from a global LLM config dictionary.""" """Create a ChatLiteLLM instance from a global LLM config dictionary."""
if llm_config.get("custom_provider"): if llm_config.get("custom_provider"):
model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}" model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}"
else: else:
provider = llm_config.get("provider", "").upper() provider = llm_config.get("provider") or llm_config.get(
provider_prefix = PROVIDER_MAP.get(provider, provider.lower()) "litellm_provider", "openai"
model_string = f"{provider_prefix}/{llm_config['model_name']}" )
model_string = f"{provider}/{llm_config['model_name']}"
litellm_kwargs = { litellm_kwargs = {
"model": model_string, "model": model_string,
@ -433,29 +319,17 @@ def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
def create_chat_litellm_from_agent_config( def create_chat_litellm_from_agent_config(
agent_config: AgentConfig, agent_config: AgentConfig,
) -> ChatLiteLLM | ChatLiteLLMRouter | None: ) -> ChatLiteLLM | ChatLiteLLMRouter | None:
"""Create a ChatLiteLLM (or, for Auto mode, a load-balancing router) from config.""" """Create a ChatLiteLLM from an already resolved concrete model config."""
if agent_config.is_auto_mode: if agent_config.is_auto_mode:
if not LLMRouterService.is_initialized(): print(
print("Error: Auto mode requested but LLM Router not initialized") "Error: Auto mode must be resolved to a concrete model before LLM creation"
return None )
try: return None
router_llm = get_auto_mode_llm()
if router_llm is not None:
# Universal injection points only: auto-mode fans out across
# providers, so provider-specific kwargs have no known target.
apply_litellm_prompt_caching(router_llm, agent_config=agent_config)
return router_llm
except Exception as e:
print(f"Error creating ChatLiteLLMRouter: {e}")
return None
if agent_config.custom_provider: if agent_config.custom_provider:
model_string = f"{agent_config.custom_provider}/{agent_config.model_name}" model_string = f"{agent_config.custom_provider}/{agent_config.model_name}"
else: else:
provider_prefix = PROVIDER_MAP.get( model_string = f"{agent_config.provider}/{agent_config.model_name}"
agent_config.provider, agent_config.provider.lower()
)
model_string = f"{provider_prefix}/{agent_config.model_name}"
litellm_kwargs = { litellm_kwargs = {
"model": model_string, "model": model_string,

View file

@ -33,7 +33,6 @@ from app.config import (
initialize_llm_router, initialize_llm_router,
initialize_openrouter_integration, initialize_openrouter_integration,
initialize_pricing_registration, initialize_pricing_registration,
initialize_vision_llm_router,
) )
from app.db import User, create_db_and_tables, get_async_session from app.db import User, create_db_and_tables, get_async_session
from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError
@ -622,7 +621,6 @@ async def lifespan(app: FastAPI):
initialize_pricing_registration() initialize_pricing_registration()
initialize_llm_router() initialize_llm_router()
initialize_image_gen_router() initialize_image_gen_router()
initialize_vision_llm_router()
# Phase 1.7 — JIT warmup. Bounded so a stuck warmup never delays # Phase 1.7 — JIT warmup. Bounded so a stuck warmup never delays
# worker readiness. ``shield`` so Uvicorn cancelling startup # worker readiness. ``shield`` so Uvicorn cancelling startup

View file

@ -39,31 +39,31 @@ async def build_dependencies(
*, *,
session: AsyncSession, session: AsyncSession,
search_space_id: int, search_space_id: int,
agent_llm_id: int | None = None, chat_model_id: int | None = None,
image_generation_config_id: int | None = None, image_gen_model_id: int | None = None,
vision_llm_config_id: int | None = None, vision_model_id: int | None = None,
) -> AgentDependencies: ) -> AgentDependencies:
"""Load the LLM bundle, connector service, and a per-invoke in-memory checkpointer. """Load the LLM bundle, connector service, and a per-invoke in-memory checkpointer.
Resolves the agent LLM from the automation's *captured* model snapshot Resolves the chat model from the automation's *captured* model snapshot
(``agent_llm_id``) so runs are insulated from later chat/search-space model (``chat_model_id``) so runs are insulated from later chat/search-space model
changes. The model policy is enforced here as a runtime backstop: a captured changes. The model policy is enforced here as a runtime backstop: a captured
model that is no longer billable (e.g. a premium global config was removed) model that is no longer billable (e.g. a premium global config was removed)
fails the run clearly instead of silently consuming a free model. fails the run clearly instead of silently consuming a free model.
When ``agent_llm_id`` is ``None`` (no captured snapshot defensive fallback), When ``chat_model_id`` is ``None`` (no captured snapshot defensive fallback),
fall back to the live search space's ``agent_llm_id`` and validate that. fall back to the live search space's ``chat_model_id`` and validate that.
""" """
if agent_llm_id is not None: if chat_model_id is not None:
try: try:
assert_models_billable( assert_models_billable(
agent_llm_id=agent_llm_id, chat_model_id=chat_model_id,
image_generation_config_id=image_generation_config_id, image_gen_model_id=image_gen_model_id,
vision_llm_config_id=vision_llm_config_id, vision_model_id=vision_model_id,
) )
except AutomationModelPolicyError as exc: except AutomationModelPolicyError as exc:
raise DependencyError(str(exc)) from exc raise DependencyError(str(exc)) from exc
resolved_agent_llm_id = agent_llm_id or 0 resolved_chat_model_id = chat_model_id or 0
else: else:
search_space = await session.get(SearchSpace, search_space_id) search_space = await session.get(SearchSpace, search_space_id)
if search_space is None: if search_space is None:
@ -72,15 +72,15 @@ async def build_dependencies(
assert_automation_models_billable(search_space) assert_automation_models_billable(search_space)
except AutomationModelPolicyError as exc: except AutomationModelPolicyError as exc:
raise DependencyError(str(exc)) from exc raise DependencyError(str(exc)) from exc
resolved_agent_llm_id = search_space.agent_llm_id or 0 resolved_chat_model_id = search_space.chat_model_id or 0
llm, agent_config, err = await load_llm_bundle( llm, agent_config, err = await load_llm_bundle(
session, session,
config_id=resolved_agent_llm_id, config_id=resolved_chat_model_id,
search_space_id=search_space_id, search_space_id=search_space_id,
) )
if err is not None or llm is None: if err is not None or llm is None:
raise DependencyError(err or "failed to load agent LLM config") raise DependencyError(err or "failed to load chat model config")
connector_service, firecrawl_api_key = await setup_connector_and_firecrawl( connector_service, firecrawl_api_key = await setup_connector_and_firecrawl(
session, search_space_id=search_space_id session, search_space_id=search_space_id

View file

@ -150,9 +150,9 @@ async def run_agent_task(
deps = await build_dependencies( deps = await build_dependencies(
session=agent_session, session=agent_session,
search_space_id=ctx.search_space_id, search_space_id=ctx.search_space_id,
agent_llm_id=ctx.agent_llm_id, chat_model_id=ctx.chat_model_id,
image_generation_config_id=ctx.image_generation_config_id, image_gen_model_id=ctx.image_gen_model_id,
vision_llm_config_id=ctx.vision_llm_config_id, vision_model_id=ctx.vision_model_id,
) )
agent = await create_multi_agent_chat_deep_agent( agent = await create_multi_agent_chat_deep_agent(
@ -167,7 +167,7 @@ async def run_agent_task(
firecrawl_api_key=deps.firecrawl_api_key, firecrawl_api_key=deps.firecrawl_api_key,
thread_visibility=ChatVisibility.PRIVATE, thread_visibility=ChatVisibility.PRIVATE,
mentioned_document_ids=mentioned_document_ids, mentioned_document_ids=mentioned_document_ids,
image_generation_config_id=ctx.image_generation_config_id, image_gen_model_id=ctx.image_gen_model_id,
) )
agent_query, runtime_context = await _resolve_mention_context( agent_query, runtime_context = await _resolve_mention_context(

View file

@ -23,9 +23,9 @@ class ActionContext:
# Captured model snapshot from the automation definition (``definition.models``), # Captured model snapshot from the automation definition (``definition.models``),
# resolved per run instead of the live search space. ``None`` falls back to the # resolved per run instead of the live search space. ``None`` falls back to the
# search space's current prefs (defensive; should not happen post-capture). # search space's current prefs (defensive; should not happen post-capture).
agent_llm_id: int | None = None chat_model_id: int | None = None
image_generation_config_id: int | None = None image_gen_model_id: int | None = None
vision_llm_config_id: int | None = None vision_model_id: int | None = None
ActionHandler = Callable[[dict[str, Any]], Awaitable[Any]] ActionHandler = Callable[[dict[str, Any]], Awaitable[Any]]

View file

@ -132,9 +132,7 @@ def _build_action_ctx(
step_id=step.step_id, step_id=step.step_id,
search_space_id=automation.search_space_id, search_space_id=automation.search_space_id,
creator_user_id=automation.created_by_user_id, creator_user_id=automation.created_by_user_id,
agent_llm_id=models.agent_llm_id if models else None, chat_model_id=models.chat_model_id if models else None,
image_generation_config_id=( image_gen_model_id=models.image_gen_model_id if models else None,
models.image_generation_config_id if models else None vision_model_id=models.vision_model_id if models else None,
),
vision_llm_config_id=models.vision_llm_config_id if models else None,
) )

View file

@ -14,16 +14,16 @@ from .trigger_spec import TriggerSpec
class AutomationModels(BaseModel): class AutomationModels(BaseModel):
"""Captured model profile for an automation. """Captured model profile for an automation.
Snapshotted from the search space's preferences at create time so runs are Snapshotted from the search space's model roles at create time so runs are
insulated from later chat/search-space model changes. Config-id conventions insulated from later chat/search-space model changes. Model-id conventions
match the shared scheme (``0`` Auto, ``< 0`` global, ``> 0`` BYOK). match the shared scheme (``0`` Auto, ``< 0`` global, ``> 0`` BYOK).
""" """
model_config = ConfigDict(extra="forbid") model_config = ConfigDict(extra="forbid")
agent_llm_id: int = 0 chat_model_id: int = 0
image_generation_config_id: int = 0 image_gen_model_id: int = 0
vision_llm_config_id: int = 0 vision_model_id: int = 0
class AutomationDefinition(BaseModel): class AutomationDefinition(BaseModel):

View file

@ -57,9 +57,9 @@ class AutomationService:
else: else:
search_space = await self._assert_models_billable(payload.search_space_id) search_space = await self._assert_models_billable(payload.search_space_id)
payload.definition.models = AutomationModels( payload.definition.models = AutomationModels(
agent_llm_id=search_space.agent_llm_id or 0, chat_model_id=search_space.chat_model_id or 0,
image_generation_config_id=search_space.image_generation_config_id or 0, image_gen_model_id=search_space.image_gen_model_id or 0,
vision_llm_config_id=search_space.vision_llm_config_id or 0, vision_model_id=search_space.vision_model_id or 0,
) )
automation = Automation( automation = Automation(
@ -225,9 +225,9 @@ class AutomationService:
""" """
try: try:
assert_models_billable( assert_models_billable(
agent_llm_id=models.agent_llm_id, chat_model_id=models.chat_model_id,
image_generation_config_id=models.image_generation_config_id, image_gen_model_id=models.image_gen_model_id,
vision_llm_config_id=models.vision_llm_config_id, vision_model_id=models.vision_model_id,
) )
except AutomationModelPolicyError as exc: except AutomationModelPolicyError as exc:
raise HTTPException(status_code=422, detail=str(exc)) from exc raise HTTPException(status_code=422, detail=str(exc)) from exc

View file

@ -2,11 +2,11 @@
Automations run unattended, so every run must be **billable**: it may only use Automations run unattended, so every run must be **billable**: it may only use
either a premium global model (``billing_tier == "premium"``) or a user-provided either a premium global model (``billing_tier == "premium"``) or a user-provided
BYOK model (a positive config id pointing at a per-user/per-space DB row). Free BYOK model (a positive model id pointing at a per-user/per-space DB row). Free
global models and Auto mode are blocked, because Auto can dispatch to a free global models and Auto mode are blocked, because Auto can dispatch to a free
deployment and free models aren't metered in premium credits. deployment and free models aren't metered in premium credits.
Config id conventions (shared across chat / image / vision): Model id conventions (shared across chat / image / vision):
- ``id == 0`` Auto mode (``AUTO_MODE_ID`` / ``IMAGE_GEN_AUTO_MODE_ID`` / - ``id == 0`` Auto mode (``AUTO_MODE_ID`` / ``IMAGE_GEN_AUTO_MODE_ID`` /
``VISION_AUTO_MODE_ID``). Blocked. ``VISION_AUTO_MODE_ID``). Blocked.
- ``id < 0`` global YAML/OpenRouter config. Allowed only if premium. - ``id < 0`` global YAML/OpenRouter config. Allowed only if premium.
@ -24,70 +24,45 @@ from typing import TYPE_CHECKING, Literal
if TYPE_CHECKING: if TYPE_CHECKING:
from app.db import SearchSpace from app.db import SearchSpace
ModelKind = Literal["llm", "image", "vision"] ModelKind = Literal["chat", "image", "vision"]
_KIND_LABEL: dict[ModelKind, str] = { _KIND_LABEL: dict[ModelKind, str] = {
"llm": "agent LLM", "chat": "chat model",
"image": "image generation model", "image": "image generation model",
"vision": "vision model", "vision": "vision model",
} }
def _is_premium_global(kind: ModelKind, config_id: int) -> bool: def _is_premium_global(model_id: int) -> bool:
"""Return True if a negative (global) config id is a premium tier model.""" """Return True if a negative (global) model id is a premium tier model."""
from app.config import config as app_config from app.config import config as app_config
cfg: dict | None = None model = next((m for m in app_config.GLOBAL_MODELS if m.get("id") == model_id), None)
if kind == "llm": if not model:
from app.agents.chat.runtime.llm_config import (
load_global_llm_config_by_id,
)
cfg = load_global_llm_config_by_id(config_id)
elif kind == "image":
cfg = next(
(
c
for c in app_config.GLOBAL_IMAGE_GEN_CONFIGS
if c.get("id") == config_id
),
None,
)
else: # vision
cfg = next(
(
c
for c in app_config.GLOBAL_VISION_LLM_CONFIGS
if c.get("id") == config_id
),
None,
)
if not cfg:
return False return False
return str(cfg.get("billing_tier", "free")).lower() == "premium" return str(model.get("billing_tier", "free")).lower() == "premium"
def _classify(kind: ModelKind, config_id: int | None) -> tuple[bool, str]: def _classify(kind: ModelKind, model_id: int | None) -> tuple[bool, str]:
"""Classify a resolved config id as allowed or blocked. """Classify a resolved model id as allowed or blocked.
Returns ``(allowed, reason)``; ``reason`` is empty when allowed. Returns ``(allowed, reason)``; ``reason`` is empty when allowed.
""" """
label = _KIND_LABEL[kind] label = _KIND_LABEL[kind]
if config_id is None or config_id == 0: if model_id is None or model_id == 0:
return ( return (
False, False,
f"The {label} is set to Auto mode. Automations require an explicit " f"The {label} is set to Auto mode. Automations require an explicit "
"premium model or your own (BYOK) model so every run is billable.", "premium model or your own (BYOK) model so every run is billable.",
) )
if config_id > 0: if model_id > 0:
# Positive id → user-owned BYOK config. Always allowed. # Positive id -> user/search-space BYOK model. Always allowed.
return True, "" return True, ""
# Negative id → global config. Allowed only if premium. # Negative id -> global model. Allowed only if premium.
if _is_premium_global(kind, config_id): if _is_premium_global(model_id):
return True, "" return True, ""
return ( return (
@ -99,27 +74,27 @@ def _classify(kind: ModelKind, config_id: int | None) -> tuple[bool, str]:
def get_model_eligibility( def get_model_eligibility(
*, *,
agent_llm_id: int | None, chat_model_id: int | None,
image_generation_config_id: int | None, image_gen_model_id: int | None,
vision_llm_config_id: int | None, vision_model_id: int | None,
) -> dict: ) -> dict:
"""Return ``{"allowed": bool, "violations": [...]}`` for explicit config ids. """Return ``{"allowed": bool, "violations": [...]}`` for explicit model ids.
The ID-based core shared by both the search-space path (creation/eligibility) The ID-based core shared by both the search-space path (creation/eligibility)
and the captured-snapshot path (runtime backstop). Each violation is and the captured-snapshot path (runtime backstop). Each violation is
``{"kind", "config_id", "reason"}``. ``{"kind", "model_id", "reason"}``.
""" """
checks: list[tuple[ModelKind, int | None]] = [ checks: list[tuple[ModelKind, int | None]] = [
("llm", agent_llm_id), ("chat", chat_model_id),
("image", image_generation_config_id), ("image", image_gen_model_id),
("vision", vision_llm_config_id), ("vision", vision_model_id),
] ]
violations: list[dict] = [] violations: list[dict] = []
for kind, config_id in checks: for kind, model_id in checks:
allowed, reason = _classify(kind, config_id) allowed, reason = _classify(kind, model_id)
if not allowed: if not allowed:
violations.append({"kind": kind, "config_id": config_id, "reason": reason}) violations.append({"kind": kind, "model_id": model_id, "reason": reason})
return {"allowed": not violations, "violations": violations} return {"allowed": not violations, "violations": violations}
@ -131,9 +106,9 @@ def get_automation_model_eligibility(search_space: SearchSpace) -> dict:
wrapper over :func:`get_model_eligibility`. wrapper over :func:`get_model_eligibility`.
""" """
return get_model_eligibility( return get_model_eligibility(
agent_llm_id=search_space.agent_llm_id, chat_model_id=search_space.chat_model_id,
image_generation_config_id=search_space.image_generation_config_id, image_gen_model_id=search_space.image_gen_model_id,
vision_llm_config_id=search_space.vision_llm_config_id, vision_model_id=search_space.vision_model_id,
) )
@ -150,9 +125,9 @@ class AutomationModelPolicyError(Exception):
def assert_models_billable( def assert_models_billable(
*, *,
agent_llm_id: int | None, chat_model_id: int | None,
image_generation_config_id: int | None, image_gen_model_id: int | None,
vision_llm_config_id: int | None, vision_model_id: int | None,
) -> None: ) -> None:
"""Raise :class:`AutomationModelPolicyError` if any explicit id is not billable. """Raise :class:`AutomationModelPolicyError` if any explicit id is not billable.
@ -160,9 +135,9 @@ def assert_models_billable(
captured model snapshot. captured model snapshot.
""" """
result = get_model_eligibility( result = get_model_eligibility(
agent_llm_id=agent_llm_id, chat_model_id=chat_model_id,
image_generation_config_id=image_generation_config_id, image_gen_model_id=image_gen_model_id,
vision_llm_config_id=vision_llm_config_id, vision_model_id=vision_model_id,
) )
if not result["allowed"]: if not result["allowed"]:
raise AutomationModelPolicyError(result["violations"]) raise AutomationModelPolicyError(result["violations"])

View file

@ -115,14 +115,12 @@ def init_worker(**kwargs):
initialize_llm_router, initialize_llm_router,
initialize_openrouter_integration, initialize_openrouter_integration,
initialize_pricing_registration, initialize_pricing_registration,
initialize_vision_llm_router,
) )
initialize_openrouter_integration() initialize_openrouter_integration()
initialize_pricing_registration() initialize_pricing_registration()
initialize_llm_router() initialize_llm_router()
initialize_image_gen_router() initialize_image_gen_router()
initialize_vision_llm_router()
# Celery configuration, sourced from the central Config singleton # Celery configuration, sourced from the central Config singleton
@ -192,6 +190,8 @@ celery_app = Celery(
"app.tasks.celery_tasks.stripe_reconciliation_task", "app.tasks.celery_tasks.stripe_reconciliation_task",
"app.tasks.celery_tasks.auto_reload_task", "app.tasks.celery_tasks.auto_reload_task",
"app.tasks.celery_tasks.gateway_tasks", "app.tasks.celery_tasks.gateway_tasks",
"app.etl_pipeline.cache.eviction.task",
"app.indexing_pipeline.cache.eviction.task",
"app.automations.tasks.execute_run", "app.automations.tasks.execute_run",
"app.automations.triggers.builtin.schedule.selector", "app.automations.triggers.builtin.schedule.selector",
"app.automations.triggers.builtin.event.selector", "app.automations.triggers.builtin.event.selector",
@ -306,6 +306,18 @@ celery_app.conf.beat_schedule = {
"schedule": crontab(hour="3", minute="17"), "schedule": crontab(hour="3", minute="17"),
"options": {"expires": 600}, "options": {"expires": 600},
}, },
# Prune the ETL parse cache (TTL + size budget) once daily, off-peak.
"evict-etl-cache": {
"task": "evict_etl_cache",
"schedule": crontab(hour="4", minute="0"),
"options": {"expires": 600},
},
# Prune the embedding cache (chunk+embedding sets) once daily, off-peak.
"evict-embedding-cache": {
"task": "evict_embedding_cache",
"schedule": crontab(hour="4", minute="30"),
"options": {"expires": 600},
},
# Fire due automation schedule triggers (Beat entry owned by the schedule # Fire due automation schedule triggers (Beat entry owned by the schedule
# trigger; see app.automations.triggers.builtin.schedule.source). # trigger; see app.automations.triggers.builtin.schedule.source).
**SCHEDULE_BEAT_SCHEDULE, **SCHEDULE_BEAT_SCHEDULE,

View file

@ -78,8 +78,7 @@ def load_global_llm_configs():
# stamps) never leak into the cached YAML structure. # stamps) never leak into the cached YAML structure.
configs = copy.deepcopy(data.get("global_llm_configs", [])) configs = copy.deepcopy(data.get("global_llm_configs", []))
# Lazy import keeps the `app.config` -> `app.services` edge one-way # Lazy import keeps the `app.config` -> `app.services` edge one-way.
# and matches the `provider_api_base` pattern used elsewhere.
from app.services.provider_capabilities import derive_supports_image_input from app.services.provider_capabilities import derive_supports_image_input
seen_slugs: dict[str, int] = {} seen_slugs: dict[str, int] = {}
@ -104,7 +103,7 @@ def load_global_llm_configs():
else None else None
) )
cfg["supports_image_input"] = derive_supports_image_input( cfg["supports_image_input"] = derive_supports_image_input(
provider=cfg.get("provider"), provider=cfg.get("provider") or cfg.get("litellm_provider"),
model_name=cfg.get("model_name"), model_name=cfg.get("model_name"),
base_model=base_model, base_model=base_model,
custom_provider=cfg.get("custom_provider"), custom_provider=cfg.get("custom_provider"),
@ -120,10 +119,10 @@ def load_global_llm_configs():
else: else:
seen_slugs[slug] = cfg.get("id", 0) seen_slugs[slug] = cfg.get("id", 0)
# Stamp Auto (Fastest) ranking metadata. YAML configs are always # Stamp Auto ranking metadata. YAML configs are always
# Tier A — operator-curated, locked first when premium-eligible. # Tier A — operator-curated, locked first when premium-eligible.
# The OpenRouter refresh tick later re-stamps health for any cfg # The OpenRouter refresh tick later re-stamps health for any cfg
# whose provider == "OPENROUTER" via _enrich_health. # whose provider == "openrouter" via _enrich_health.
try: try:
from app.services.quality_score import static_score_yaml from app.services.quality_score import static_score_yaml
@ -133,7 +132,7 @@ def load_global_llm_configs():
cfg["quality_score_static"] = static_q cfg["quality_score_static"] = static_q
cfg["quality_score"] = static_q cfg["quality_score"] = static_q
cfg["quality_score_health"] = None cfg["quality_score_health"] = None
# YAML cfgs whose provider is OPENROUTER are also subject # YAML cfgs whose provider is openrouter are also subject
# to health gating against their own /endpoints data — a # to health gating against their own /endpoints data — a
# hand-picked dead OR model is still dead. _enrich_health # hand-picked dead OR model is still dead. _enrich_health
# re-stamps health_gated for them on the next refresh tick. # re-stamps health_gated for them on the next refresh tick.
@ -211,42 +210,6 @@ def load_global_image_gen_configs():
return [] return []
def load_global_vision_llm_configs():
data = _global_config_data()
if not data:
return []
try:
configs = copy.deepcopy(data.get("global_vision_llm_configs", []) or [])
for cfg in configs:
if isinstance(cfg, dict):
cfg.setdefault("billing_tier", "free")
return configs
except Exception as e:
print(f"Warning: Failed to load global vision LLM configs: {e}")
return []
def load_vision_llm_router_settings():
default_settings = {
"routing_strategy": "usage-based-routing",
"num_retries": 3,
"allowed_fails": 3,
"cooldown_time": 60,
}
data = _global_config_data()
if not data:
return default_settings
try:
settings = data.get("vision_llm_router_settings", {})
return {**default_settings, **settings}
except Exception as e:
print(f"Warning: Failed to load vision LLM router settings: {e}")
return default_settings
def load_image_gen_router_settings(): def load_image_gen_router_settings():
""" """
Load router settings for image generation Auto mode from YAML file. Load router settings for image generation Auto mode from YAML file.
@ -363,8 +326,8 @@ def initialize_openrouter_integration():
else: else:
print("Info: OpenRouter integration enabled but no models fetched") print("Info: OpenRouter integration enabled but no models fetched")
# Image generation + vision LLM emissions are opt-in (issue L). # Image generation emissions reuse the catalogue already cached by
# Both reuse the catalogue already cached by ``service.initialize`` # ``service.initialize``
# so we don't make additional network calls here. # so we don't make additional network calls here.
if settings.get("image_generation_enabled"): if settings.get("image_generation_enabled"):
try: try:
@ -378,21 +341,26 @@ def initialize_openrouter_integration():
except Exception as e: except Exception as e:
print(f"Warning: Failed to inject OpenRouter image-gen configs: {e}") print(f"Warning: Failed to inject OpenRouter image-gen configs: {e}")
if settings.get("vision_enabled"): refresh_global_model_catalog()
try:
vision_configs = service.get_vision_llm_configs()
if vision_configs:
config.GLOBAL_VISION_LLM_CONFIGS.extend(vision_configs)
print(
f"Info: OpenRouter integration added {len(vision_configs)} "
f"vision LLM models"
)
except Exception as e:
print(f"Warning: Failed to inject OpenRouter vision-LLM configs: {e}")
except Exception as e: except Exception as e:
print(f"Warning: Failed to initialize OpenRouter integration: {e}") print(f"Warning: Failed to initialize OpenRouter integration: {e}")
def materialize_global_configs():
from app.services.global_model_catalog import materialize_global_model_catalog
return materialize_global_model_catalog(
chat_configs=getattr(config, "GLOBAL_LLM_CONFIGS", []),
image_configs=getattr(config, "GLOBAL_IMAGE_GEN_CONFIGS", []),
)
def refresh_global_model_catalog():
connections, models = materialize_global_configs()
config.GLOBAL_CONNECTIONS = connections
config.GLOBAL_MODELS = models
def initialize_pricing_registration(): def initialize_pricing_registration():
""" """
Teach LiteLLM the per-token cost of every deployment in Teach LiteLLM the per-token cost of every deployment in
@ -430,7 +398,10 @@ def initialize_llm_router():
router_settings = config.ROUTER_SETTINGS router_settings = config.ROUTER_SETTINGS
if not all_configs: if not all_configs:
print("Info: No global LLM configs found, Auto mode will not be available") print(
"Info: No global LLM configs found; global Auto pool is unavailable. "
"Auto can still use enabled BYOK models."
)
return return
try: try:
@ -475,32 +446,6 @@ def initialize_image_gen_router():
print(f"Warning: Failed to initialize Image Generation Router: {e}") print(f"Warning: Failed to initialize Image Generation Router: {e}")
def initialize_vision_llm_router():
vision_configs = load_global_vision_llm_configs()
# Reuse the router settings already parsed at Config construction. The
# *configs* list is intentionally re-read from YAML (it must exclude the
# OpenRouter-injected dynamic models held in config.GLOBAL_VISION_LLM_CONFIGS).
router_settings = config.VISION_LLM_ROUTER_SETTINGS
if not vision_configs:
print(
"Info: No global vision LLM configs found, "
"Vision LLM Auto mode will not be available"
)
return
try:
from app.services.vision_llm_router_service import VisionLLMRouterService
VisionLLMRouterService.initialize(vision_configs, router_settings)
print(
f"Info: Vision LLM Router initialized with {len(vision_configs)} models "
f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
)
except Exception as e:
print(f"Warning: Failed to initialize Vision LLM Router: {e}")
class Config: class Config:
# Check if ffmpeg is installed # Check if ffmpeg is installed
if not is_ffmpeg_installed(): if not is_ffmpeg_installed():
@ -612,14 +557,15 @@ class Config:
# Platform web search (SearXNG) # Platform web search (SearXNG)
SEARXNG_DEFAULT_HOST = os.getenv("SEARXNG_DEFAULT_HOST") SEARXNG_DEFAULT_HOST = os.getenv("SEARXNG_DEFAULT_HOST")
NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL") SURFSENSE_PUBLIC_URL = os.getenv("SURFSENSE_PUBLIC_URL")
NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL") or SURFSENSE_PUBLIC_URL
# Backend URL to override the http to https in the OAuth redirect URI # Backend URL to override the http to https in the OAuth redirect URI
BACKEND_URL = os.getenv("BACKEND_URL") BACKEND_URL = os.getenv("BACKEND_URL") or SURFSENSE_PUBLIC_URL
# Messaging gateway (Telegram v1) # Messaging gateway
# Global master switch: when FALSE, no gateway supervisors/workers start and all # Global master switch: when FALSE, no gateway supervisors/workers start and all
# gateway HTTP routes return 404, regardless of the per-channel flags below. # gated gateway HTTP routes return 404, regardless of the per-channel flags below.
GATEWAY_ENABLED = os.getenv("GATEWAY_ENABLED", "TRUE").upper() == "TRUE" GATEWAY_ENABLED = os.getenv("GATEWAY_ENABLED", "FALSE").upper() == "TRUE"
TELEGRAM_SHARED_BOT_TOKEN = os.getenv("TELEGRAM_SHARED_BOT_TOKEN") TELEGRAM_SHARED_BOT_TOKEN = os.getenv("TELEGRAM_SHARED_BOT_TOKEN")
TELEGRAM_SHARED_BOT_USERNAME = os.getenv("TELEGRAM_SHARED_BOT_USERNAME") TELEGRAM_SHARED_BOT_USERNAME = os.getenv("TELEGRAM_SHARED_BOT_USERNAME")
TELEGRAM_WEBHOOK_SECRET = os.getenv("TELEGRAM_WEBHOOK_SECRET") TELEGRAM_WEBHOOK_SECRET = os.getenv("TELEGRAM_WEBHOOK_SECRET")
@ -784,7 +730,7 @@ class Config:
os.getenv("QUOTA_DEFAULT_IMAGE_RESERVE_MICROS", "50000") os.getenv("QUOTA_DEFAULT_IMAGE_RESERVE_MICROS", "50000")
) )
# Per-podcast reservation (in micro-USD). One agent LLM call generating # Per-podcast reservation (in micro-USD). One chat model call generating
# a transcript, typically 5k-20k completion tokens. $0.20 covers a long # a transcript, typically 5k-20k completion tokens. $0.20 covers a long
# premium-model run. Tune via env. # premium-model run. Tune via env.
QUOTA_DEFAULT_PODCAST_RESERVE_MICROS = int( QUOTA_DEFAULT_PODCAST_RESERVE_MICROS = int(
@ -890,6 +836,13 @@ class Config:
# LLM instances are now managed per-user through the LLMConfig system # LLM instances are now managed per-user through the LLMConfig system
# Legacy environment variables removed in favor of user-specific configurations # Legacy environment variables removed in favor of user-specific configurations
# True when an operator-provided global_llm_config.yaml is present.
# Used to gate the per-search-space LLM onboarding flow: when a global
# config file exists, search spaces inherit it and onboarding is skipped.
GLOBAL_LLM_CONFIG_FILE_EXISTS = (
BASE_DIR / "app" / "config" / "global_llm_config.yaml"
).exists()
# Global LLM Configurations (optional) # Global LLM Configurations (optional)
# Load from global_llm_config.yaml if available # Load from global_llm_config.yaml if available
# These can be used as default options for users # These can be used as default options for users
@ -904,11 +857,17 @@ class Config:
# Router settings for Image Generation Auto mode # Router settings for Image Generation Auto mode
IMAGE_GEN_ROUTER_SETTINGS = load_image_gen_router_settings() IMAGE_GEN_ROUTER_SETTINGS = load_image_gen_router_settings()
# Global Vision LLM Configurations (optional) # Virtual GLOBAL connection/model catalog. This is server-only metadata
GLOBAL_VISION_LLM_CONFIGS = load_global_vision_llm_configs() # derived from global_llm_config.yaml; GLOBAL keys are not stored in DB.
from app.services.global_model_catalog import (
materialize_global_model_catalog as _materialize_global_model_catalog,
)
# Router settings for Vision LLM Auto mode GLOBAL_CONNECTIONS, GLOBAL_MODELS = _materialize_global_model_catalog(
VISION_LLM_ROUTER_SETTINGS = load_vision_llm_router_settings() chat_configs=GLOBAL_LLM_CONFIGS,
image_configs=GLOBAL_IMAGE_GEN_CONFIGS,
)
del _materialize_global_model_catalog
# OpenRouter Integration settings (optional) # OpenRouter Integration settings (optional)
OPENROUTER_INTEGRATION_SETTINGS = load_openrouter_integration_settings() OPENROUTER_INTEGRATION_SETTINGS = load_openrouter_integration_settings()
@ -974,6 +933,47 @@ class Config:
AZURE_DI_ENDPOINT = os.getenv("AZURE_DI_ENDPOINT") AZURE_DI_ENDPOINT = os.getenv("AZURE_DI_ENDPOINT")
AZURE_DI_KEY = os.getenv("AZURE_DI_KEY") AZURE_DI_KEY = os.getenv("AZURE_DI_KEY")
# ETL parse cache: reuse parser output for identical bytes across workspaces.
ETL_CACHE_ENABLED = (
os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true"
)
# Bump to invalidate every cached entry after a parser/behaviour change.
ETL_CACHE_PARSER_VERSION = int(os.getenv("ETL_CACHE_PARSER_VERSION", "1"))
ETL_CACHE_TTL_DAYS = int(os.getenv("ETL_CACHE_TTL_DAYS", "90"))
ETL_CACHE_MAX_TOTAL_MB = int(os.getenv("ETL_CACHE_MAX_TOTAL_MB", "5120"))
ETL_CACHE_EVICTION_BATCH = int(os.getenv("ETL_CACHE_EVICTION_BATCH", "500"))
# Optional dedicated blob storage; unset reuses the main file_storage backend.
ETL_CACHE_STORAGE_BACKEND = os.getenv("ETL_CACHE_STORAGE_BACKEND")
ETL_CACHE_STORAGE_CONTAINER = os.getenv("ETL_CACHE_STORAGE_CONTAINER")
ETL_CACHE_STORAGE_LOCAL_PATH = os.getenv("ETL_CACHE_STORAGE_LOCAL_PATH")
# Embedding cache: reuse chunk+embedding output for identical markdown across
# workspaces. Blobs share the ETL_CACHE_STORAGE_* backend.
EMBEDDING_CACHE_ENABLED = (
os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
)
# Bump to invalidate every cached embedding set after a chunker change.
EMBEDDING_CACHE_CHUNKER_VERSION = int(
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
)
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
EMBEDDING_CACHE_MAX_TOTAL_MB = int(
os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120")
)
EMBEDDING_CACHE_EVICTION_BATCH = int(
os.getenv("EMBEDDING_CACHE_EVICTION_BATCH", "500")
)
# Incremental re-indexing: on document edits, keep chunk rows whose text is
# unchanged (reusing their embeddings) and embed only new/changed chunks.
# Kill switch -- disabling falls back to delete-all + full re-embed.
CHUNK_RECONCILE_ENABLED = (
os.getenv("CHUNK_RECONCILE_ENABLED", "true").strip().lower() == "true"
)
INDEXING_CHUNK_INSERT_BATCH_SIZE = int(
os.getenv("INDEXING_CHUNK_INSERT_BATCH_SIZE", "200")
)
# Proxy provider selection. Maps to a ProxyProvider implementation registered # Proxy provider selection. Maps to a ProxyProvider implementation registered
# in app/utils/proxy/registry.py. Add new vendors there and switch via this var. # in app/utils/proxy/registry.py. Add new vendors there and switch via this var.
PROXY_PROVIDER = os.getenv("PROXY_PROVIDER", "anonymous_proxies") PROXY_PROVIDER = os.getenv("PROXY_PROVIDER", "anonymous_proxies")

View file

@ -1,362 +1,236 @@
# Global LLM Configuration # Global LLM Configuration
# #
# SETUP INSTRUCTIONS: # SETUP INSTRUCTIONS:
# 1. For production: Copy this file to global_llm_config.yaml and add your real API keys # 1. Copy this file to global_llm_config.yaml.
# 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist # 2. Replace placeholder credentials, endpoints, deployment names, and pricing
# with values from your own provider accounts.
# #
# NOTE: The example API keys below are placeholders and won't work. # This file is intentionally safe to commit. Do not put real API keys in this
# Replace them with your actual API keys to enable global configurations. # example file.
# #
# These configurations will be available to all users as a convenient option # These YAML entries are materialized at startup as server-owned GLOBAL
# Users can choose to use these global configs or add their own # connections and models:
# #
# AUTO MODE (Recommended): # global_llm_configs -> GLOBAL chat models
# - Auto mode (ID: 0) uses LiteLLM Router to automatically load balance across all global configs # global_image_generation_configs -> GLOBAL image generation models
# - This helps avoid rate limits by distributing requests across multiple providers
# - New users are automatically assigned Auto mode by default
# - Configure router_settings below to customize the load balancing behavior
# #
# Structure matches NewLLMConfig: # Do not add global_connections or global_models sections here. They are
# - Model configuration (provider, model_name, api_key, etc.) # runtime-derived metadata exposed through the model-connections APIs.
# - Prompt configuration (system_instructions, citations_enabled) #
# Static config shape:
# - Connection fields: provider, api_key, api_base, api_version
# - Model fields: model_name, billing_tier, rpm/tpm, capabilities, litellm_params
# - Public no-login SEO metadata: seo_title, seo_description
# - Prompt defaults: system_instructions, use_default_system_instructions,
# citations_enabled
#
# Provider notes:
# - Use the canonical provider field.
# - For Azure, use the bare deployment name in model_name, for example
# model_name: "gpt-5.1". The resolver prefixes the LiteLLM model string from
# provider: "azure".
#
# GLOBAL ID namespace:
# - ID 0 is reserved for Auto mode.
# - Negative IDs are server-owned GLOBAL models.
# - Positive IDs are user/BYOK database models.
# - Keep static IDs unique across chat and image generation.
# - Suggested static ranges: chat -1..-999, image -2001..-2999.
# - Vision is not a separate config/table. Chat models that accept images use
# supports_image_input: true.
# #
# COST-BASED PREMIUM CREDITS: # COST-BASED PREMIUM CREDITS:
# Each premium config bills the user's USD-credit balance based on the # Each premium model bills the user's USD-credit balance based on provider cost
# actual provider cost reported by LiteLLM. For models LiteLLM already # reported by LiteLLM. For custom Azure deployments or any model LiteLLM does
# knows (most OpenAI/Anthropic/etc. names) you don't need to do anything. # not know, declare per-token costs inline:
# For custom Azure deployment names (e.g. an in-house "gpt-5.4" deployment)
# or any model LiteLLM doesn't have in its built-in pricing table, declare
# per-token costs inline so they bill correctly:
# #
# litellm_params: # litellm_params:
# base_model: "my-custom-azure-deploy" # base_model: "my-custom-deployment"
# # USD per token; e.g. 0.000003 == $3.00 per million input tokens # # USD per token; 0.00000125 == $1.25 per million input tokens.
# input_cost_per_token: 0.000003 # input_cost_per_token: 0.00000125
# output_cost_per_token: 0.000015 # output_cost_per_token: 0.00001
# #
# OpenRouter dynamic models pull pricing automatically from OpenRouter's # OpenRouter dynamic chat models pull pricing automatically from OpenRouter's
# API — no inline declaration needed. Models without resolvable pricing # API. Models without resolvable pricing debit $0 and log a warning.
# debit $0 from the user's balance and log a WARNING.
# Router Settings for Auto Mode # =============================================================================
# These settings control how the LiteLLM Router distributes requests across models # Chat Auto Mode Router Settings
# =============================================================================
# These settings control how the LiteLLM Router distributes Auto-mode requests
# across curated router-eligible GLOBAL chat deployments.
router_settings: router_settings:
# Routing strategy options: # Routing strategy options:
# - "usage-based-routing": Routes to deployment with lowest current usage (recommended for rate limits) # - "usage-based-routing": Routes to deployment with lowest current usage.
# - "simple-shuffle": Random distribution with optional RPM/TPM weighting # - "simple-shuffle": Random distribution with optional RPM/TPM weighting.
# - "least-busy": Routes to least busy deployment # - "least-busy": Routes to least busy deployment.
# - "latency-based-routing": Routes based on response latency # - "latency-based-routing": Routes based on response latency.
routing_strategy: "usage-based-routing" routing_strategy: "usage-based-routing"
# Number of retries before failing
num_retries: 3 num_retries: 3
# Number of failures allowed before cooling down a deployment
allowed_fails: 3 allowed_fails: 3
# Cooldown time in seconds after allowed_fails is exceeded
cooldown_time: 60 cooldown_time: 60
# Optional fallback map:
# fallbacks:
# - {"azure/gpt-5.1": ["azure/gpt-5.4-mini"]}
# Fallback models (optional) - when primary fails, try these # =============================================================================
# Format: [{"primary_model": ["fallback1", "fallback2"]}] # Static GLOBAL Chat Models
# fallbacks: [] # =============================================================================
global_llm_configs: global_llm_configs:
# Example: OpenAI GPT-4 Turbo with citations enabled # Premium Azure chat model with image input support and explicit custom
# pricing. This is the current shape to use for hosted GPT 5.x deployments.
- id: -1 - id: -1
name: "Global GPT-4 Turbo" name: "Azure GPT 5.1"
description: "OpenAI's GPT-4 Turbo with default prompts and citations" billing_tier: "premium"
billing_tier: "free" anonymous_enabled: false
anonymous_enabled: true seo_enabled: false
seo_enabled: true seo_slug: "azure-gpt-5-1"
seo_slug: "gpt-4-turbo"
quota_reserve_tokens: 4000 quota_reserve_tokens: 4000
provider: "OPENAI" provider: "azure"
model_name: "gpt-4-turbo-preview" model_name: "gpt-5.1"
api_key: "sk-your-openai-api-key-here" supports_image_input: true
api_base: "" supports_tools: true
# Rate limits for load balancing (requests/tokens per minute) max_input_tokens: 400000
rpm: 500 # Requests per minute api_key: "your-azure-api-key-here"
tpm: 100000 # Tokens per minute api_base: "https://your-resource.openai.azure.com"
# api_version is optional. Include it if your Azure deployment requires a
# specific API version.
# api_version: "2025-04-01-preview"
rpm: 47500
tpm: 14750000
litellm_params: litellm_params:
temperature: 0.7 max_tokens: 16384
max_tokens: 4000 base_model: "gpt-5.1"
# Prompt Configuration input_cost_per_token: 0.00000125
system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS output_cost_per_token: 0.00001
system_instructions: ""
use_default_system_instructions: true use_default_system_instructions: true
citations_enabled: true citations_enabled: true
# Example: Anthropic Claude 3 Opus # Larger premium chat model. If your provider prices long-context traffic
# differently, choose a conservative flat price or document the limitation
# next to the inline pricing.
- id: -2 - id: -2
name: "Global Claude 3 Opus" name: "Azure GPT 5.4"
description: "Anthropic's most capable model with citations" billing_tier: "premium"
billing_tier: "free" anonymous_enabled: false
anonymous_enabled: true seo_enabled: false
seo_enabled: true seo_slug: "azure-gpt-5-4"
seo_slug: "claude-3-opus"
quota_reserve_tokens: 4000 quota_reserve_tokens: 4000
provider: "ANTHROPIC" provider: "azure"
model_name: "claude-3-opus-20240229" model_name: "gpt-5.4"
api_key: "sk-ant-your-anthropic-api-key-here" supports_image_input: true
api_base: "" supports_tools: true
rpm: 1000 max_input_tokens: 400000
tpm: 100000 api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
rpm: 150000
tpm: 15000000
litellm_params: litellm_params:
temperature: 0.7 max_tokens: 16384
max_tokens: 4000 base_model: "gpt-5.4"
input_cost_per_token: 0.0000025
output_cost_per_token: 0.000015
system_instructions: "" system_instructions: ""
use_default_system_instructions: true use_default_system_instructions: true
citations_enabled: true citations_enabled: true
# Example: Fast model - GPT-3.5 Turbo (citations disabled for speed) # Free/no-login hosted model. Free models are visible to users when
# anonymous_enabled/seo_enabled are true but do not debit premium credits.
- id: -3 - id: -3
name: "Global GPT-3.5 Turbo (Fast)" name: "Azure GPT 5.4 Mini"
description: "Fast responses without citations for quick queries"
billing_tier: "free" billing_tier: "free"
anonymous_enabled: true anonymous_enabled: true
seo_enabled: true seo_enabled: true
seo_slug: "gpt-3.5-turbo-fast" seo_slug: "gpt-5-4-mini-no-login"
quota_reserve_tokens: 2000 seo_title: "Free GPT 5.4 Mini Chat"
provider: "OPENAI" seo_description: "Chat with a hosted GPT 5.4 Mini model without signing in."
model_name: "gpt-3.5-turbo"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 3500 # GPT-3.5 has higher rate limits
tpm: 200000
litellm_params:
temperature: 0.5
max_tokens: 2000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: false # Disabled for faster responses
# Example: Chinese LLM - DeepSeek with custom instructions
- id: -4
name: "Global DeepSeek Chat (Chinese)"
description: "DeepSeek optimized for Chinese language responses"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "deepseek-chat-chinese"
quota_reserve_tokens: 4000 quota_reserve_tokens: 4000
provider: "DEEPSEEK" provider: "azure"
model_name: "deepseek-chat" model_name: "gpt-5.4-mini"
api_key: "your-deepseek-api-key-here" supports_image_input: false
api_base: "https://api.deepseek.com/v1" supports_tools: true
rpm: 60 max_input_tokens: 128000
tpm: 100000
litellm_params:
temperature: 0.7
max_tokens: 4000
# Custom system instructions for Chinese responses
system_instructions: |
<system_instruction>
You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
Today's date (UTC): {resolved_today}
IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language.
</system_instruction>
use_default_system_instructions: false
citations_enabled: true
# Example: Azure OpenAI GPT-4o
# IMPORTANT: For Azure deployments, always include 'base_model' in litellm_params
# to enable accurate token counting, cost tracking, and max token limits
- id: -5
name: "Global Azure GPT-4o"
description: "Azure OpenAI GPT-4o deployment"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "azure-gpt-4o"
quota_reserve_tokens: 4000
provider: "AZURE"
# model_name format for Azure: azure/<your-deployment-name>
model_name: "azure/gpt-4o-deployment"
api_key: "your-azure-api-key-here" api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com" api_base: "https://your-resource.openai.azure.com"
api_version: "2024-02-15-preview" # Azure API version rpm: 15000
rpm: 1000 tpm: 15000000
tpm: 150000
litellm_params: litellm_params:
temperature: 0.7 max_tokens: 16384
max_tokens: 4000 base_model: "gpt-5.4-mini"
# REQUIRED for Azure: Specify the underlying OpenAI model
# This fixes "Could not identify azure model" warnings
# Common base_model values: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo
base_model: "gpt-4o"
system_instructions: "" system_instructions: ""
use_default_system_instructions: true use_default_system_instructions: true
citations_enabled: true citations_enabled: true
# Example: Azure OpenAI GPT-4 Turbo # Planner LLM. This is operator-only and is not shown in the user-facing
- id: -6 # model selector. Only one global_llm_configs entry should set is_planner.
name: "Global Azure GPT-4 Turbo"
description: "Azure OpenAI GPT-4 Turbo deployment"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "azure-gpt-4-turbo"
quota_reserve_tokens: 4000
provider: "AZURE"
model_name: "azure/gpt-4-turbo-deployment"
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
api_version: "2024-02-15-preview"
rpm: 500
tpm: 100000
litellm_params:
temperature: 0.7
max_tokens: 4000
base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Example: Groq - Fast inference
- id: -7
name: "Global Groq Llama 3"
description: "Ultra-fast Llama 3 70B via Groq"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "groq-llama-3"
quota_reserve_tokens: 8000
provider: "GROQ"
model_name: "llama3-70b-8192"
api_key: "your-groq-api-key-here"
api_base: ""
rpm: 30 # Groq has lower rate limits on free tier
tpm: 14400
litellm_params:
temperature: 0.7
max_tokens: 8000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Example: MiniMax M3 - High-performance with 512K context window
- id: -8
name: "Global MiniMax M3"
description: "MiniMax M3 with 512K context window and competitive pricing"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "minimax-m3"
quota_reserve_tokens: 4000
provider: "MINIMAX"
model_name: "MiniMax-M3"
api_key: "your-minimax-api-key-here"
api_base: "https://api.minimax.io/v1"
rpm: 60
tpm: 100000
litellm_params:
temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
max_tokens: 4000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Example: Planner LLM - small, fast model used for internal utility tasks
#
# The PLANNER role handles short, structured internal calls (KB query
# rewriting, date extraction, recency classification, etc.) that don't
# need frontier-tier capability. Pointing the planner at a cheap+fast
# model (gpt-4o-mini, Claude Haiku, Azure gpt-5.x-nano, Groq Llama, ...)
# typically saves 500ms-1.5s per turn vs. routing those same internal
# calls through the user's chat model.
#
# Activation:
# - Mark EXACTLY ONE global config with ``is_planner: true``.
# - If multiple are marked, the first one wins and a WARNING is logged.
# - If none is marked, every internal call falls back to the user's
# chat LLM (same behavior as before this flag existed).
#
# This config is operator-only — it is NOT exposed in the user-facing
# model selector, never billed against premium quota, and the
# billing_tier / anonymous_enabled fields below are ignored.
- id: -9 - id: -9
name: "Global Planner (GPT-4o mini)" name: "Azure GPT 5.x Nano Planner"
description: "Internal-only planner LLM for query rewriting and classification"
is_planner: true is_planner: true
billing_tier: "free" billing_tier: "free"
anonymous_enabled: false anonymous_enabled: false
seo_enabled: false seo_enabled: false
quota_reserve_tokens: 1000 quota_reserve_tokens: 1000
provider: "OPENAI" provider: "azure"
model_name: "gpt-4o-mini" model_name: "gpt-5.4-nano"
api_key: "sk-your-openai-api-key-here" supports_image_input: false
api_base: "" supports_tools: false
rpm: 3500 router_pool_eligible: false
tpm: 200000 api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
rpm: 20000
tpm: 4000000
litellm_params: litellm_params:
temperature: 0 temperature: 0
max_tokens: 1000 max_tokens: 1000
base_model: "gpt-5.4-nano"
system_instructions: "" system_instructions: ""
use_default_system_instructions: true use_default_system_instructions: true
citations_enabled: false citations_enabled: false
# ============================================================================= # =============================================================================
# OpenRouter Integration # OpenRouter Dynamic Model Integration
# ============================================================================= # =============================================================================
# When enabled, dynamically fetches ALL available models from the OpenRouter API # When enabled, SurfSense fetches the OpenRouter catalog at startup and injects
# and injects them as global configs. This gives premium users access to any model # supported models as GLOBAL chat and optionally image-generation models.
# on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota, # Tier is derived per model from OpenRouter data:
# while free-tier OpenRouter models show up with a green Free badge and do NOT # - model id ends with ":free" -> billing_tier=free
# consume premium quota. # - prompt and completion pricing are zero -> billing_tier=free
# Models are fetched at startup and refreshed periodically in the background. # - otherwise -> billing_tier=premium
# All calls go through LiteLLM with the openrouter/ prefix. #
# Do not use deprecated openrouter_integration.billing_tier or
# openrouter_integration.anonymous_enabled. Use the tier-specific anonymous
# switches below.
openrouter_integration: openrouter_integration:
enabled: false enabled: false
api_key: "sk-or-your-openrouter-api-key" api_key: "sk-or-your-openrouter-api-key"
# Tier is derived PER MODEL from OpenRouter's own API signals:
# - id ends with ":free" -> billing_tier=free
# - pricing.prompt AND pricing.completion == "0" -> billing_tier=free
# - otherwise -> billing_tier=premium
# No global billing_tier knob is honored; any legacy value emits a startup warning.
# Anonymous access is split by tier so operators can expose only free
# models to no-login users without leaking paid inference.
anonymous_enabled_paid: false anonymous_enabled_paid: false
anonymous_enabled_free: false anonymous_enabled_free: false
seo_enabled: false seo_enabled: false
# quota_reserve_tokens: tokens reserved per call for quota enforcement
quota_reserve_tokens: 4000 quota_reserve_tokens: 4000
# id_offset: base negative ID for dynamically generated configs.
# Model IDs are derived deterministically via BLAKE2b so they survive # Base negative ID namespace for dynamic chat models. IDs are derived
# catalogue churn. Must not overlap with your static global_llm_configs IDs. # deterministically so they survive catalog churn. Do not overlap static IDs.
id_offset: -10000 id_offset: -10000
# refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only)
# Separate base negative ID namespace for dynamic image-generation models.
image_id_offset: -20000
# How often to refresh the OpenRouter catalog. 0 means startup only.
refresh_interval_hours: 24 refresh_interval_hours: 24
# Rate limits for PAID OpenRouter models. These are used by LiteLLM Router # Paid OpenRouter models may join curated router pools when eligible.
# for per-deployment accounting when OR premium models participate in the
# shared sub-agent "auto" pool. They do NOT cap OpenRouter itself — your
# real account limits live at https://openrouter.ai/settings/limits.
rpm: 200 rpm: 200
tpm: 1000000 tpm: 1000000
# Rate limits for FREE OpenRouter models. Informational only: free OR # Free OpenRouter models are available for user-facing selection/pinning but
# models are intentionally kept OUT of the LiteLLM Router pool, because # should be treated as a shared-account bucket, not normal router capacity.
# OpenRouter enforces free-tier limits globally per account (~20 RPM +
# 50-1000 daily requests across every ":free" model combined) —
# per-deployment router accounting can't represent a shared bucket
# correctly. Free OR models stay fully available in the model selector
# and for user-facing Auto thread pinning.
free_rpm: 20 free_rpm: 20
free_tpm: 100000 free_tpm: 100000
# Image generation + vision LLM emission are OPT-IN. OpenRouter's catalogue # Image generation is opt-in to avoid injecting a large image catalog during
# contains hundreds of image- and vision-capable models; turning these on # upgrades. Vision-capable chat models are represented with
# injects them into the global Image-Generation / Vision-LLM model # supports_image_input: true.
# selectors alongside any static configs. Tier (free/premium) is derived
# per model the same way it is for chat (`:free` suffix or zero pricing).
# When a user picks a premium image/vision model the call debits the
# shared $5 USD-cost-based premium credit pool — so leaving these off
# avoids surprise quota burn on existing deployments. Default: false.
image_generation_enabled: false image_generation_enabled: false
vision_enabled: false vision_enabled: false
@ -367,191 +241,80 @@ openrouter_integration:
citations_enabled: true citations_enabled: true
# ============================================================================= # =============================================================================
# Image Generation Configuration # Image Generation Auto Mode Router Settings
# ============================================================================= # =============================================================================
# These configurations power the image generation feature using litellm.aimage_generation().
# Supported providers: OpenAI, Azure, Google AI Studio, Vertex AI, AWS Bedrock,
# Recraft, OpenRouter, Xinference, Nscale
#
# Auto mode (ID 0) uses LiteLLM Router for load balancing across all image gen configs.
# Router Settings for Image Generation Auto Mode
image_generation_router_settings: image_generation_router_settings:
routing_strategy: "usage-based-routing" routing_strategy: "usage-based-routing"
num_retries: 3 num_retries: 3
allowed_fails: 3 allowed_fails: 3
cooldown_time: 60 cooldown_time: 60
# =============================================================================
# Static GLOBAL Image Generation Models
# =============================================================================
global_image_generation_configs: global_image_generation_configs:
# Example: OpenAI DALL-E 3 - id: -2001
- id: -1 name: "Azure GPT Image 1.5"
name: "Global DALL-E 3" billing_tier: "premium"
description: "OpenAI's DALL-E 3 for high-quality image generation" provider: "azure"
provider: "OPENAI" model_name: "gpt-image-1.5"
model_name: "dall-e-3"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
litellm_params: {}
# Example: OpenAI GPT Image 1
- id: -2
name: "Global GPT Image 1"
description: "OpenAI's GPT Image 1 model"
provider: "OPENAI"
model_name: "gpt-image-1"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 50
litellm_params: {}
# Example: Azure OpenAI DALL-E 3
- id: -3
name: "Global Azure DALL-E 3"
description: "Azure-hosted DALL-E 3 deployment"
provider: "AZURE_OPENAI"
model_name: "azure/dall-e-3-deployment"
api_key: "your-azure-api-key-here" api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com" api_base: "https://your-resource.openai.azure.com"
api_version: "2024-02-15-preview" # api_version: "2025-04-01-preview"
rpm: 50 rpm: 60
litellm_params: litellm_params:
base_model: "dall-e-3" base_model: "gpt-image-1.5"
# Example: OpenRouter Gemini Image Generation - id: -2002
# - id: -4 name: "Azure GPT Image 1 Mini"
# name: "Global Gemini Image Gen" billing_tier: "free"
# description: "Google Gemini image generation via OpenRouter" provider: "azure"
# provider: "OPENROUTER" model_name: "gpt-image-1-mini"
# model_name: "google/gemini-2.5-flash-image" api_key: "your-azure-api-key-here"
# api_key: "your-openrouter-api-key-here" api_base: "https://your-resource.openai.azure.com"
# api_base: "" # api_version: "2025-04-01-preview"
# rpm: 30 rpm: 120
# litellm_params: {} litellm_params:
base_model: "gpt-image-1-mini"
# ============================================================================= # =============================================================================
# Vision LLM Configuration # Field Notes
# ============================================================================= # =============================================================================
# These configurations power the vision autocomplete feature (screenshot analysis). # Common chat/image fields:
# Only vision-capable models should be used here (e.g. GPT-4o, Gemini Pro, Claude 3). # - provider: Canonical provider adapter name. Example: azure, openai,
# Supported providers: OpenAI, Anthropic, Google, Azure OpenAI, Vertex AI, Bedrock, # anthropic, openrouter, groq, bedrock.
# xAI, OpenRouter, Ollama, Groq, Together AI, Fireworks AI, DeepSeek, Mistral, Custom # - model_name: Provider model or deployment id. For Azure, use the bare
# deployment name. The resolver prefixes LiteLLM model strings from provider.
# - api_base: Provider endpoint/root URL. For OpenAI-compatible providers, the
# resolver adds /v1 when needed.
# - api_version: Optional provider-specific API version, stored on the
# materialized connection extra metadata.
# - litellm_params: Passed to LiteLLM when invoking the model. Also used for
# base_model and inline pricing registration.
# #
# Auto mode (ID 0) uses LiteLLM Router for load balancing across all vision configs. # Chat model fields:
# - supports_image_input: true when the chat model can consume image inputs.
# Router Settings for Vision LLM Auto Mode # - supports_tools: true when the model can use tools/function calling.
vision_llm_router_settings: # - max_input_tokens: Optional UI/catalog metadata for context size.
routing_strategy: "usage-based-routing" # - router_pool_eligible: false keeps a model out of shared router pools while
num_retries: 3 # still allowing direct selection/pinning.
allowed_fails: 3 # - is_planner: true marks the internal-only planner model. Only one config
cooldown_time: 60 # should set this flag.
global_vision_llm_configs:
# Example: OpenAI GPT-4o (recommended for vision)
- id: -1
name: "Global GPT-4o Vision"
description: "OpenAI's GPT-4o with strong vision capabilities"
provider: "OPENAI"
model_name: "gpt-4o"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 500
tpm: 100000
litellm_params:
temperature: 0.3
max_tokens: 1000
# Example: Google Gemini 2.0 Flash
- id: -2
name: "Global Gemini 2.0 Flash"
description: "Google's fast vision model with large context"
provider: "GOOGLE"
model_name: "gemini-2.0-flash"
api_key: "your-google-ai-api-key-here"
api_base: ""
rpm: 1000
tpm: 200000
litellm_params:
temperature: 0.3
max_tokens: 1000
# Example: Anthropic Claude 3.5 Sonnet
- id: -3
name: "Global Claude 3.5 Sonnet Vision"
description: "Anthropic's Claude 3.5 Sonnet with vision support"
provider: "ANTHROPIC"
model_name: "claude-3-5-sonnet-20241022"
api_key: "sk-ant-your-anthropic-api-key-here"
api_base: ""
rpm: 1000
tpm: 100000
litellm_params:
temperature: 0.3
max_tokens: 1000
# Example: Azure OpenAI GPT-4o
# - id: -4
# name: "Global Azure GPT-4o Vision"
# description: "Azure-hosted GPT-4o for vision analysis"
# provider: "AZURE_OPENAI"
# model_name: "azure/gpt-4o-deployment"
# api_key: "your-azure-api-key-here"
# api_base: "https://your-resource.openai.azure.com"
# api_version: "2024-02-15-preview"
# rpm: 500
# tpm: 100000
# litellm_params:
# temperature: 0.3
# max_tokens: 1000
# base_model: "gpt-4o"
# Notes:
# - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing
# - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB)
# - IDs should be unique and sequential (e.g., -1, -2, -3, etc.)
# - The 'api_key' field will not be exposed to users via API
# - system_instructions: Custom prompt or empty string to use defaults
# - use_default_system_instructions: true = use SURFSENSE_SYSTEM_INSTRUCTIONS when system_instructions is empty
# - citations_enabled: true = include citation instructions, false = include anti-citation instructions
# - All standard LiteLLM providers are supported
# - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute)
# These help the router distribute load evenly and avoid rate limit errors
# #
# Catalog and access fields:
# - billing_tier: "free" or "premium".
# - anonymous_enabled: Whether the model appears in the public no-login catalog.
# - seo_enabled: Whether a /free/<seo_slug> landing page is generated.
# - seo_slug: Stable URL slug for SEO pages. Keep unique and do not change once
# public.
# - seo_title / seo_description: Optional SEO metadata overrides.
# - quota_reserve_tokens: Tokens reserved before each chat LLM call.
# - rpm / tpm: Optional rate limits for router accounting and load balancing.
# #
# IMAGE GENERATION NOTES: # Image generation notes:
# - Image generation configs use the same ID scheme as LLM configs (negative for global) # - Image-generation configs use the same GLOBAL ID namespace as chat models.
# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure), # - Only RPM is relevant for most image-generation APIs.
# bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter) # - The runtime uses litellm.aimage_generation().
# - The router uses litellm.aimage_generation() for async image generation # - Image billing currently uses billing_tier and model catalog metadata. Keep
# - Only RPM (requests per minute) is relevant for image generation rate limiting. # quota reserve tuning in code/catalog unless the materializer copies a YAML
# TPM (tokens per minute) does not apply since image APIs are billed/rate-limited per request, not per token. # key for image quota reservation.
#
# VISION LLM NOTES:
# - Vision configs use the same ID scheme (negative for global, positive for user DB)
# - Only use vision-capable models (GPT-4o, Gemini, Claude 3, etc.)
# - Lower temperature (0.3) is recommended for accurate screenshot analysis
# - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
#
# PLANNER LLM NOTES:
# - is_planner: true marks a config as the internal-only planner LLM (small,
# fast model used for KB query rewriting, date extraction, recency
# classification, etc.). Only one config may carry this flag — if
# multiple do, the first one wins and a startup WARNING is logged.
# - When no config is marked is_planner, every internal utility call falls
# back to the user's chat LLM (the historical behavior).
# - Planner configs are NOT shown in the user-facing model selector and
# are NOT billed against the user's premium quota. Their billing_tier,
# anonymous_enabled, seo_* fields are ignored.
# - Recommended models: gpt-4o-mini, claude-3-5-haiku, gemini-1.5-flash,
# azure gpt-5.x-nano, groq llama3-8b — anything <200ms p50 on a 1-2k
# prompt. Frontier models here defeat the purpose of the flag.
#
# TOKEN QUOTA & ANONYMOUS ACCESS NOTES:
# - billing_tier: "free" or "premium". Controls whether registered users need premium token quota.
# - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog.
# - seo_enabled: true/false. Whether a /free/<seo_slug> landing page is generated.
# - seo_slug: Stable URL slug for SEO pages. Must be unique. Do NOT change once public.
# - seo_title: Optional HTML title tag override for the model's /free/<slug> page.
# - seo_description: Optional meta description override for the model's /free/<slug> page.
# - quota_reserve_tokens: Tokens reserved before each LLM call for quota enforcement.
# Independent of litellm_params.max_tokens. Used by the token quota service.

View file

@ -90,11 +90,12 @@ async def download_and_extract_content(
if error: if error:
return None, metadata, error return None, metadata, error
from app.etl_pipeline.cache import extract_with_cache
from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
result = await EtlPipelineService(vision_llm=vision_llm).extract( result = await extract_with_cache(
EtlRequest(file_path=temp_file_path, filename=file_name) EtlRequest(file_path=temp_file_path, filename=file_name),
vision_llm=vision_llm,
) )
markdown = result.markdown_content markdown = result.markdown_content
return markdown, metadata, None return markdown, metadata, None

View file

@ -122,12 +122,13 @@ async def download_and_extract_content(
async def _parse_file_to_markdown( async def _parse_file_to_markdown(
file_path: str, filename: str, *, vision_llm=None file_path: str, filename: str, *, vision_llm=None
) -> str: ) -> str:
"""Parse a local file to markdown using the unified ETL pipeline.""" """Parse a local file to markdown via the cache-aware ETL pipeline."""
from app.etl_pipeline.cache import extract_with_cache
from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
result = await EtlPipelineService(vision_llm=vision_llm).extract( result = await extract_with_cache(
EtlRequest(file_path=file_path, filename=filename) EtlRequest(file_path=file_path, filename=filename),
vision_llm=vision_llm,
) )
return result.markdown_content return result.markdown_content

View file

@ -84,11 +84,12 @@ async def download_and_extract_content(
async def _parse_file_to_markdown( async def _parse_file_to_markdown(
file_path: str, filename: str, *, vision_llm=None file_path: str, filename: str, *, vision_llm=None
) -> str: ) -> str:
"""Parse a local file to markdown using the unified ETL pipeline.""" """Parse a local file to markdown via the cache-aware ETL pipeline."""
from app.etl_pipeline.cache import extract_with_cache
from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
result = await EtlPipelineService(vision_llm=vision_llm).extract( result = await extract_with_cache(
EtlRequest(file_path=file_path, filename=filename) EtlRequest(file_path=file_path, filename=filename),
vision_llm=vision_llm,
) )
return result.markdown_content return result.markdown_content

View file

@ -201,79 +201,15 @@ class DocumentStatus:
return None return None
class LiteLLMProvider(StrEnum): class ConnectionScope(StrEnum):
""" GLOBAL = "GLOBAL"
Enum for LLM providers supported by LiteLLM. SEARCH_SPACE = "SEARCH_SPACE"
""" USER = "USER"
OPENAI = "OPENAI"
ANTHROPIC = "ANTHROPIC"
GOOGLE = "GOOGLE"
AZURE_OPENAI = "AZURE_OPENAI"
BEDROCK = "BEDROCK"
VERTEX_AI = "VERTEX_AI"
GROQ = "GROQ"
COHERE = "COHERE"
MISTRAL = "MISTRAL"
DEEPSEEK = "DEEPSEEK"
XAI = "XAI"
OPENROUTER = "OPENROUTER"
TOGETHER_AI = "TOGETHER_AI"
FIREWORKS_AI = "FIREWORKS_AI"
REPLICATE = "REPLICATE"
PERPLEXITY = "PERPLEXITY"
OLLAMA = "OLLAMA"
ALIBABA_QWEN = "ALIBABA_QWEN"
MOONSHOT = "MOONSHOT"
ZHIPU = "ZHIPU"
ANYSCALE = "ANYSCALE"
DEEPINFRA = "DEEPINFRA"
CEREBRAS = "CEREBRAS"
SAMBANOVA = "SAMBANOVA"
AI21 = "AI21"
CLOUDFLARE = "CLOUDFLARE"
DATABRICKS = "DATABRICKS"
COMETAPI = "COMETAPI"
HUGGINGFACE = "HUGGINGFACE"
GITHUB_MODELS = "GITHUB_MODELS"
MINIMAX = "MINIMAX"
CUSTOM = "CUSTOM"
class ImageGenProvider(StrEnum): class ModelSource(StrEnum):
""" DISCOVERED = "DISCOVERED"
Enum for image generation providers supported by LiteLLM. MANUAL = "MANUAL"
This is a subset of LLM providers only those that support image generation.
See: https://docs.litellm.ai/docs/image_generation#supported-providers
"""
OPENAI = "OPENAI"
AZURE_OPENAI = "AZURE_OPENAI"
GOOGLE = "GOOGLE" # Google AI Studio
VERTEX_AI = "VERTEX_AI"
BEDROCK = "BEDROCK" # AWS Bedrock
RECRAFT = "RECRAFT"
OPENROUTER = "OPENROUTER"
XINFERENCE = "XINFERENCE"
NSCALE = "NSCALE"
class VisionProvider(StrEnum):
OPENAI = "OPENAI"
ANTHROPIC = "ANTHROPIC"
GOOGLE = "GOOGLE"
AZURE_OPENAI = "AZURE_OPENAI"
VERTEX_AI = "VERTEX_AI"
BEDROCK = "BEDROCK"
XAI = "XAI"
OPENROUTER = "OPENROUTER"
OLLAMA = "OLLAMA"
GROQ = "GROQ"
TOGETHER_AI = "TOGETHER_AI"
FIREWORKS_AI = "FIREWORKS_AI"
DEEPSEEK = "DEEPSEEK"
MISTRAL = "MISTRAL"
CUSTOM = "CUSTOM"
class LogLevel(StrEnum): class LogLevel(StrEnum):
@ -702,11 +638,11 @@ class NewChatThread(BaseModel, TimestampMixin):
default=False, default=False,
server_default="false", server_default="false",
) )
# Auto (Fastest) model pin for this thread: concrete resolved global LLM # Auto model pin for this thread: concrete resolved global LLM
# config id. NULL means no pin; Auto will resolve on the next turn. # config id. NULL means no pin; Auto will resolve on the next turn.
# Single-writer invariant: only app.services.auto_model_pin_service sets # Single-writer invariant: only app.services.auto_model_pin_service sets
# or clears this column (plus bulk clears when a search space's # or clears this column (plus bulk clears when a search space's
# agent_llm_id changes). Unindexed: all reads are by primary key. # chat_model_id changes). Unindexed: all reads are by primary key.
pinned_llm_config_id = Column(Integer, nullable=True) pinned_llm_config_id = Column(Integer, nullable=True)
# Surface metadata for first-party SurfSense and external chat threads. # Surface metadata for first-party SurfSense and external chat threads.
@ -1487,7 +1423,10 @@ class Document(BaseModel, TimestampMixin):
created_by = relationship("User", back_populates="documents") created_by = relationship("User", back_populates="documents")
connector = relationship("SearchSourceConnector", back_populates="documents") connector = relationship("SearchSourceConnector", back_populates="documents")
chunks = relationship( chunks = relationship(
"Chunk", back_populates="document", cascade="all, delete-orphan" "Chunk",
back_populates="document",
cascade="all, delete-orphan",
order_by="Chunk.position",
) )
# Original upload + future derived artifacts (redacted, filled-form). # Original upload + future derived artifacts (redacted, filled-form).
# Model lives in app.file_storage.persistence to keep that feature cohesive. # Model lives in app.file_storage.persistence to keep that feature cohesive.
@ -1523,6 +1462,9 @@ class Chunk(BaseModel, TimestampMixin):
content = Column(Text, nullable=False) content = Column(Text, nullable=False)
embedding = Column(Vector(config.embedding_model_instance.dimension)) embedding = Column(Vector(config.embedding_model_instance.dimension))
# Explicit document order; ids don't follow it since incremental
# re-indexing keeps unchanged rows across edits.
position = Column(Integer, nullable=False, server_default="0", index=True)
document_id = Column( document_id = Column(
Integer, Integer,
@ -1604,73 +1546,80 @@ class Report(BaseModel, TimestampMixin):
thread = relationship("NewChatThread") thread = relationship("NewChatThread")
class ImageGenerationConfig(BaseModel, TimestampMixin): class Connection(BaseModel, TimestampMixin):
""" __tablename__ = "connections"
Dedicated configuration table for image generation models.
Separate from NewLLMConfig because image generation models don't need provider = Column(String(100), nullable=False, index=True)
system_instructions, citations_enabled, or use_default_system_instructions. base_url = Column(String(500), nullable=True)
They only need provider credentials and model parameters. api_key = Column(String, nullable=True)
""" extra = Column(JSONB, nullable=False, default=dict, server_default="{}")
scope = Column(SQLAlchemyEnum(ConnectionScope), nullable=False, index=True)
__tablename__ = "image_generation_configs" enabled = Column(Boolean, nullable=False, default=True, server_default="true")
name = Column(String(100), nullable=False, index=True)
description = Column(String(500), nullable=True)
# Provider & model (uses ImageGenProvider, NOT LiteLLMProvider)
provider = Column(SQLAlchemyEnum(ImageGenProvider), nullable=False)
custom_provider = Column(String(100), nullable=True)
model_name = Column(String(100), nullable=False)
# Credentials
api_key = Column(String, nullable=False)
api_base = Column(String(500), nullable=True)
api_version = Column(String(50), nullable=True) # Azure-specific
# Additional litellm parameters
litellm_params = Column(JSON, nullable=True, default={})
# Relationships
search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
)
search_space = relationship(
"SearchSpace", back_populates="image_generation_configs"
)
# User who created this config
user_id = Column(
UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
)
user = relationship("User", back_populates="image_generation_configs")
class VisionLLMConfig(BaseModel, TimestampMixin):
__tablename__ = "vision_llm_configs"
name = Column(String(100), nullable=False, index=True)
description = Column(String(500), nullable=True)
provider = Column(SQLAlchemyEnum(VisionProvider), nullable=False)
custom_provider = Column(String(100), nullable=True)
model_name = Column(String(100), nullable=False)
api_key = Column(String, nullable=False)
api_base = Column(String(500), nullable=True)
api_version = Column(String(50), nullable=True)
litellm_params = Column(JSON, nullable=True, default={})
search_space_id = Column( search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=True
) )
search_space = relationship("SearchSpace", back_populates="vision_llm_configs")
user_id = Column( user_id = Column(
UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=True
)
search_space = relationship("SearchSpace", back_populates="connections")
user = relationship("User", back_populates="connections")
models = relationship(
"Model",
back_populates="connection",
order_by="Model.id",
cascade="all, delete-orphan",
passive_deletes=True,
)
__table_args__ = (
CheckConstraint(
"(scope = 'GLOBAL' AND search_space_id IS NULL AND user_id IS NULL) OR "
"(scope = 'SEARCH_SPACE' AND search_space_id IS NOT NULL AND user_id IS NOT NULL) OR "
"(scope = 'USER' AND user_id IS NOT NULL)",
name="ck_connections_scope_owner",
),
)
class Model(BaseModel, TimestampMixin):
__tablename__ = "models"
connection_id = Column(
Integer,
ForeignKey("connections.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
model_id = Column(String(255), nullable=False)
display_name = Column(String(255), nullable=True)
source = Column(
SQLAlchemyEnum(ModelSource),
nullable=False,
default=ModelSource.DISCOVERED,
server_default=ModelSource.DISCOVERED.value,
)
supports_chat = Column(Boolean, nullable=True)
max_input_tokens = Column(Integer, nullable=True)
supports_image_input = Column(Boolean, nullable=True)
supports_tools = Column(Boolean, nullable=True)
supports_image_generation = Column(Boolean, nullable=True)
capabilities_override = Column(
JSONB, nullable=False, default=dict, server_default="{}"
)
enabled = Column(Boolean, nullable=False, default=True, server_default="true")
billing_tier = Column(String(50), nullable=True, index=True)
catalog = Column(JSONB, nullable=False, default=dict, server_default="{}")
connection = relationship("Connection", back_populates="models")
__table_args__ = (
UniqueConstraint(
"connection_id", "model_id", name="uq_models_connection_model_id"
),
Index("ix_models_model_id", "model_id"),
) )
user = relationship("User", back_populates="vision_llm_configs")
class ImageGeneration(BaseModel, TimestampMixin): class ImageGeneration(BaseModel, TimestampMixin):
@ -1704,10 +1653,9 @@ class ImageGeneration(BaseModel, TimestampMixin):
style = Column(String(50), nullable=True) # Model-specific style parameter style = Column(String(50), nullable=True) # Model-specific style parameter
response_format = Column(String(50), nullable=True) # "url" or "b64_json" response_format = Column(String(50), nullable=True) # "url" or "b64_json"
# Image generation config reference # Image generation model provenance.
# 0 = Auto mode (router), negative IDs = global configs from YAML, # 0 = Auto mode, negative IDs = GLOBAL models, positive IDs = Model records.
# positive IDs = ImageGenerationConfig records in DB image_gen_model_id = Column(Integer, nullable=True)
image_generation_config_id = Column(Integer, nullable=True)
# Response data (full litellm response as JSONB) — present on success # Response data (full litellm response as JSONB) — present on success
response_data = Column(JSONB, nullable=True) response_data = Column(JSONB, nullable=True)
@ -1749,19 +1697,19 @@ class SearchSpace(BaseModel, TimestampMixin):
shared_memory_md = Column(Text, nullable=True, server_default="") shared_memory_md = Column(Text, nullable=True, server_default="")
# Search space-level LLM preferences (shared by all members) # Connection/model role bindings.
# Note: ID values: # Note: ID values preserve the existing convention:
# - 0: Auto mode (uses LiteLLM Router for load balancing) - default for new search spaces # - 0: Auto mode
# - Negative IDs: Global configs from YAML # - Negative IDs: Global virtual models from global_llm_config.yaml
# - Positive IDs: Custom configs from DB (NewLLMConfig table) # - Positive IDs: User/search-space models from the models table
agent_llm_id = Column( chat_model_id = Column(
Integer, nullable=True, default=0 Integer, nullable=True, default=0, server_default="0"
) # For agent/chat operations, defaults to Auto mode ) # For agent/chat operations, defaults to Auto mode
image_generation_config_id = Column( image_gen_model_id = Column(
Integer, nullable=True, default=0 Integer, nullable=True, default=0, server_default="0"
) # For image generation, defaults to Auto mode ) # For image generation, defaults to Auto mode when eligible
vision_llm_config_id = Column( vision_model_id = Column(
Integer, nullable=True, default=0 Integer, nullable=True, default=0, server_default="0"
) # For vision/screenshot analysis, defaults to Auto mode ) # For vision/screenshot analysis, defaults to Auto mode
ai_file_sort_enabled = Column( ai_file_sort_enabled = Column(
@ -1833,23 +1781,12 @@ class SearchSpace(BaseModel, TimestampMixin):
order_by="SearchSourceConnector.id", order_by="SearchSourceConnector.id",
cascade="all, delete-orphan", cascade="all, delete-orphan",
) )
new_llm_configs = relationship( connections = relationship(
"NewLLMConfig", "Connection",
back_populates="search_space", back_populates="search_space",
order_by="NewLLMConfig.id", order_by="Connection.id",
cascade="all, delete-orphan",
)
image_generation_configs = relationship(
"ImageGenerationConfig",
back_populates="search_space",
order_by="ImageGenerationConfig.id",
cascade="all, delete-orphan",
)
vision_llm_configs = relationship(
"VisionLLMConfig",
back_populates="search_space",
order_by="VisionLLMConfig.id",
cascade="all, delete-orphan", cascade="all, delete-orphan",
passive_deletes=True,
) )
automations = relationship( automations = relationship(
@ -1952,64 +1889,6 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
documents = relationship("Document", back_populates="connector") documents = relationship("Document", back_populates="connector")
class NewLLMConfig(BaseModel, TimestampMixin):
"""
New LLM configuration table that combines model settings with prompt configuration.
This table provides:
- LLM model configuration (provider, model_name, api_key, etc.)
- Configurable system instructions (defaults to SURFSENSE_SYSTEM_INSTRUCTIONS)
- Citation toggle (enable/disable citation instructions)
Note: Tools instructions are built by get_tools_instructions(thread_visibility) (personal vs shared memory).
"""
__tablename__ = "new_llm_configs"
name = Column(String(100), nullable=False, index=True)
description = Column(String(500), nullable=True)
# === LLM Model Configuration (from original LLMConfig, excluding 'language') ===
# Provider from the enum
provider = Column(SQLAlchemyEnum(LiteLLMProvider), nullable=False)
# Custom provider name when provider is CUSTOM
custom_provider = Column(String(100), nullable=True)
# Just the model name without provider prefix
model_name = Column(String(100), nullable=False)
# API Key should be encrypted before storing
api_key = Column(String, nullable=False)
api_base = Column(String(500), nullable=True)
# For any other parameters that litellm supports
litellm_params = Column(JSON, nullable=True, default={})
# === Prompt Configuration ===
# Configurable system instructions (defaults to SURFSENSE_SYSTEM_INSTRUCTIONS)
# Users can customize this from the UI
system_instructions = Column(
Text,
nullable=False,
default="", # Empty string means use default SURFSENSE_SYSTEM_INSTRUCTIONS
)
# Whether to use the default system instructions when system_instructions is empty
use_default_system_instructions = Column(Boolean, nullable=False, default=True)
# Citation toggle - when enabled, SURFSENSE_CITATION_INSTRUCTIONS is injected
# When disabled, an anti-citation prompt is injected instead
citations_enabled = Column(Boolean, nullable=False, default=True)
# === Relationships ===
search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
)
search_space = relationship("SearchSpace", back_populates="new_llm_configs")
# User who created this config
user_id = Column(
UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
)
user = relationship("User", back_populates="new_llm_configs")
class Log(BaseModel, TimestampMixin): class Log(BaseModel, TimestampMixin):
__tablename__ = "logs" __tablename__ = "logs"
@ -2376,22 +2255,8 @@ if config.AUTH_TYPE == "GOOGLE":
passive_deletes=True, passive_deletes=True,
) )
# LLM configs created by this user connections = relationship(
new_llm_configs = relationship( "Connection",
"NewLLMConfig",
back_populates="user",
passive_deletes=True,
)
# Image generation configs created by this user
image_generation_configs = relationship(
"ImageGenerationConfig",
back_populates="user",
passive_deletes=True,
)
vision_llm_configs = relationship(
"VisionLLMConfig",
back_populates="user", back_populates="user",
passive_deletes=True, passive_deletes=True,
) )
@ -2522,22 +2387,8 @@ else:
passive_deletes=True, passive_deletes=True,
) )
# LLM configs created by this user connections = relationship(
new_llm_configs = relationship( "Connection",
"NewLLMConfig",
back_populates="user",
passive_deletes=True,
)
# Image generation configs created by this user
image_generation_configs = relationship(
"ImageGenerationConfig",
back_populates="user",
passive_deletes=True,
)
vision_llm_configs = relationship(
"VisionLLMConfig",
back_populates="user", back_populates="user",
passive_deletes=True, passive_deletes=True,
) )
@ -2867,7 +2718,11 @@ from app.automations.persistence import ( # noqa: E402, F401
AutomationRun, AutomationRun,
AutomationTrigger, AutomationTrigger,
) )
from app.etl_pipeline.cache.persistence.models import CachedParse # noqa: E402, F401
from app.file_storage.persistence import DocumentFile # noqa: E402, F401 from app.file_storage.persistence import DocumentFile # noqa: E402, F401
from app.indexing_pipeline.cache.persistence.models import ( # noqa: E402, F401
CachedEmbeddingSet,
)
from app.notifications.persistence import Notification # noqa: E402, F401 from app.notifications.persistence import Notification # noqa: E402, F401
from app.podcasts.persistence import ( # noqa: E402, F401 from app.podcasts.persistence import ( # noqa: E402, F401
Podcast, Podcast,

View file

@ -0,0 +1,11 @@
"""Content-addressed reuse of expensive ETL parser output across workspaces."""
from __future__ import annotations
from app.etl_pipeline.cache.cached_extraction import extract_with_cache
from app.etl_pipeline.cache.service import EtlCacheService
__all__ = [
"EtlCacheService",
"extract_with_cache",
]

View file

@ -0,0 +1,86 @@
"""Entry point: serve ETL parses from cache, parsing only on a miss."""
from __future__ import annotations
import asyncio
import hashlib
import logging
from app.config import config
from app.etl_pipeline.cache.eligibility import is_parse_cacheable
from app.etl_pipeline.cache.schemas import ParseKey
from app.etl_pipeline.cache.service import EtlCacheService
from app.etl_pipeline.cache.settings import load_etl_cache_settings
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
from app.observability import metrics
logger = logging.getLogger(__name__)
_HASH_CHUNK = 1024 * 1024
async def extract_with_cache(request: EtlRequest, *, vision_llm=None) -> EtlResult:
"""Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output."""
settings = load_etl_cache_settings()
cacheable = is_parse_cacheable(
filename=request.filename,
etl_service=config.ETL_SERVICE,
cache_enabled=settings.enabled,
has_vision_llm=vision_llm is not None,
)
if not cacheable:
return await EtlPipelineService(vision_llm=vision_llm).extract(request)
key = ParseKey.for_document(
await asyncio.to_thread(_hash_file, request.file_path),
etl_service=config.ETL_SERVICE,
mode=request.processing_mode.value,
version=settings.parser_version,
)
cached_result = await _recall(key)
if cached_result is not None:
metrics.record_etl_cache_lookup(
etl_service=key.etl_service, mode=key.mode, outcome="hit"
)
logger.debug("ETL cache hit for %s", key.source_sha256)
return cached_result
metrics.record_etl_cache_lookup(
etl_service=key.etl_service, mode=key.mode, outcome="miss"
)
result = await EtlPipelineService(vision_llm=vision_llm).extract(request)
await _remember(key, result)
return result
async def _recall(key: ParseKey) -> EtlResult | None:
# Caching is best-effort: any failure falls through to a normal parse.
try:
from app.tasks.celery_tasks import get_celery_session_maker
async with get_celery_session_maker()() as session:
return await EtlCacheService(session).recall(key)
except Exception:
logger.warning("ETL cache recall failed; parsing fresh", exc_info=True)
return None
async def _remember(key: ParseKey, result: EtlResult) -> None:
try:
from app.tasks.celery_tasks import get_celery_session_maker
async with get_celery_session_maker()() as session:
await EtlCacheService(session).remember(key, result)
except Exception:
logger.warning("ETL cache write failed; result not cached", exc_info=True)
def _hash_file(path: str) -> str:
digest = hashlib.sha256()
with open(path, "rb") as handle:
for chunk in iter(lambda: handle.read(_HASH_CHUNK), b""):
digest.update(chunk)
return digest.hexdigest()

View file

@ -0,0 +1,28 @@
"""Gating rule: may this upload be served from / written to the parse cache?"""
from __future__ import annotations
from app.etl_pipeline.file_classifier import FileCategory, classify_file
def is_parse_cacheable(
*,
filename: str,
etl_service: str | None,
cache_enabled: bool,
has_vision_llm: bool,
) -> bool:
"""Only deterministic document parses are shareable across workspaces.
Vision-LLM runs append model-generated content not captured by the cache key,
and a missing ETL service means there is no document parser to key against --
both bypass the cache. Non-document categories (plaintext, audio, images,
direct-convert) are cheap or parser-agnostic and are handled outside it.
"""
if not cache_enabled:
return False
if has_vision_llm:
return False
if not etl_service:
return False
return classify_file(filename) == FileCategory.DOCUMENT

View file

@ -0,0 +1,9 @@
"""Background pruning of the parse cache by age and size budget."""
from __future__ import annotations
from .task import evict_etl_cache_task
__all__ = [
"evict_etl_cache_task",
]

View file

@ -0,0 +1,28 @@
"""Pure selection rules for which cached entries to drop."""
from __future__ import annotations
from collections.abc import Iterable
from app.etl_pipeline.cache.schemas import EvictionCandidate
def select_over_budget(
coldest_first: Iterable[EvictionCandidate],
*,
current_total_bytes: int,
max_total_bytes: int,
) -> list[EvictionCandidate]:
"""Pick coldest entries until the footprint drops under the budget."""
bytes_to_free = current_total_bytes - max_total_bytes
if bytes_to_free <= 0:
return []
chosen: list[EvictionCandidate] = []
bytes_freed = 0
for candidate in coldest_first:
if bytes_freed >= bytes_to_free:
break
chosen.append(candidate)
bytes_freed += candidate.size_bytes
return chosen

View file

@ -0,0 +1,68 @@
"""Celery task that prunes the parse cache by TTL, then by size budget."""
from __future__ import annotations
import contextlib
import logging
from datetime import UTC, datetime, timedelta
from app.celery_app import celery_app
from app.etl_pipeline.cache.eviction.policy import select_over_budget
from app.etl_pipeline.cache.persistence import CachedParseRepository
from app.etl_pipeline.cache.schemas import EvictionCandidate
from app.etl_pipeline.cache.settings import load_etl_cache_settings
from app.etl_pipeline.cache.storage import MarkdownCacheStore
from app.observability import metrics
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
logger = logging.getLogger(__name__)
@celery_app.task(name="evict_etl_cache")
def evict_etl_cache_task():
return run_async_celery_task(_evict)
async def _evict() -> None:
"""Expire stale entries, then shed the coldest overflow only if still over budget."""
settings = load_etl_cache_settings()
if not settings.enabled:
return
store = MarkdownCacheStore()
async with get_celery_session_maker()() as session:
index = CachedParseRepository(session)
cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days)
expired = await index.select_expired(
cutoff=cutoff, limit=settings.eviction_batch
)
await _drop(index, store, expired, phase="ttl")
total = await index.total_size_bytes()
if total > settings.max_total_bytes:
coldest = await index.select_coldest(limit=settings.eviction_batch)
over_budget = select_over_budget(
coldest,
current_total_bytes=total,
max_total_bytes=settings.max_total_bytes,
)
await _drop(index, store, over_budget, phase="size")
async def _drop(
index: CachedParseRepository,
store: MarkdownCacheStore,
candidates: list[EvictionCandidate],
*,
phase: str,
) -> None:
if not candidates:
return
for candidate in candidates:
# Drop the index row even if the blob delete fails (orphan blob is harmless).
with contextlib.suppress(Exception):
await store.delete(candidate.storage_key)
await index.delete_by_ids([candidate.id for candidate in candidates])
metrics.record_etl_cache_eviction(len(candidates), phase=phase)
logger.info("Evicted %d cached parses (%s)", len(candidates), phase)

View file

@ -0,0 +1,11 @@
"""Database access for cached parse rows."""
from __future__ import annotations
from .models import CachedParse
from .repository import CachedParseRepository
__all__ = [
"CachedParse",
"CachedParseRepository",
]

View file

@ -0,0 +1,49 @@
"""``etl_cache_parses``: one reusable parser result per (bytes + recipe)."""
from __future__ import annotations
from sqlalchemy import (
BigInteger,
Column,
DateTime,
Index,
Integer,
String,
UniqueConstraint,
)
from app.db import BaseModel, TimestampMixin
class CachedParse(BaseModel, TimestampMixin):
__tablename__ = "etl_cache_parses"
# Key: raw bytes + the recipe that produced the markdown.
source_sha256 = Column(String(64), nullable=False)
etl_service = Column(String(32), nullable=False)
mode = Column(String(16), nullable=False)
parser_version = Column(Integer, nullable=False)
# Where the markdown blob lives (kept out of the row to stay small).
storage_backend = Column(String(32), nullable=False)
storage_key = Column(String, nullable=False)
size_bytes = Column(BigInteger, nullable=False)
# Payload needed to rebuild the EtlResult on a hit.
content_type = Column(String(32), nullable=False)
actual_pages = Column(Integer, nullable=False, default=0, server_default="0")
# Drives eviction (popularity + recency).
times_reused = Column(BigInteger, nullable=False, default=0, server_default="0")
last_used_at = Column(DateTime(timezone=True), nullable=False)
__table_args__ = (
UniqueConstraint(
"source_sha256",
"etl_service",
"mode",
"parser_version",
name="uq_etl_cache_parses_key",
),
Index("ix_etl_cache_parses_last_used_at", "last_used_at"),
)

View file

@ -0,0 +1,121 @@
"""CRUD and eviction selectors for ``etl_cache_parses`` (no business rules)."""
from __future__ import annotations
from datetime import UTC, datetime
from sqlalchemy import delete, func, select, update
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.ext.asyncio import AsyncSession
from app.etl_pipeline.cache.schemas import EvictionCandidate, ParseKey
from .models import CachedParse
_EVICTION_COLUMNS = (
CachedParse.id,
CachedParse.storage_key,
CachedParse.size_bytes,
CachedParse.last_used_at,
CachedParse.times_reused,
)
def _as_eviction_candidate(row) -> EvictionCandidate:
return EvictionCandidate(
id=row.id,
storage_key=row.storage_key,
size_bytes=row.size_bytes,
last_used_at=row.last_used_at,
times_reused=row.times_reused,
)
class CachedParseRepository:
def __init__(self, session: AsyncSession) -> None:
self._session = session
async def get(self, key: ParseKey) -> CachedParse | None:
result = await self._session.execute(
select(CachedParse).where(
CachedParse.source_sha256 == key.source_sha256,
CachedParse.etl_service == key.etl_service,
CachedParse.mode == key.mode,
CachedParse.parser_version == key.version,
)
)
return result.scalars().first()
async def insert(
self,
*,
key: ParseKey,
content_type: str,
actual_pages: int,
storage_backend: str,
storage_key: str,
size_bytes: int,
) -> None:
# Concurrent writers parse identical bytes, so a lost race is harmless.
now = datetime.now(UTC)
await self._session.execute(
pg_insert(CachedParse)
.values(
source_sha256=key.source_sha256,
etl_service=key.etl_service,
mode=key.mode,
parser_version=key.version,
content_type=content_type,
actual_pages=actual_pages,
storage_backend=storage_backend,
storage_key=storage_key,
size_bytes=size_bytes,
times_reused=0,
last_used_at=now,
created_at=now,
)
.on_conflict_do_nothing(constraint="uq_etl_cache_parses_key")
)
await self._session.commit()
async def mark_used(self, row_id: int) -> None:
await self._session.execute(
update(CachedParse)
.where(CachedParse.id == row_id)
.values(
times_reused=CachedParse.times_reused + 1,
last_used_at=datetime.now(UTC),
)
)
await self._session.commit()
async def total_size_bytes(self) -> int:
result = await self._session.execute(
select(func.coalesce(func.sum(CachedParse.size_bytes), 0))
)
return int(result.scalar() or 0)
async def select_expired(
self, *, cutoff: datetime, limit: int
) -> list[EvictionCandidate]:
result = await self._session.execute(
select(*_EVICTION_COLUMNS)
.where(CachedParse.last_used_at < cutoff)
.order_by(CachedParse.last_used_at.asc())
.limit(limit)
)
return [_as_eviction_candidate(row) for row in result]
async def select_coldest(self, *, limit: int) -> list[EvictionCandidate]:
result = await self._session.execute(
select(*_EVICTION_COLUMNS)
.order_by(CachedParse.times_reused.asc(), CachedParse.last_used_at.asc())
.limit(limit)
)
return [_as_eviction_candidate(row) for row in result]
async def delete_by_ids(self, ids: list[int]) -> None:
if not ids:
return
await self._session.execute(delete(CachedParse).where(CachedParse.id.in_(ids)))
await self._session.commit()

View file

@ -0,0 +1,11 @@
"""Pure value objects for the parse cache."""
from __future__ import annotations
from .eviction_candidate import EvictionCandidate
from .parse_key import ParseKey
__all__ = [
"EvictionCandidate",
"ParseKey",
]

View file

@ -0,0 +1,15 @@
"""Row projection handed to the eviction policy."""
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
@dataclass(frozen=True, slots=True)
class EvictionCandidate:
id: int
storage_key: str
size_bytes: int
last_used_at: datetime
times_reused: int

View file

@ -0,0 +1,28 @@
"""Identity of a cacheable parse: equal keys yield identical markdown."""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class ParseKey:
source_sha256: str
etl_service: str
mode: str
version: int
@classmethod
def for_document(
cls, source_sha256: str, *, etl_service: str, mode: str, version: int
) -> ParseKey:
return cls(
source_sha256=source_sha256,
etl_service=etl_service,
mode=mode,
version=version,
)
@property
def object_suffix(self) -> str:
return f"{self.etl_service}.{self.mode}.v{self.version}.md"

View file

@ -0,0 +1,53 @@
"""Recall and remember parser output, coordinating the index and blob store."""
from __future__ import annotations
import logging
from sqlalchemy.ext.asyncio import AsyncSession
from app.etl_pipeline.cache.persistence import CachedParseRepository
from app.etl_pipeline.cache.schemas import ParseKey
from app.etl_pipeline.cache.storage import MarkdownCacheStore
from app.etl_pipeline.etl_document import EtlResult
logger = logging.getLogger(__name__)
class EtlCacheService:
def __init__(self, session: AsyncSession) -> None:
self._index = CachedParseRepository(session)
self._store = MarkdownCacheStore()
async def recall(self, key: ParseKey) -> EtlResult | None:
"""Return the cached result, or None on a miss."""
row = await self._index.get(key)
if row is None:
return None
try:
markdown = await self._store.load(row.storage_key)
except Exception:
# Index points at a blob that is gone; treat as a miss and re-parse.
logger.warning("Cache blob missing: %s", row.storage_key, exc_info=True)
return None
await self._index.mark_used(row.id)
return EtlResult(
markdown_content=markdown,
etl_service=row.etl_service,
actual_pages=row.actual_pages,
content_type=row.content_type,
)
async def remember(self, key: ParseKey, result: EtlResult) -> None:
"""Store a freshly parsed result for future reuse."""
storage_key = await self._store.save(key, result.markdown_content)
await self._index.insert(
key=key,
content_type=result.content_type,
actual_pages=result.actual_pages,
storage_backend=self._store.backend_name,
storage_key=storage_key,
size_bytes=len(result.markdown_content.encode("utf-8")),
)

View file

@ -0,0 +1,33 @@
"""Cache configuration resolved from the central ``Config``."""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class EtlCacheSettings:
enabled: bool
parser_version: int
ttl_days: int
max_total_bytes: int
eviction_batch: int
# None for any storage_* field means: reuse the main file_storage backend.
storage_backend: str | None
storage_container: str | None
storage_local_root: str | None
def load_etl_cache_settings() -> EtlCacheSettings:
from app.config import config
return EtlCacheSettings(
enabled=config.ETL_CACHE_ENABLED,
parser_version=config.ETL_CACHE_PARSER_VERSION,
ttl_days=config.ETL_CACHE_TTL_DAYS,
max_total_bytes=config.ETL_CACHE_MAX_TOTAL_MB * 1024 * 1024,
eviction_batch=config.ETL_CACHE_EVICTION_BATCH,
storage_backend=config.ETL_CACHE_STORAGE_BACKEND or None,
storage_container=config.ETL_CACHE_STORAGE_CONTAINER or None,
storage_local_root=config.ETL_CACHE_STORAGE_LOCAL_PATH or None,
)

View file

@ -0,0 +1,9 @@
"""Blob storage for cached parse markdown."""
from __future__ import annotations
from .markdown_store import MarkdownCacheStore
__all__ = [
"MarkdownCacheStore",
]

View file

@ -0,0 +1,48 @@
"""Resolve the storage backend for cache blobs: shared main store or a dedicated one."""
from __future__ import annotations
from functools import lru_cache
from app.file_storage.backends.base import StorageBackend
@lru_cache(maxsize=1)
def resolve_cache_backend() -> StorageBackend:
from app.etl_pipeline.cache.settings import load_etl_cache_settings
settings = load_etl_cache_settings()
if not settings.storage_backend:
from app.file_storage.factory import get_storage_backend
return get_storage_backend()
backend = settings.storage_backend.strip().lower()
if backend == "azure":
from app.config import config
if not settings.storage_container:
raise ValueError("ETL_CACHE_STORAGE_CONTAINER is required for azure cache.")
if not config.AZURE_STORAGE_CONNECTION_STRING:
raise ValueError(
"AZURE_STORAGE_CONNECTION_STRING is required for azure cache."
)
from app.file_storage.backends.azure import AzureBlobBackend
return AzureBlobBackend(
connection_string=config.AZURE_STORAGE_CONNECTION_STRING,
container=settings.storage_container,
)
if backend == "local":
if not settings.storage_local_root:
raise ValueError(
"ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache."
)
from app.file_storage.backends.local import LocalFileBackend
return LocalFileBackend(settings.storage_local_root)
raise ValueError(f"Unknown ETL_CACHE_STORAGE_BACKEND: {settings.storage_backend!r}")

View file

@ -0,0 +1,35 @@
"""Read and write cached markdown blobs through the resolved backend."""
from __future__ import annotations
from app.etl_pipeline.cache.schemas import ParseKey
from app.etl_pipeline.cache.storage.backend import resolve_cache_backend
from app.etl_pipeline.cache.storage.object_keys import build_parse_object_key
_MARKDOWN_CONTENT_TYPE = "text/markdown; charset=utf-8"
class MarkdownCacheStore:
def __init__(self) -> None:
self._backend = resolve_cache_backend()
@property
def backend_name(self) -> str:
return self._backend.backend_name
async def save(self, key: ParseKey, markdown: str) -> str:
"""Persist the markdown and return its storage key for the index row."""
storage_key = build_parse_object_key(key)
await self._backend.put(
storage_key,
markdown.encode("utf-8"),
content_type=_MARKDOWN_CONTENT_TYPE,
)
return storage_key
async def load(self, storage_key: str) -> str:
chunks = [chunk async for chunk in self._backend.open_stream(storage_key)]
return b"".join(chunks).decode("utf-8")
async def delete(self, storage_key: str) -> None:
await self._backend.delete(storage_key)

View file

@ -0,0 +1,12 @@
"""Object keys for cached markdown, namespaced under a dedicated prefix."""
from __future__ import annotations
from app.etl_pipeline.cache.schemas import ParseKey
CACHE_PREFIX = "etl_cache"
def build_parse_object_key(key: ParseKey) -> str:
# Content-addressed: identical bytes + recipe always map to the same key.
return f"{CACHE_PREFIX}/{key.source_sha256}/{key.object_suffix}"

View file

@ -8,7 +8,7 @@ from app.config import config
def require_gateway_enabled() -> None: def require_gateway_enabled() -> None:
"""FastAPI dependency that gates all gateway HTTP routes on the global flag. """FastAPI dependency that gates gateway operational routes on the global flag.
Returns 404 (rather than 503) when ``GATEWAY_ENABLED`` is FALSE so that Returns 404 (rather than 503) when ``GATEWAY_ENABLED`` is FALSE so that
disabling the gateway makes its webhook/OAuth/pairing surface indistinguishable disabling the gateway makes its webhook/OAuth/pairing surface indistinguishable

View file

@ -0,0 +1,11 @@
"""Content-addressed reuse of chunk+embedding output across workspaces."""
from __future__ import annotations
from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings
from app.indexing_pipeline.cache.service import EmbeddingCacheService
__all__ = [
"EmbeddingCacheService",
"build_chunk_embeddings",
]

View file

@ -0,0 +1,129 @@
"""Entry point: serve chunk embeddings from cache, embedding only on a miss.
Embeddings are a pure function of the markdown, the embedding model, and the
chunker -- so identical markdown is chunked and embedded once and reused across
workspaces, even when it came from different sources.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import numpy as np
from app.config import config
from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
from app.indexing_pipeline.cache.service import EmbeddingCacheService
from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
from app.indexing_pipeline.document_embedder import embed_texts
from app.observability import metrics
logger = logging.getLogger(__name__)
ChunkPair = tuple[str, np.ndarray]
async def build_chunk_embeddings(
markdown: str, *, use_code_chunker: bool
) -> tuple[np.ndarray, list[ChunkPair]]:
"""Return the document-level vector and ordered ``(chunk_text, vector)`` pairs.
Drop-in for the inline chunk+embed step; reuses prior output when the same
markdown has already been embedded with the current model and chunker.
"""
settings = load_embedding_cache_settings()
chunker_kind = "code" if use_code_chunker else "hybrid"
embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
cacheable = is_embedding_cacheable(
cache_enabled=settings.enabled,
embedding_model=config.EMBEDDING_MODEL,
embedding_dim=embedding_dim,
)
if not cacheable:
return await _compute(markdown, use_code_chunker=use_code_chunker)
key = EmbeddingKey(
markdown_sha256=_hash_text(markdown),
embedding_model=config.EMBEDDING_MODEL,
embedding_dim=int(embedding_dim),
chunker_kind=chunker_kind,
chunker_version=settings.chunker_version,
)
cached = await _recall(key)
if cached is not None:
metrics.record_embedding_cache_lookup(
embedding_model=key.embedding_model,
chunker_kind=chunker_kind,
outcome="hit",
)
logger.debug("Embedding cache hit for %s", key.markdown_sha256)
return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
metrics.record_embedding_cache_lookup(
embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
)
summary_embedding, chunk_pairs = await _compute(
markdown, use_code_chunker=use_code_chunker
)
await _remember(key, summary_embedding, chunk_pairs)
return summary_embedding, chunk_pairs
async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
"""Chunk markdown into ordered texts with the pipeline's chunker selection."""
if use_code_chunker:
return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
# Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
return await asyncio.to_thread(chunk_text_hybrid, markdown)
async def embed_batch(texts: list[str]) -> list[np.ndarray]:
"""Embed texts in one batch off the event loop."""
return await asyncio.to_thread(embed_texts, texts)
async def _compute(
markdown: str, *, use_code_chunker: bool
) -> tuple[np.ndarray, list[ChunkPair]]:
chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker)
embeddings = await embed_batch([markdown, *chunk_texts])
summary_embedding, *chunk_embeddings = embeddings
return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
# Caching is best-effort: any failure falls through to a normal embed.
try:
from app.tasks.celery_tasks import get_celery_session_maker
async with get_celery_session_maker()() as session:
return await EmbeddingCacheService(session).recall(key)
except Exception:
logger.warning("Embedding cache recall failed; embedding fresh", exc_info=True)
return None
async def _remember(
key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair]
) -> None:
try:
from app.tasks.celery_tasks import get_celery_session_maker
embedding_set = EmbeddingSet(
summary_embedding=summary_embedding,
chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs],
)
async with get_celery_session_maker()() as session:
await EmbeddingCacheService(session).remember(key, embedding_set)
except Exception:
logger.warning("Embedding cache write failed; result not cached", exc_info=True)
def _hash_text(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()

View file

@ -0,0 +1,21 @@
"""Gating rule: may this document be served from / written to the embedding cache?"""
from __future__ import annotations
def is_embedding_cacheable(
*,
cache_enabled: bool,
embedding_model: str | None,
embedding_dim: int | None,
) -> bool:
"""Cache only when a concrete embedding model and dimension are configured.
Without a model there is nothing to key against, and without a dimension the
blob's integrity guard cannot run -- both bypass the cache.
"""
if not cache_enabled:
return False
if not embedding_model:
return False
return bool(embedding_dim)

View file

@ -0,0 +1,9 @@
"""Background pruning of the embedding cache by age and size budget."""
from __future__ import annotations
from .task import evict_embedding_cache_task
__all__ = [
"evict_embedding_cache_task",
]

View file

@ -0,0 +1,68 @@
"""Celery task that prunes the embedding cache by TTL, then by size budget."""
from __future__ import annotations
import contextlib
import logging
from datetime import UTC, datetime, timedelta
from app.celery_app import celery_app
from app.etl_pipeline.cache.eviction.policy import select_over_budget
from app.etl_pipeline.cache.schemas import EvictionCandidate
from app.indexing_pipeline.cache.persistence import CachedEmbeddingSetRepository
from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
from app.indexing_pipeline.cache.storage import EmbeddingCacheStore
from app.observability import metrics
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
logger = logging.getLogger(__name__)
@celery_app.task(name="evict_embedding_cache")
def evict_embedding_cache_task():
return run_async_celery_task(_evict)
async def _evict() -> None:
"""Expire stale entries, then shed the coldest overflow only if still over budget."""
settings = load_embedding_cache_settings()
if not settings.enabled:
return
store = EmbeddingCacheStore()
async with get_celery_session_maker()() as session:
index = CachedEmbeddingSetRepository(session)
cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days)
expired = await index.select_expired(
cutoff=cutoff, limit=settings.eviction_batch
)
await _drop(index, store, expired, phase="ttl")
total = await index.total_size_bytes()
if total > settings.max_total_bytes:
coldest = await index.select_coldest(limit=settings.eviction_batch)
over_budget = select_over_budget(
coldest,
current_total_bytes=total,
max_total_bytes=settings.max_total_bytes,
)
await _drop(index, store, over_budget, phase="size")
async def _drop(
index: CachedEmbeddingSetRepository,
store: EmbeddingCacheStore,
candidates: list[EvictionCandidate],
*,
phase: str,
) -> None:
if not candidates:
return
for candidate in candidates:
# Drop the index row even if the blob delete fails (orphan blob is harmless).
with contextlib.suppress(Exception):
await store.delete(candidate.storage_key)
await index.delete_by_ids([candidate.id for candidate in candidates])
metrics.record_embedding_cache_eviction(len(candidates), phase=phase)
logger.info("Evicted %d cached embedding sets (%s)", len(candidates), phase)

View file

@ -0,0 +1,11 @@
"""Database access for cached embedding sets."""
from __future__ import annotations
from .models import CachedEmbeddingSet
from .repository import CachedEmbeddingSetRepository
__all__ = [
"CachedEmbeddingSet",
"CachedEmbeddingSetRepository",
]

View file

@ -0,0 +1,47 @@
"""``embedding_cache_sets``: one reusable chunk+embedding set per markdown."""
from __future__ import annotations
from sqlalchemy import (
BigInteger,
Column,
DateTime,
Index,
Integer,
String,
UniqueConstraint,
)
from app.db import BaseModel, TimestampMixin
class CachedEmbeddingSet(BaseModel, TimestampMixin):
__tablename__ = "embedding_cache_sets"
# Key: markdown text + the recipe that turned it into vectors.
markdown_sha256 = Column(String(64), nullable=False)
embedding_model = Column(String(255), nullable=False)
embedding_dim = Column(Integer, nullable=False)
chunker_kind = Column(String(8), nullable=False)
chunker_version = Column(Integer, nullable=False)
# Where the embedding blob lives (kept out of the row to stay small).
storage_backend = Column(String(32), nullable=False)
storage_key = Column(String, nullable=False)
size_bytes = Column(BigInteger, nullable=False)
chunk_count = Column(Integer, nullable=False, default=0, server_default="0")
# Drives eviction (popularity + recency).
times_reused = Column(BigInteger, nullable=False, default=0, server_default="0")
last_used_at = Column(DateTime(timezone=True), nullable=False)
__table_args__ = (
UniqueConstraint(
"markdown_sha256",
"embedding_model",
"chunker_kind",
"chunker_version",
name="uq_embedding_cache_sets_key",
),
Index("ix_embedding_cache_sets_last_used_at", "last_used_at"),
)

View file

@ -0,0 +1,126 @@
"""CRUD and eviction selectors for ``embedding_cache_sets`` (no business rules)."""
from __future__ import annotations
from datetime import UTC, datetime
from sqlalchemy import delete, func, select, update
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.ext.asyncio import AsyncSession
from app.etl_pipeline.cache.schemas import EvictionCandidate
from app.indexing_pipeline.cache.schemas import EmbeddingKey
from .models import CachedEmbeddingSet
_EVICTION_COLUMNS = (
CachedEmbeddingSet.id,
CachedEmbeddingSet.storage_key,
CachedEmbeddingSet.size_bytes,
CachedEmbeddingSet.last_used_at,
CachedEmbeddingSet.times_reused,
)
def _as_eviction_candidate(row) -> EvictionCandidate:
return EvictionCandidate(
id=row.id,
storage_key=row.storage_key,
size_bytes=row.size_bytes,
last_used_at=row.last_used_at,
times_reused=row.times_reused,
)
class CachedEmbeddingSetRepository:
def __init__(self, session: AsyncSession) -> None:
self._session = session
async def get(self, key: EmbeddingKey) -> CachedEmbeddingSet | None:
result = await self._session.execute(
select(CachedEmbeddingSet).where(
CachedEmbeddingSet.markdown_sha256 == key.markdown_sha256,
CachedEmbeddingSet.embedding_model == key.embedding_model,
CachedEmbeddingSet.chunker_kind == key.chunker_kind,
CachedEmbeddingSet.chunker_version == key.chunker_version,
)
)
return result.scalars().first()
async def insert(
self,
*,
key: EmbeddingKey,
storage_backend: str,
storage_key: str,
size_bytes: int,
chunk_count: int,
) -> None:
# Concurrent writers embed identical markdown, so a lost race is harmless.
now = datetime.now(UTC)
await self._session.execute(
pg_insert(CachedEmbeddingSet)
.values(
markdown_sha256=key.markdown_sha256,
embedding_model=key.embedding_model,
embedding_dim=key.embedding_dim,
chunker_kind=key.chunker_kind,
chunker_version=key.chunker_version,
storage_backend=storage_backend,
storage_key=storage_key,
size_bytes=size_bytes,
chunk_count=chunk_count,
times_reused=0,
last_used_at=now,
created_at=now,
)
.on_conflict_do_nothing(constraint="uq_embedding_cache_sets_key")
)
await self._session.commit()
async def mark_used(self, row_id: int) -> None:
await self._session.execute(
update(CachedEmbeddingSet)
.where(CachedEmbeddingSet.id == row_id)
.values(
times_reused=CachedEmbeddingSet.times_reused + 1,
last_used_at=datetime.now(UTC),
)
)
await self._session.commit()
async def total_size_bytes(self) -> int:
result = await self._session.execute(
select(func.coalesce(func.sum(CachedEmbeddingSet.size_bytes), 0))
)
return int(result.scalar() or 0)
async def select_expired(
self, *, cutoff: datetime, limit: int
) -> list[EvictionCandidate]:
result = await self._session.execute(
select(*_EVICTION_COLUMNS)
.where(CachedEmbeddingSet.last_used_at < cutoff)
.order_by(CachedEmbeddingSet.last_used_at.asc())
.limit(limit)
)
return [_as_eviction_candidate(row) for row in result]
async def select_coldest(self, *, limit: int) -> list[EvictionCandidate]:
result = await self._session.execute(
select(*_EVICTION_COLUMNS)
.order_by(
CachedEmbeddingSet.times_reused.asc(),
CachedEmbeddingSet.last_used_at.asc(),
)
.limit(limit)
)
return [_as_eviction_candidate(row) for row in result]
async def delete_by_ids(self, ids: list[int]) -> None:
if not ids:
return
await self._session.execute(
delete(CachedEmbeddingSet).where(CachedEmbeddingSet.id.in_(ids))
)
await self._session.commit()

View file

@ -0,0 +1,12 @@
"""Pure value objects for the embedding cache."""
from __future__ import annotations
from .embedding_key import EmbeddingKey
from .embedding_set import CachedChunk, EmbeddingSet
__all__ = [
"CachedChunk",
"EmbeddingKey",
"EmbeddingSet",
]

View file

@ -0,0 +1,27 @@
"""Identity of a cacheable embedding set: equal keys yield identical vectors.
Embeddings depend on the markdown text, the embedding model, and the chunker --
never on how the markdown was produced. So the key is the markdown's own hash
plus the model and chunker recipe, not the upstream parse identity.
"""
from __future__ import annotations
import hashlib
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class EmbeddingKey:
markdown_sha256: str
embedding_model: str
embedding_dim: int
chunker_kind: str
chunker_version: int
@property
def object_suffix(self) -> str:
# Fingerprint the model so distinct models never share a blob, while the
# markdown hash (the object's folder) stays human-readable.
fingerprint = hashlib.sha256(self.embedding_model.encode("utf-8")).hexdigest()
return f"{fingerprint[:16]}.{self.chunker_kind}.v{self.chunker_version}.emb"

View file

@ -0,0 +1,29 @@
"""The cached payload: a document's chunk texts paired with their vectors."""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
@dataclass(frozen=True, slots=True)
class CachedChunk:
text: str
embedding: np.ndarray
@dataclass(frozen=True, slots=True)
class EmbeddingSet:
"""Everything the indexer needs to rebuild a document's chunks without embedding.
``summary_embedding`` is the document-level vector; ``chunks`` are the ordered
chunk texts and their vectors.
"""
summary_embedding: np.ndarray
chunks: list[CachedChunk]
@property
def chunk_count(self) -> int:
return len(self.chunks)

View file

@ -0,0 +1,75 @@
"""Serialize an EmbeddingSet to a compact, self-describing blob (no pickle).
Layout: ``MAGIC | uint32 header_len | json header | float32 matrix``. The header
carries the dim, chunk count, and ordered chunk texts; the matrix holds the
summary vector followed by one row per chunk, all float32 for compactness.
"""
from __future__ import annotations
import json
import struct
import numpy as np
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingSet
# Marker at the start of every blob: "SurfSense EMBeddings, version 1"-> SSEMB1. Lets us
# reject foreign blobs and bump the trailing digit if the layout ever changes.
_MAGIC = b"SSEMB1"
# 4-byte big-endian unsigned int written before the variable-length JSON header,
# so the reader knows where the header ends and the float matrix begins.
_HEADER_LEN = struct.Struct(">I")
def serialize(embedding_set: EmbeddingSet) -> bytes:
summary = np.asarray(embedding_set.summary_embedding, dtype=np.float32).reshape(-1)
dim = int(summary.shape[0])
rows = [summary]
texts: list[str] = []
for chunk in embedding_set.chunks:
vector = np.asarray(chunk.embedding, dtype=np.float32).reshape(-1)
if vector.shape[0] != dim:
raise ValueError(
"All vectors in an embedding set must share one dimension."
)
rows.append(vector)
texts.append(chunk.text)
matrix = np.stack(rows, axis=0)
header = json.dumps(
{"dim": dim, "count": len(texts), "texts": texts}, ensure_ascii=False
).encode("utf-8")
return b"".join(
[_MAGIC, _HEADER_LEN.pack(len(header)), header, matrix.tobytes(order="C")]
)
def deserialize(blob: bytes) -> EmbeddingSet:
view = memoryview(blob)
if bytes(view[: len(_MAGIC)]) != _MAGIC:
raise ValueError("Unrecognized embedding cache blob.")
offset = len(_MAGIC)
(header_len,) = _HEADER_LEN.unpack(view[offset : offset + _HEADER_LEN.size])
offset += _HEADER_LEN.size
header = json.loads(bytes(view[offset : offset + header_len]).decode("utf-8"))
offset += header_len
dim = int(header["dim"])
count = int(header["count"])
texts: list[str] = header["texts"]
matrix = np.frombuffer(view[offset:], dtype=np.float32)
if matrix.shape[0] != (count + 1) * dim:
raise ValueError("Embedding cache blob is truncated or corrupt.")
matrix = matrix.reshape(count + 1, dim)
return EmbeddingSet(
summary_embedding=matrix[0],
chunks=[
CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count)
],
)

View file

@ -0,0 +1,51 @@
"""Recall and remember embedding sets, coordinating the index and blob store."""
from __future__ import annotations
import logging
from sqlalchemy.ext.asyncio import AsyncSession
from app.indexing_pipeline.cache.persistence import CachedEmbeddingSetRepository
from app.indexing_pipeline.cache.schemas import EmbeddingKey, EmbeddingSet
from app.indexing_pipeline.cache.storage import EmbeddingCacheStore
logger = logging.getLogger(__name__)
class EmbeddingCacheService:
def __init__(self, session: AsyncSession) -> None:
self._index = CachedEmbeddingSetRepository(session)
self._store = EmbeddingCacheStore()
async def recall(self, key: EmbeddingKey) -> EmbeddingSet | None:
"""Return the cached embedding set, or None on a miss."""
row = await self._index.get(key)
if row is None:
return None
try:
embedding_set = await self._store.load(row.storage_key)
except Exception:
# Index points at a blob that is gone; treat as a miss and re-embed.
logger.warning("Cache blob missing: %s", row.storage_key, exc_info=True)
return None
if int(embedding_set.summary_embedding.shape[0]) != key.embedding_dim:
# A model swapped its dimension under a reused name; never serve it.
logger.warning("Cached embedding dimension mismatch: %s", row.storage_key)
return None
await self._index.mark_used(row.id)
return embedding_set
async def remember(self, key: EmbeddingKey, embedding_set: EmbeddingSet) -> None:
"""Store a freshly embedded set for future reuse."""
storage_key, size_bytes = await self._store.save(key, embedding_set)
await self._index.insert(
key=key,
storage_backend=self._store.backend_name,
storage_key=storage_key,
size_bytes=size_bytes,
chunk_count=embedding_set.chunk_count,
)

View file

@ -0,0 +1,30 @@
"""Embedding-cache configuration resolved from the central ``Config``.
The blob backend is intentionally not configured here: it is shared with the ETL
parse cache (see ``ETL_CACHE_STORAGE_*``).
"""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class EmbeddingCacheSettings:
enabled: bool
chunker_version: int
ttl_days: int
max_total_bytes: int
eviction_batch: int
def load_embedding_cache_settings() -> EmbeddingCacheSettings:
from app.config import config
return EmbeddingCacheSettings(
enabled=config.EMBEDDING_CACHE_ENABLED,
chunker_version=config.EMBEDDING_CACHE_CHUNKER_VERSION,
ttl_days=config.EMBEDDING_CACHE_TTL_DAYS,
max_total_bytes=config.EMBEDDING_CACHE_MAX_TOTAL_MB * 1024 * 1024,
eviction_batch=config.EMBEDDING_CACHE_EVICTION_BATCH,
)

View file

@ -0,0 +1,9 @@
"""Blob storage for cached embedding sets."""
from __future__ import annotations
from .embedding_store import EmbeddingCacheStore
__all__ = [
"EmbeddingCacheStore",
]

View file

@ -0,0 +1,39 @@
"""Read and write cached embedding blobs through the shared cache backend.
The blob backend is shared with the ETL parse cache (same bucket / root), so
markdown and its embeddings live side by side; only the object prefix differs.
"""
from __future__ import annotations
from app.etl_pipeline.cache.storage.backend import resolve_cache_backend
from app.indexing_pipeline.cache.schemas import EmbeddingKey, EmbeddingSet
from app.indexing_pipeline.cache.serialization import deserialize, serialize
from app.indexing_pipeline.cache.storage.object_keys import build_embedding_object_key
_EMBEDDING_CONTENT_TYPE = "application/octet-stream"
class EmbeddingCacheStore:
def __init__(self) -> None:
self._backend = resolve_cache_backend()
@property
def backend_name(self) -> str:
return self._backend.backend_name
async def save(
self, key: EmbeddingKey, embedding_set: EmbeddingSet
) -> tuple[str, int]:
"""Persist the embedding set and return its storage key and byte size."""
blob = serialize(embedding_set)
storage_key = build_embedding_object_key(key)
await self._backend.put(storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE)
return storage_key, len(blob)
async def load(self, storage_key: str) -> EmbeddingSet:
chunks = [chunk async for chunk in self._backend.open_stream(storage_key)]
return deserialize(b"".join(chunks))
async def delete(self, storage_key: str) -> None:
await self._backend.delete(storage_key)

View file

@ -0,0 +1,12 @@
"""Object keys for cached embedding sets, namespaced under a dedicated prefix."""
from __future__ import annotations
from app.indexing_pipeline.cache.schemas import EmbeddingKey
CACHE_PREFIX = "embedding_cache"
def build_embedding_object_key(key: EmbeddingKey) -> str:
# Content-addressed: identical markdown + recipe always map to the same key.
return f"{CACHE_PREFIX}/{key.markdown_sha256}/{key.object_suffix}"

View file

@ -0,0 +1,56 @@
"""Diff a document's existing chunk rows against its freshly chunked texts.
Embeddings are a pure function of chunk text, so a row whose content reappears
in the new chunking keeps its embedding (and its HNSW/GIN index entries); only
genuinely new texts are embedded and only vanished rows are deleted. Matching
is a greedy multiset match on content in document order, so duplicate
boilerplate chunks pair up one-to-one and reordered chunks become cheap
position updates instead of delete+reinsert.
"""
from __future__ import annotations
from collections import defaultdict, deque
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class ExistingChunk:
id: int
content: str
position: int
@dataclass(frozen=True, slots=True)
class ChunkPlan:
"""The minimal set of writes that turns the stored chunks into the new ones.
``reused`` holds only kept rows whose position actually changed; rows that
match in place need no write at all. Kept-row count (for metrics) is
``len(existing) - len(to_delete)``.
"""
reused: list[tuple[int, int]] # (existing_chunk_id, new_position)
to_embed: list[tuple[int, str]] # (new_position, text)
to_delete: list[int] # existing chunk ids
def reconcile(existing: list[ExistingChunk], new_texts: list[str]) -> ChunkPlan:
available: dict[str, deque[ExistingChunk]] = defaultdict(deque)
for chunk in sorted(existing, key=lambda c: c.position):
available[chunk.content].append(chunk)
reused: list[tuple[int, int]] = []
to_embed: list[tuple[int, str]] = []
for new_position, text in enumerate(new_texts):
matches = available.get(text)
if matches:
chunk = matches.popleft()
if chunk.position != new_position:
reused.append((chunk.id, new_position))
else:
to_embed.append((new_position, text))
to_delete = [chunk.id for queue in available.values() for chunk in queue]
return ChunkPlan(reused=reused, to_embed=to_embed, to_delete=to_delete)

View file

@ -1,12 +1,12 @@
import contextlib import contextlib
import logging import logging
import time
from datetime import UTC, datetime from datetime import UTC, datetime
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import object_session
from sqlalchemy.orm.attributes import set_committed_value from sqlalchemy.orm.attributes import set_committed_value
from app.db import Document, DocumentStatus from app.db import Chunk, Document, DocumentStatus
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -22,7 +22,6 @@ async def rollback_and_persist_failure(
try: try:
await session.rollback() await session.rollback()
except Exception: except Exception:
# Session is completely dead; surface it but never raise.
logger.warning( logger.warning(
"Rollback failed; cannot persist failed status for document %s", "Rollback failed; cannot persist failed status for document %s",
getattr(document, "id", "unknown"), getattr(document, "id", "unknown"),
@ -35,8 +34,6 @@ async def rollback_and_persist_failure(
document.status = DocumentStatus.failed(message) document.status = DocumentStatus.failed(message)
await session.commit() await session.commit()
except Exception: except Exception:
# Best-effort: the document stays non-ready and is retried next sync.
# Log it so a permanently-stuck document is at least traceable.
logger.warning( logger.warning(
"Could not persist failed status for document %s; will retry next sync", "Could not persist failed status for document %s; will retry next sync",
getattr(document, "id", "unknown"), getattr(document, "id", "unknown"),
@ -46,12 +43,60 @@ async def rollback_and_persist_failure(
await session.rollback() await session.rollback()
def attach_chunks_to_document(document: Document, chunks: list) -> None: async def persist_scratch_index(
"""Assign chunks to a document without triggering SQLAlchemy async lazy loading.""" session: AsyncSession,
document: Document,
content: str,
chunks: list[Chunk],
*,
batch_size: int,
perf: logging.Logger,
) -> None:
"""Commit document content first, then chunk rows in batches, then mark ready."""
if document.id is None:
raise ValueError("document.id is required to persist chunks")
document.content = content
document.updated_at = datetime.now(UTC)
await session.commit()
t_persist = time.perf_counter()
total = len(chunks)
if total == 0:
set_committed_value(document, "chunks", [])
document.status = DocumentStatus.ready()
document.updated_at = datetime.now(UTC)
await session.commit()
return
effective_batch = total if batch_size <= 0 else batch_size
num_batches = (total + effective_batch - 1) // effective_batch
doc_id = document.id
for batch_idx, start in enumerate(range(0, total, effective_batch), start=1):
batch = chunks[start : start + effective_batch]
t_batch = time.perf_counter()
for chunk in batch:
chunk.document_id = doc_id
session.add_all(batch)
await session.commit()
perf.info(
"[indexing] chunk batch doc=%d batch=%d/%d rows=%d in %.3fs",
doc_id,
batch_idx,
num_batches,
len(batch),
time.perf_counter() - t_batch,
)
set_committed_value(document, "chunks", chunks) set_committed_value(document, "chunks", chunks)
session = object_session(document) document.status = DocumentStatus.ready()
if session is not None: document.updated_at = datetime.now(UTC)
if document.id is not None: await session.commit()
for chunk in chunks: perf.info(
chunk.document_id = document.id "[indexing] chunk persist doc=%d chunks=%d batches=%d in %.3fs",
session.add_all(chunks) doc_id,
total,
num_batches,
time.perf_counter() - t_persist,
)

View file

@ -14,6 +14,8 @@ from litellm.exceptions import (
) )
from sqlalchemy.exc import IntegrityError as IntegrityError from sqlalchemy.exc import IntegrityError as IntegrityError
from app.services.llm_error_adapter import LLMErrorCategory, adapt_llm_exception
# Tuples for use directly in except clauses. # Tuples for use directly in except clauses.
RETRYABLE_LLM_ERRORS = ( RETRYABLE_LLM_ERRORS = (
RateLimitError, RateLimitError,
@ -97,38 +99,20 @@ def safe_exception_message(exc: Exception) -> str:
def llm_retryable_message(exc: Exception) -> str: def llm_retryable_message(exc: Exception) -> str:
try: try:
if isinstance(exc, RateLimitError): adapted = adapt_llm_exception(exc)
return PipelineMessages.RATE_LIMIT if adapted.category is LLMErrorCategory.UNKNOWN:
if isinstance(exc, Timeout): return safe_exception_message(exc)
return PipelineMessages.LLM_TIMEOUT return adapted.user_message
if isinstance(exc, ServiceUnavailableError):
return PipelineMessages.LLM_UNAVAILABLE
if isinstance(exc, BadGatewayError):
return PipelineMessages.LLM_BAD_GATEWAY
if isinstance(exc, InternalServerError):
return PipelineMessages.LLM_SERVER_ERROR
if isinstance(exc, APIConnectionError):
return PipelineMessages.LLM_CONNECTION
return safe_exception_message(exc)
except Exception: except Exception:
return "Something went wrong when calling the LLM." return "Something went wrong when calling the LLM."
def llm_permanent_message(exc: Exception) -> str: def llm_permanent_message(exc: Exception) -> str:
try: try:
if isinstance(exc, AuthenticationError): adapted = adapt_llm_exception(exc)
return PipelineMessages.LLM_AUTH if adapted.category is LLMErrorCategory.UNKNOWN:
if isinstance(exc, PermissionDeniedError): return safe_exception_message(exc)
return PipelineMessages.LLM_PERMISSION return adapted.user_message
if isinstance(exc, NotFoundError):
return PipelineMessages.LLM_NOT_FOUND
if isinstance(exc, BadRequestError):
return PipelineMessages.LLM_BAD_REQUEST
if isinstance(exc, UnprocessableEntityError):
return PipelineMessages.LLM_UNPROCESSABLE
if isinstance(exc, APIResponseValidationError):
return PipelineMessages.LLM_RESPONSE
return safe_exception_message(exc)
except Exception: except Exception:
return "Something went wrong when calling the LLM." return "Something went wrong when calling the LLM."

View file

@ -8,7 +8,7 @@ from collections.abc import Awaitable, Callable
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import UTC, datetime from datetime import UTC, datetime
from sqlalchemy import delete, select from sqlalchemy import delete, select, update
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
@ -19,16 +19,17 @@ from app.db import (
DocumentStatus, DocumentStatus,
DocumentType, DocumentType,
) )
from app.indexing_pipeline.cache import build_chunk_embeddings
from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch
from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile
from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
from app.indexing_pipeline.document_embedder import embed_texts
from app.indexing_pipeline.document_hashing import ( from app.indexing_pipeline.document_hashing import (
compute_content_hash, compute_content_hash,
compute_identifier_hash, compute_identifier_hash,
compute_unique_identifier_hash, compute_unique_identifier_hash,
) )
from app.indexing_pipeline.document_persistence import ( from app.indexing_pipeline.document_persistence import (
attach_chunks_to_document, persist_scratch_index,
rollback_and_persist_failure, rollback_and_persist_failure,
) )
from app.indexing_pipeline.exceptions import ( from app.indexing_pipeline.exceptions import (
@ -380,53 +381,50 @@ class IndexingPipelineService:
content = connector_doc.source_markdown content = connector_doc.source_markdown
await self.session.execute(
delete(Chunk).where(Chunk.document_id == document.id)
)
t_step = time.perf_counter() t_step = time.perf_counter()
if connector_doc.should_use_code_chunker: existing = await self._load_existing_chunks(document.id)
chunk_texts = await asyncio.to_thread( if existing and self._reconcile_enabled():
chunk_text, chunk_count = await self._reindex_incrementally(
connector_doc.source_markdown, document, content, connector_doc, existing
use_code_chunker=True,
) )
perf.info(
"[indexing] chunk+embed doc=%d chunks=%d in %.3fs",
document.id,
chunk_count,
time.perf_counter() - t_step,
)
document.content = content
document.updated_at = datetime.now(UTC)
document.status = DocumentStatus.ready()
await self.session.commit()
else: else:
# Use the table-aware hybrid chunker so Markdown tables are not from app.config import config
# split mid-row (see issue #1334).
chunk_texts = await asyncio.to_thread( chunks = await self._reindex_from_scratch(
chunk_text_hybrid, document, content, connector_doc
connector_doc.source_markdown, )
chunk_count = len(chunks)
perf.info(
"[indexing] chunk+embed doc=%d chunks=%d in %.3fs",
document.id,
chunk_count,
time.perf_counter() - t_step,
)
await persist_scratch_index(
self.session,
document,
content,
chunks,
batch_size=config.INDEXING_CHUNK_INSERT_BATCH_SIZE,
perf=perf,
) )
texts_to_embed = [content, *chunk_texts]
embeddings = await asyncio.to_thread(embed_texts, texts_to_embed)
summary_embedding, *chunk_embeddings = embeddings
chunks = [
Chunk(content=text, embedding=emb)
for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)
]
perf.info(
"[indexing] chunk+embed doc=%d chunks=%d in %.3fs",
document.id,
len(chunks),
time.perf_counter() - t_step,
)
document.content = content
document.embedding = summary_embedding
attach_chunks_to_document(document, chunks)
document.updated_at = datetime.now(UTC)
document.status = DocumentStatus.ready()
await self.session.commit()
perf.info( perf.info(
"[indexing] index TOTAL doc=%d chunks=%d in %.3fs", "[indexing] index TOTAL doc=%d chunks=%d in %.3fs",
document.id, document.id,
len(chunks), chunk_count,
time.perf_counter() - t_index, time.perf_counter() - t_index,
) )
log_index_success(ctx, chunk_count=len(chunks)) log_index_success(ctx, chunk_count=chunk_count)
outcome_status = "success" outcome_status = "success"
await self._enqueue_ai_sort_if_enabled(document) await self._enqueue_ai_sort_if_enabled(document)
@ -483,6 +481,89 @@ class IndexingPipelineService:
persist_span_cm.__exit__(*sys.exc_info()) persist_span_cm.__exit__(*sys.exc_info())
return document return document
@staticmethod
def _reconcile_enabled() -> bool:
from app.config import config
return config.CHUNK_RECONCILE_ENABLED
async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
result = await self.session.execute(
select(Chunk.id, Chunk.content, Chunk.position).where(
Chunk.document_id == document_id
)
)
return [
ExistingChunk(id=row.id, content=row.content, position=row.position)
for row in result
]
async def _reindex_from_scratch(
self, document: Document, content: str, connector_doc: ConnectorDocument
) -> list[Chunk]:
await self.session.execute(
delete(Chunk).where(Chunk.document_id == document.id)
)
summary_embedding, chunk_pairs = await build_chunk_embeddings(
content,
use_code_chunker=connector_doc.should_use_code_chunker,
)
document.embedding = summary_embedding
return [
Chunk(content=text, embedding=emb, position=i)
for i, (text, emb) in enumerate(chunk_pairs)
]
async def _reindex_incrementally(
self,
document: Document,
content: str,
connector_doc: ConnectorDocument,
existing: list[ExistingChunk],
) -> int:
"""Edit path: keep rows whose text survived, embed only new texts.
Unchanged rows keep their embedding and their HNSW/GIN index entries;
moved rows get a position-only UPDATE, which touches neither index.
"""
new_texts = await chunk_markdown(
content, use_code_chunker=connector_doc.should_use_code_chunker
)
plan = reconcile(existing, new_texts)
# One batch: the document-level summary vector plus the missing chunks.
embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
summary_embedding, *new_embeddings = embeddings
if plan.reused:
await self.session.execute(
update(Chunk),
[{"id": cid, "position": pos} for cid, pos in plan.reused],
)
if plan.to_delete:
await self.session.execute(
delete(Chunk).where(Chunk.id.in_(plan.to_delete))
)
self.session.add_all(
Chunk(
content=text,
embedding=emb,
position=pos,
document_id=document.id,
)
for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True)
)
document.embedding = summary_embedding
ot_metrics.record_chunk_reconcile(
reused=len(existing) - len(plan.to_delete),
embedded=len(plan.to_embed),
deleted=len(plan.to_delete),
)
return len(new_texts)
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None: async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
"""Fire-and-forget: enqueue incremental AI sort if the search space has it enabled.""" """Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
try: try:

View file

@ -2,6 +2,9 @@
from __future__ import annotations from __future__ import annotations
# Matches notifications.title VARCHAR(200).
TITLE_MAX_LENGTH = 200
# Notifications newer than this are live-synced; older ones load via the list endpoint. # Notifications newer than this are live-synced; older ones load via the list endpoint.
SYNC_WINDOW_DAYS = 14 SYNC_WINDOW_DAYS = 14

View file

@ -28,7 +28,7 @@ class DocumentProcessingNotificationHandler(BaseNotificationHandler):
) -> Notification: ) -> Notification:
"""Open the notification when document processing is queued.""" """Open the notification when document processing is queued."""
operation_id = msg.operation_id(document_type, document_name, search_space_id) operation_id = msg.operation_id(document_type, document_name, search_space_id)
title = f"Processing: {document_name}" title = msg.started_title(document_name)
message = "Waiting in queue" message = "Waiting in queue"
metadata = { metadata = {

View file

@ -6,6 +6,8 @@ import hashlib
from datetime import UTC, datetime from datetime import UTC, datetime
from typing import Any from typing import Any
from app.notifications.service.messages.text import format_title
def operation_id(document_type: str, filename: str, search_space_id: int) -> str: def operation_id(document_type: str, filename: str, search_space_id: int) -> str:
"""Build a unique id for a document processing run.""" """Build a unique id for a document processing run."""
@ -14,6 +16,11 @@ def operation_id(document_type: str, filename: str, search_space_id: int) -> str
return f"doc_{document_type}_{search_space_id}_{timestamp}_{filename_hash}" return f"doc_{document_type}_{search_space_id}_{timestamp}_{filename_hash}"
def started_title(document_name: str) -> str:
"""Title shown when document processing is queued."""
return format_title("Processing: ", document_name)
def progress( def progress(
stage: str, stage: str,
stage_message: str | None = None, stage_message: str | None = None,
@ -44,11 +51,11 @@ def completion(
) -> tuple[str, str, str, dict[str, Any]]: ) -> tuple[str, str, str, dict[str, Any]]:
"""Compute the final title, message, status, and metadata for a finished run.""" """Compute the final title, message, status, and metadata for a finished run."""
if error_message: if error_message:
title = f"Failed: {document_name}" title = format_title("Failed: ", document_name)
message = f"Processing failed: {error_message}" message = f"Processing failed: {error_message}"
status = "failed" status = "failed"
else: else:
title = f"Ready: {document_name}" title = format_title("Ready: ", document_name)
message = "Now searchable!" message = "Now searchable!"
status = "completed" status = "completed"

View file

@ -2,7 +2,21 @@
from __future__ import annotations from __future__ import annotations
from app.notifications.constants import TITLE_MAX_LENGTH
def truncate(text: str, limit: int) -> str: def truncate(text: str, limit: int) -> str:
"""Return ``text`` capped at ``limit`` chars, appending an ellipsis if cut.""" """Return ``text`` capped at ``limit`` chars, appending an ellipsis if cut."""
return text[:limit] + "..." if len(text) > limit else text return text[:limit] + "..." if len(text) > limit else text
def format_title(prefix: str, text: str, *, max_length: int = TITLE_MAX_LENGTH) -> str:
"""Build a notification title that fits ``max_length`` including ``prefix``."""
budget = max_length - len(prefix)
if budget <= 0:
return prefix[:max_length]
if len(text) <= budget:
return f"{prefix}{text}"
if budget <= 3:
return f"{prefix}{text[:budget]}"
return f"{prefix}{text[: budget - 3]}..."

View file

@ -289,6 +289,49 @@ def _etl_extract_outcome():
) )
@lru_cache(maxsize=1)
def _etl_cache_lookups():
return _get_meter().create_counter(
"surfsense.etl.cache.lookups",
description="Count of ETL parse-cache lookups by outcome (hit/miss).",
)
@lru_cache(maxsize=1)
def _etl_cache_evictions():
return _get_meter().create_counter(
"surfsense.etl.cache.evictions",
description="Count of ETL parse-cache entries evicted, by phase.",
)
@lru_cache(maxsize=1)
def _embedding_cache_lookups():
return _get_meter().create_counter(
"surfsense.embedding.cache.lookups",
description="Count of embedding (chunk+embedding) cache lookups by outcome (hit/miss).",
)
@lru_cache(maxsize=1)
def _embedding_cache_evictions():
return _get_meter().create_counter(
"surfsense.embedding.cache.evictions",
description="Count of embedding cache entries evicted, by phase.",
)
@lru_cache(maxsize=1)
def _chunk_reconcile_chunks():
return _get_meter().create_counter(
"surfsense.indexing.reconcile.chunks",
description=(
"Chunks handled by incremental re-indexing, by outcome "
"(reused/embedded/deleted)."
),
)
@lru_cache(maxsize=1) @lru_cache(maxsize=1)
def _celery_heartbeat_refreshes(): def _celery_heartbeat_refreshes():
return _get_meter().create_counter( return _get_meter().create_counter(
@ -670,6 +713,61 @@ def record_etl_extract_outcome(
) )
def record_etl_cache_lookup(
*, etl_service: str | None, mode: str | None, outcome: str
) -> None:
"""Record a parse-cache lookup. ``outcome`` is ``hit`` or ``miss``."""
_add(
_etl_cache_lookups(),
1,
{
"etl.service": etl_service or "unknown",
"mode": mode or "unknown",
"outcome": outcome,
},
)
def record_etl_cache_eviction(count: int, *, phase: str) -> None:
"""Record evicted entries. ``phase`` is ``ttl`` or ``size``."""
if count <= 0:
return
_add(_etl_cache_evictions(), count, {"phase": phase})
def record_embedding_cache_lookup(
*, embedding_model: str | None, chunker_kind: str | None, outcome: str
) -> None:
"""Record an embedding-cache lookup. ``outcome`` is ``hit`` or ``miss``."""
_add(
_embedding_cache_lookups(),
1,
{
"embedding.model": embedding_model or "unknown",
"chunker.kind": chunker_kind or "unknown",
"outcome": outcome,
},
)
def record_embedding_cache_eviction(count: int, *, phase: str) -> None:
"""Record evicted entries. ``phase`` is ``ttl`` or ``size``."""
if count <= 0:
return
_add(_embedding_cache_evictions(), count, {"phase": phase})
def record_chunk_reconcile(*, reused: int, embedded: int, deleted: int) -> None:
"""Record an incremental re-index: how many chunks were kept vs recomputed."""
for outcome, count in (
("reused", reused),
("embedded", embedded),
("deleted", deleted),
):
if count > 0:
_add(_chunk_reconcile_chunks(), count, {"outcome": outcome})
def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None: def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None:
_add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type}) _add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type})
@ -863,9 +961,14 @@ __all__ = [
"record_celery_queue_latency", "record_celery_queue_latency",
"record_chat_request_duration", "record_chat_request_duration",
"record_chat_request_outcome", "record_chat_request_outcome",
"record_chunk_reconcile",
"record_compaction_run", "record_compaction_run",
"record_connector_sync_duration", "record_connector_sync_duration",
"record_connector_sync_outcome", "record_connector_sync_outcome",
"record_embedding_cache_eviction",
"record_embedding_cache_lookup",
"record_etl_cache_eviction",
"record_etl_cache_lookup",
"record_etl_extract_duration", "record_etl_extract_duration",
"record_etl_extract_outcome", "record_etl_extract_outcome",
"record_indexing_document_duration", "record_indexing_document_duration",

View file

@ -47,6 +47,7 @@ from app.utils.rbac import check_permission
from .schemas import ( from .schemas import (
CreatePodcastRequest, CreatePodcastRequest,
LanguageOptions,
PodcastDetail, PodcastDetail,
PodcastSummary, PodcastSummary,
UpdateSpecRequest, UpdateSpecRequest,
@ -114,6 +115,20 @@ async def list_voices(language: str | None = None):
] ]
@router.get("/podcasts/languages", response_model=LanguageOptions)
async def list_languages():
"""Languages the active TTS provider can offer the brief editor."""
if not app_config.TTS_SERVICE:
raise HTTPException(status_code=503, detail="No TTS provider configured")
provider = provider_from_service(app_config.TTS_SERVICE)
offering = get_voice_catalog().offerable_languages(provider)
return LanguageOptions(
languages=offering.languages,
allows_custom=offering.allows_custom,
)
@router.get("/podcasts/voices/{voice_id}/preview") @router.get("/podcasts/voices/{voice_id}/preview")
async def preview_voice( async def preview_voice(
voice_id: str, voice_id: str,

View file

@ -63,6 +63,17 @@ class VoiceOption(BaseModel):
gender: str gender: str
class LanguageOptions(BaseModel):
"""The languages the brief editor may offer for the active provider.
When ``allows_custom`` is true the list is a curated starting point and
the editor accepts any BCP-47 tag beyond it.
"""
languages: list[str]
allows_custom: bool
class PodcastSummary(BaseModel): class PodcastSummary(BaseModel):
"""Lightweight list item.""" """Lightweight list item."""

View file

@ -6,7 +6,7 @@ configured provider via :func:`provider_from_service`.
from __future__ import annotations from __future__ import annotations
from .catalog import VoiceCatalog, get_voice_catalog from .catalog import LanguageOffering, VoiceCatalog, get_voice_catalog
from .preview import render_voice_preview from .preview import render_voice_preview
from .provider import TtsProvider, provider_from_service from .provider import TtsProvider, provider_from_service
from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender
@ -14,6 +14,7 @@ from .voice import ANY_LANGUAGE, CatalogVoice, VoiceGender
__all__ = [ __all__ = [
"ANY_LANGUAGE", "ANY_LANGUAGE",
"CatalogVoice", "CatalogVoice",
"LanguageOffering",
"TtsProvider", "TtsProvider",
"VoiceCatalog", "VoiceCatalog",
"VoiceGender", "VoiceGender",

View file

@ -9,11 +9,26 @@ provider-native reference.
from __future__ import annotations from __future__ import annotations
from collections.abc import Iterable from collections.abc import Iterable
from dataclasses import dataclass
from functools import lru_cache from functools import lru_cache
from .data import AZURE_VOICES, KOKORO_VOICES, OPENAI_VOICES, VERTEX_VOICES from .data import AZURE_VOICES, KOKORO_VOICES, OPENAI_VOICES, VERTEX_VOICES
from .data.languages import COMMON_LANGUAGES
from .provider import TtsProvider from .provider import TtsProvider
from .voice import CatalogVoice from .voice import ANY_LANGUAGE, CatalogVoice
@dataclass(frozen=True, slots=True)
class LanguageOffering:
"""The languages a provider's roster can offer the brief form.
``allows_custom`` is true when the roster has wildcard voices: the listed
languages are then a curated starting point, not a limit, and any BCP-47
tag may be entered.
"""
languages: list[str]
allows_custom: bool
class VoiceCatalog: class VoiceCatalog:
@ -44,6 +59,20 @@ class VoiceCatalog:
"""Whether ``provider`` has at least one voice for ``language``.""" """Whether ``provider`` has at least one voice for ``language``."""
return any(v.speaks(language) for v in self.for_provider(provider)) return any(v.speaks(language) for v in self.for_provider(provider))
def offerable_languages(self, provider: TtsProvider) -> LanguageOffering:
"""The languages ``provider`` can offer up front.
Language-bound voices contribute their concrete tags; wildcard voices
cannot enumerate languages, so their presence merges in the curated
common list and opens free entry.
"""
voices = self.for_provider(provider)
tags = {v.language for v in voices if v.language != ANY_LANGUAGE}
has_wildcard = any(v.language == ANY_LANGUAGE for v in voices)
if has_wildcard:
tags.update(COMMON_LANGUAGES)
return LanguageOffering(languages=sorted(tags), allows_custom=has_wildcard)
@lru_cache(maxsize=1) @lru_cache(maxsize=1)
def get_voice_catalog() -> VoiceCatalog: def get_voice_catalog() -> VoiceCatalog:

View file

@ -0,0 +1,33 @@
"""Curated languages offered when a roster has wildcard (any-language) voices.
OpenAI-style multilingual voices speak whatever language the text is in, so
there is no provider list to enumerate. This is the set the brief form offers
up front for such providers; it is an offering, not a limit the API flags
``allows_custom`` so users can enter any BCP-47 tag beyond it.
"""
from __future__ import annotations
COMMON_LANGUAGES: tuple[str, ...] = (
"ar",
"bn",
"de",
"en",
"es",
"fr",
"hi",
"id",
"it",
"ja",
"ko",
"nl",
"pl",
"pt",
"ru",
"sw",
"th",
"tr",
"uk",
"vi",
"zh",
)

View file

@ -82,7 +82,7 @@ def build_configurable_system_prompt(
*, *,
model_name: str | None = None, model_name: str | None = None,
) -> str: ) -> str:
"""Build a configurable SurfSense system prompt (NewLLMConfig path). """Build a configurable SurfSense system prompt.
See :func:`app.prompts.system_prompt_composer.composer.compose_system_prompt` See :func:`app.prompts.system_prompt_composer.composer.compose_system_prompt`
for full parameter docs. for full parameter docs.
@ -104,7 +104,7 @@ def build_configurable_system_prompt(
def get_default_system_instructions() -> str: def get_default_system_instructions() -> str:
"""Return the default ``<system_instruction>`` block (no tools / citations). """Return the default ``<system_instruction>`` block (no tools / citations).
Useful for populating the UI when seeding ``NewLLMConfig.system_instructions``. Useful for populating the UI when editing custom system instructions.
The output reflects the current fragment tree, not a baked-in constant. The output reflects the current fragment tree, not a baked-in constant.
""" """
resolved_today = datetime.now(UTC).date().isoformat() resolved_today = datetime.now(UTC).date().isoformat()

View file

@ -348,8 +348,7 @@ def compose_system_prompt(
mcp_connector_tools: ``{server_name: [tool_names...]}`` to inject mcp_connector_tools: ``{server_name: [tool_names...]}`` to inject
an explicit MCP routing block. an explicit MCP routing block.
custom_system_instructions: Free-form instructions that override custom_system_instructions: Free-form instructions that override
the default ``<system_instruction>`` block (legacy support the default ``<system_instruction>`` block.
for ``NewLLMConfig.system_instructions``).
use_default_system_instructions: When ``custom_system_instructions`` use_default_system_instructions: When ``custom_system_instructions``
is empty/None, fall back to defaults (legacy semantics). is empty/None, fall back to defaults (legacy semantics).
citations_enabled: Include ``citations_on.md`` (true) or citations_enabled: Include ``citations_on.md`` (true) or

View file

@ -420,7 +420,10 @@ class ChucksHybridSearchRetriever:
select( select(
Chunk.id.label("chunk_id"), Chunk.id.label("chunk_id"),
func.row_number() func.row_number()
.over(partition_by=Chunk.document_id, order_by=Chunk.id) .over(
partition_by=Chunk.document_id,
order_by=(Chunk.position, Chunk.id),
)
.label("rn"), .label("rn"),
) )
.where(Chunk.document_id.in_(doc_ids)) .where(Chunk.document_id.in_(doc_ids))
@ -441,7 +444,7 @@ class ChucksHybridSearchRetriever:
select(Chunk.id, Chunk.content, Chunk.document_id) select(Chunk.id, Chunk.content, Chunk.document_id)
.join(numbered, Chunk.id == numbered.c.chunk_id) .join(numbered, Chunk.id == numbered.c.chunk_id)
.where(chunk_filter) .where(chunk_filter)
.order_by(Chunk.document_id, Chunk.id) .order_by(Chunk.document_id, Chunk.position, Chunk.id)
) )
t_fetch = time.perf_counter() t_fetch = time.perf_counter()

View file

@ -357,7 +357,10 @@ class DocumentHybridSearchRetriever:
select( select(
Chunk.id.label("chunk_id"), Chunk.id.label("chunk_id"),
func.row_number() func.row_number()
.over(partition_by=Chunk.document_id, order_by=Chunk.id) .over(
partition_by=Chunk.document_id,
order_by=(Chunk.position, Chunk.id),
)
.label("rn"), .label("rn"),
) )
.where(Chunk.document_id.in_(doc_ids)) .where(Chunk.document_id.in_(doc_ids))
@ -369,7 +372,7 @@ class DocumentHybridSearchRetriever:
select(Chunk.id, Chunk.content, Chunk.document_id) select(Chunk.id, Chunk.content, Chunk.document_id)
.join(numbered, Chunk.id == numbered.c.chunk_id) .join(numbered, Chunk.id == numbered.c.chunk_id)
.where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC) .where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC)
.order_by(Chunk.document_id, Chunk.id) .order_by(Chunk.document_id, Chunk.position, Chunk.id)
) )
t_fetch = time.perf_counter() t_fetch = time.perf_counter()

View file

@ -24,7 +24,10 @@ from .dropbox_add_connector_route import router as dropbox_add_connector_router
from .editor_routes import router as editor_router from .editor_routes import router as editor_router
from .export_routes import router as export_router from .export_routes import router as export_router
from .folders_routes import router as folders_router from .folders_routes import router as folders_router
from .gateway_webhook_routes import router as gateway_router from .gateway_webhook_routes import (
config_router as gateway_config_router,
router as gateway_router,
)
from .gateway_whatsapp_baileys_routes import router as gateway_whatsapp_baileys_router from .gateway_whatsapp_baileys_routes import router as gateway_whatsapp_baileys_router
from .gateway_whatsapp_webhook_routes import router as gateway_whatsapp_webhook_router from .gateway_whatsapp_webhook_routes import router as gateway_whatsapp_webhook_router
from .google_calendar_add_connector_route import ( from .google_calendar_add_connector_route import (
@ -44,9 +47,9 @@ from .logs_routes import router as logs_router
from .luma_add_connector_route import router as luma_add_connector_router from .luma_add_connector_route import router as luma_add_connector_router
from .mcp_oauth_route import router as mcp_oauth_router from .mcp_oauth_route import router as mcp_oauth_router
from .memory_routes import router as memory_router from .memory_routes import router as memory_router
from .model_connections_routes import router as model_connections_router
from .model_list_routes import router as model_list_router from .model_list_routes import router as model_list_router
from .new_chat_routes import router as new_chat_router from .new_chat_routes import router as new_chat_router
from .new_llm_config_routes import router as new_llm_config_router
from .notes_routes import router as notes_router from .notes_routes import router as notes_router
from .notion_add_connector_route import router as notion_add_connector_router from .notion_add_connector_route import router as notion_add_connector_router
from .obsidian_plugin_routes import router as obsidian_plugin_router from .obsidian_plugin_routes import router as obsidian_plugin_router
@ -63,7 +66,6 @@ from .stripe_routes import router as stripe_router
from .team_memory_routes import router as team_memory_router from .team_memory_routes import router as team_memory_router
from .teams_add_connector_route import router as teams_add_connector_router from .teams_add_connector_route import router as teams_add_connector_router
from .video_presentations_routes import router as video_presentations_router from .video_presentations_routes import router as video_presentations_router
from .vision_llm_routes import router as vision_llm_router
from .youtube_routes import router as youtube_router from .youtube_routes import router as youtube_router
router = APIRouter() router = APIRouter()
@ -75,6 +77,7 @@ router.include_router(export_router)
router.include_router(documents_router) router.include_router(documents_router)
router.include_router(folders_router) router.include_router(folders_router)
_gateway_enabled_dep = [Depends(require_gateway_enabled)] _gateway_enabled_dep = [Depends(require_gateway_enabled)]
router.include_router(gateway_config_router)
router.include_router(gateway_router, dependencies=_gateway_enabled_dep) router.include_router(gateway_router, dependencies=_gateway_enabled_dep)
router.include_router( router.include_router(
gateway_whatsapp_webhook_router, dependencies=_gateway_enabled_dep gateway_whatsapp_webhook_router, dependencies=_gateway_enabled_dep
@ -98,7 +101,6 @@ router.include_router(
) # Video presentation status and streaming ) # Video presentation status and streaming
router.include_router(reports_router) # Report CRUD and multi-format export router.include_router(reports_router) # Report CRUD and multi-format export
router.include_router(image_generation_router) # Image generation via litellm router.include_router(image_generation_router) # Image generation via litellm
router.include_router(vision_llm_router) # Vision LLM configs for screenshot analysis
router.include_router(search_source_connectors_router) router.include_router(search_source_connectors_router)
router.include_router(google_calendar_add_connector_router) router.include_router(google_calendar_add_connector_router)
router.include_router(google_gmail_add_connector_router) router.include_router(google_gmail_add_connector_router)
@ -116,7 +118,7 @@ router.include_router(jira_add_connector_router)
router.include_router(confluence_add_connector_router) router.include_router(confluence_add_connector_router)
router.include_router(clickup_add_connector_router) router.include_router(clickup_add_connector_router)
router.include_router(dropbox_add_connector_router) router.include_router(dropbox_add_connector_router)
router.include_router(new_llm_config_router) # LLM configs with prompt configuration router.include_router(model_connections_router) # Connection-centric model catalog
router.include_router(model_list_router) # Dynamic model catalogue from OpenRouter router.include_router(model_list_router) # Dynamic model catalogue from OpenRouter
router.include_router(logs_router) router.include_router(logs_router)
router.include_router(circleback_webhook_router) # Circleback meeting webhooks router.include_router(circleback_webhook_router) # Circleback meeting webhooks

View file

@ -18,6 +18,7 @@ from app.etl_pipeline.file_classifier import (
PLAINTEXT_EXTENSIONS, PLAINTEXT_EXTENSIONS,
) )
from app.rate_limiter import limiter from app.rate_limiter import limiter
from app.tasks.chat.streaming.errors.classifier import classify_stream_exception
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -98,7 +99,6 @@ class AnonQuotaResponse(BaseModel):
class AnonModelResponse(BaseModel): class AnonModelResponse(BaseModel):
id: int id: int
name: str name: str
description: str | None = None
provider: str provider: str
model_name: str model_name: str
billing_tier: str = "free" billing_tier: str = "free"
@ -131,8 +131,7 @@ async def list_anonymous_models():
AnonModelResponse( AnonModelResponse(
id=cfg.get("id", 0), id=cfg.get("id", 0),
name=cfg.get("name", ""), name=cfg.get("name", ""),
description=cfg.get("description"), provider=cfg.get("provider") or cfg.get("litellm_provider", ""),
provider=cfg.get("provider", ""),
model_name=cfg.get("model_name", ""), model_name=cfg.get("model_name", ""),
billing_tier=cfg.get("billing_tier", "free"), billing_tier=cfg.get("billing_tier", "free"),
is_premium=cfg.get("billing_tier", "free") == "premium", is_premium=cfg.get("billing_tier", "free") == "premium",
@ -160,8 +159,7 @@ async def get_anonymous_model(slug: str):
return AnonModelResponse( return AnonModelResponse(
id=cfg.get("id", 0), id=cfg.get("id", 0),
name=cfg.get("name", ""), name=cfg.get("name", ""),
description=cfg.get("description"), provider=cfg.get("provider") or cfg.get("litellm_provider", ""),
provider=cfg.get("provider", ""),
model_name=cfg.get("model_name", ""), model_name=cfg.get("model_name", ""),
billing_tier=cfg.get("billing_tier", "free"), billing_tier=cfg.get("billing_tier", "free"),
is_premium=cfg.get("billing_tier", "free") == "premium", is_premium=cfg.get("billing_tier", "free") == "premium",
@ -474,7 +472,15 @@ async def stream_anonymous_chat(
except Exception as e: except Exception as e:
logger.exception("Anonymous chat stream error") logger.exception("Anonymous chat stream error")
await TokenQuotaService.anon_release(session_key, ip_key, request_id) await TokenQuotaService.anon_release(session_key, ip_key, request_id)
yield streaming_service.format_error(f"Error during chat: {e!s}") _, error_code, _, _, user_message, extra = classify_stream_exception(
e,
flow_label="chat",
)
yield streaming_service.format_error(
user_message,
error_code=error_code,
extra=extra,
)
yield streaming_service.format_done() yield streaming_service.format_done()
finally: finally:
await TokenQuotaService.anon_release_stream_slot(client_ip) await TokenQuotaService.anon_release_stream_slot(client_ip)

Some files were not shown because too many files have changed in this diff Show more