cloud: added openrouter integration with global configs

2026-04-25 00:36:31 +02:00 · 2026-04-15 23:46:29 -07:00 · 2026-04-15 23:46:29 -07:00 · 4a51ccdc2c
commit 4a51ccdc2c
parent ff4e0f9b62
26 changed files with 911 additions and 178 deletions
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@ -184,17 +184,17 @@ VIDEO_PRESENTATION_DEFAULT_DURATION_IN_FRAMES=300
 # (Optional) Maximum pages limit per user for ETL services (default: `999999999` for unlimited in OSS version)  
 PAGES_LIMIT=500

-# Premium token quota per registered user (default: 5,000,000)
+# Premium token quota per registered user (default: 3,000,000)
 # Applies only to models with billing_tier=premium in global_llm_config.yaml
-PREMIUM_TOKEN_LIMIT=5000000
+PREMIUM_TOKEN_LIMIT=3000000

 # No-login (anonymous) mode — allows public users to chat without an account
 # Set TRUE to enable /free pages and anonymous chat API
 NOLOGIN_MODE_ENABLED=FALSE
 # Total tokens allowed per anonymous session before requiring account creation
-ANON_TOKEN_LIMIT=1000000
+ANON_TOKEN_LIMIT=500000
 # Token count at which the UI shows a soft warning
-ANON_TOKEN_WARNING_THRESHOLD=800000
+ANON_TOKEN_WARNING_THRESHOLD=400000
 # Days before anonymous quota tracking expires in Redis
 ANON_TOKEN_QUOTA_TTL_DAYS=30
 # Max document upload size for anonymous users (MB)
--- a/surfsense_backend/app/agents/new_chat/init.py
+++ b/surfsense_backend/app/agents/new_chat/init.py
@ -22,7 +22,11 @@ from .chat_deepagent import create_surfsense_deep_agent
 from .context import SurfSenseContextSchema

 # LLM config
-from .llm_config import create_chat_litellm_from_config, load_llm_config_from_yaml
+from .llm_config import (
+    create_chat_litellm_from_config,
+    load_global_llm_config_by_id,
+    load_llm_config_from_yaml,
+)

 # Middleware
 from .middleware import (
@ -81,6 +85,7 @@ __all__ = [
    "get_all_tool_names",
    "get_default_enabled_tools",
    "get_tool_by_name",
+    "load_global_llm_config_by_id",
    "load_llm_config_from_yaml",
    "search_knowledge_base_async",
 ]
--- a/surfsense_backend/app/agents/new_chat/llm_config.py
+++ b/surfsense_backend/app/agents/new_chat/llm_config.py
@ -10,10 +10,18 @@ It also provides utilities for creating ChatLiteLLM instances and
 managing prompt configurations.
 """

+from collections.abc import AsyncIterator
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Any

 import yaml
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForLLMRun,
+    CallbackManagerForLLMRun,
+)
+from langchain_core.messages import AIMessage, BaseMessage
+from langchain_core.outputs import ChatGenerationChunk, ChatResult
 from langchain_litellm import ChatLiteLLM
 from litellm import get_model_info
 from sqlalchemy import select
@ -23,10 +31,64 @@ from app.services.llm_router_service import (
    AUTO_MODE_ID,
    ChatLiteLLMRouter,
    LLMRouterService,
+    _sanitize_content,
    get_auto_mode_llm,
    is_auto_mode,
 )

+
+def _sanitize_messages(messages: list[BaseMessage]) -> list[BaseMessage]:
+    """Sanitize content on every message so it is safe for any provider.
+
+    Handles three cross-provider incompatibilities:
+    - List content with provider-specific blocks (e.g. ``thinking``)
+    - List content with bare strings or empty text blocks
+    - AI messages with empty content + tool calls: some providers (Bedrock)
+      convert ``""`` to ``[{"type":"text","text":""}]`` server-side then
+      reject the blank text.  The OpenAI spec says ``content`` should be
+      ``null`` when an assistant message only carries tool calls.
+    """
+    for msg in messages:
+        if isinstance(msg.content, list):
+            msg.content = _sanitize_content(msg.content)
+        if (
+            isinstance(msg, AIMessage)
+            and (not msg.content or msg.content == "")
+            and getattr(msg, "tool_calls", None)
+        ):
+            msg.content = None  # type: ignore[assignment]
+    return messages
+
+
+class SanitizedChatLiteLLM(ChatLiteLLM):
+    """ChatLiteLLM subclass that strips provider-specific content blocks
+    (e.g. ``thinking`` from reasoning models) and normalises bare strings
+    in content arrays before forwarding to the underlying provider."""
+
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: CallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        return super()._generate(
+            _sanitize_messages(messages), stop, run_manager, **kwargs
+        )
+
+    async def _astream(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: AsyncCallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[ChatGenerationChunk]:
+        async for chunk in super()._astream(
+            _sanitize_messages(messages), stop, run_manager, **kwargs
+        ):
+            yield chunk
+
+
 # Provider mapping for LiteLLM model string construction
 PROVIDER_MAP = {
    "OPENAI": "openai",
@ -252,6 +314,28 @@ def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None:
        return None


+def load_global_llm_config_by_id(llm_config_id: int) -> dict | None:
+    """
+    Load a global LLM config by ID, checking in-memory configs first.
+
+    This handles both static YAML configs and dynamically injected configs
+    (e.g. OpenRouter integration models that only exist in memory).
+
+    Args:
+        llm_config_id: The negative ID of the global config to load
+
+    Returns:
+        LLM config dict or None if not found
+    """
+    from app.config import config as app_config
+
+    for cfg in app_config.GLOBAL_LLM_CONFIGS:
+        if cfg.get("id") == llm_config_id:
+            return cfg
+    # Fallback to YAML file read (covers edge cases like hot-reload)
+    return load_llm_config_from_yaml(llm_config_id)
+
+
 async def load_new_llm_config_from_db(
    session: AsyncSession,
    config_id: int,
@ -359,7 +443,13 @@ async def load_agent_config(
        return AgentConfig.from_auto_mode()

    if config_id < 0:
-        # Load from YAML (global configs have negative IDs)
+        # Check in-memory configs first (includes static YAML + dynamic OpenRouter)
+        from app.config import config as app_config
+
+        for cfg in app_config.GLOBAL_LLM_CONFIGS:
+            if cfg.get("id") == config_id:
+                return AgentConfig.from_yaml_config(cfg)
+        # Fallback to YAML file read for safety
        yaml_config = load_llm_config_from_yaml(config_id)
        if yaml_config:
            return AgentConfig.from_yaml_config(yaml_config)
@ -402,7 +492,7 @@ def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
    if llm_config.get("litellm_params"):
        litellm_kwargs.update(llm_config["litellm_params"])

-    llm = ChatLiteLLM(**litellm_kwargs)
+    llm = SanitizedChatLiteLLM(**litellm_kwargs)
    _attach_model_profile(llm, model_string)
    return llm

@ -457,6 +547,6 @@ def create_chat_litellm_from_agent_config(
    if agent_config.litellm_params:
        litellm_kwargs.update(agent_config.litellm_params)

-    llm = ChatLiteLLM(**litellm_kwargs)
+    llm = SanitizedChatLiteLLM(**litellm_kwargs)
    _attach_model_profile(llm, model_string)
    return llm
--- a/surfsense_backend/app/agents/podcaster/nodes.py
+++ b/surfsense_backend/app/agents/podcaster/nodes.py
@ -13,6 +13,7 @@ from litellm import aspeech
 from app.config import config as app_config
 from app.services.kokoro_tts_service import get_kokoro_tts_service
 from app.services.llm_service import get_agent_llm
+from app.utils.content_utils import extract_text_content, strip_markdown_fences

 from .configuration import Configuration
 from .prompts import get_podcast_generation_prompt
@ -53,43 +54,32 @@ async def create_podcast_transcript(
    # Generate the podcast transcript
    llm_response = await llm.ainvoke(messages)

-    # First try the direct approach
+    # Reasoning models (e.g. Kimi K2.5) may return content as a list of
+    # blocks including 'reasoning' entries.  Normalise to a plain string.
+    content = strip_markdown_fences(extract_text_content(llm_response.content))
+
    try:
-        podcast_transcript = PodcastTranscripts.model_validate(
-            json.loads(llm_response.content)
-        )
-    except (json.JSONDecodeError, ValueError) as e:
+        podcast_transcript = PodcastTranscripts.model_validate(json.loads(content))
+    except (json.JSONDecodeError, TypeError, ValueError) as e:
        print(f"Direct JSON parsing failed, trying fallback approach: {e!s}")

-        # Fallback: Parse the JSON response manually
        try:
-            # Extract JSON content from the response
-            content = llm_response.content
-
-            # Find the JSON in the content (handle case where LLM might add additional text)
            json_start = content.find("{")
            json_end = content.rfind("}") + 1
            if json_start >= 0 and json_end > json_start:
                json_str = content[json_start:json_end]
-
-                # Parse the JSON string
                parsed_data = json.loads(json_str)
-
-                # Convert to Pydantic model
                podcast_transcript = PodcastTranscripts.model_validate(parsed_data)
-
                print("Successfully parsed podcast transcript using fallback approach")
            else:
-                # If JSON structure not found, raise a clear error
                error_message = f"Could not find valid JSON in LLM response. Raw response: {content}"
                print(error_message)
                raise ValueError(error_message)

-        except (json.JSONDecodeError, ValueError) as e2:
-            # Log the error and re-raise it
+        except (json.JSONDecodeError, TypeError, ValueError) as e2:
            error_message = f"Error parsing LLM response (fallback also failed): {e2!s}"
            print(f"Error parsing LLM response: {e2!s}")
-            print(f"Raw response: {llm_response.content}")
+            print(f"Raw response: {content}")
            raise

    return {"podcast_transcript": podcast_transcript.podcast_transcripts}
--- a/surfsense_backend/app/agents/video_presentation/nodes.py
+++ b/surfsense_backend/app/agents/video_presentation/nodes.py
@ -16,6 +16,7 @@ from litellm import aspeech
 from app.config import config as app_config
 from app.services.kokoro_tts_service import get_kokoro_tts_service
 from app.services.llm_service import get_agent_llm
+from app.utils.content_utils import extract_text_content, strip_markdown_fences

 from .configuration import Configuration
 from .prompts import (
@ -67,16 +68,14 @@ async def create_presentation_slides(
    ]

    llm_response = await llm.ainvoke(messages)
+    content = strip_markdown_fences(extract_text_content(llm_response.content))

    try:
-        presentation = PresentationSlides.model_validate(
-            json.loads(llm_response.content)
-        )
-    except (json.JSONDecodeError, ValueError) as e:
+        presentation = PresentationSlides.model_validate(json.loads(content))
+    except (json.JSONDecodeError, TypeError, ValueError) as e:
        print(f"Direct JSON parsing failed, trying fallback approach: {e!s}")

        try:
-            content = llm_response.content
            json_start = content.find("{")
            json_end = content.rfind("}") + 1
            if json_start >= 0 and json_end > json_start:
@ -89,10 +88,10 @@ async def create_presentation_slides(
                print(error_message)
                raise ValueError(error_message)

-        except (json.JSONDecodeError, ValueError) as e2:
+        except (json.JSONDecodeError, TypeError, ValueError) as e2:
            error_message = f"Error parsing LLM response (fallback also failed): {e2!s}"
            print(f"Error parsing LLM response: {e2!s}")
-            print(f"Raw response: {llm_response.content}")
+            print(f"Raw response: {content}")
            raise

    return {"slides": presentation.slides}
@ -308,12 +307,7 @@ async def _assign_themes_with_llm(
            ]
        )

-        text = response.content.strip()
-        if text.startswith("```"):
-            lines = text.split("\n")
-            text = "\n".join(
-                line for line in lines if not line.strip().startswith("```")
-            ).strip()
+        text = strip_markdown_fences(extract_text_content(response.content))

        assignments = json.loads(text)
        valid_themes = set(THEME_PRESETS)
@ -424,7 +418,9 @@ async def generate_slide_scene_codes(
        )

        llm_response = await llm.ainvoke(messages)
-        code, scene_title = _extract_code_and_title(llm_response.content)
+        code, scene_title = _extract_code_and_title(
+            extract_text_content(llm_response.content)
+        )

        code = await _refine_if_needed(llm, code, slide.slide_number)

@ -452,7 +448,7 @@ def _extract_code_and_title(content: str) -> tuple[str, str | None]:

    Returns (code, title) where title may be None.
    """
-    text = content.strip()
+    text = strip_markdown_fences(content)

    if text.startswith("{"):
        try:
@ -472,18 +468,7 @@ def _extract_code_and_title(content: str) -> tuple[str, str | None]:
            except (json.JSONDecodeError, ValueError):
                pass

-    code = text
-    if code.startswith("```"):
-        lines = code.split("\n")
-        start = 1
-        end = len(lines)
-        for i in range(len(lines) - 1, 0, -1):
-            if lines[i].strip().startswith("```"):
-                end = i
-                break
-        code = "\n".join(lines[start:end]).strip()
-
-    return code, None
+    return text, None


 async def _refine_if_needed(llm, code: str, slide_number: int) -> str:
@ -512,7 +497,7 @@ async def _refine_if_needed(llm, code: str, slide_number: int) -> str:
        ]

        response = await llm.ainvoke(messages)
-        code, _ = _extract_code_and_title(response.content)
+        code, _ = _extract_code_and_title(extract_text_content(response.content))

        error = _basic_syntax_check(code)
        if error is None:
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@ -30,6 +30,7 @@ from app.config import (
    config,
    initialize_image_gen_router,
    initialize_llm_router,
+    initialize_openrouter_integration,
    initialize_vision_llm_router,
 )
 from app.db import User, create_db_and_tables, get_async_session
@ -368,6 +369,26 @@ def _enable_slow_callback_logging(threshold_sec: float = 0.5) -> None:
    )


+def _start_openrouter_background_refresh() -> None:
+    """Start periodic OpenRouter model refresh if integration is enabled."""
+    from app.services.openrouter_integration_service import OpenRouterIntegrationService
+
+    if not OpenRouterIntegrationService.is_initialized():
+        return
+    settings = config.OPENROUTER_INTEGRATION_SETTINGS
+    if settings:
+        interval = settings.get("refresh_interval_hours", 24)
+        OpenRouterIntegrationService.get_instance().start_background_refresh(interval)
+
+
+def _stop_openrouter_background_refresh() -> None:
+    """Cancel the periodic OpenRouter refresh task on shutdown."""
+    from app.services.openrouter_integration_service import OpenRouterIntegrationService
+
+    if OpenRouterIntegrationService.is_initialized():
+        OpenRouterIntegrationService.get_instance().stop_background_refresh()
+
+
@asynccontextmanager
 async def lifespan(app: FastAPI):
    # Tune GC: lower gen-2 threshold so long-lived garbage is collected
@ -378,6 +399,8 @@ async def lifespan(app: FastAPI):
    _enable_slow_callback_logging(threshold_sec=0.5)
    await create_db_and_tables()
    await setup_checkpointer_tables()
+    initialize_openrouter_integration()
+    _start_openrouter_background_refresh()
    initialize_llm_router()
    initialize_image_gen_router()
    initialize_vision_llm_router()
@ -393,6 +416,7 @@ async def lifespan(app: FastAPI):

    yield

+    _stop_openrouter_background_refresh()
    await close_checkpointer()


--- a/surfsense_backend/app/celery_app.py
+++ b/surfsense_backend/app/celery_app.py
@ -21,9 +21,11 @@ def init_worker(**kwargs):
    from app.config import (
        initialize_image_gen_router,
        initialize_llm_router,
+        initialize_openrouter_integration,
        initialize_vision_llm_router,
    )

+    initialize_openrouter_integration()
    initialize_llm_router()
    initialize_image_gen_router()
    initialize_vision_llm_router()
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -187,24 +187,82 @@ def load_image_gen_router_settings():
        return default_settings


+def load_openrouter_integration_settings() -> dict | None:
+    """
+    Load OpenRouter integration settings from the YAML config.
+
+    Returns:
+        dict with settings if present and enabled, None otherwise
+    """
+    global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml"
+
+    if not global_config_file.exists():
+        return None
+
+    try:
+        with open(global_config_file, encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            settings = data.get("openrouter_integration")
+            if settings and settings.get("enabled"):
+                return settings
+            return None
+    except Exception as e:
+        print(f"Warning: Failed to load OpenRouter integration settings: {e}")
+        return None
+
+
+def initialize_openrouter_integration():
+    """
+    If enabled, fetch all OpenRouter models and append them to
+    config.GLOBAL_LLM_CONFIGS as dynamic premium entries.
+    Should be called BEFORE initialize_llm_router() so the router
+    correctly excludes premium models from Auto mode.
+    """
+    settings = load_openrouter_integration_settings()
+    if not settings:
+        return
+
+    try:
+        from app.services.openrouter_integration_service import (
+            OpenRouterIntegrationService,
+        )
+
+        service = OpenRouterIntegrationService.get_instance()
+        new_configs = service.initialize(settings)
+
+        if new_configs:
+            config.GLOBAL_LLM_CONFIGS.extend(new_configs)
+            print(
+                f"Info: OpenRouter integration added {len(new_configs)} models "
+                f"(billing_tier={settings.get('billing_tier', 'premium')})"
+            )
+        else:
+            print("Info: OpenRouter integration enabled but no models fetched")
+    except Exception as e:
+        print(f"Warning: Failed to initialize OpenRouter integration: {e}")
+
+
 def initialize_llm_router():
    """
    Initialize the LLM Router service for Auto mode.
-    This should be called during application startup.
+    This should be called during application startup, AFTER
+    initialize_openrouter_integration() so dynamic models are included.
+    Uses config.GLOBAL_LLM_CONFIGS (in-memory) which includes both
+    static YAML configs and dynamic OpenRouter models.
    """
-    global_configs = load_global_llm_configs()
+    all_configs = config.GLOBAL_LLM_CONFIGS
    router_settings = load_router_settings()

-    if not global_configs:
+    if not all_configs:
        print("Info: No global LLM configs found, Auto mode will not be available")
        return

    try:
        from app.services.llm_router_service import LLMRouterService

-        LLMRouterService.initialize(global_configs, router_settings)
+        LLMRouterService.initialize(all_configs, router_settings)
        print(
-            f"Info: LLM Router initialized with {len(global_configs)} models "
+            f"Info: LLM Router initialized with {len(all_configs)} models "
            f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
        )
    except Exception as e:
@ -326,7 +384,7 @@ class Config:
    )

    # Premium token quota settings
-    PREMIUM_TOKEN_LIMIT = int(os.getenv("PREMIUM_TOKEN_LIMIT", "5000000"))
+    PREMIUM_TOKEN_LIMIT = int(os.getenv("PREMIUM_TOKEN_LIMIT", "3000000"))
    STRIPE_PREMIUM_TOKEN_PRICE_ID = os.getenv("STRIPE_PREMIUM_TOKEN_PRICE_ID")
    STRIPE_TOKENS_PER_UNIT = int(os.getenv("STRIPE_TOKENS_PER_UNIT", "1000000"))
    STRIPE_TOKEN_BUYING_ENABLED = (
@ -335,9 +393,9 @@ class Config:

    # Anonymous / no-login mode settings
    NOLOGIN_MODE_ENABLED = os.getenv("NOLOGIN_MODE_ENABLED", "FALSE").upper() == "TRUE"
-    ANON_TOKEN_LIMIT = int(os.getenv("ANON_TOKEN_LIMIT", "1000000"))
+    ANON_TOKEN_LIMIT = int(os.getenv("ANON_TOKEN_LIMIT", "500000"))
    ANON_TOKEN_WARNING_THRESHOLD = int(
-        os.getenv("ANON_TOKEN_WARNING_THRESHOLD", "800000")
+        os.getenv("ANON_TOKEN_WARNING_THRESHOLD", "400000")
    )
    ANON_TOKEN_QUOTA_TTL_DAYS = int(os.getenv("ANON_TOKEN_QUOTA_TTL_DAYS", "30"))
    ANON_MAX_UPLOAD_SIZE_MB = int(os.getenv("ANON_MAX_UPLOAD_SIZE_MB", "5"))
@ -450,6 +508,9 @@ class Config:
    # Router settings for Vision LLM Auto mode
    VISION_LLM_ROUTER_SETTINGS = load_vision_llm_router_settings()

+    # OpenRouter Integration settings (optional)
+    OPENROUTER_INTEGRATION_SETTINGS = load_openrouter_integration_settings()
+
    # Chonkie Configuration | Edit this to your needs
    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
    # Azure OpenAI credentials from environment variables
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@ -58,13 +58,13 @@ global_llm_configs:
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
    # Rate limits for load balancing (requests/tokens per minute)
-    rpm: 500  # Requests per minute
-    tpm: 100000  # Tokens per minute
+    rpm: 500 # Requests per minute
+    tpm: 100000 # Tokens per minute
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
    # Prompt Configuration
-    system_instructions: ""  # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
+    system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
    use_default_system_instructions: true
    citations_enabled: true

@ -103,14 +103,14 @@ global_llm_configs:
    model_name: "gpt-3.5-turbo"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
-    rpm: 3500  # GPT-3.5 has higher rate limits
+    rpm: 3500 # GPT-3.5 has higher rate limits
    tpm: 200000
    litellm_params:
      temperature: 0.5
      max_tokens: 2000
    system_instructions: ""
    use_default_system_instructions: true
-    citations_enabled: false  # Disabled for faster responses
+    citations_enabled: false # Disabled for faster responses

  # Example: Chinese LLM - DeepSeek with custom instructions
  - id: -4
@ -158,7 +158,7 @@ global_llm_configs:
    model_name: "azure/gpt-4o-deployment"
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
-    api_version: "2024-02-15-preview"  # Azure API version
+    api_version: "2024-02-15-preview" # Azure API version
    rpm: 1000
    tpm: 150000
    litellm_params:
@ -191,7 +191,7 @@ global_llm_configs:
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
-      base_model: "gpt-4-turbo"  # Maps to gpt-4-turbo-preview
+      base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true
@ -209,7 +209,7 @@ global_llm_configs:
    model_name: "llama3-70b-8192"
    api_key: "your-groq-api-key-here"
    api_base: ""
-    rpm: 30  # Groq has lower rate limits on free tier
+    rpm: 30 # Groq has lower rate limits on free tier
    tpm: 14400
    litellm_params:
      temperature: 0.7
@ -234,12 +234,48 @@ global_llm_configs:
    rpm: 60
    tpm: 100000
    litellm_params:
-      temperature: 1.0  # MiniMax requires temperature in (0.0, 1.0], cannot be 0
+      temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
      max_tokens: 4000
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true

+# =============================================================================
+# OpenRouter Integration
+# =============================================================================
+# When enabled, dynamically fetches ALL available models from the OpenRouter API
+# and injects them as global configs. This gives premium users access to any model
+# on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota.
+# Models are fetched at startup and refreshed periodically in the background.
+# All calls go through LiteLLM with the openrouter/ prefix.
+openrouter_integration:
+  enabled: false
+  api_key: "sk-or-your-openrouter-api-key"
+  # billing_tier: "premium" or "free". Controls whether users need premium tokens.
+  billing_tier: "premium"
+  # anonymous_enabled: set true to also show OpenRouter models to no-login users
+  anonymous_enabled: false
+  seo_enabled: false
+  # quota_reserve_tokens: tokens reserved per call for quota enforcement
+  quota_reserve_tokens: 4000
+  # id_offset: starting negative ID for dynamically generated configs.
+  # Must not overlap with your static global_llm_configs IDs above.
+  id_offset: -10000
+  # refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only)
+  refresh_interval_hours: 24
+  # rpm/tpm: Applied uniformly to all OpenRouter models for LiteLLM Router load balancing.
+  # OpenRouter doesn't expose per-model rate limits via API; actual throttling is handled
+  # upstream by OpenRouter itself (your account limits are at https://openrouter.ai/settings/limits).
+  # These values only matter if you set billing_tier to "free" (adding them to Auto mode).
+  # For premium-only models they are cosmetic. Set conservatively or match your account tier.
+  rpm: 200
+  tpm: 1000000
+  litellm_params:
+    max_tokens: 16384
+  system_instructions: ""
+  use_default_system_instructions: true
+  citations_enabled: true
+
 # =============================================================================
 # Image Generation Configuration
 # =============================================================================
@ -265,7 +301,7 @@ global_image_generation_configs:
    model_name: "dall-e-3"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
-    rpm: 50  # Requests per minute (image gen is rate-limited by RPM, not tokens)
+    rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
    litellm_params: {}

  # Example: OpenAI GPT Image 1
--- a/surfsense_backend/app/services/llm_router_service.py
+++ b/surfsense_backend/app/services/llm_router_service.py
@ -49,6 +49,49 @@ def _is_context_overflow_error(exc: LiteLLMBadRequestError) -> bool:
    return bool(_CONTEXT_OVERFLOW_PATTERNS.search(str(exc)))


+_UNIVERSAL_CONTENT_TYPES = {
+    "text",
+    "image_url",
+    "input_audio",
+    "refusal",
+    "audio",
+    "file",
+}
+
+
+def _sanitize_content(content: Any) -> Any:
+    """Normalise a LangChain message ``content`` field so it is safe for any
+    downstream provider (Azure, OpenAI, OpenRouter, etc.).
+
+    * Strips provider-specific block types (e.g. ``thinking`` from reasoning models).
+    * Removes text blocks with blank text (Bedrock rejects ``{"type":"text","text":""}``)
+    * Converts bare strings inside a list to ``{"type": "text", "text": ...}`` objects
+      (Azure rejects raw strings in a content array).
+    * Collapses a single-text-block list to a plain string for maximum compatibility.
+    """
+    if not isinstance(content, list):
+        return content
+
+    filtered: list[dict] = []
+    for block in content:
+        if isinstance(block, str):
+            if block:
+                filtered.append({"type": "text", "text": block})
+        elif isinstance(block, dict):
+            block_type = block.get("type", "text")
+            if block_type not in _UNIVERSAL_CONTENT_TYPES:
+                continue
+            if block_type == "text" and not block.get("text"):
+                continue
+            filtered.append(block)
+
+    if not filtered:
+        return ""
+    if len(filtered) == 1 and filtered[0].get("type") == "text":
+        return filtered[0].get("text", "")
+    return filtered
+
+
 # Special ID for Auto mode - uses router for load balancing
 AUTO_MODE_ID = 0

@ -103,6 +146,7 @@ class LLMRouterService:
    _model_list: list[dict] = []
    _router_settings: dict = {}
    _initialized: bool = False
+    _premium_model_strings: set[str] = set()

    def __new__(cls):
        if cls._instance is None:
@ -135,22 +179,28 @@ class LLMRouterService:
            logger.debug("LLM Router already initialized, skipping")
            return

-        auto_configs = [
-            c for c in global_configs if c.get("billing_tier", "free") != "premium"
-        ]
-
        model_list = []
-        for config in auto_configs:
+        premium_models: set[str] = set()
+        for config in global_configs:
            deployment = cls._config_to_deployment(config)
            if deployment:
                model_list.append(deployment)
+                if config.get("billing_tier") == "premium":
+                    model_string = deployment["litellm_params"]["model"]
+                    premium_models.add(model_string)

        if not model_list:
            logger.warning("No valid LLM configs found for router initialization")
            return

        instance._model_list = model_list
+        instance._premium_model_strings = premium_models
        instance._router_settings = router_settings or {}
+        logger.info(
+            "Router pool: %d deployments (%d premium)",
+            len(model_list),
+            len(premium_models),
+        )

        # Default router settings optimized for rate limit handling
        default_settings = {
@ -197,6 +247,21 @@ class LLMRouterService:
            logger.error(f"Failed to initialize LLM Router: {e}")
            instance._router = None

+    @classmethod
+    def is_premium_model(cls, model_string: str) -> bool:
+        """Return True if *model_string* (as reported by LiteLLM) belongs to a
+        premium-tier deployment in the router pool."""
+        instance = cls.get_instance()
+        return model_string in instance._premium_model_strings
+
+    @classmethod
+    def compute_premium_tokens(cls, calls: list) -> int:
+        """Sum ``total_tokens`` for calls whose model is premium."""
+        instance = cls.get_instance()
+        return sum(
+            c.total_tokens for c in calls if c.model in instance._premium_model_strings
+        )
+
    @classmethod
    def _build_context_fallback_groups(
        cls, model_list: list[dict]
@ -1044,10 +1109,12 @@ class ChatLiteLLMRouter(BaseChatModel):
                result.append({"role": "user", "content": msg.content})
            elif isinstance(msg, AIMsg):
                ai_msg: dict[str, Any] = {"role": "assistant"}
-                if msg.content:
-                    ai_msg["content"] = msg.content
-                # Handle tool calls
-                if hasattr(msg, "tool_calls") and msg.tool_calls:
+                has_tool_calls = hasattr(msg, "tool_calls") and msg.tool_calls
+
+                sanitized = _sanitize_content(msg.content) if msg.content else ""
+                ai_msg["content"] = sanitized if sanitized else ""
+
+                if has_tool_calls:
                    ai_msg["tool_calls"] = [
                        {
                            "id": tc.get("id", ""),
--- a/surfsense_backend/app/services/llm_service.py
+++ b/surfsense_backend/app/services/llm_service.py
@ -6,6 +6,7 @@ from langchain_litellm import ChatLiteLLM
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select

+from app.agents.new_chat.llm_config import SanitizedChatLiteLLM
 from app.config import config
 from app.db import NewLLMConfig, SearchSpace
 from app.services.llm_router_service import (
@ -150,7 +151,7 @@ async def validate_llm_config(
        if litellm_params:
            litellm_kwargs.update(litellm_params)

-        llm = ChatLiteLLM(**litellm_kwargs)
+        llm = SanitizedChatLiteLLM(**litellm_kwargs)

        # Make a simple test call
        test_message = HumanMessage(content="Hello")
@ -302,7 +303,7 @@ async def get_search_space_llm_instance(
            if disable_streaming:
                litellm_kwargs["disable_streaming"] = True

-            return ChatLiteLLM(**litellm_kwargs)
+            return SanitizedChatLiteLLM(**litellm_kwargs)

        # Get the LLM configuration from database (NewLLMConfig)
        result = await session.execute(
@ -379,7 +380,7 @@ async def get_search_space_llm_instance(
        if disable_streaming:
            litellm_kwargs["disable_streaming"] = True

-        return ChatLiteLLM(**litellm_kwargs)
+        return SanitizedChatLiteLLM(**litellm_kwargs)

    except Exception as e:
        logger.error(
@ -480,7 +481,7 @@ async def get_vision_llm(
            if global_cfg.get("litellm_params"):
                litellm_kwargs.update(global_cfg["litellm_params"])

-            return ChatLiteLLM(**litellm_kwargs)
+            return SanitizedChatLiteLLM(**litellm_kwargs)

        result = await session.execute(
            select(VisionLLMConfig).where(
@ -513,7 +514,7 @@ async def get_vision_llm(
        if vision_cfg.litellm_params:
            litellm_kwargs.update(vision_cfg.litellm_params)

-        return ChatLiteLLM(**litellm_kwargs)
+        return SanitizedChatLiteLLM(**litellm_kwargs)

    except Exception as e:
        logger.error(
--- a/surfsense_backend/app/services/model_list_service.py
+++ b/surfsense_backend/app/services/model_list_service.py
@ -86,12 +86,34 @@ def _is_text_output_model(model: dict) -> bool:
    return output_mods == ["text"]


+def _supports_tool_calling(model: dict) -> bool:
+    """Return True if the model supports function/tool calling."""
+    supported = model.get("supported_parameters") or []
+    return "tools" in supported
+
+
+MIN_CONTEXT_LENGTH = 100_000
+
+
+def _has_sufficient_context(model: dict) -> bool:
+    """Return True if the model's context window is at least MIN_CONTEXT_LENGTH."""
+    ctx = model.get("context_length") or 0
+    return ctx >= MIN_CONTEXT_LENGTH
+
+
+def _is_allowed_model(model: dict) -> bool:
+    """Reuse the exclusion list from the OpenRouter integration service."""
+    from app.services.openrouter_integration_service import _is_allowed_model as _check
+
+    return _check(model)
+
+
 def _process_models(raw_models: list[dict]) -> list[dict]:
    """
    Transform raw OpenRouter model entries into a flat list of
    {value, label, provider, context_window} dicts.

-    Only text-output models are included (audio/image generators are skipped).
+    Only text-output models with tool-calling support are included.

    Each OpenRouter model is emitted once for OPENROUTER (full id) and,
    when the slug maps to a native provider, once more with just the
@ -110,6 +132,15 @@ def _process_models(raw_models: list[dict]) -> list[dict]:
        if not _is_text_output_model(model):
            continue

+        if not _supports_tool_calling(model):
+            continue
+
+        if not _has_sufficient_context(model):
+            continue
+
+        if not _is_allowed_model(model):
+            continue
+
        provider_slug, model_name = model_id.split("/", 1)
        context_window = _format_context_length(context_length)

--- a/surfsense_backend/app/services/openrouter_integration_service.py
+++ b/surfsense_backend/app/services/openrouter_integration_service.py
@ -0,0 +1,291 @@
+"""
+OpenRouter Integration Service
+
+Dynamically fetches all available models from the OpenRouter public API
+and generates virtual global LLM config entries. These entries are injected
+into config.GLOBAL_LLM_CONFIGS so they appear alongside static YAML configs
+in the model selector.
+
+All actual LLM calls go through LiteLLM with the ``openrouter/`` prefix --
+this service only manages the catalogue, not the inference path.
+"""
+
+import asyncio
+import logging
+import threading
+from typing import Any
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+OPENROUTER_API_URL = "https://openrouter.ai/api/v1/models"
+
+# Sentinel value stored on each generated config so we can distinguish
+# dynamic OpenRouter entries from hand-written YAML entries during refresh.
+_OPENROUTER_DYNAMIC_MARKER = "__openrouter_dynamic__"
+
+
+def _is_text_output_model(model: dict) -> bool:
+    """Return True if the model produces text output only (skip image/audio generators)."""
+    output_mods = model.get("architecture", {}).get("output_modalities", [])
+    return output_mods == ["text"]
+
+
+def _supports_tool_calling(model: dict) -> bool:
+    """Return True if the model supports function/tool calling."""
+    supported = model.get("supported_parameters") or []
+    return "tools" in supported
+
+
+MIN_CONTEXT_LENGTH = 100_000
+
+# Provider slugs whose backend is fundamentally incompatible with our agent's
+# tool-call message flow (e.g. Amazon Bedrock requires toolConfig alongside
+# tool history which OpenRouter doesn't relay).
+_EXCLUDED_PROVIDER_SLUGS = {"amazon"}
+
+_EXCLUDED_MODEL_IDS: set[str] = {
+    # Deprecated / removed upstream
+    "openai/gpt-4-1106-preview",
+    "openai/gpt-4-turbo-preview",
+    # Permanently no-capacity variant
+    "openai/gpt-4o:extended",
+    # Non-serverless model that requires a dedicated endpoint
+    "arcee-ai/virtuoso-large",
+    # Deep-research models reject standard params (temperature, etc.)
+    "openai/o3-deep-research",
+    "openai/o4-mini-deep-research",
+}
+
+_EXCLUDED_MODEL_SUFFIXES: tuple[str, ...] = ("-deep-research",)
+
+
+def _has_sufficient_context(model: dict) -> bool:
+    """Return True if the model's context window is at least MIN_CONTEXT_LENGTH."""
+    ctx = model.get("context_length") or 0
+    return ctx >= MIN_CONTEXT_LENGTH
+
+
+def _is_compatible_provider(model: dict) -> bool:
+    """Return False for models from providers known to be incompatible."""
+    model_id = model.get("id", "")
+    slug = model_id.split("/", 1)[0] if "/" in model_id else ""
+    return slug not in _EXCLUDED_PROVIDER_SLUGS
+
+
+def _is_allowed_model(model: dict) -> bool:
+    """Return False for specific model IDs known to be broken or incompatible."""
+    model_id = model.get("id", "")
+    if model_id in _EXCLUDED_MODEL_IDS:
+        return False
+    base_id = model_id.split(":")[0]
+    return not base_id.endswith(_EXCLUDED_MODEL_SUFFIXES)
+
+
+def _fetch_models_sync() -> list[dict] | None:
+    """Synchronous fetch for use during startup (before the event loop is running)."""
+    try:
+        with httpx.Client(timeout=20) as client:
+            response = client.get(OPENROUTER_API_URL)
+            response.raise_for_status()
+            data = response.json()
+            return data.get("data", [])
+    except Exception as e:
+        logger.warning("Failed to fetch OpenRouter models (sync): %s", e)
+        return None
+
+
+async def _fetch_models_async() -> list[dict] | None:
+    """Async fetch for background refresh."""
+    try:
+        async with httpx.AsyncClient(timeout=20) as client:
+            response = await client.get(OPENROUTER_API_URL)
+            response.raise_for_status()
+            data = response.json()
+            return data.get("data", [])
+    except Exception as e:
+        logger.warning("Failed to fetch OpenRouter models (async): %s", e)
+        return None
+
+
+def _generate_configs(
+    raw_models: list[dict],
+    settings: dict[str, Any],
+) -> list[dict]:
+    """
+    Convert raw OpenRouter model entries into global LLM config dicts.
+
+    Models are sorted by ID for deterministic, stable ID assignment across
+    restarts and refreshes.
+    """
+    id_offset: int = settings.get("id_offset", -10000)
+    api_key: str = settings.get("api_key", "")
+    billing_tier: str = settings.get("billing_tier", "premium")
+    anonymous_enabled: bool = settings.get("anonymous_enabled", False)
+    seo_enabled: bool = settings.get("seo_enabled", False)
+    quota_reserve_tokens: int = settings.get("quota_reserve_tokens", 4000)
+    rpm: int = settings.get("rpm", 200)
+    tpm: int = settings.get("tpm", 1000000)
+    litellm_params: dict = settings.get("litellm_params") or {}
+    system_instructions: str = settings.get("system_instructions", "")
+    use_default: bool = settings.get("use_default_system_instructions", True)
+    citations_enabled: bool = settings.get("citations_enabled", True)
+
+    text_models = [
+        m
+        for m in raw_models
+        if _is_text_output_model(m)
+        and _supports_tool_calling(m)
+        and _has_sufficient_context(m)
+        and _is_compatible_provider(m)
+        and _is_allowed_model(m)
+        and "/" in m.get("id", "")
+    ]
+    text_models.sort(key=lambda m: m["id"])
+
+    configs: list[dict] = []
+    for idx, model in enumerate(text_models):
+        model_id: str = model["id"]
+        name: str = model.get("name", model_id)
+
+        cfg: dict[str, Any] = {
+            "id": id_offset - idx,
+            "name": name,
+            "description": f"{name} via OpenRouter",
+            "billing_tier": billing_tier,
+            "anonymous_enabled": anonymous_enabled,
+            "seo_enabled": seo_enabled,
+            "seo_slug": None,
+            "quota_reserve_tokens": quota_reserve_tokens,
+            "provider": "OPENROUTER",
+            "model_name": model_id,
+            "api_key": api_key,
+            "api_base": "",
+            "rpm": rpm,
+            "tpm": tpm,
+            "litellm_params": dict(litellm_params),
+            "system_instructions": system_instructions,
+            "use_default_system_instructions": use_default,
+            "citations_enabled": citations_enabled,
+            _OPENROUTER_DYNAMIC_MARKER: True,
+        }
+        configs.append(cfg)
+
+    return configs
+
+
+class OpenRouterIntegrationService:
+    """Singleton that manages the dynamic OpenRouter model catalogue."""
+
+    _instance: "OpenRouterIntegrationService | None" = None
+    _lock = threading.Lock()
+
+    def __init__(self) -> None:
+        self._settings: dict[str, Any] = {}
+        self._configs: list[dict] = []
+        self._configs_by_id: dict[int, dict] = {}
+        self._initialized = False
+        self._refresh_task: asyncio.Task | None = None
+
+    @classmethod
+    def get_instance(cls) -> "OpenRouterIntegrationService":
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = cls()
+        return cls._instance
+
+    @classmethod
+    def is_initialized(cls) -> bool:
+        return cls._instance is not None and cls._instance._initialized
+
+    # ------------------------------------------------------------------
+    # Initialisation (called at startup, before event loop for Celery)
+    # ------------------------------------------------------------------
+
+    def initialize(self, settings: dict[str, Any]) -> list[dict]:
+        """
+        Fetch models synchronously and generate configs.
+        Returns the generated configs list.
+        """
+        self._settings = settings
+        raw_models = _fetch_models_sync()
+        if raw_models is None:
+            logger.warning("OpenRouter integration: could not fetch models at startup")
+            self._initialized = True
+            return []
+
+        self._configs = _generate_configs(raw_models, settings)
+        self._configs_by_id = {c["id"]: c for c in self._configs}
+        self._initialized = True
+
+        logger.info(
+            "OpenRouter integration: loaded %d models (IDs %d to %d)",
+            len(self._configs),
+            self._configs[0]["id"] if self._configs else 0,
+            self._configs[-1]["id"] if self._configs else 0,
+        )
+        return self._configs
+
+    # ------------------------------------------------------------------
+    # Background refresh
+    # ------------------------------------------------------------------
+
+    async def refresh(self) -> None:
+        """Re-fetch from OpenRouter and atomically swap configs in GLOBAL_LLM_CONFIGS."""
+        raw_models = await _fetch_models_async()
+        if raw_models is None:
+            logger.warning("OpenRouter refresh: fetch failed, keeping stale list")
+            return
+
+        new_configs = _generate_configs(raw_models, self._settings)
+        new_by_id = {c["id"]: c for c in new_configs}
+
+        from app.config import config as app_config
+
+        static_configs = [
+            c
+            for c in app_config.GLOBAL_LLM_CONFIGS
+            if not c.get(_OPENROUTER_DYNAMIC_MARKER)
+        ]
+        app_config.GLOBAL_LLM_CONFIGS = static_configs + new_configs
+
+        self._configs = new_configs
+        self._configs_by_id = new_by_id
+
+        logger.info("OpenRouter refresh: updated to %d models", len(new_configs))
+
+    async def _refresh_loop(self, interval_hours: float) -> None:
+        interval_sec = interval_hours * 3600
+        while True:
+            await asyncio.sleep(interval_sec)
+            try:
+                await self.refresh()
+            except Exception:
+                logger.exception("OpenRouter background refresh failed")
+
+    def start_background_refresh(self, interval_hours: float) -> None:
+        if interval_hours <= 0:
+            return
+        loop = asyncio.get_event_loop()
+        self._refresh_task = loop.create_task(self._refresh_loop(interval_hours))
+        logger.info(
+            "OpenRouter background refresh started (every %.1fh)", interval_hours
+        )
+
+    def stop_background_refresh(self) -> None:
+        if self._refresh_task is not None and not self._refresh_task.done():
+            self._refresh_task.cancel()
+            self._refresh_task = None
+            logger.info("OpenRouter background refresh stopped")
+
+    # ------------------------------------------------------------------
+    # Accessors
+    # ------------------------------------------------------------------
+
+    def get_configs(self) -> list[dict]:
+        return self._configs
+
+    def get_config_by_id(self, config_id: int) -> dict | None:
+        return self._configs_by_id.get(config_id)
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -35,7 +35,7 @@ from app.agents.new_chat.llm_config import (
    create_chat_litellm_from_agent_config,
    create_chat_litellm_from_config,
    load_agent_config,
-    load_llm_config_from_yaml,
+    load_global_llm_config_by_id,
 )
 from app.agents.new_chat.memory_extraction import (
    extract_and_save_memory,
@ -1205,8 +1205,8 @@ async def stream_new_chat(
            # Create ChatLiteLLM from AgentConfig
            llm = create_chat_litellm_from_agent_config(agent_config)
        else:
-            # Negative ID: Load from YAML (global configs)
-            llm_config = load_llm_config_from_yaml(llm_config_id=llm_config_id)
+            # Negative ID: Load from in-memory global configs (includes dynamic OpenRouter models)
+            llm_config = load_global_llm_config_by_id(llm_config_id)
            if not llm_config:
                yield streaming_service.format_error(
                    f"Failed to load LLM config with id {llm_config_id}"
@ -1214,9 +1214,8 @@ async def stream_new_chat(
                yield streaming_service.format_done()
                return

-            # Create ChatLiteLLM from YAML config dict
+            # Create ChatLiteLLM from global config dict
            llm = create_chat_litellm_from_config(llm_config)
-            # Create AgentConfig from YAML for consistency (uses defaults for prompt settings)
            agent_config = AgentConfig.from_yaml_config(llm_config)
        _perf_log.info(
            "[stream_new_chat] LLM config loaded in %.3fs (config_id=%s)",
@ -1224,8 +1223,14 @@ async def stream_new_chat(
            llm_config_id,
        )

-        # Premium quota reservation
-        if agent_config and agent_config.is_premium and user_id:
+        # Premium quota reservation — applies to explicitly premium configs
+        # AND Auto mode (which may route to premium models).
+        _needs_premium_quota = (
+            agent_config is not None
+            and user_id
+            and (agent_config.is_premium or agent_config.is_auto_mode)
+        )
+        if _needs_premium_quota:
            import uuid as _uuid

            from app.config import config as _app_config
@ -1246,11 +1251,16 @@ async def stream_new_chat(
                )
            _premium_reserved = reserve_amount
            if not quota_result.allowed:
-                yield streaming_service.format_error(
-                    "Premium token quota exceeded. Please purchase more tokens to continue using premium models."
-                )
-                yield streaming_service.format_done()
-                return
+                if agent_config.is_premium:
+                    yield streaming_service.format_error(
+                        "Premium token quota exceeded. Please purchase more tokens to continue using premium models."
+                    )
+                    yield streaming_service.format_done()
+                    return
+                # Auto mode: quota exhausted but we can still proceed
+                # (the router may pick a free model). Reset reservation.
+                _premium_request_id = None
+                _premium_reserved = 0

        if not llm:
            yield streaming_service.format_error("Failed to create LLM instance")
@ -1658,17 +1668,27 @@ async def stream_new_chat(
                    chat_id, generated_title
                )

-        # Finalize premium quota with actual tokens
+        # Finalize premium quota with actual tokens.
+        # For Auto mode, only count tokens from calls that used premium models.
        if _premium_request_id and user_id:
            try:
                from app.services.token_quota_service import TokenQuotaService

+                if agent_config and agent_config.is_auto_mode:
+                    from app.services.llm_router_service import LLMRouterService
+
+                    actual_premium_tokens = LLMRouterService.compute_premium_tokens(
+                        accumulator.calls
+                    )
+                else:
+                    actual_premium_tokens = accumulator.grand_total
+
                async with shielded_async_session() as quota_session:
                    await TokenQuotaService.premium_finalize(
                        db_session=quota_session,
                        user_id=UUID(user_id),
                        request_id=_premium_request_id,
-                        actual_tokens=accumulator.grand_total,
+                        actual_tokens=actual_premium_tokens,
                        reserved_tokens=_premium_reserved,
                    )
            except Exception:
@ -1856,7 +1876,7 @@ async def stream_resume_chat(
                return
            llm = create_chat_litellm_from_agent_config(agent_config)
        else:
-            llm_config = load_llm_config_from_yaml(llm_config_id=llm_config_id)
+            llm_config = load_global_llm_config_by_id(llm_config_id)
            if not llm_config:
                yield streaming_service.format_error(
                    f"Failed to load LLM config with id {llm_config_id}"
@ -1869,6 +1889,44 @@ async def stream_resume_chat(
            "[stream_resume] LLM config loaded in %.3fs", time.perf_counter() - _t0
        )

+        # Premium quota reservation (same logic as stream_new_chat)
+        _resume_premium_reserved = 0
+        _resume_premium_request_id: str | None = None
+        _resume_needs_premium = (
+            agent_config is not None
+            and user_id
+            and (agent_config.is_premium or agent_config.is_auto_mode)
+        )
+        if _resume_needs_premium:
+            import uuid as _uuid
+
+            from app.config import config as _app_config
+            from app.services.token_quota_service import TokenQuotaService
+
+            _resume_premium_request_id = _uuid.uuid4().hex[:16]
+            reserve_amount = min(
+                agent_config.quota_reserve_tokens
+                or _app_config.QUOTA_MAX_RESERVE_PER_CALL,
+                _app_config.QUOTA_MAX_RESERVE_PER_CALL,
+            )
+            async with shielded_async_session() as quota_session:
+                quota_result = await TokenQuotaService.premium_reserve(
+                    db_session=quota_session,
+                    user_id=UUID(user_id),
+                    request_id=_resume_premium_request_id,
+                    reserve_tokens=reserve_amount,
+                )
+            _resume_premium_reserved = reserve_amount
+            if not quota_result.allowed:
+                if agent_config.is_premium:
+                    yield streaming_service.format_error(
+                        "Premium token quota exceeded. Please purchase more tokens to continue using premium models."
+                    )
+                    yield streaming_service.format_done()
+                    return
+                _resume_premium_request_id = None
+                _resume_premium_reserved = 0
+
        if not llm:
            yield streaming_service.format_error("Failed to create LLM instance")
            yield streaming_service.format_done()
@ -1982,6 +2040,35 @@ async def stream_resume_chat(
            yield streaming_service.format_done()
            return

+        # Finalize premium quota for resume path
+        if _resume_premium_request_id and user_id:
+            try:
+                from app.services.token_quota_service import TokenQuotaService
+
+                if agent_config and agent_config.is_auto_mode:
+                    from app.services.llm_router_service import LLMRouterService
+
+                    actual_premium_tokens = LLMRouterService.compute_premium_tokens(
+                        accumulator.calls
+                    )
+                else:
+                    actual_premium_tokens = accumulator.grand_total
+
+                async with shielded_async_session() as quota_session:
+                    await TokenQuotaService.premium_finalize(
+                        db_session=quota_session,
+                        user_id=UUID(user_id),
+                        request_id=_resume_premium_request_id,
+                        actual_tokens=actual_premium_tokens,
+                        reserved_tokens=_resume_premium_reserved,
+                    )
+            except Exception:
+                logging.getLogger(__name__).warning(
+                    "Failed to finalize premium quota for user %s (resume)",
+                    user_id,
+                    exc_info=True,
+                )
+
        usage_summary = accumulator.per_message_summary()
        _perf_log.info(
            "[token_usage] normal resume_chat: calls=%d total=%d summary=%s",
@ -2018,6 +2105,23 @@ async def stream_resume_chat(

    finally:
        with anyio.CancelScope(shield=True):
+            # Release premium reservation if not finalized
+            if _resume_premium_request_id and _resume_premium_reserved > 0 and user_id:
+                try:
+                    from app.services.token_quota_service import TokenQuotaService
+
+                    async with shielded_async_session() as quota_session:
+                        await TokenQuotaService.premium_release(
+                            db_session=quota_session,
+                            user_id=UUID(user_id),
+                            reserved_tokens=_resume_premium_reserved,
+                        )
+                    _resume_premium_reserved = 0
+                except Exception:
+                    logging.getLogger(__name__).warning(
+                        "Failed to release premium quota for user %s (resume)", user_id
+                    )
+
            try:
                await session.rollback()
                await clear_ai_responding(session, chat_id)
--- a/surfsense_backend/app/utils/content_utils.py
+++ b/surfsense_backend/app/utils/content_utils.py
@ -22,6 +22,20 @@ if TYPE_CHECKING:
    from app.db import ChatVisibility


+import re
+
+_FENCE_RE = re.compile(
+    r"^```(?:\w+)?\s*\n(.*?)```\s*$",
+    re.DOTALL,
+)
+
+
+def strip_markdown_fences(text: str) -> str:
+    """Remove a single markdown code fence (```json ... ```) wrapper if present."""
+    m = _FENCE_RE.match(text.strip())
+    return m.group(1).strip() if m else text
+
+
 def extract_text_content(content: str | dict | list) -> str:
    """Extract plain text content from various message formats."""
    if isinstance(content, str):
--- a/surfsense_web/app/(home)/free/[model_slug]/page.tsx
+++ b/surfsense_web/app/(home)/free/[model_slug]/page.tsx
@ -57,7 +57,7 @@ function buildModelFaq(model: AnonModel) {
 		},
 		{
 			question: `Is ${model.name} really free on SurfSense?`,
-			answer: `Yes! You can use ${model.name} completely free without login or sign-up. SurfSense gives you 1 million free tokens to use across any model, including ${model.name}.`,
+			answer: `Yes! You can use ${model.name} completely free without login or sign-up. SurfSense gives you 500,000 free tokens to use across any model, including ${model.name}.`,
 		},
 		{
 			question: `How do I use ${model.name} with no login?`,
--- a/surfsense_web/app/(home)/free/page.tsx
+++ b/surfsense_web/app/(home)/free/page.tsx
@ -107,12 +107,12 @@ const FAQ_ITEMS = [
 	{
 		question: "Can I use ChatGPT without login?",
 		answer:
-			"Yes. SurfSense lets you use ChatGPT without login or any sign-up. Just pick a model and start chatting. No email, no password, no account needed. You get 1 million free tokens to use across ChatGPT, Claude AI, Gemini, and other models.",
+			"Yes. SurfSense lets you use ChatGPT without login or any sign-up. Just pick a model and start chatting. No email, no password, no account needed. You get 500,000 free tokens to use across ChatGPT, Claude AI, Gemini, and other models.",
 	},
 	{
 		question: "Is ChatGPT really free on SurfSense?",
 		answer:
-			"Yes. SurfSense gives you free access to ChatGPT (GPT-4), Claude AI, Gemini, and other models without login. You get 1 million free tokens across any model with no sign-up required.",
+			"Yes. SurfSense gives you free access to ChatGPT (GPT-4), Claude AI, Gemini, and other models without login. You get 500,000 free tokens across any model with no sign-up required.",
 	},
 	{
 		question: "How do I use ChatGPT no login?",
@ -125,9 +125,9 @@ const FAQ_ITEMS = [
 			"SurfSense offers free access without login to models from OpenAI (GPT-4, GPT-4 Turbo), Anthropic (Claude 3, Claude free), Google (Gemini), DeepSeek, Mistral, Llama, and more. All available as a free ChatGPT alternative online with no login required.",
 	},
 	{
-		question: "What happens after I use 1 million free tokens?",
+		question: "What happens after I use my free tokens?",
 		answer:
-			"After your free tokens, create a free SurfSense account to unlock 5 million more. Premium model tokens can be purchased at $1 per million tokens. Non-premium models remain unlimited for registered users.",
+			"After your free tokens, create a free SurfSense account to unlock 3 million more premium tokens. Additional tokens can be purchased at $1 per million. Non-premium models remain unlimited for registered users.",
 	},
 	{
 		question: "Is Claude AI available without login?",
@ -203,7 +203,7 @@ export default async function FreeHubPage() {
 							No login required
 						</Badge>
 						<Badge variant="secondary" className="px-3 py-1.5 text-sm">
-							1M free tokens
+							500K free tokens
 						</Badge>
 						<Badge variant="secondary" className="px-3 py-1.5 text-sm">
 							{seoModels.length} AI models
@ -329,7 +329,7 @@ export default async function FreeHubPage() {
 				<section className="max-w-3xl mx-auto text-center">
 					<h2 className="text-2xl font-bold mb-3">Want More Features?</h2>
 					<p className="text-muted-foreground mb-6 leading-relaxed">
-						Create a free SurfSense account to unlock 5 million tokens, document uploads with
+						Create a free SurfSense account to unlock 3 million tokens, document uploads with
 						citations, team collaboration, and integrations with Slack, Google Drive, Notion, and
 						30+ more tools.
 					</p>
--- a/surfsense_web/app/(home)/pricing/page.tsx
+++ b/surfsense_web/app/(home)/pricing/page.tsx
@ -5,7 +5,7 @@ import { BreadcrumbNav } from "@/components/seo/breadcrumb-nav";
 export const metadata: Metadata = {
 	title: "Pricing | SurfSense - Free AI Search Plans",
 	description:
-		"Explore SurfSense plans and pricing. Use ChatGPT, Claude AI, and any AI model free. Open source NotebookLM alternative for teams.",
+		"Explore SurfSense plans and pricing. Start free with 500 pages & 3M premium tokens. Use ChatGPT, Claude AI, and premium AI models. Pay-as-you-go tokens at $1 per million.",
 	alternates: {
 		canonical: "https://surfsense.com/pricing",
 	},
--- a/surfsense_web/app/sitemap.ts
+++ b/surfsense_web/app/sitemap.ts
@ -13,7 +13,7 @@ const changelogSource = loader({
 	source: changelog.toFumadocsSource(),
 });

-const BASE_URL = "https://surfsense.com";
+const BASE_URL = "https://www.surfsense.com";
 const BACKEND_URL = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";

 async function getFreeModelSlugs(): Promise<string[]> {
--- a/surfsense_web/blog/content/why-enterprise-search-matters.mdx
+++ b/surfsense_web/blog/content/why-enterprise-search-matters.mdx
@ -1,38 +0,0 @@
---
-title: "Why Enterprise Search Matters More Than Ever"
-description: "As organizations generate more data across more tools, finding the right information at the right time has become a critical competitive advantage. Here's why enterprise search is no longer optional."
-date: "2026-04-10"
-image: "/og-image.png"
-author: "SurfSense Team"
-authorAvatar: "/logo.png"
-tags: ["Enterprise Search", "Productivity", "Knowledge Management"]
---
-
-## The Information Overload Problem
-
-The average knowledge worker switches between **11 different applications** per day and spends nearly 20% of their time searching for information. That's an entire day each week lost to context-switching and hunting for answers buried in Slack threads, Google Docs, Notion pages, and email chains.
-
-Enterprise search solves this by providing a **single, unified interface** to query across all your knowledge sources simultaneously.
-
-## What Makes Modern Enterprise Search Different
-
-Traditional enterprise search was little more than a keyword matcher slapped onto a file server. Modern solutions like SurfSense take a fundamentally different approach:
-
- **Semantic Understanding**: AI-powered search understands the *meaning* behind your query, not just the keywords
- **Federated Architecture**: Connect dozens of data sources without migrating data out of where it lives
- **Contextual Ranking**: Results are ranked by relevance to your role, recent activity, and team context
- **Real-time Indexing**: New content becomes searchable within minutes, not days
-
-## The ROI of Getting Search Right
-
-Organizations that invest in proper enterprise search see measurable improvements:
-
- **30% reduction** in time spent searching for information
- **25% faster** onboarding for new team members
- **40% fewer** duplicate documents and redundant work
-
-## Getting Started
-
-The best time to implement enterprise search was when your team hit 20 people. The second best time is now. Start by auditing which tools your team uses daily, then look for a solution that connects to all of them natively.
-
-SurfSense connects to Slack, Google Drive, Notion, Confluence, GitHub, and dozens more — all with a single search bar.
--- a/surfsense_web/components/free-chat/free-composer.tsx
+++ b/surfsense_web/components/free-chat/free-composer.tsx
@ -222,7 +222,10 @@ export const FreeComposer: FC = () => {

 					<Tooltip>
 						<TooltipTrigger asChild>
-							<label htmlFor="free-web-search-toggle" className="flex items-center gap-1.5 cursor-pointer select-none rounded-md px-2 py-1 text-xs text-muted-foreground hover:text-foreground hover:bg-accent/50 transition-colors">
+							<label
+								htmlFor="free-web-search-toggle"
+								className="flex items-center gap-1.5 cursor-pointer select-none rounded-md px-2 py-1 text-xs text-muted-foreground hover:text-foreground hover:bg-accent/50 transition-colors"
+							>
 								<Globe className="size-3.5" />
 								<span className="hidden sm:inline">Web</span>
 								<Switch
--- a/surfsense_web/components/free-chat/free-model-selector.tsx
+++ b/surfsense_web/components/free-chat/free-model-selector.tsx
@ -40,16 +40,21 @@ export function FreeModelSelector({ className }: { className?: string }) {
 		[models, currentSlug]
 	);

+	const sortedModels = useMemo(
+		() => [...models].sort((a, b) => Number(a.is_premium) - Number(b.is_premium)),
+		[models]
+	);
+
 	const filteredModels = useMemo(() => {
-		if (!searchQuery.trim()) return models;
+		if (!searchQuery.trim()) return sortedModels;
 		const q = searchQuery.toLowerCase();
-		return models.filter(
+		return sortedModels.filter(
 			(m) =>
 				m.name.toLowerCase().includes(q) ||
 				m.model_name.toLowerCase().includes(q) ||
 				m.provider.toLowerCase().includes(q)
 		);
-	}, [models, searchQuery]);
+	}, [sortedModels, searchQuery]);

 	const handleSelect = useCallback(
 		(model: AnonModel) => {
@ -170,13 +175,20 @@ export function FreeModelSelector({ className }: { className?: string }) {
 									<div className="flex-1 min-w-0">
 										<div className="flex items-center gap-1.5">
 											<span className="font-medium text-sm truncate">{model.name}</span>
-											{model.is_premium && (
+											{model.is_premium ? (
 												<Badge
 													variant="secondary"
 													className="text-[9px] px-1 py-0 h-3.5 bg-purple-100 text-purple-700 dark:bg-purple-900/50 dark:text-purple-300 border-0"
 												>
 													Premium
 												</Badge>
+											) : (
+												<Badge
+													variant="secondary"
+													className="text-[9px] px-1 py-0 h-3.5 bg-emerald-100 text-emerald-700 dark:bg-emerald-900/50 dark:text-emerald-300 border-0"
+												>
+													Free
+												</Badge>
 											)}
 										</div>
 										<span className="text-xs text-muted-foreground truncate block">
--- a/surfsense_web/components/free-chat/quota-warning-banner.tsx
+++ b/surfsense_web/components/free-chat/quota-warning-banner.tsx
@ -40,7 +40,7 @@ export function QuotaWarningBanner({
 						</p>
 						<p className="text-xs text-red-600 dark:text-red-300">
 							You&apos;ve used all {limit.toLocaleString()} free tokens. Create a free account to
-							get 5 million tokens and access to all models.
+							get 3 million tokens and access to all models.
 						</p>
 						<Link
 							href="/register"
--- a/surfsense_web/components/new-chat/model-selector.tsx
+++ b/surfsense_web/components/new-chat/model-selector.tsx
@ -433,13 +433,31 @@ export function ModelSelector({
 					isGlobal && "is_auto_mode" in c && !!(c as Record<string, unknown>).is_auto_mode,
 			}));

+		const sortGlobalItems = (items: DisplayItem[]): DisplayItem[] =>
+			[...items].sort((a, b) => {
+				if (a.isAutoMode !== b.isAutoMode) return a.isAutoMode ? -1 : 1;
+				const aPremium = !!(a.config as Record<string, unknown>).is_premium;
+				const bPremium = !!(b.config as Record<string, unknown>).is_premium;
+				if (aPremium !== bPremium) return aPremium ? 1 : -1;
+				return 0;
+			});
+
 		switch (activeTab) {
 			case "llm":
-				return [...toItems(filteredLLMGlobal, true), ...toItems(filteredLLMUser, false)];
+				return [
+					...sortGlobalItems(toItems(filteredLLMGlobal, true)),
+					...toItems(filteredLLMUser, false),
+				];
 			case "image":
-				return [...toItems(filteredImageGlobal, true), ...toItems(filteredImageUser, false)];
+				return [
+					...sortGlobalItems(toItems(filteredImageGlobal, true)),
+					...toItems(filteredImageUser, false),
+				];
 			case "vision":
-				return [...toItems(filteredVisionGlobal, true), ...toItems(filteredVisionUser, false)];
+				return [
+					...sortGlobalItems(toItems(filteredVisionGlobal, true)),
+					...toItems(filteredVisionUser, false),
+				];
 		}
 	}, [
 		activeTab,
@ -859,14 +877,23 @@ export function ModelSelector({
 								Recommended
 							</Badge>
 						)}
-						{"is_premium" in config && (config as Record<string, unknown>).is_premium && (
+						{"is_premium" in config && (config as Record<string, unknown>).is_premium ? (
 							<Badge
 								variant="secondary"
 								className="text-[9px] px-1 py-0 h-3.5 bg-purple-100 text-purple-700 dark:bg-purple-900/50 dark:text-purple-300 border-0"
 							>
 								Premium
 							</Badge>
-						)}
+						) : "is_premium" in config &&
+							!(config as Record<string, unknown>).is_premium &&
+							!isAutoMode ? (
+							<Badge
+								variant="secondary"
+								className="text-[9px] px-1 py-0 h-3.5 bg-emerald-100 text-emerald-700 dark:bg-emerald-900/50 dark:text-emerald-300 border-0"
+							>
+								Free
+							</Badge>
+						) : null}
 					</div>
 					<div className="flex items-center gap-1.5 mt-0.5">
 						<span className="text-xs text-muted-foreground truncate">
--- a/surfsense_web/components/pricing/pricing-section.tsx
+++ b/surfsense_web/components/pricing/pricing-section.tsx
@ -12,10 +12,11 @@ const demoPlans = [
 		price: "0",
 		yearlyPrice: "0",
 		period: "",
-		billingText: "500 pages included",
+		billingText: "500 pages + 3M premium tokens included",
 		features: [
 			"Self Hostable",
 			"500 pages included to start",
+			"3 million premium tokens to start",
 			"Earn up to 3,000+ bonus pages for free",
 			"Includes access to OpenAI text, audio and image models",
 			"Realtime Collaborative Group Chats with teammates",
@ -30,11 +31,13 @@ const demoPlans = [
 		name: "PAY AS YOU GO",
 		price: "1",
 		yearlyPrice: "1",
-		period: "1,000 pages",
+		period: "pack",
 		billingText: "No subscription, buy only when you need more",
 		features: [
 			"Everything in Free",
 			"Buy 1,000-page packs at $1 each",
+			"Buy 1M premium token packs at $1 each",
+			"Use premium AI models like GPT-5.4, Claude Sonnet 4.6, Gemini 2.5 Pro & 100+ more via OpenRouter",
 			"Priority support on Discord",
 		],
 		description: "",
@ -77,7 +80,7 @@ interface FAQSection {

 const faqData: FAQSection[] = [
 	{
-		title: "Pages & Billing",
+		title: "Pages & Document Billing",
 		items: [
 			{
 				question: 'What exactly is a "page" in SurfSense?',
@ -126,13 +129,38 @@ const faqData: FAQSection[] = [
 			},
 		],
 	},
+	{
+		title: "Premium Tokens",
+		items: [
+			{
+				question: 'What are "premium tokens"?',
+				answer:
+					"Premium tokens are the billing unit for using premium AI models like GPT-5.4, Claude Sonnet 4.6, and Gemini 2.5 Pro in SurfSense. Each AI request consumes tokens based on the length of your conversation. Non-premium models (such as free-tier models available without login) do not consume premium tokens.",
+			},
+			{
+				question: "How many premium tokens do I get for free?",
+				answer:
+					"Every registered SurfSense account starts with 3 million premium tokens at no cost. Anonymous users (no login) get 500,000 free tokens across all models. Once your free tokens are used up, you can purchase more at any time.",
+			},
+			{
+				question: "How does purchasing premium tokens work?",
+				answer:
+					"Just like pages, there's no subscription. You buy 1-million-token packs at $1 each whenever you need more. Purchased tokens are added to your account immediately. You can buy up to 100 packs at a time.",
+			},
+			{
+				question: "What happens if I run out of premium tokens?",
+				answer:
+					"When your premium token balance runs low (below 20%), you'll see a warning. Once you run out, premium model requests are paused until you purchase more tokens. You can always switch to non-premium models which don't consume premium tokens.",
+			},
+		],
+	},
 	{
 		title: "Self-Hosting",
 		items: [
 			{
-				question: "Can I self-host SurfSense with unlimited pages?",
+				question: "Can I self-host SurfSense with unlimited pages and tokens?",
 				answer:
-					"Yes! When self-hosting, you have full control over your page limits. The default self-hosted setup gives you effectively unlimited pages, so you can index as much data as your infrastructure supports.",
+					"Yes! When self-hosting, you have full control over your page and token limits. The default self-hosted setup gives you effectively unlimited pages and tokens, so you can index as much data and use as many AI queries as your infrastructure supports.",
 			},
 		],
 	},
@ -223,8 +251,8 @@ function PricingFAQ() {
 					Frequently Asked Questions
 				</h2>
 				<p className="mx-auto mt-4 max-w-2xl text-lg text-muted-foreground">
-					Everything you need to know about SurfSense pages and billing. Can&apos;t find what you
-					need? Reach out at{" "}
+					Everything you need to know about SurfSense pages, premium tokens, and billing. Can&apos;t
+					find what you need? Reach out at{" "}
 					<a href="mailto:rohan@surfsense.com" className="text-blue-500 underline">
 						rohan@surfsense.com
 					</a>
@ -308,7 +336,7 @@ function PricingBasic() {
 			<Pricing
 				plans={demoPlans}
 				title="SurfSense Pricing"
-				description="Start free with 500 pages and pay as you go."
+				description="Start free with 500 pages & 3M premium tokens. Pay as you go."
 			/>
 			<PricingFAQ />
 		</>
--- a/surfsense_web/contexts/login-gate.tsx
+++ b/surfsense_web/contexts/login-gate.tsx
@ -44,7 +44,7 @@ export function LoginGateProvider({ children }: { children: ReactNode }) {
 					<DialogHeader>
 						<DialogTitle>Create a free account to {feature}</DialogTitle>
 						<DialogDescription>
-							Get 5 million tokens, save chat history, upload documents, use all AI tools, and
+							Get 3 million tokens, save chat history, upload documents, use all AI tools, and
 							connect 30+ integrations.
 						</DialogDescription>
 					</DialogHeader>