feat(database-migrations): add migration to remove legacy model config tables and remove stale model connection code

This commit is contained in:
Anish Sarkar 2026-06-13 12:45:43 +05:30
parent 50668775f8
commit bd4a04f2e7
93 changed files with 956 additions and 11442 deletions

View file

@ -1,13 +1,13 @@
"""Resolve and persist Auto (Fastest) model pins per chat thread.
"""Resolve and persist Auto model pins per chat thread.
Auto (Fastest) is represented by ``agent_llm_id == 0``. For chat threads we
resolve that virtual mode to one concrete global LLM config exactly once and
Auto is represented by ``chat_model_id == 0``. For chat threads we
resolve that virtual mode to one concrete global model exactly once and
persist the chosen config id on ``new_chat_threads.pinned_llm_config_id`` so
subsequent turns are stable.
Single-writer invariant: this module is the only writer of
``NewChatThread.pinned_llm_config_id`` (aside from the bulk clear in
``search_spaces_routes`` when a search space's ``agent_llm_id`` changes).
``model_connections_routes`` when a search space's ``chat_model_id`` changes).
Therefore a non-NULL value unambiguously means "this thread has an
Auto-resolved pin"; no separate source/policy column is needed.
"""
@ -33,8 +33,10 @@ from app.services.token_quota_service import TokenQuotaService
logger = logging.getLogger(__name__)
AUTO_FASTEST_ID = 0
AUTO_FASTEST_MODE = "auto_fastest"
AUTO_MODE_ID = 0
# Stable internal hash namespace for deterministic per-thread selection.
# Do not rename: changing this rebalances Auto's model choice for new pins.
AUTO_PIN_HASH_NAMESPACE = "auto_fastest"
_RUNTIME_COOLDOWN_SECONDS = 600
_HEALTHY_TTL_SECONDS = 45
@ -383,7 +385,7 @@ def _select_pin(eligible: list[dict], thread_id: int) -> tuple[dict, int]:
pool = tier_a if tier_a else eligible
pool = sorted(pool, key=lambda c: -int(c.get("quality_score") or 0))
top_k = pool[:_QUALITY_TOP_K]
digest = hashlib.sha256(f"{AUTO_FASTEST_MODE}:{thread_id}".encode()).digest()
digest = hashlib.sha256(f"{AUTO_PIN_HASH_NAMESPACE}:{thread_id}".encode()).digest()
idx = int.from_bytes(digest[:8], "big") % len(top_k)
return top_k[idx], len(top_k)
@ -425,7 +427,7 @@ async def resolve_or_get_pinned_llm_config_id(
exclude_config_ids: set[int] | None = None,
requires_image_input: bool = False,
) -> AutoPinResolution:
"""Resolve Auto (Fastest) to one concrete config id and persist the pin.
"""Resolve Auto to one concrete config id and persist the pin.
For non-auto selections, this function clears any existing pin and returns
the selected id as-is.
@ -457,7 +459,7 @@ async def resolve_or_get_pinned_llm_config_id(
)
# Explicit model selected: clear any stale pin.
if selected_llm_config_id != AUTO_FASTEST_ID:
if selected_llm_config_id != AUTO_MODE_ID:
if thread.pinned_llm_config_id is not None:
thread.pinned_llm_config_id = None
await session.commit()

View file

@ -450,10 +450,10 @@ async def _resolve_agent_billing_for_search_space(
Used by Celery tasks (podcast generation, video presentation) to bill the
search-space owner's premium credit pool when the chat model is premium.
Resolution rules mirror chat at ``stream_new_chat.py:2294-2351``:
Resolution rules mirror the chat model role resolver:
- Search space not found / no ``agent_llm_id``: raise ``ValueError``.
- **Auto mode** (``id == AUTO_FASTEST_ID == 0``):
- Search space not found / no ``chat_model_id``: raise ``ValueError``.
- **Auto mode** (``id == AUTO_MODE_ID == 0``):
* ``thread_id`` is set: delegate to
``resolve_or_get_pinned_llm_config_id`` (the same call chat uses) and
recurse into the resolved id. Reuses chat's existing pin if present
@ -469,9 +469,8 @@ async def _resolve_agent_billing_for_search_space(
(defaults to ``"free"`` via ``app/config/__init__.py:52`` setdefault),
``base_model = litellm_params.get("base_model") or model_name``
NOT provider-prefixed, matching chat's cost-map lookup convention.
- **Positive id** (user BYOK ``NewLLMConfig``): always free (matches
``AgentConfig.from_new_llm_config`` which hard-codes ``billing_tier="free"``);
``base_model`` from ``litellm_params`` or ``model_name``.
- **Positive id** (user BYOK ``Model``): always free; ``base_model`` from
the model catalog override or the upstream ``model_id``.
Note on imports: ``llm_service``, ``auto_model_pin_service``, and
``llm_router_service`` are imported lazily inside the function body to
@ -480,8 +479,9 @@ async def _resolve_agent_billing_for_search_space(
``billable_calls.py``'s module load path.
"""
from sqlalchemy import select
from sqlalchemy.orm import selectinload
from app.db import NewLLMConfig, SearchSpace
from app.db import Model, SearchSpace
result = await session.execute(
select(SearchSpace).where(SearchSpace.id == search_space_id)
@ -490,20 +490,20 @@ async def _resolve_agent_billing_for_search_space(
if search_space is None:
raise ValueError(f"Search space {search_space_id} not found")
agent_llm_id = search_space.agent_llm_id
if agent_llm_id is None:
chat_model_id = search_space.chat_model_id
if chat_model_id is None:
raise ValueError(
f"Search space {search_space_id} has no agent_llm_id configured"
f"Search space {search_space_id} has no chat_model_id configured"
)
owner_user_id: UUID = search_space.user_id
from app.services.auto_model_pin_service import (
AUTO_FASTEST_ID,
AUTO_MODE_ID,
resolve_or_get_pinned_llm_config_id,
)
if agent_llm_id == AUTO_FASTEST_ID:
if chat_model_id == AUTO_MODE_ID:
if thread_id is None:
return owner_user_id, "free", "auto"
try:
@ -512,7 +512,7 @@ async def _resolve_agent_billing_for_search_space(
thread_id=thread_id,
search_space_id=search_space_id,
user_id=str(owner_user_id),
selected_llm_config_id=AUTO_FASTEST_ID,
selected_llm_config_id=AUTO_MODE_ID,
)
except ValueError:
logger.warning(
@ -523,28 +523,35 @@ async def _resolve_agent_billing_for_search_space(
exc_info=True,
)
return owner_user_id, "free", "auto"
agent_llm_id = resolution.resolved_llm_config_id
chat_model_id = resolution.resolved_llm_config_id
if agent_llm_id < 0:
if chat_model_id < 0:
from app.services.llm_service import get_global_llm_config
cfg = get_global_llm_config(agent_llm_id) or {}
cfg = get_global_llm_config(chat_model_id) or {}
billing_tier = str(cfg.get("billing_tier", "free")).lower()
litellm_params = cfg.get("litellm_params") or {}
base_model = litellm_params.get("base_model") or cfg.get("model_name") or ""
return owner_user_id, billing_tier, base_model
nlc_result = await session.execute(
select(NewLLMConfig).where(
NewLLMConfig.id == agent_llm_id,
NewLLMConfig.search_space_id == search_space_id,
)
model_result = await session.execute(
select(Model)
.options(selectinload(Model.connection))
.where(Model.id == chat_model_id, Model.enabled.is_(True))
)
nlc = nlc_result.scalars().first()
model = model_result.scalars().first()
base_model = ""
if nlc is not None:
litellm_params = nlc.litellm_params or {}
base_model = litellm_params.get("base_model") or nlc.model_name or ""
if (
model is not None
and model.connection is not None
and model.connection.enabled
and (
model.connection.search_space_id in (None, search_space_id)
and model.connection.user_id in (None, owner_user_id)
)
):
catalog = model.catalog or {}
base_model = catalog.get("base_model") or model.model_id or ""
return owner_user_id, "free", base_model

View file

@ -14,7 +14,11 @@ from app.services.auto_model_pin_service import (
auto_model_candidates,
choose_auto_model_candidate,
)
from app.services.llm_router_service import AUTO_MODE_ID, ChatLiteLLMRouter, is_auto_mode
from app.services.llm_router_service import (
AUTO_MODE_ID,
ChatLiteLLMRouter,
is_auto_mode,
)
from app.services.model_capabilities import has_capability
from app.services.model_resolver import native_connection_from_config, to_litellm
from app.services.token_tracking_service import token_tracker
@ -96,26 +100,16 @@ class LLMRole:
def get_global_llm_config(llm_config_id: int) -> dict | None:
"""
Get a global LLM configuration by ID.
Global configs have negative IDs. ID 0 is reserved for Auto mode.
Global configs have negative IDs. Auto mode (ID 0) is resolved through the
model-candidate pipeline, not this legacy config lookup.
Args:
llm_config_id: The ID of the global config (should be negative or 0 for Auto)
llm_config_id: The ID of the global config (must be negative)
Returns:
dict: Global config dictionary or None if not found
"""
# Auto mode (ID 0) is handled separately via the router
if llm_config_id == AUTO_MODE_ID:
return {
"id": AUTO_MODE_ID,
"name": "Auto (Fastest)",
"description": "Automatically routes requests across available LLM providers for optimal performance and rate limit handling",
"provider": "AUTO",
"model_name": "auto",
"is_auto_mode": True,
}
if llm_config_id > 0:
if llm_config_id >= 0:
return None
for cfg in config.GLOBAL_LLM_CONFIGS:

View file

@ -24,7 +24,7 @@ CACHE_TTL_SECONDS = 86400 # 24 hours
_cache: list[dict] | None = None
_cache_timestamp: float = 0
# Maps OpenRouter provider slug → our LiteLLMProvider enum value.
# Maps OpenRouter provider slug to native LiteLLM provider prefixes.
# Only providers where the model-name part (after the slash) can be
# used directly with the native provider's litellm prefix are listed.
#

View file

@ -281,7 +281,7 @@ def _generate_configs(
OpenRouter's own ``openrouter/free`` meta-router is filtered out upstream
via ``_EXCLUDED_MODEL_IDS``; we don't expose a redundant auto-select layer
because our own Auto (Fastest) pin + 24 h refresh + repair logic already
because our own Auto pin + 24 h refresh + repair logic already
cover the catalogue-churn case.
"""
id_offset: int = settings.get("id_offset", -10000)
@ -346,7 +346,7 @@ def _generate_configs(
# ``"No endpoints found that support image input"``.
"supports_image_input": bool(normalized.get("supports_image_input")),
_OPENROUTER_DYNAMIC_MARKER: True,
# Auto (Fastest) ranking metadata. ``quality_score`` is initialised
# Auto ranking metadata. ``quality_score`` is initialised
# to the static score and gets re-blended with health on the next
# ``_enrich_health`` pass (synchronous on refresh, deferred on cold
# start so startup latency is unchanged).
@ -361,11 +361,7 @@ def _generate_configs(
return configs
# ID-offset bands used to keep dynamic OpenRouter configs in their own
# namespace per surface. Image / vision get separate bands so a single
# Postgres-INTEGER cfg ID is unambiguous about which selector it belongs to.
_OPENROUTER_IMAGE_ID_OFFSET_DEFAULT = -20000
_OPENROUTER_VISION_ID_OFFSET_DEFAULT = -30000
def _generate_image_gen_configs(
@ -431,89 +427,6 @@ def _generate_image_gen_configs(
return configs
def _generate_vision_llm_configs(
raw_models: list[dict], settings: dict[str, Any]
) -> list[dict]:
"""Convert OpenRouter vision-capable LLMs into global vision-LLM config
dicts (matches the YAML shape consumed by ``vision_llm_routes``).
Filter:
- architecture.input_modalities contains "image"
- architecture.output_modalities contains "text"
- compatible provider (excluded slugs blocked)
- allowed model id (excluded list blocked)
Vision-LLM is invoked from the indexer (image extraction during
document upload) via ``langchain_litellm.ChatLiteLLM.ainvoke``, so
the chat-only ``_supports_tool_calling`` and ``_has_sufficient_context``
filters do not apply: a small-context vision model that doesn't
advertise tool-calling is still perfectly viable for "describe this
image" prompts.
"""
id_offset: int = int(
settings.get("vision_id_offset") or _OPENROUTER_VISION_ID_OFFSET_DEFAULT
)
api_key: str = settings.get("api_key", "")
rpm: int = settings.get("rpm", 200)
tpm: int = settings.get("tpm", 1_000_000)
free_rpm: int = settings.get("free_rpm", 20)
free_tpm: int = settings.get("free_tpm", 100_000)
quota_reserve_tokens: int = settings.get("quota_reserve_tokens", 4000)
litellm_params: dict = settings.get("litellm_params") or {}
vision_models = [
m
for m in raw_models
if supports_image_input(m)
and _shared_is_compatible_provider(m)
and _shared_is_allowed_model(m)
and "/" in m.get("id", "")
]
configs: list[dict] = []
taken: set[int] = set()
for model in vision_models:
model_id: str = model["id"]
name: str = model.get("name", model_id)
tier = _openrouter_tier(model)
pricing = model.get("pricing") or {}
# Capture per-token prices so ``pricing_registration`` can
# register them with LiteLLM at startup (and so the cost
# estimator in ``estimate_call_reserve_micros`` can resolve
# them at reserve time).
try:
input_cost = float(pricing.get("prompt", 0) or 0)
except (TypeError, ValueError):
input_cost = 0.0
try:
output_cost = float(pricing.get("completion", 0) or 0)
except (TypeError, ValueError):
output_cost = 0.0
cfg: dict[str, Any] = {
"id": _stable_config_id(model_id, id_offset, taken),
"name": name,
"description": f"{name} via OpenRouter (vision)",
"provider": "openrouter",
"model_name": model_id,
"api_key": api_key,
"api_base": "https://openrouter.ai/api/v1",
"api_version": None,
"rpm": free_rpm if tier == "free" else rpm,
"tpm": free_tpm if tier == "free" else tpm,
"litellm_params": dict(litellm_params),
"billing_tier": tier,
"quota_reserve_tokens": quota_reserve_tokens,
"input_cost_per_token": input_cost or None,
"output_cost_per_token": output_cost or None,
_OPENROUTER_DYNAMIC_MARKER: True,
}
configs.append(cfg)
return configs
class OpenRouterIntegrationService:
"""Singleton that manages the dynamic OpenRouter model catalogue."""
@ -724,7 +637,7 @@ class OpenRouterIntegrationService:
return counts
# ------------------------------------------------------------------
# Auto (Fastest) health enrichment
# Auto health enrichment
# ------------------------------------------------------------------
async def _enrich_health_safely(

View file

@ -154,10 +154,8 @@ def _register_chat_shape_configs(
input_cost = _safe_float(entry.get("prompt"))
output_cost = _safe_float(entry.get("completion"))
else:
# Vision configs from ``_generate_vision_llm_configs``
# carry their pricing inline because the OpenRouter
# raw-pricing cache is keyed by chat-catalogue model_id;
# vision flows pick up the inline values here.
# Some dynamically materialized configs can carry pricing
# inline when the raw OpenRouter cache has no matching entry.
input_cost = _safe_float(cfg.get("input_cost_per_token"))
output_cost = _safe_float(cfg.get("output_cost_per_token"))
if input_cost == 0.0 and output_cost == 0.0:

View file

@ -1,4 +1,4 @@
"""Pure-function quality scoring for Auto (Fastest) model selection.
"""Pure-function quality scoring for Auto model selection.
This module is import-free of any service / request-path dependencies. All
numbers are computed once during the OpenRouter refresh tick (or YAML load)

View file

@ -1,160 +0,0 @@
import logging
from typing import Any
from litellm import Router
from app.services.model_resolver import native_connection_from_config, to_litellm
logger = logging.getLogger(__name__)
VISION_AUTO_MODE_ID = 0
class VisionLLMRouterService:
_instance = None
_router: Router | None = None
_model_list: list[dict] = []
_router_settings: dict = {}
_initialized: bool = False
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
@classmethod
def get_instance(cls) -> "VisionLLMRouterService":
if cls._instance is None:
cls._instance = cls()
return cls._instance
@classmethod
def initialize(
cls,
global_configs: list[dict],
router_settings: dict | None = None,
) -> None:
instance = cls.get_instance()
if instance._initialized:
logger.debug("Vision LLM Router already initialized, skipping")
return
model_list = []
for config in global_configs:
deployment = cls._config_to_deployment(config)
if deployment:
model_list.append(deployment)
if not model_list:
logger.warning(
"No valid vision LLM configs found for router initialization"
)
return
instance._model_list = model_list
instance._router_settings = router_settings or {}
default_settings = {
"routing_strategy": "usage-based-routing",
"num_retries": 3,
"allowed_fails": 3,
"cooldown_time": 60,
"retry_after": 5,
}
final_settings = {**default_settings, **instance._router_settings}
try:
instance._router = Router(
model_list=model_list,
routing_strategy=final_settings.get(
"routing_strategy", "usage-based-routing"
),
num_retries=final_settings.get("num_retries", 3),
allowed_fails=final_settings.get("allowed_fails", 3),
cooldown_time=final_settings.get("cooldown_time", 60),
set_verbose=False,
)
instance._initialized = True
logger.info(
"Vision LLM Router initialized with %d deployments, strategy: %s",
len(model_list),
final_settings.get("routing_strategy"),
)
except Exception as e:
logger.error(f"Failed to initialize Vision LLM Router: {e}")
instance._router = None
@classmethod
def _config_to_deployment(cls, config: dict) -> dict | None:
try:
if not config.get("model_name") or not config.get("api_key"):
return None
model_string, resolved_kwargs = to_litellm(
native_connection_from_config(config),
config["model_name"],
)
litellm_params: dict[str, Any] = {"model": model_string, **resolved_kwargs}
deployment: dict[str, Any] = {
"model_name": "auto",
"litellm_params": litellm_params,
}
if config.get("rpm"):
deployment["rpm"] = config["rpm"]
if config.get("tpm"):
deployment["tpm"] = config["tpm"]
return deployment
except Exception as e:
logger.warning(f"Failed to convert vision config to deployment: {e}")
return None
@classmethod
def get_router(cls) -> Router | None:
instance = cls.get_instance()
return instance._router
@classmethod
def is_initialized(cls) -> bool:
instance = cls.get_instance()
return instance._initialized and instance._router is not None
@classmethod
def get_model_count(cls) -> int:
instance = cls.get_instance()
return len(instance._model_list)
def is_vision_auto_mode(config_id: int | None) -> bool:
return config_id == VISION_AUTO_MODE_ID
def build_vision_model_string(
litellm_provider: str, model_name: str, custom_provider: str | None
) -> str:
if custom_provider:
return f"{custom_provider}/{model_name}"
return f"{litellm_provider}/{model_name}"
def get_global_vision_llm_config(config_id: int) -> dict | None:
from app.config import config
if config_id == VISION_AUTO_MODE_ID:
return {
"id": VISION_AUTO_MODE_ID,
"name": "Auto (Fastest)",
"provider": "AUTO",
"model_name": "auto",
"is_auto_mode": True,
}
if config_id > 0:
return None
for cfg in config.GLOBAL_VISION_LLM_CONFIGS:
if cfg.get("id") == config_id:
return cfg
return None

View file

@ -1,134 +0,0 @@
"""
Service for fetching and caching the vision-capable model list.
Reuses the same OpenRouter public API and local fallback as the LLM model
list service, but filters for models that accept image input.
"""
import json
import logging
import time
from pathlib import Path
import httpx
logger = logging.getLogger(__name__)
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/models"
FALLBACK_FILE = (
Path(__file__).parent.parent / "config" / "vision_model_list_fallback.json"
)
CACHE_TTL_SECONDS = 86400 # 24 hours
_cache: list[dict] | None = None
_cache_timestamp: float = 0
OPENROUTER_SLUG_TO_VISION_PROVIDER: dict[str, str] = {
"openai": "OPENAI",
"anthropic": "ANTHROPIC",
"google": "GOOGLE",
"mistralai": "MISTRAL",
"x-ai": "XAI",
}
def _format_context_length(length: int | None) -> str | None:
if not length:
return None
if length >= 1_000_000:
return f"{length / 1_000_000:g}M"
if length >= 1_000:
return f"{length / 1_000:g}K"
return str(length)
async def _fetch_from_openrouter() -> list[dict] | None:
try:
async with httpx.AsyncClient(timeout=15) as client:
response = await client.get(OPENROUTER_API_URL)
response.raise_for_status()
data = response.json()
return data.get("data", [])
except Exception as e:
logger.warning("Failed to fetch from OpenRouter API for vision models: %s", e)
return None
def _load_fallback() -> list[dict]:
try:
with open(FALLBACK_FILE, encoding="utf-8") as f:
return json.load(f)
except Exception as e:
logger.error("Failed to load vision model fallback list: %s", e)
return []
def _is_vision_model(model: dict) -> bool:
"""Return True if the model accepts image input and outputs text."""
arch = model.get("architecture", {})
input_mods = arch.get("input_modalities", [])
output_mods = arch.get("output_modalities", [])
return "image" in input_mods and "text" in output_mods
def _process_vision_models(raw_models: list[dict]) -> list[dict]:
processed: list[dict] = []
for model in raw_models:
model_id: str = model.get("id", "")
name: str = model.get("name", "")
context_length = model.get("context_length")
if "/" not in model_id:
continue
if not _is_vision_model(model):
continue
provider_slug, model_name = model_id.split("/", 1)
context_window = _format_context_length(context_length)
processed.append(
{
"value": model_id,
"label": name,
"provider": "OPENROUTER",
"context_window": context_window,
}
)
direct_provider = OPENROUTER_SLUG_TO_VISION_PROVIDER.get(provider_slug)
if direct_provider:
if direct_provider == "GOOGLE" and not model_name.startswith("gemini-"):
continue
processed.append(
{
"value": model_name,
"label": name,
"provider": direct_provider,
"context_window": context_window,
}
)
return processed
async def get_vision_model_list() -> list[dict]:
global _cache, _cache_timestamp
if _cache is not None and (time.time() - _cache_timestamp) < CACHE_TTL_SECONDS:
return _cache
raw_models = await _fetch_from_openrouter()
if raw_models is None:
logger.info("Using fallback vision model list")
return _load_fallback()
processed = _process_vision_models(raw_models)
_cache = processed
_cache_timestamp = time.time()
return processed