feat(openrouter): derive billing tier per-model and stabilize config IDs

This commit is contained in:
Anish Sarkar 2026-05-01 17:42:21 +05:30
parent 5dd45a5740
commit ccd7caf99f

View file

@ -11,6 +11,7 @@ this service only manages the catalogue, not the inference path.
"""
import asyncio
import hashlib
import logging
import threading
from typing import Any
@ -25,6 +26,56 @@ OPENROUTER_API_URL = "https://openrouter.ai/api/v1/models"
# dynamic OpenRouter entries from hand-written YAML entries during refresh.
_OPENROUTER_DYNAMIC_MARKER = "__openrouter_dynamic__"
# Fixed negative ID for the virtual ``openrouter/free`` auto-select entry.
# Chosen to sit far below any reasonable ``id_offset`` so it never collides
# with per-model stable IDs.
_FREE_ROUTER_ID = -9_999_999
# Width of the hash space used by ``_stable_config_id``. 9_000_000 provides
# enough headroom to avoid frequent collisions for OpenRouter's catalogue
# (~300 models) while keeping IDs comfortably within Postgres INTEGER range.
_STABLE_ID_HASH_WIDTH = 9_000_000
def _stable_config_id(model_id: str, offset: int, taken: set[int]) -> int:
"""Derive a deterministic negative config ID from ``model_id``.
The same ``model_id`` always hashes to the same base value so thread pins
survive catalogue churn (models appearing/disappearing/reordering between
refreshes). On collision we decrement until we find an unused slot; this
keeps the mapping stable for the first config that claimed a slot and
only shifts collisions, which is much less disruptive than the legacy
index-based scheme that reshuffled every ID when the catalogue changed.
"""
digest = hashlib.blake2b(model_id.encode("utf-8"), digest_size=6).digest()
base = offset - (int.from_bytes(digest, "big") % _STABLE_ID_HASH_WIDTH)
cid = base
while cid in taken:
cid -= 1
taken.add(cid)
return cid
def _openrouter_tier(model: dict) -> str:
"""Classify an OpenRouter model as ``"free"`` or ``"premium"``.
Per OpenRouter's API contract, a model is free if:
- Its id ends with ``:free`` (OpenRouter's own free-variant convention), or
- Both ``pricing.prompt`` and ``pricing.completion`` are zero strings.
Anything else (missing pricing, non-zero pricing) falls through to
``"premium"`` so we never under-charge users. This derivation runs off the
already-cached /api/v1/models payload, so it adds no network cost.
"""
if model.get("id", "").endswith(":free"):
return "free"
pricing = model.get("pricing") or {}
prompt = str(pricing.get("prompt", "")).strip()
completion = str(pricing.get("completion", "")).strip()
if prompt == "0" and completion == "0":
return "free"
return "premium"
def _is_text_output_model(model: dict) -> bool:
"""Return True if the model produces text output only (skip image/audio generators)."""
@ -109,24 +160,77 @@ async def _fetch_models_async() -> list[dict] | None:
return None
def _build_free_router_config(settings: dict[str, Any]) -> dict[str, Any]:
"""Build the virtual ``openrouter/free`` auto-select config entry.
This exposes OpenRouter's Free Models Router as a single selectable
option. LiteLLM forwards ``openrouter/openrouter/free`` and OpenRouter
picks a capable free model per request (availability varies, account-wide
rate limit is ~20 req/min).
"""
return {
"id": _FREE_ROUTER_ID,
"name": "OpenRouter Free (Auto-Select)",
"description": (
"OpenRouter picks a capable free model per request. "
"~20 req/min account-wide; availability varies."
),
"provider": "OPENROUTER",
"model_name": "openrouter/free",
"api_key": settings.get("api_key", ""),
"api_base": "",
"billing_tier": "free",
"rpm": settings.get("free_rpm", 20),
"tpm": settings.get("free_tpm", 100_000),
"anonymous_enabled": settings.get("anonymous_enabled_free", False),
"seo_enabled": False,
"seo_slug": None,
"quota_reserve_tokens": settings.get("quota_reserve_tokens", 4000),
"litellm_params": dict(settings.get("litellm_params") or {}),
"system_instructions": settings.get("system_instructions", ""),
"use_default_system_instructions": settings.get(
"use_default_system_instructions", True
),
"citations_enabled": settings.get("citations_enabled", True),
"router_pool_eligible": False,
_OPENROUTER_DYNAMIC_MARKER: True,
}
def _generate_configs(
raw_models: list[dict],
settings: dict[str, Any],
) -> list[dict]:
"""
Convert raw OpenRouter model entries into global LLM config dicts.
"""Convert raw OpenRouter model entries into global LLM config dicts.
Models are sorted by ID for deterministic, stable ID assignment across
restarts and refreshes.
Tier (``billing_tier``) is derived per-model from OpenRouter's own API
signals via ``_openrouter_tier`` there is no longer a uniform YAML
override. Config IDs are derived via ``_stable_config_id`` so they
survive catalogue churn across refreshes.
Router-pool membership is tier-aware:
- Premium OR models join the LiteLLM router pool (``router_pool_eligible=True``)
so sub-agent ``model="auto"`` flows benefit from load balancing and
failover across the curated YAML configs and the OR premium passthrough.
- Free OR models and the virtual ``openrouter/free`` entry stay excluded
(``router_pool_eligible=False``). LiteLLM Router tracks rate limits per
deployment, but OpenRouter enforces a single global free-tier quota
(~20 RPM + 50-1000 daily requests account-wide across every ``:free``
model), so rotating across many free deployments would only burn the
shared bucket faster. Free OR models remain fully available for user-
facing Auto-mode thread pinning via ``auto_model_pin_service``.
"""
id_offset: int = settings.get("id_offset", -10000)
api_key: str = settings.get("api_key", "")
billing_tier: str = settings.get("billing_tier", "premium")
anonymous_enabled: bool = settings.get("anonymous_enabled", False)
seo_enabled: bool = settings.get("seo_enabled", False)
quota_reserve_tokens: int = settings.get("quota_reserve_tokens", 4000)
rpm: int = settings.get("rpm", 200)
tpm: int = settings.get("tpm", 1000000)
tpm: int = settings.get("tpm", 1_000_000)
free_rpm: int = settings.get("free_rpm", 20)
free_tpm: int = settings.get("free_tpm", 100_000)
anon_paid: bool = settings.get("anonymous_enabled_paid", False)
anon_free: bool = settings.get("anonymous_enabled_free", False)
litellm_params: dict = settings.get("litellm_params") or {}
system_instructions: str = settings.get("system_instructions", "")
use_default: bool = settings.get("use_default_system_instructions", True)
@ -142,19 +246,27 @@ def _generate_configs(
and _is_allowed_model(m)
and "/" in m.get("id", "")
]
text_models.sort(key=lambda m: m["id"])
configs: list[dict] = []
for idx, model in enumerate(text_models):
if settings.get("free_router_enabled", True) and api_key:
configs.append(_build_free_router_config(settings))
taken: set[int] = set()
if configs:
taken.add(_FREE_ROUTER_ID)
for model in text_models:
model_id: str = model["id"]
name: str = model.get("name", model_id)
tier = _openrouter_tier(model)
cfg: dict[str, Any] = {
"id": id_offset - idx,
"id": _stable_config_id(model_id, id_offset, taken),
"name": name,
"description": f"{name} via OpenRouter",
"billing_tier": billing_tier,
"anonymous_enabled": anonymous_enabled,
"billing_tier": tier,
"anonymous_enabled": anon_free if tier == "free" else anon_paid,
"seo_enabled": seo_enabled,
"seo_slug": None,
"quota_reserve_tokens": quota_reserve_tokens,
@ -162,12 +274,18 @@ def _generate_configs(
"model_name": model_id,
"api_key": api_key,
"api_base": "",
"rpm": rpm,
"tpm": tpm,
"rpm": free_rpm if tier == "free" else rpm,
"tpm": free_tpm if tier == "free" else tpm,
"litellm_params": dict(litellm_params),
"system_instructions": system_instructions,
"use_default_system_instructions": use_default,
"citations_enabled": citations_enabled,
# Premium OR deployments join the LiteLLM router pool so sub-agent
# model="auto" flows can load-balance / fail over across them.
# Free OR deployments stay out: OpenRouter's free tier is a single
# account-wide quota, so per-deployment routing can't spread load
# there — it just drains the shared bucket faster.
"router_pool_eligible": tier == "premium",
_OPENROUTER_DYNAMIC_MARKER: True,
}
configs.append(cfg)
@ -220,11 +338,12 @@ class OpenRouterIntegrationService:
self._configs_by_id = {c["id"]: c for c in self._configs}
self._initialized = True
tier_counts = self._tier_counts(self._configs)
logger.info(
"OpenRouter integration: loaded %d models (IDs %d to %d)",
"OpenRouter integration: loaded %d models (free=%d, premium=%d)",
len(self._configs),
self._configs[0]["id"] if self._configs else 0,
self._configs[-1]["id"] if self._configs else 0,
tier_counts["free"],
tier_counts["premium"],
)
return self._configs
@ -254,7 +373,43 @@ class OpenRouterIntegrationService:
self._configs = new_configs
self._configs_by_id = new_by_id
logger.info("OpenRouter refresh: updated to %d models", len(new_configs))
tier_counts = self._tier_counts(new_configs)
logger.info(
"OpenRouter refresh: updated to %d models (free=%d, premium=%d)",
len(new_configs),
tier_counts["free"],
tier_counts["premium"],
)
# Rebuild the LiteLLM router so freshly fetched configs flow through
# (the router filters dynamic OR entries out of its pool, but a
# refresh still needs to pick up any static-config edits and reset
# cached context-window profiles).
try:
from app.config import config as _app_config
from app.services.llm_router_service import LLMRouterService
from app.services.llm_router_service import (
_router_instance_cache as _chat_router_cache,
)
LLMRouterService.rebuild(
_app_config.GLOBAL_LLM_CONFIGS,
getattr(_app_config, "ROUTER_SETTINGS", None),
)
_chat_router_cache.clear()
except Exception as exc:
logger.warning(
"OpenRouter refresh: router rebuild skipped (%s)", exc
)
@staticmethod
def _tier_counts(configs: list[dict]) -> dict[str, int]:
counts = {"free": 0, "premium": 0}
for cfg in configs:
tier = str(cfg.get("billing_tier", "")).lower()
if tier in counts:
counts[tier] += 1
return counts
async def _refresh_loop(self, interval_hours: float) -> None:
interval_sec = interval_hours * 3600