diff --git a/surfsense_backend/app/services/quality_score.py b/surfsense_backend/app/services/quality_score.py new file mode 100644 index 000000000..8f6c75d56 --- /dev/null +++ b/surfsense_backend/app/services/quality_score.py @@ -0,0 +1,382 @@ +"""Pure-function quality scoring for Auto (Fastest) model selection. + +This module is import-free of any service / request-path dependencies. All +numbers are computed once during the OpenRouter refresh tick (or YAML load) +and cached on the cfg dict, so the chat hot path only does a precomputed +sort and a SHA256 pick. + +Score components (0-100 scale, higher is better): + +* ``static_score_or`` – derived from the bulk ``/api/v1/models`` payload + (provider prestige + ``created`` recency + pricing band + context window + + capabilities + narrow tiny/legacy slug penalty). +* ``static_score_yaml`` – same shape for hand-curated YAML configs, plus + an operator-trust bonus (the operator deliberately picked this model). +* ``aggregate_health`` – run on per-model ``/api/v1/models/{id}/endpoints`` + responses; returns ``(gated, score_or_none)``. + +The blended ``quality_score`` (0.5 * static + 0.5 * health) is computed in +:mod:`app.services.openrouter_integration_service` because that's the only +caller that sees both halves. +""" + +from __future__ import annotations + +# --------------------------------------------------------------------------- +# Tunables (constants, not flags) +# --------------------------------------------------------------------------- + +# Top-K size for deterministic spread inside the locked tier. +_QUALITY_TOP_K: int = 5 + +# Hard health gate: any cfg whose best non-null uptime is below this % +# is excluded from Auto-mode selection entirely. +_HEALTH_GATE_UPTIME_PCT: float = 90.0 + +# Health/static blend weight when a cfg has fresh /endpoints data. +_HEALTH_BLEND_WEIGHT: float = 0.5 + +# Static bonus applied to YAML cfgs because the operator hand-picked them. +_OPERATOR_TRUST_BONUS: int = 20 + +# /endpoints fan-out is bounded per refresh tick. +_HEALTH_ENRICH_TOP_N_PREMIUM: int = 50 +_HEALTH_ENRICH_TOP_N_FREE: int = 30 +_HEALTH_ENRICH_CONCURRENCY: int = 15 +_HEALTH_FETCH_TIMEOUT_SEC: float = 5.0 + +# If at least this fraction of /endpoints fetches fail in a refresh cycle, +# fall back to the previous cycle's last-good cache instead of writing +# partial / stale health values. +_HEALTH_FAIL_RATIO_FALLBACK: float = 0.25 + +# Narrow tiny/legacy slug penalties only. We deliberately do NOT penalise +# ``-nano`` / ``-mini`` / ``-lite`` because modern frontier models ship with +# those naming patterns (``gpt-5-mini``, ``gemini-2.5-flash-lite`` etc.) and +# blanket-penalising them suppresses high-quality picks. +_TINY_LEGACY_PENALTY_PATTERNS: tuple[str, ...] = ( + "-1b-", + "-1.2b-", + "-1.5b-", + "-2b-", + "-3b-", + "gemma-3n", + "lfm-", + "-base", + "-distill", + ":nitro", + "-preview", +) + + +# --------------------------------------------------------------------------- +# Provider prestige tables +# --------------------------------------------------------------------------- + +# OpenRouter-side provider slug (the prefix before ``/`` in the model id). +# Tiers are coarse: frontier labs > strong open / fast-moving labs > +# specialist labs > everything else. +PROVIDER_PRESTIGE_OR: dict[str, int] = { + # Frontier labs + "openai": 50, + "anthropic": 50, + "google": 50, + "x-ai": 50, + # Strong open / fast-moving labs + "deepseek": 38, + "qwen": 38, + "meta-llama": 38, + "mistralai": 38, + "cohere": 38, + "nvidia": 38, + "alibaba": 38, + # Specialist / regional / strong second-tier + "microsoft": 28, + "01-ai": 28, + "minimax": 28, + "moonshot": 28, + "z-ai": 28, + "nousresearch": 28, + "ai21": 28, + "perplexity": 28, + # Smaller / niche providers + "liquid": 18, + "cognitivecomputations": 18, + "venice": 18, + "inflection": 18, +} + +# YAML provider field (the upstream API shape the operator selected). +PROVIDER_PRESTIGE_YAML: dict[str, int] = { + "AZURE_OPENAI": 50, + "OPENAI": 50, + "ANTHROPIC": 50, + "GOOGLE": 50, + "VERTEX_AI": 50, + "GEMINI": 50, + "XAI": 50, + "MISTRAL": 38, + "DEEPSEEK": 38, + "COHERE": 38, + "GROQ": 30, + "TOGETHER_AI": 28, + "FIREWORKS_AI": 28, + "PERPLEXITY": 28, + "MINIMAX": 28, + "BEDROCK": 28, + "OPENROUTER": 25, + "OLLAMA": 12, + "CUSTOM": 12, +} + + +# --------------------------------------------------------------------------- +# Pure scoring helpers +# --------------------------------------------------------------------------- + +# Calibrated against the live /api/v1/models bulk dump. Frontier models +# released in the last ~6 months (GPT-5 family, Claude 4.x, Gemini 2.5, +# Grok 4) score in the 18-20 band; mid-2024 models in the 8-12 band; +# anything older trails off. +_RECENCY_BANDS_DAYS: tuple[tuple[int, int], ...] = ( + (60, 20), + (180, 16), + (365, 12), + (540, 9), + (730, 6), + (1095, 3), +) + + +def created_recency_signal(created_ts: int | None, now_ts: int) -> int: + """Return 0-20 based on how recently the model was published. + + Uses the OpenRouter ``created`` Unix timestamp (or any equivalent for + YAML cfgs). Models without a usable timestamp get 0 (we don't penalise, + we just don't reward). + """ + if created_ts is None or created_ts <= 0 or now_ts <= 0: + return 0 + age_days = max(0, (now_ts - int(created_ts)) // 86_400) + for cutoff, score in _RECENCY_BANDS_DAYS: + if age_days <= cutoff: + return score + return 0 + + +def pricing_band( + prompt: str | float | int | None, + completion: str | float | int | None, +) -> int: + """Return 0-15 based on combined prompt+completion cost per 1M tokens. + + Higher-priced models tend to be the larger / more capable ones. A free + model returns 0 (we use other signals to rank free-vs-free instead). + Uncoercible inputs are treated as 0 rather than raising. + """ + + def _to_float(value) -> float: + if value is None: + return 0.0 + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + p = _to_float(prompt) + c = _to_float(completion) + total_per_million = (p + c) * 1_000_000 + + if total_per_million >= 20.0: + return 15 + if total_per_million >= 5.0: + return 12 + if total_per_million >= 1.0: + return 9 + if total_per_million >= 0.3: + return 6 + if total_per_million >= 0.05: + return 4 + if total_per_million > 0.0: + return 2 + return 0 + + +def context_signal(ctx: int | None) -> int: + """Return 0-10 based on the model's context window.""" + if not ctx or ctx <= 0: + return 0 + if ctx >= 1_000_000: + return 10 + if ctx >= 400_000: + return 8 + if ctx >= 200_000: + return 6 + if ctx >= 128_000: + return 4 + if ctx >= 100_000: + return 2 + return 0 + + +def capabilities_signal(supported_parameters: list[str] | None) -> int: + """Return 0-5 for capabilities that matter for our agent flows.""" + if not supported_parameters: + return 0 + params = set(supported_parameters) + score = 0 + if "tools" in params: + score += 2 + if "structured_outputs" in params or "response_format" in params: + score += 2 + if "reasoning" in params or "include_reasoning" in params: + score += 1 + return min(score, 5) + + +def slug_penalty(model_id: str) -> int: + """Return a non-positive number; matches the narrow tiny/legacy patterns.""" + if not model_id: + return 0 + needle = model_id.lower() + for pattern in _TINY_LEGACY_PENALTY_PATTERNS: + if pattern in needle: + return -10 + return 0 + + +def _provider_prestige_or(model_id: str) -> int: + if "/" not in model_id: + return 0 + slug = model_id.split("/", 1)[0].lower() + return PROVIDER_PRESTIGE_OR.get(slug, 15) + + +def static_score_or(or_model: dict, *, now_ts: int) -> int: + """Score a raw OpenRouter ``/api/v1/models`` entry on a 0-100 scale.""" + model_id = str(or_model.get("id", "")) + pricing = or_model.get("pricing") or {} + + score = ( + _provider_prestige_or(model_id) + + created_recency_signal(or_model.get("created"), now_ts) + + pricing_band(pricing.get("prompt"), pricing.get("completion")) + + context_signal(or_model.get("context_length")) + + capabilities_signal(or_model.get("supported_parameters")) + + slug_penalty(model_id) + ) + return max(0, min(100, int(score))) + + +def static_score_yaml(cfg: dict) -> int: + """Score a YAML-curated cfg on a 0-100 scale. + + Includes ``_OPERATOR_TRUST_BONUS`` because the operator deliberately + listed this model. Pricing / context fall through to lazy ``litellm`` + lookups; failures are silent (we just lose those sub-points). + """ + provider = str(cfg.get("provider", "")).upper() + base = PROVIDER_PRESTIGE_YAML.get(provider, 15) + + model_name = cfg.get("model_name") or "" + litellm_params = cfg.get("litellm_params") or {} + lookup_name = ( + litellm_params.get("base_model") + or litellm_params.get("model") + or model_name + ) + + ctx = 0 + p_cost: float = 0.0 + c_cost: float = 0.0 + try: + from litellm import get_model_info # lazy: avoid cold-import cost + + info = get_model_info(lookup_name) or {} + ctx = int(info.get("max_input_tokens") or info.get("max_tokens") or 0) + p_cost = float(info.get("input_cost_per_token") or 0.0) + c_cost = float(info.get("output_cost_per_token") or 0.0) + except Exception: + # Unknown to litellm — that's fine for prestige+operator-bonus weighting. + pass + + score = ( + base + + _OPERATOR_TRUST_BONUS + + pricing_band(p_cost, c_cost) + + context_signal(ctx) + + slug_penalty(str(model_name)) + ) + return max(0, min(100, int(score))) + + +# --------------------------------------------------------------------------- +# Health aggregation +# --------------------------------------------------------------------------- + + +def _coerce_pct(value) -> float | None: + try: + if value is None: + return None + f = float(value) + except (TypeError, ValueError): + return None + if f < 0: + return None + # OpenRouter reports uptime as a 0-1 fraction; some endpoints surface it + # as a 0-100 percentage. Normalise. + return f * 100.0 if f <= 1.0 else f + + +def _best_uptime(endpoints: list[dict]) -> tuple[float | None, str | None]: + """Pick the best (highest) non-null uptime across all endpoints. + + Window preference: ``uptime_last_30m`` > ``uptime_last_1d`` > + ``uptime_last_5m``. Returns ``(uptime_pct, window_used)``. + """ + for window in ("uptime_last_30m", "uptime_last_1d", "uptime_last_5m"): + values = [_coerce_pct(ep.get(window)) for ep in endpoints] + values = [v for v in values if v is not None] + if values: + return max(values), window + return None, None + + +def aggregate_health(endpoints: list[dict]) -> tuple[bool, float | None]: + """Aggregate a model's per-endpoint health into ``(gated, score_or_none)``. + + Hard gate (returns ``(True, None)``): + * ``endpoints`` empty, + * no endpoint reports ``status == 0`` (OK), or + * best non-null uptime below ``_HEALTH_GATE_UPTIME_PCT``. + + On a pass, returns a 0-100 health score blending uptime, status, and a + freshness-weighted recent uptime sample. + """ + if not endpoints: + return True, None + + any_ok = any(int(ep.get("status", 1)) == 0 for ep in endpoints) + if not any_ok: + return True, None + + best_uptime, _ = _best_uptime(endpoints) + if best_uptime is None or best_uptime < _HEALTH_GATE_UPTIME_PCT: + return True, None + + # Freshness term: prefer 5m, fall through to 30m / 1d if 5m is missing. + freshness = None + for window in ("uptime_last_5m", "uptime_last_30m", "uptime_last_1d"): + values = [_coerce_pct(ep.get(window)) for ep in endpoints] + values = [v for v in values if v is not None] + if values: + freshness = max(values) + break + + uptime_term = best_uptime + status_term = 100.0 if any_ok else 0.0 + freshness_term = freshness if freshness is not None else best_uptime + + score = 0.50 * uptime_term + 0.30 * status_term + 0.20 * freshness_term + return False, max(0.0, min(100.0, score)) diff --git a/surfsense_backend/tests/unit/services/test_quality_score.py b/surfsense_backend/tests/unit/services/test_quality_score.py new file mode 100644 index 000000000..fbc91521d --- /dev/null +++ b/surfsense_backend/tests/unit/services/test_quality_score.py @@ -0,0 +1,342 @@ +"""Unit tests for the Auto (Fastest) quality scoring module.""" + +from __future__ import annotations + +import time + +import pytest + +from app.services.quality_score import ( + _HEALTH_GATE_UPTIME_PCT, + _OPERATOR_TRUST_BONUS, + aggregate_health, + capabilities_signal, + context_signal, + created_recency_signal, + pricing_band, + slug_penalty, + static_score_or, + static_score_yaml, +) + +pytestmark = pytest.mark.unit + + +# --------------------------------------------------------------------------- +# created_recency_signal +# --------------------------------------------------------------------------- + + +def test_created_recency_signal_recent_model_scores_high(): + now = 1_750_000_000 # ~mid-2025 + one_month_ago = now - (30 * 86_400) + assert created_recency_signal(one_month_ago, now) == 20 + + +def test_created_recency_signal_old_model_scores_zero(): + now = 1_750_000_000 + five_years_ago = now - (5 * 365 * 86_400) + assert created_recency_signal(five_years_ago, now) == 0 + + +def test_created_recency_signal_missing_timestamp_is_neutral(): + now = 1_750_000_000 + assert created_recency_signal(None, now) == 0 + assert created_recency_signal(0, now) == 0 + + +def test_created_recency_signal_monotonic_decay(): + now = 1_750_000_000 + scores = [ + created_recency_signal(now - days * 86_400, now) + for days in (30, 120, 300, 500, 700, 1000, 1500) + ] + assert scores == sorted(scores, reverse=True) + + +# --------------------------------------------------------------------------- +# pricing_band +# --------------------------------------------------------------------------- + + +def test_pricing_band_free_returns_zero(): + assert pricing_band("0", "0") == 0 + assert pricing_band(0.0, 0.0) == 0 + assert pricing_band(None, None) == 0 + + +def test_pricing_band_handles_unparseable(): + assert pricing_band("not-a-number", "0") == 0 + assert pricing_band({}, []) == 0 # type: ignore[arg-type] + + +def test_pricing_band_premium_tiers_increase_with_price(): + cheap = pricing_band("0.0000003", "0.0000005") + mid = pricing_band("0.000003", "0.000015") + flagship = pricing_band("0.00001", "0.00005") + assert 0 < cheap < mid < flagship + + +# --------------------------------------------------------------------------- +# context_signal +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "ctx,expected", + [ + (1_500_000, 10), + (1_000_000, 10), + (500_000, 8), + (200_000, 6), + (128_000, 4), + (100_000, 2), + (50_000, 0), + (0, 0), + (None, 0), + ], +) +def test_context_signal_bands(ctx, expected): + assert context_signal(ctx) == expected + + +# --------------------------------------------------------------------------- +# capabilities_signal +# --------------------------------------------------------------------------- + + +def test_capabilities_signal_caps_at_five(): + assert capabilities_signal( + ["tools", "structured_outputs", "reasoning", "include_reasoning"] + ) <= 5 + + +def test_capabilities_signal_tools_only(): + assert capabilities_signal(["tools"]) == 2 + + +def test_capabilities_signal_empty(): + assert capabilities_signal(None) == 0 + assert capabilities_signal([]) == 0 + + +# --------------------------------------------------------------------------- +# slug_penalty +# --------------------------------------------------------------------------- + + +def test_slug_penalty_demotes_tiny_models(): + assert slug_penalty("meta-llama/llama-3.2-1b-instruct") < 0 + assert slug_penalty("liquid/lfm-7b") < 0 + assert slug_penalty("google/gemma-3n-e4b-it") < 0 + + +def test_slug_penalty_skips_capable_mini_nano_lite_models(): + """Critical Option C+ regression: don't penalise modern frontier + models named ``-nano`` / ``-mini`` / ``-lite`` (gpt-5-mini, etc.).""" + assert slug_penalty("openai/gpt-5-mini") == 0 + assert slug_penalty("openai/gpt-5-nano") == 0 + assert slug_penalty("google/gemini-2.5-flash-lite") == 0 + assert slug_penalty("anthropic/claude-haiku-4.5") == 0 + + +def test_slug_penalty_demotes_legacy_variants(): + assert slug_penalty("openai/o1-preview") < 0 + assert slug_penalty("foo/bar-base") < 0 + assert slug_penalty("foo/bar-distill") < 0 + + +def test_slug_penalty_empty_input(): + assert slug_penalty("") == 0 + + +# --------------------------------------------------------------------------- +# static_score_or +# --------------------------------------------------------------------------- + + +def _or_model( + *, + model_id: str, + created: int | None = None, + prompt: str = "0.000003", + completion: str = "0.000015", + context: int = 200_000, + params: list[str] | None = None, +) -> dict: + return { + "id": model_id, + "created": created, + "pricing": {"prompt": prompt, "completion": completion}, + "context_length": context, + "supported_parameters": params if params is not None else ["tools"], + } + + +def test_static_score_or_frontier_premium_beats_free_tiny(): + now = 1_750_000_000 + frontier = _or_model( + model_id="openai/gpt-5", + created=now - (60 * 86_400), + prompt="0.000005", + completion="0.000020", + context=400_000, + params=["tools", "structured_outputs", "reasoning"], + ) + tiny_free = _or_model( + model_id="meta-llama/llama-3.2-1b-instruct:free", + created=now - (5 * 365 * 86_400), + prompt="0", + completion="0", + context=128_000, + params=["tools"], + ) + assert static_score_or(frontier, now_ts=now) > static_score_or( + tiny_free, now_ts=now + ) + + +def test_static_score_or_score_is_clamped_0_to_100(): + now = int(time.time()) + score = static_score_or(_or_model(model_id="openai/gpt-4o"), now_ts=now) + assert 0 <= score <= 100 + + +def test_static_score_or_unknown_provider_is_neutral_not_zero(): + now = int(time.time()) + score = static_score_or( + _or_model(model_id="some-new-lab/some-model"), + now_ts=now, + ) + assert score > 0 + + +def test_static_score_or_recent_release_beats_year_old_same_provider(): + now = 1_750_000_000 + fresh = _or_model(model_id="openai/gpt-5", created=now - (60 * 86_400)) + old = _or_model(model_id="openai/gpt-4-turbo", created=now - (700 * 86_400)) + assert static_score_or(fresh, now_ts=now) > static_score_or(old, now_ts=now) + + +# --------------------------------------------------------------------------- +# static_score_yaml +# --------------------------------------------------------------------------- + + +def test_static_score_yaml_includes_operator_bonus(): + cfg = { + "provider": "AZURE_OPENAI", + "model_name": "gpt-5", + "litellm_params": {"base_model": "azure/gpt-5"}, + } + score = static_score_yaml(cfg) + assert score >= _OPERATOR_TRUST_BONUS + + +def test_static_score_yaml_unknown_provider_still_carries_bonus(): + cfg = { + "provider": "SOME_NEW_PROVIDER", + "model_name": "weird-model", + } + score = static_score_yaml(cfg) + assert score >= _OPERATOR_TRUST_BONUS + + +def test_static_score_yaml_clamped_0_to_100(): + cfg = { + "provider": "AZURE_OPENAI", + "model_name": "gpt-5", + "litellm_params": {"base_model": "azure/gpt-5"}, + } + assert 0 <= static_score_yaml(cfg) <= 100 + + +# --------------------------------------------------------------------------- +# aggregate_health +# --------------------------------------------------------------------------- + + +def test_aggregate_health_gates_when_uptime_below_threshold(): + """Live data showed Venice-routed cfgs at 53-68%; this guards that the + 90% gate excludes them.""" + venice_endpoints = [ + { + "status": 0, + "uptime_last_30m": 0.55, + "uptime_last_1d": 0.60, + "uptime_last_5m": 0.50, + }, + { + "status": 0, + "uptime_last_30m": 0.65, + "uptime_last_1d": 0.68, + "uptime_last_5m": 0.62, + }, + ] + gated, score = aggregate_health(venice_endpoints) + assert gated is True + assert score is None + + +def test_aggregate_health_passes_for_healthy_provider(): + healthy = [ + { + "status": 0, + "uptime_last_30m": 0.99, + "uptime_last_1d": 0.995, + "uptime_last_5m": 0.99, + }, + ] + gated, score = aggregate_health(healthy) + assert gated is False + assert score is not None + assert score >= _HEALTH_GATE_UPTIME_PCT + + +def test_aggregate_health_picks_best_endpoint_across_multiple(): + """Multi-endpoint aggregation should reward the best non-null uptime.""" + mixed = [ + {"status": 0, "uptime_last_30m": 0.55}, + {"status": 0, "uptime_last_30m": 0.97}, # this one passes the gate + ] + gated, score = aggregate_health(mixed) + assert gated is False + assert score is not None + + +def test_aggregate_health_empty_endpoints_gated(): + gated, score = aggregate_health([]) + assert gated is True + assert score is None + + +def test_aggregate_health_no_status_zero_gated(): + """Even with high uptime, no OK status means the cfg is broken upstream.""" + endpoints = [ + {"status": 1, "uptime_last_30m": 0.99}, + {"status": 2, "uptime_last_30m": 0.98}, + ] + gated, score = aggregate_health(endpoints) + assert gated is True + assert score is None + + +def test_aggregate_health_all_uptime_null_gated(): + endpoints = [ + {"status": 0, "uptime_last_30m": None, "uptime_last_1d": None}, + ] + gated, score = aggregate_health(endpoints) + assert gated is True + assert score is None + + +def test_aggregate_health_pct_normalisation(): + """OpenRouter returns 0-1 fractions; some endpoints surface 0-100% + percentages. Both should reach the same gate decision.""" + fraction_form = [{"status": 0, "uptime_last_30m": 0.95}] + pct_form = [{"status": 0, "uptime_last_30m": 95.0}] + g1, s1 = aggregate_health(fraction_form) + g2, s2 = aggregate_health(pct_form) + assert g1 == g2 == False # noqa: E712 + assert s1 is not None and s2 is not None + assert abs(s1 - s2) < 0.5