mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
feat(auto_pin): add pure-function quality scoring module
This commit is contained in:
parent
421a4d7d08
commit
d9058b73f5
2 changed files with 724 additions and 0 deletions
382
surfsense_backend/app/services/quality_score.py
Normal file
382
surfsense_backend/app/services/quality_score.py
Normal file
|
|
@ -0,0 +1,382 @@
|
|||
"""Pure-function quality scoring for Auto (Fastest) model selection.
|
||||
|
||||
This module is import-free of any service / request-path dependencies. All
|
||||
numbers are computed once during the OpenRouter refresh tick (or YAML load)
|
||||
and cached on the cfg dict, so the chat hot path only does a precomputed
|
||||
sort and a SHA256 pick.
|
||||
|
||||
Score components (0-100 scale, higher is better):
|
||||
|
||||
* ``static_score_or`` – derived from the bulk ``/api/v1/models`` payload
|
||||
(provider prestige + ``created`` recency + pricing band + context window
|
||||
+ capabilities + narrow tiny/legacy slug penalty).
|
||||
* ``static_score_yaml`` – same shape for hand-curated YAML configs, plus
|
||||
an operator-trust bonus (the operator deliberately picked this model).
|
||||
* ``aggregate_health`` – run on per-model ``/api/v1/models/{id}/endpoints``
|
||||
responses; returns ``(gated, score_or_none)``.
|
||||
|
||||
The blended ``quality_score`` (0.5 * static + 0.5 * health) is computed in
|
||||
:mod:`app.services.openrouter_integration_service` because that's the only
|
||||
caller that sees both halves.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tunables (constants, not flags)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Top-K size for deterministic spread inside the locked tier.
|
||||
_QUALITY_TOP_K: int = 5
|
||||
|
||||
# Hard health gate: any cfg whose best non-null uptime is below this %
|
||||
# is excluded from Auto-mode selection entirely.
|
||||
_HEALTH_GATE_UPTIME_PCT: float = 90.0
|
||||
|
||||
# Health/static blend weight when a cfg has fresh /endpoints data.
|
||||
_HEALTH_BLEND_WEIGHT: float = 0.5
|
||||
|
||||
# Static bonus applied to YAML cfgs because the operator hand-picked them.
|
||||
_OPERATOR_TRUST_BONUS: int = 20
|
||||
|
||||
# /endpoints fan-out is bounded per refresh tick.
|
||||
_HEALTH_ENRICH_TOP_N_PREMIUM: int = 50
|
||||
_HEALTH_ENRICH_TOP_N_FREE: int = 30
|
||||
_HEALTH_ENRICH_CONCURRENCY: int = 15
|
||||
_HEALTH_FETCH_TIMEOUT_SEC: float = 5.0
|
||||
|
||||
# If at least this fraction of /endpoints fetches fail in a refresh cycle,
|
||||
# fall back to the previous cycle's last-good cache instead of writing
|
||||
# partial / stale health values.
|
||||
_HEALTH_FAIL_RATIO_FALLBACK: float = 0.25
|
||||
|
||||
# Narrow tiny/legacy slug penalties only. We deliberately do NOT penalise
|
||||
# ``-nano`` / ``-mini`` / ``-lite`` because modern frontier models ship with
|
||||
# those naming patterns (``gpt-5-mini``, ``gemini-2.5-flash-lite`` etc.) and
|
||||
# blanket-penalising them suppresses high-quality picks.
|
||||
_TINY_LEGACY_PENALTY_PATTERNS: tuple[str, ...] = (
|
||||
"-1b-",
|
||||
"-1.2b-",
|
||||
"-1.5b-",
|
||||
"-2b-",
|
||||
"-3b-",
|
||||
"gemma-3n",
|
||||
"lfm-",
|
||||
"-base",
|
||||
"-distill",
|
||||
":nitro",
|
||||
"-preview",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider prestige tables
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# OpenRouter-side provider slug (the prefix before ``/`` in the model id).
|
||||
# Tiers are coarse: frontier labs > strong open / fast-moving labs >
|
||||
# specialist labs > everything else.
|
||||
PROVIDER_PRESTIGE_OR: dict[str, int] = {
|
||||
# Frontier labs
|
||||
"openai": 50,
|
||||
"anthropic": 50,
|
||||
"google": 50,
|
||||
"x-ai": 50,
|
||||
# Strong open / fast-moving labs
|
||||
"deepseek": 38,
|
||||
"qwen": 38,
|
||||
"meta-llama": 38,
|
||||
"mistralai": 38,
|
||||
"cohere": 38,
|
||||
"nvidia": 38,
|
||||
"alibaba": 38,
|
||||
# Specialist / regional / strong second-tier
|
||||
"microsoft": 28,
|
||||
"01-ai": 28,
|
||||
"minimax": 28,
|
||||
"moonshot": 28,
|
||||
"z-ai": 28,
|
||||
"nousresearch": 28,
|
||||
"ai21": 28,
|
||||
"perplexity": 28,
|
||||
# Smaller / niche providers
|
||||
"liquid": 18,
|
||||
"cognitivecomputations": 18,
|
||||
"venice": 18,
|
||||
"inflection": 18,
|
||||
}
|
||||
|
||||
# YAML provider field (the upstream API shape the operator selected).
|
||||
PROVIDER_PRESTIGE_YAML: dict[str, int] = {
|
||||
"AZURE_OPENAI": 50,
|
||||
"OPENAI": 50,
|
||||
"ANTHROPIC": 50,
|
||||
"GOOGLE": 50,
|
||||
"VERTEX_AI": 50,
|
||||
"GEMINI": 50,
|
||||
"XAI": 50,
|
||||
"MISTRAL": 38,
|
||||
"DEEPSEEK": 38,
|
||||
"COHERE": 38,
|
||||
"GROQ": 30,
|
||||
"TOGETHER_AI": 28,
|
||||
"FIREWORKS_AI": 28,
|
||||
"PERPLEXITY": 28,
|
||||
"MINIMAX": 28,
|
||||
"BEDROCK": 28,
|
||||
"OPENROUTER": 25,
|
||||
"OLLAMA": 12,
|
||||
"CUSTOM": 12,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pure scoring helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Calibrated against the live /api/v1/models bulk dump. Frontier models
|
||||
# released in the last ~6 months (GPT-5 family, Claude 4.x, Gemini 2.5,
|
||||
# Grok 4) score in the 18-20 band; mid-2024 models in the 8-12 band;
|
||||
# anything older trails off.
|
||||
_RECENCY_BANDS_DAYS: tuple[tuple[int, int], ...] = (
|
||||
(60, 20),
|
||||
(180, 16),
|
||||
(365, 12),
|
||||
(540, 9),
|
||||
(730, 6),
|
||||
(1095, 3),
|
||||
)
|
||||
|
||||
|
||||
def created_recency_signal(created_ts: int | None, now_ts: int) -> int:
|
||||
"""Return 0-20 based on how recently the model was published.
|
||||
|
||||
Uses the OpenRouter ``created`` Unix timestamp (or any equivalent for
|
||||
YAML cfgs). Models without a usable timestamp get 0 (we don't penalise,
|
||||
we just don't reward).
|
||||
"""
|
||||
if created_ts is None or created_ts <= 0 or now_ts <= 0:
|
||||
return 0
|
||||
age_days = max(0, (now_ts - int(created_ts)) // 86_400)
|
||||
for cutoff, score in _RECENCY_BANDS_DAYS:
|
||||
if age_days <= cutoff:
|
||||
return score
|
||||
return 0
|
||||
|
||||
|
||||
def pricing_band(
|
||||
prompt: str | float | int | None,
|
||||
completion: str | float | int | None,
|
||||
) -> int:
|
||||
"""Return 0-15 based on combined prompt+completion cost per 1M tokens.
|
||||
|
||||
Higher-priced models tend to be the larger / more capable ones. A free
|
||||
model returns 0 (we use other signals to rank free-vs-free instead).
|
||||
Uncoercible inputs are treated as 0 rather than raising.
|
||||
"""
|
||||
|
||||
def _to_float(value) -> float:
|
||||
if value is None:
|
||||
return 0.0
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return 0.0
|
||||
|
||||
p = _to_float(prompt)
|
||||
c = _to_float(completion)
|
||||
total_per_million = (p + c) * 1_000_000
|
||||
|
||||
if total_per_million >= 20.0:
|
||||
return 15
|
||||
if total_per_million >= 5.0:
|
||||
return 12
|
||||
if total_per_million >= 1.0:
|
||||
return 9
|
||||
if total_per_million >= 0.3:
|
||||
return 6
|
||||
if total_per_million >= 0.05:
|
||||
return 4
|
||||
if total_per_million > 0.0:
|
||||
return 2
|
||||
return 0
|
||||
|
||||
|
||||
def context_signal(ctx: int | None) -> int:
|
||||
"""Return 0-10 based on the model's context window."""
|
||||
if not ctx or ctx <= 0:
|
||||
return 0
|
||||
if ctx >= 1_000_000:
|
||||
return 10
|
||||
if ctx >= 400_000:
|
||||
return 8
|
||||
if ctx >= 200_000:
|
||||
return 6
|
||||
if ctx >= 128_000:
|
||||
return 4
|
||||
if ctx >= 100_000:
|
||||
return 2
|
||||
return 0
|
||||
|
||||
|
||||
def capabilities_signal(supported_parameters: list[str] | None) -> int:
|
||||
"""Return 0-5 for capabilities that matter for our agent flows."""
|
||||
if not supported_parameters:
|
||||
return 0
|
||||
params = set(supported_parameters)
|
||||
score = 0
|
||||
if "tools" in params:
|
||||
score += 2
|
||||
if "structured_outputs" in params or "response_format" in params:
|
||||
score += 2
|
||||
if "reasoning" in params or "include_reasoning" in params:
|
||||
score += 1
|
||||
return min(score, 5)
|
||||
|
||||
|
||||
def slug_penalty(model_id: str) -> int:
|
||||
"""Return a non-positive number; matches the narrow tiny/legacy patterns."""
|
||||
if not model_id:
|
||||
return 0
|
||||
needle = model_id.lower()
|
||||
for pattern in _TINY_LEGACY_PENALTY_PATTERNS:
|
||||
if pattern in needle:
|
||||
return -10
|
||||
return 0
|
||||
|
||||
|
||||
def _provider_prestige_or(model_id: str) -> int:
|
||||
if "/" not in model_id:
|
||||
return 0
|
||||
slug = model_id.split("/", 1)[0].lower()
|
||||
return PROVIDER_PRESTIGE_OR.get(slug, 15)
|
||||
|
||||
|
||||
def static_score_or(or_model: dict, *, now_ts: int) -> int:
|
||||
"""Score a raw OpenRouter ``/api/v1/models`` entry on a 0-100 scale."""
|
||||
model_id = str(or_model.get("id", ""))
|
||||
pricing = or_model.get("pricing") or {}
|
||||
|
||||
score = (
|
||||
_provider_prestige_or(model_id)
|
||||
+ created_recency_signal(or_model.get("created"), now_ts)
|
||||
+ pricing_band(pricing.get("prompt"), pricing.get("completion"))
|
||||
+ context_signal(or_model.get("context_length"))
|
||||
+ capabilities_signal(or_model.get("supported_parameters"))
|
||||
+ slug_penalty(model_id)
|
||||
)
|
||||
return max(0, min(100, int(score)))
|
||||
|
||||
|
||||
def static_score_yaml(cfg: dict) -> int:
|
||||
"""Score a YAML-curated cfg on a 0-100 scale.
|
||||
|
||||
Includes ``_OPERATOR_TRUST_BONUS`` because the operator deliberately
|
||||
listed this model. Pricing / context fall through to lazy ``litellm``
|
||||
lookups; failures are silent (we just lose those sub-points).
|
||||
"""
|
||||
provider = str(cfg.get("provider", "")).upper()
|
||||
base = PROVIDER_PRESTIGE_YAML.get(provider, 15)
|
||||
|
||||
model_name = cfg.get("model_name") or ""
|
||||
litellm_params = cfg.get("litellm_params") or {}
|
||||
lookup_name = (
|
||||
litellm_params.get("base_model")
|
||||
or litellm_params.get("model")
|
||||
or model_name
|
||||
)
|
||||
|
||||
ctx = 0
|
||||
p_cost: float = 0.0
|
||||
c_cost: float = 0.0
|
||||
try:
|
||||
from litellm import get_model_info # lazy: avoid cold-import cost
|
||||
|
||||
info = get_model_info(lookup_name) or {}
|
||||
ctx = int(info.get("max_input_tokens") or info.get("max_tokens") or 0)
|
||||
p_cost = float(info.get("input_cost_per_token") or 0.0)
|
||||
c_cost = float(info.get("output_cost_per_token") or 0.0)
|
||||
except Exception:
|
||||
# Unknown to litellm — that's fine for prestige+operator-bonus weighting.
|
||||
pass
|
||||
|
||||
score = (
|
||||
base
|
||||
+ _OPERATOR_TRUST_BONUS
|
||||
+ pricing_band(p_cost, c_cost)
|
||||
+ context_signal(ctx)
|
||||
+ slug_penalty(str(model_name))
|
||||
)
|
||||
return max(0, min(100, int(score)))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Health aggregation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _coerce_pct(value) -> float | None:
|
||||
try:
|
||||
if value is None:
|
||||
return None
|
||||
f = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if f < 0:
|
||||
return None
|
||||
# OpenRouter reports uptime as a 0-1 fraction; some endpoints surface it
|
||||
# as a 0-100 percentage. Normalise.
|
||||
return f * 100.0 if f <= 1.0 else f
|
||||
|
||||
|
||||
def _best_uptime(endpoints: list[dict]) -> tuple[float | None, str | None]:
|
||||
"""Pick the best (highest) non-null uptime across all endpoints.
|
||||
|
||||
Window preference: ``uptime_last_30m`` > ``uptime_last_1d`` >
|
||||
``uptime_last_5m``. Returns ``(uptime_pct, window_used)``.
|
||||
"""
|
||||
for window in ("uptime_last_30m", "uptime_last_1d", "uptime_last_5m"):
|
||||
values = [_coerce_pct(ep.get(window)) for ep in endpoints]
|
||||
values = [v for v in values if v is not None]
|
||||
if values:
|
||||
return max(values), window
|
||||
return None, None
|
||||
|
||||
|
||||
def aggregate_health(endpoints: list[dict]) -> tuple[bool, float | None]:
|
||||
"""Aggregate a model's per-endpoint health into ``(gated, score_or_none)``.
|
||||
|
||||
Hard gate (returns ``(True, None)``):
|
||||
* ``endpoints`` empty,
|
||||
* no endpoint reports ``status == 0`` (OK), or
|
||||
* best non-null uptime below ``_HEALTH_GATE_UPTIME_PCT``.
|
||||
|
||||
On a pass, returns a 0-100 health score blending uptime, status, and a
|
||||
freshness-weighted recent uptime sample.
|
||||
"""
|
||||
if not endpoints:
|
||||
return True, None
|
||||
|
||||
any_ok = any(int(ep.get("status", 1)) == 0 for ep in endpoints)
|
||||
if not any_ok:
|
||||
return True, None
|
||||
|
||||
best_uptime, _ = _best_uptime(endpoints)
|
||||
if best_uptime is None or best_uptime < _HEALTH_GATE_UPTIME_PCT:
|
||||
return True, None
|
||||
|
||||
# Freshness term: prefer 5m, fall through to 30m / 1d if 5m is missing.
|
||||
freshness = None
|
||||
for window in ("uptime_last_5m", "uptime_last_30m", "uptime_last_1d"):
|
||||
values = [_coerce_pct(ep.get(window)) for ep in endpoints]
|
||||
values = [v for v in values if v is not None]
|
||||
if values:
|
||||
freshness = max(values)
|
||||
break
|
||||
|
||||
uptime_term = best_uptime
|
||||
status_term = 100.0 if any_ok else 0.0
|
||||
freshness_term = freshness if freshness is not None else best_uptime
|
||||
|
||||
score = 0.50 * uptime_term + 0.30 * status_term + 0.20 * freshness_term
|
||||
return False, max(0.0, min(100.0, score))
|
||||
342
surfsense_backend/tests/unit/services/test_quality_score.py
Normal file
342
surfsense_backend/tests/unit/services/test_quality_score.py
Normal file
|
|
@ -0,0 +1,342 @@
|
|||
"""Unit tests for the Auto (Fastest) quality scoring module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from app.services.quality_score import (
|
||||
_HEALTH_GATE_UPTIME_PCT,
|
||||
_OPERATOR_TRUST_BONUS,
|
||||
aggregate_health,
|
||||
capabilities_signal,
|
||||
context_signal,
|
||||
created_recency_signal,
|
||||
pricing_band,
|
||||
slug_penalty,
|
||||
static_score_or,
|
||||
static_score_yaml,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# created_recency_signal
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_created_recency_signal_recent_model_scores_high():
|
||||
now = 1_750_000_000 # ~mid-2025
|
||||
one_month_ago = now - (30 * 86_400)
|
||||
assert created_recency_signal(one_month_ago, now) == 20
|
||||
|
||||
|
||||
def test_created_recency_signal_old_model_scores_zero():
|
||||
now = 1_750_000_000
|
||||
five_years_ago = now - (5 * 365 * 86_400)
|
||||
assert created_recency_signal(five_years_ago, now) == 0
|
||||
|
||||
|
||||
def test_created_recency_signal_missing_timestamp_is_neutral():
|
||||
now = 1_750_000_000
|
||||
assert created_recency_signal(None, now) == 0
|
||||
assert created_recency_signal(0, now) == 0
|
||||
|
||||
|
||||
def test_created_recency_signal_monotonic_decay():
|
||||
now = 1_750_000_000
|
||||
scores = [
|
||||
created_recency_signal(now - days * 86_400, now)
|
||||
for days in (30, 120, 300, 500, 700, 1000, 1500)
|
||||
]
|
||||
assert scores == sorted(scores, reverse=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pricing_band
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pricing_band_free_returns_zero():
|
||||
assert pricing_band("0", "0") == 0
|
||||
assert pricing_band(0.0, 0.0) == 0
|
||||
assert pricing_band(None, None) == 0
|
||||
|
||||
|
||||
def test_pricing_band_handles_unparseable():
|
||||
assert pricing_band("not-a-number", "0") == 0
|
||||
assert pricing_band({}, []) == 0 # type: ignore[arg-type]
|
||||
|
||||
|
||||
def test_pricing_band_premium_tiers_increase_with_price():
|
||||
cheap = pricing_band("0.0000003", "0.0000005")
|
||||
mid = pricing_band("0.000003", "0.000015")
|
||||
flagship = pricing_band("0.00001", "0.00005")
|
||||
assert 0 < cheap < mid < flagship
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# context_signal
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ctx,expected",
|
||||
[
|
||||
(1_500_000, 10),
|
||||
(1_000_000, 10),
|
||||
(500_000, 8),
|
||||
(200_000, 6),
|
||||
(128_000, 4),
|
||||
(100_000, 2),
|
||||
(50_000, 0),
|
||||
(0, 0),
|
||||
(None, 0),
|
||||
],
|
||||
)
|
||||
def test_context_signal_bands(ctx, expected):
|
||||
assert context_signal(ctx) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# capabilities_signal
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_capabilities_signal_caps_at_five():
|
||||
assert capabilities_signal(
|
||||
["tools", "structured_outputs", "reasoning", "include_reasoning"]
|
||||
) <= 5
|
||||
|
||||
|
||||
def test_capabilities_signal_tools_only():
|
||||
assert capabilities_signal(["tools"]) == 2
|
||||
|
||||
|
||||
def test_capabilities_signal_empty():
|
||||
assert capabilities_signal(None) == 0
|
||||
assert capabilities_signal([]) == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# slug_penalty
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_slug_penalty_demotes_tiny_models():
|
||||
assert slug_penalty("meta-llama/llama-3.2-1b-instruct") < 0
|
||||
assert slug_penalty("liquid/lfm-7b") < 0
|
||||
assert slug_penalty("google/gemma-3n-e4b-it") < 0
|
||||
|
||||
|
||||
def test_slug_penalty_skips_capable_mini_nano_lite_models():
|
||||
"""Critical Option C+ regression: don't penalise modern frontier
|
||||
models named ``-nano`` / ``-mini`` / ``-lite`` (gpt-5-mini, etc.)."""
|
||||
assert slug_penalty("openai/gpt-5-mini") == 0
|
||||
assert slug_penalty("openai/gpt-5-nano") == 0
|
||||
assert slug_penalty("google/gemini-2.5-flash-lite") == 0
|
||||
assert slug_penalty("anthropic/claude-haiku-4.5") == 0
|
||||
|
||||
|
||||
def test_slug_penalty_demotes_legacy_variants():
|
||||
assert slug_penalty("openai/o1-preview") < 0
|
||||
assert slug_penalty("foo/bar-base") < 0
|
||||
assert slug_penalty("foo/bar-distill") < 0
|
||||
|
||||
|
||||
def test_slug_penalty_empty_input():
|
||||
assert slug_penalty("") == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# static_score_or
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _or_model(
|
||||
*,
|
||||
model_id: str,
|
||||
created: int | None = None,
|
||||
prompt: str = "0.000003",
|
||||
completion: str = "0.000015",
|
||||
context: int = 200_000,
|
||||
params: list[str] | None = None,
|
||||
) -> dict:
|
||||
return {
|
||||
"id": model_id,
|
||||
"created": created,
|
||||
"pricing": {"prompt": prompt, "completion": completion},
|
||||
"context_length": context,
|
||||
"supported_parameters": params if params is not None else ["tools"],
|
||||
}
|
||||
|
||||
|
||||
def test_static_score_or_frontier_premium_beats_free_tiny():
|
||||
now = 1_750_000_000
|
||||
frontier = _or_model(
|
||||
model_id="openai/gpt-5",
|
||||
created=now - (60 * 86_400),
|
||||
prompt="0.000005",
|
||||
completion="0.000020",
|
||||
context=400_000,
|
||||
params=["tools", "structured_outputs", "reasoning"],
|
||||
)
|
||||
tiny_free = _or_model(
|
||||
model_id="meta-llama/llama-3.2-1b-instruct:free",
|
||||
created=now - (5 * 365 * 86_400),
|
||||
prompt="0",
|
||||
completion="0",
|
||||
context=128_000,
|
||||
params=["tools"],
|
||||
)
|
||||
assert static_score_or(frontier, now_ts=now) > static_score_or(
|
||||
tiny_free, now_ts=now
|
||||
)
|
||||
|
||||
|
||||
def test_static_score_or_score_is_clamped_0_to_100():
|
||||
now = int(time.time())
|
||||
score = static_score_or(_or_model(model_id="openai/gpt-4o"), now_ts=now)
|
||||
assert 0 <= score <= 100
|
||||
|
||||
|
||||
def test_static_score_or_unknown_provider_is_neutral_not_zero():
|
||||
now = int(time.time())
|
||||
score = static_score_or(
|
||||
_or_model(model_id="some-new-lab/some-model"),
|
||||
now_ts=now,
|
||||
)
|
||||
assert score > 0
|
||||
|
||||
|
||||
def test_static_score_or_recent_release_beats_year_old_same_provider():
|
||||
now = 1_750_000_000
|
||||
fresh = _or_model(model_id="openai/gpt-5", created=now - (60 * 86_400))
|
||||
old = _or_model(model_id="openai/gpt-4-turbo", created=now - (700 * 86_400))
|
||||
assert static_score_or(fresh, now_ts=now) > static_score_or(old, now_ts=now)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# static_score_yaml
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_static_score_yaml_includes_operator_bonus():
|
||||
cfg = {
|
||||
"provider": "AZURE_OPENAI",
|
||||
"model_name": "gpt-5",
|
||||
"litellm_params": {"base_model": "azure/gpt-5"},
|
||||
}
|
||||
score = static_score_yaml(cfg)
|
||||
assert score >= _OPERATOR_TRUST_BONUS
|
||||
|
||||
|
||||
def test_static_score_yaml_unknown_provider_still_carries_bonus():
|
||||
cfg = {
|
||||
"provider": "SOME_NEW_PROVIDER",
|
||||
"model_name": "weird-model",
|
||||
}
|
||||
score = static_score_yaml(cfg)
|
||||
assert score >= _OPERATOR_TRUST_BONUS
|
||||
|
||||
|
||||
def test_static_score_yaml_clamped_0_to_100():
|
||||
cfg = {
|
||||
"provider": "AZURE_OPENAI",
|
||||
"model_name": "gpt-5",
|
||||
"litellm_params": {"base_model": "azure/gpt-5"},
|
||||
}
|
||||
assert 0 <= static_score_yaml(cfg) <= 100
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# aggregate_health
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_aggregate_health_gates_when_uptime_below_threshold():
|
||||
"""Live data showed Venice-routed cfgs at 53-68%; this guards that the
|
||||
90% gate excludes them."""
|
||||
venice_endpoints = [
|
||||
{
|
||||
"status": 0,
|
||||
"uptime_last_30m": 0.55,
|
||||
"uptime_last_1d": 0.60,
|
||||
"uptime_last_5m": 0.50,
|
||||
},
|
||||
{
|
||||
"status": 0,
|
||||
"uptime_last_30m": 0.65,
|
||||
"uptime_last_1d": 0.68,
|
||||
"uptime_last_5m": 0.62,
|
||||
},
|
||||
]
|
||||
gated, score = aggregate_health(venice_endpoints)
|
||||
assert gated is True
|
||||
assert score is None
|
||||
|
||||
|
||||
def test_aggregate_health_passes_for_healthy_provider():
|
||||
healthy = [
|
||||
{
|
||||
"status": 0,
|
||||
"uptime_last_30m": 0.99,
|
||||
"uptime_last_1d": 0.995,
|
||||
"uptime_last_5m": 0.99,
|
||||
},
|
||||
]
|
||||
gated, score = aggregate_health(healthy)
|
||||
assert gated is False
|
||||
assert score is not None
|
||||
assert score >= _HEALTH_GATE_UPTIME_PCT
|
||||
|
||||
|
||||
def test_aggregate_health_picks_best_endpoint_across_multiple():
|
||||
"""Multi-endpoint aggregation should reward the best non-null uptime."""
|
||||
mixed = [
|
||||
{"status": 0, "uptime_last_30m": 0.55},
|
||||
{"status": 0, "uptime_last_30m": 0.97}, # this one passes the gate
|
||||
]
|
||||
gated, score = aggregate_health(mixed)
|
||||
assert gated is False
|
||||
assert score is not None
|
||||
|
||||
|
||||
def test_aggregate_health_empty_endpoints_gated():
|
||||
gated, score = aggregate_health([])
|
||||
assert gated is True
|
||||
assert score is None
|
||||
|
||||
|
||||
def test_aggregate_health_no_status_zero_gated():
|
||||
"""Even with high uptime, no OK status means the cfg is broken upstream."""
|
||||
endpoints = [
|
||||
{"status": 1, "uptime_last_30m": 0.99},
|
||||
{"status": 2, "uptime_last_30m": 0.98},
|
||||
]
|
||||
gated, score = aggregate_health(endpoints)
|
||||
assert gated is True
|
||||
assert score is None
|
||||
|
||||
|
||||
def test_aggregate_health_all_uptime_null_gated():
|
||||
endpoints = [
|
||||
{"status": 0, "uptime_last_30m": None, "uptime_last_1d": None},
|
||||
]
|
||||
gated, score = aggregate_health(endpoints)
|
||||
assert gated is True
|
||||
assert score is None
|
||||
|
||||
|
||||
def test_aggregate_health_pct_normalisation():
|
||||
"""OpenRouter returns 0-1 fractions; some endpoints surface 0-100%
|
||||
percentages. Both should reach the same gate decision."""
|
||||
fraction_form = [{"status": 0, "uptime_last_30m": 0.95}]
|
||||
pct_form = [{"status": 0, "uptime_last_30m": 95.0}]
|
||||
g1, s1 = aggregate_health(fraction_form)
|
||||
g2, s2 = aggregate_health(pct_form)
|
||||
assert g1 == g2 == False # noqa: E712
|
||||
assert s1 is not None and s2 is not None
|
||||
assert abs(s1 - s2) < 0.5
|
||||
Loading…
Add table
Add a link
Reference in a new issue