diff --git a/router.py b/router.py index e2ce8d5..a7f6a75 100644 --- a/router.py +++ b/router.py @@ -1099,6 +1099,13 @@ async def _make_chat_request(model: str, messages: list, tools=None, stream: boo err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {} n_ctx_limit = err_detail.get("n_ctx", 0) actual_tokens = err_detail.get("n_prompt_tokens", 0) + if not n_ctx_limit: + _m = re.search(r"'n_ctx':\s*(\d+)", _e_str) + if _m: + n_ctx_limit = int(_m.group(1)) + _m = re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str) + if _m: + actual_tokens = int(_m.group(1)) if not n_ctx_limit: raise msgs_to_trim = params.get("messages", []) @@ -2005,6 +2012,13 @@ async def chat_proxy(request: Request): err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {} n_ctx_limit = err_detail.get("n_ctx", 0) actual_tokens = err_detail.get("n_prompt_tokens", 0) + if not n_ctx_limit: + _m = re.search(r"'n_ctx':\s*(\d+)", _e_str) + if _m: + n_ctx_limit = int(_m.group(1)) + _m = re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str) + if _m: + actual_tokens = int(_m.group(1)) if not n_ctx_limit: await decrement_usage(endpoint, tracking_model) raise @@ -3143,6 +3157,15 @@ async def openai_chat_completions_proxy(request: Request): err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {} n_ctx_limit = err_detail.get("n_ctx", 0) actual_tokens = err_detail.get("n_prompt_tokens", 0) + # Fallback: parse from string if body parsing yielded nothing (SDK may not parse llama-server errors) + if not n_ctx_limit: + import re as _re + _m = _re.search(r"'n_ctx':\s*(\d+)", _e_str) + if _m: + n_ctx_limit = int(_m.group(1)) + _m = _re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str) + if _m: + actual_tokens = int(_m.group(1)) print(f"[ctx-trim] n_ctx={n_ctx_limit} actual={actual_tokens}", flush=True) if not n_ctx_limit: await decrement_usage(endpoint, tracking_model)