feat(error-handling): implement LLM error adaptation and classification for chat streaming

- Introduced LLMErrorCategory and adapt_llm_exception to normalize LLM exceptions.
- Updated llm_retryable_message and llm_permanent_message to utilize the new adaptation logic.
- Enhanced classify_stream_exception to classify provider errors and return user-friendly messages.
- Added tests for error classification and adaptation to ensure robustness.
- Updated frontend error handling to display appropriate messages based on new classifications.
This commit is contained in:
Anish Sarkar 2026-06-12 05:03:14 +05:30
parent 203ef78346
commit 8e8cf96faa
9 changed files with 533 additions and 38 deletions

View file

@ -12,6 +12,7 @@ from app.agents.chat.multi_agent_chat.main_agent.middleware.busy_mutex import (
is_cancel_requested,
)
from app.agents.chat.runtime.errors import BusyError
from app.services.llm_error_adapter import LLMErrorCategory, adapt_llm_exception
TURN_CANCELLING_INITIAL_DELAY_MS = 200
TURN_CANCELLING_BACKOFF_FACTOR = 2
@ -102,6 +103,9 @@ def _extract_provider_error_code(parsed: dict[str, Any] | None) -> int | None:
def is_provider_rate_limited(exc: BaseException) -> bool:
"""Return True if the exception looks like an upstream HTTP 429 / rate limit."""
if adapt_llm_exception(exc).category is LLMErrorCategory.RATE_LIMITED:
return True
raw = str(exc)
lowered = raw.lower()
if "ratelimit" in type(exc).__name__.lower():
@ -131,6 +135,84 @@ def is_provider_rate_limited(exc: BaseException) -> bool:
)
def _provider_error_extra(adapted: Any) -> dict[str, Any] | None:
extra: dict[str, Any] = {"provider_error_category": adapted.category.value}
if adapted.provider_status_code is not None:
extra["provider_status_code"] = adapted.provider_status_code
if adapted.provider_error_type:
extra["provider_error_type"] = adapted.provider_error_type
return extra
def _classify_provider_exception(
exc: Exception,
) -> tuple[
str, str, Literal["info", "warn", "error"], bool, str, dict[str, Any] | None
] | None:
adapted = adapt_llm_exception(exc)
if adapted.category is LLMErrorCategory.RATE_LIMITED:
return (
"rate_limited",
"RATE_LIMITED",
"warn",
True,
"This model is temporarily rate-limited. Please try again in a few seconds or switch models.",
_provider_error_extra(adapted),
)
if adapted.category in {
LLMErrorCategory.AUTH_FAILED,
LLMErrorCategory.PERMISSION_DENIED,
}:
return (
"model_auth_failed",
"MODEL_AUTH_FAILED",
"warn",
True,
"This model's API key is invalid or expired. Switch models, or update the API key.",
_provider_error_extra(adapted),
)
if adapted.category is LLMErrorCategory.MODEL_NOT_FOUND:
return (
"model_not_found",
"MODEL_NOT_FOUND",
"warn",
True,
"The selected model is unavailable or no longer exists. Switch to another model and try again.",
_provider_error_extra(adapted),
)
if adapted.category is LLMErrorCategory.CONTEXT_LIMIT:
return (
"model_context_limit",
"MODEL_CONTEXT_LIMIT",
"warn",
True,
"This request is too large for the selected model. Try a model with a larger context window or reduce the input.",
_provider_error_extra(adapted),
)
if adapted.category in {
LLMErrorCategory.TIMEOUT,
LLMErrorCategory.PROVIDER_UNAVAILABLE,
LLMErrorCategory.BAD_GATEWAY,
LLMErrorCategory.CONNECTION_FAILED,
LLMErrorCategory.SERVER_ERROR,
}:
return (
"model_provider_unavailable",
"MODEL_PROVIDER_UNAVAILABLE",
"warn",
True,
"The selected model provider is temporarily unavailable. Please try again or switch models.",
_provider_error_extra(adapted),
)
return None
def classify_stream_exception(
exc: Exception,
*,
@ -167,15 +249,9 @@ def classify_stream_exception(
None,
)
if is_provider_rate_limited(exc):
return (
"rate_limited",
"RATE_LIMITED",
"warn",
True,
"This model is temporarily rate-limited. Please try again in a few seconds or switch models.",
None,
)
provider_classification = _classify_provider_exception(exc)
if provider_classification is not None:
return provider_classification
return (
"server_error",