feat: add performance logging middleware and enhance performance tracking across services

- Introduced RequestPerfMiddleware to log request performance metrics, including slow request thresholds.
- Updated various services and retrievers to utilize the new performance logging utility for better tracking of execution times.
- Enhanced existing methods with detailed performance logs for operations such as embedding, searching, and indexing.
- Removed deprecated logging setup in stream_new_chat and replaced it with the new performance logger.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-27 16:32:30 -08:00
parent 68bb196d45
commit 664c43ca13
11 changed files with 430 additions and 36 deletions

View file

@ -13,6 +13,7 @@ synchronous ChatLiteLLM-like interface and async methods.
import logging
import re
import time
from typing import Any
from langchain_core.callbacks import CallbackManagerForLLMRun
@ -26,6 +27,8 @@ from litellm.exceptions import (
ContextWindowExceededError,
)
from app.utils.perf import get_perf_logger
logger = logging.getLogger(__name__)
_CONTEXT_OVERFLOW_PATTERNS = re.compile(
@ -410,6 +413,10 @@ class ChatLiteLLMRouter(BaseChatModel):
if not self._router:
raise ValueError("Router not initialized")
perf = get_perf_logger()
t0 = time.perf_counter()
msg_count = len(messages)
# Convert LangChain messages to OpenAI format
formatted_messages = self._convert_messages(messages)
@ -428,12 +435,28 @@ class ChatLiteLLMRouter(BaseChatModel):
**call_kwargs,
)
except ContextWindowExceededError as e:
perf.warning(
"[llm_router] _generate CONTEXT_OVERFLOW msgs=%d in %.3fs",
msg_count, time.perf_counter() - t0,
)
raise ContextOverflowError(str(e)) from e
except LiteLLMBadRequestError as e:
if _is_context_overflow_error(e):
perf.warning(
"[llm_router] _generate CONTEXT_OVERFLOW msgs=%d in %.3fs",
msg_count, time.perf_counter() - t0,
)
raise ContextOverflowError(str(e)) from e
raise
elapsed = time.perf_counter() - t0
perf.info(
"[llm_router] _generate completed msgs=%d tools=%d in %.3fs",
msg_count,
len(self._bound_tools) if self._bound_tools else 0,
elapsed,
)
# Convert response to ChatResult with potential tool calls
message = self._convert_response_to_message(response.choices[0].message)
generation = ChatGeneration(message=message)
@ -453,6 +476,10 @@ class ChatLiteLLMRouter(BaseChatModel):
if not self._router:
raise ValueError("Router not initialized")
perf = get_perf_logger()
t0 = time.perf_counter()
msg_count = len(messages)
# Convert LangChain messages to OpenAI format
formatted_messages = self._convert_messages(messages)
@ -471,12 +498,28 @@ class ChatLiteLLMRouter(BaseChatModel):
**call_kwargs,
)
except ContextWindowExceededError as e:
perf.warning(
"[llm_router] _agenerate CONTEXT_OVERFLOW msgs=%d in %.3fs",
msg_count, time.perf_counter() - t0,
)
raise ContextOverflowError(str(e)) from e
except LiteLLMBadRequestError as e:
if _is_context_overflow_error(e):
perf.warning(
"[llm_router] _agenerate CONTEXT_OVERFLOW msgs=%d in %.3fs",
msg_count, time.perf_counter() - t0,
)
raise ContextOverflowError(str(e)) from e
raise
elapsed = time.perf_counter() - t0
perf.info(
"[llm_router] _agenerate completed msgs=%d tools=%d in %.3fs",
msg_count,
len(self._bound_tools) if self._bound_tools else 0,
elapsed,
)
# Convert response to ChatResult with potential tool calls
message = self._convert_response_to_message(response.choices[0].message)
generation = ChatGeneration(message=message)
@ -541,6 +584,10 @@ class ChatLiteLLMRouter(BaseChatModel):
if not self._router:
raise ValueError("Router not initialized")
perf = get_perf_logger()
t0 = time.perf_counter()
msg_count = len(messages)
formatted_messages = self._convert_messages(messages)
# Add tools if bound
@ -559,20 +606,48 @@ class ChatLiteLLMRouter(BaseChatModel):
**call_kwargs,
)
except ContextWindowExceededError as e:
perf.warning(
"[llm_router] _astream CONTEXT_OVERFLOW msgs=%d in %.3fs",
msg_count, time.perf_counter() - t0,
)
raise ContextOverflowError(str(e)) from e
except LiteLLMBadRequestError as e:
if _is_context_overflow_error(e):
perf.warning(
"[llm_router] _astream CONTEXT_OVERFLOW msgs=%d in %.3fs",
msg_count, time.perf_counter() - t0,
)
raise ContextOverflowError(str(e)) from e
raise
# Yield chunks asynchronously
t_first_chunk = time.perf_counter()
perf.info(
"[llm_router] _astream connection established msgs=%d in %.3fs",
msg_count, t_first_chunk - t0,
)
chunk_count = 0
first_chunk_logged = False
async for chunk in response:
if hasattr(chunk, "choices") and chunk.choices:
delta = chunk.choices[0].delta
chunk_msg = self._convert_delta_to_chunk(delta)
if chunk_msg:
chunk_count += 1
if not first_chunk_logged:
perf.info(
"[llm_router] _astream first chunk in %.3fs (total %.3fs from start)",
time.perf_counter() - t_first_chunk,
time.perf_counter() - t0,
)
first_chunk_logged = True
yield ChatGenerationChunk(message=chunk_msg)
perf.info(
"[llm_router] _astream completed chunks=%d total=%.3fs",
chunk_count, time.perf_counter() - t0,
)
def _convert_messages(self, messages: list[BaseMessage]) -> list[dict]:
"""Convert LangChain messages to OpenAI format."""
from langchain_core.messages import (