feat: enhance knowledge base search and document retrieval

- Introduced a mechanism to identify degenerate queries that lack meaningful search signals, improving search accuracy.
- Implemented a fallback method for browsing recent documents when queries are degenerate, ensuring relevant results are returned.
- Added limits on the number of chunks fetched per document to optimize performance and prevent excessive data loading.
- Updated the ConnectorService to allow for reusable query embeddings, enhancing efficiency in search operations.
- Enhanced LLM router service to support context window fallbacks, improving robustness during context window limitations.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-28 19:40:24 -08:00
parent b08e8da40c
commit 40a091f8cc
7 changed files with 476 additions and 100 deletions

View file

@ -159,26 +159,95 @@ class LLMRouterService:
# Merge with provided settings
final_settings = {**default_settings, **instance._router_settings}
# Build a "auto-large" fallback group with deployments whose context
# window exceeds the smallest deployment. This lets the router
# automatically fall back to a bigger-context model when gpt-4o (128K)
# hits ContextWindowExceededError.
full_model_list, ctx_fallbacks = cls._build_context_fallback_groups(model_list)
try:
instance._router = Router(
model_list=model_list,
routing_strategy=final_settings.get(
router_kwargs: dict[str, Any] = {
"model_list": full_model_list,
"routing_strategy": final_settings.get(
"routing_strategy", "usage-based-routing"
),
num_retries=final_settings.get("num_retries", 3),
allowed_fails=final_settings.get("allowed_fails", 3),
cooldown_time=final_settings.get("cooldown_time", 60),
set_verbose=False, # Disable verbose logging in production
)
"num_retries": final_settings.get("num_retries", 3),
"allowed_fails": final_settings.get("allowed_fails", 3),
"cooldown_time": final_settings.get("cooldown_time", 60),
"set_verbose": False,
}
if ctx_fallbacks:
router_kwargs["context_window_fallbacks"] = ctx_fallbacks
instance._router = Router(**router_kwargs)
instance._initialized = True
logger.info(
f"LLM Router initialized with {len(model_list)} deployments, "
f"strategy: {final_settings.get('routing_strategy')}"
"LLM Router initialized with %d deployments, "
"strategy: %s, context_window_fallbacks: %s",
len(model_list),
final_settings.get("routing_strategy"),
ctx_fallbacks or "none",
)
except Exception as e:
logger.error(f"Failed to initialize LLM Router: {e}")
instance._router = None
@classmethod
def _build_context_fallback_groups(
cls, model_list: list[dict]
) -> tuple[list[dict], list[dict[str, list[str]]] | None]:
"""Create an ``auto-large`` model group for context-window fallbacks.
Uses ``litellm.get_model_info`` to discover the context window of each
deployment. Deployments whose ``max_input_tokens`` exceeds the smallest
window are duplicated into an ``auto-large`` group. The returned
fallback config tells the Router: on ``ContextWindowExceededError`` for
``auto``, retry with ``auto-large``.
Returns:
(full_model_list, context_window_fallbacks) ``full_model_list``
contains the original entries plus any ``auto-large`` duplicates.
``context_window_fallbacks`` is ``None`` when every deployment has
the same context size (no useful fallback).
"""
from litellm import get_model_info
ctx_map: dict[str, int] = {}
for dep in model_list:
params = dep.get("litellm_params", {})
base_model = params.get("base_model") or params.get("model", "")
try:
info = get_model_info(base_model)
ctx = info.get("max_input_tokens")
if isinstance(ctx, int) and ctx > 0:
ctx_map[base_model] = ctx
except Exception:
continue
if not ctx_map:
return model_list, None
min_ctx = min(ctx_map.values())
large_deployments: list[dict] = []
for dep in model_list:
params = dep.get("litellm_params", {})
base_model = params.get("base_model") or params.get("model", "")
if ctx_map.get(base_model, 0) > min_ctx:
dup = {**dep, "model_name": "auto-large"}
large_deployments.append(dup)
if not large_deployments:
return model_list, None
logger.info(
"Context-window fallback: %d large-context deployments "
"(min_ctx=%d) added to 'auto-large' group",
len(large_deployments),
min_ctx,
)
return model_list + large_deployments, [{"auto": ["auto-large"]}]
@classmethod
def _config_to_deployment(cls, config: dict) -> dict | None:
"""