feat: enhance knowledge base search and document retrieval

- Introduced a mechanism to identify degenerate queries that lack meaningful search signals, improving search accuracy. - Implemented a fallback method for browsing recent documents when queries are degenerate, ensuring relevant results are returned. - Added limits on the number of chunks fetched per document to optimize performance and prevent excessive data loading. - Updated the ConnectorService to allow for reusable query embeddings, enhancing efficiency in search operations. - Enhanced LLM router service to support context window fallbacks, improving robustness during context window limitations.
2026-05-05 13:52:40 +02:00 · 2026-02-28 19:40:24 -08:00 · 2026-02-28 19:40:24 -08:00 · 40a091f8cc
commit 40a091f8cc
parent b08e8da40c
7 changed files with 476 additions and 100 deletions
--- a/surfsense_backend/app/services/llm_router_service.py
+++ b/surfsense_backend/app/services/llm_router_service.py
@ -159,26 +159,95 @@ class LLMRouterService:
        # Merge with provided settings
        final_settings = {**default_settings, **instance._router_settings}

+        # Build a "auto-large" fallback group with deployments whose context
+        # window exceeds the smallest deployment.  This lets the router
+        # automatically fall back to a bigger-context model when gpt-4o (128K)
+        # hits ContextWindowExceededError.
+        full_model_list, ctx_fallbacks = cls._build_context_fallback_groups(model_list)
+
        try:
-            instance._router = Router(
-                model_list=model_list,
-                routing_strategy=final_settings.get(
+            router_kwargs: dict[str, Any] = {
+                "model_list": full_model_list,
+                "routing_strategy": final_settings.get(
                    "routing_strategy", "usage-based-routing"
                ),
-                num_retries=final_settings.get("num_retries", 3),
-                allowed_fails=final_settings.get("allowed_fails", 3),
-                cooldown_time=final_settings.get("cooldown_time", 60),
-                set_verbose=False,  # Disable verbose logging in production
-            )
+                "num_retries": final_settings.get("num_retries", 3),
+                "allowed_fails": final_settings.get("allowed_fails", 3),
+                "cooldown_time": final_settings.get("cooldown_time", 60),
+                "set_verbose": False,
+            }
+            if ctx_fallbacks:
+                router_kwargs["context_window_fallbacks"] = ctx_fallbacks
+
+            instance._router = Router(**router_kwargs)
            instance._initialized = True
            logger.info(
-                f"LLM Router initialized with {len(model_list)} deployments, "
-                f"strategy: {final_settings.get('routing_strategy')}"
+                "LLM Router initialized with %d deployments, "
+                "strategy: %s, context_window_fallbacks: %s",
+                len(model_list),
+                final_settings.get("routing_strategy"),
+                ctx_fallbacks or "none",
            )
        except Exception as e:
            logger.error(f"Failed to initialize LLM Router: {e}")
            instance._router = None

+    @classmethod
+    def _build_context_fallback_groups(
+        cls, model_list: list[dict]
+    ) -> tuple[list[dict], list[dict[str, list[str]]] | None]:
+        """Create an ``auto-large`` model group for context-window fallbacks.
+
+        Uses ``litellm.get_model_info`` to discover the context window of each
+        deployment.  Deployments whose ``max_input_tokens`` exceeds the smallest
+        window are duplicated into an ``auto-large`` group.  The returned
+        fallback config tells the Router: on ``ContextWindowExceededError`` for
+        ``auto``, retry with ``auto-large``.
+
+        Returns:
+            (full_model_list, context_window_fallbacks) — ``full_model_list``
+            contains the original entries plus any ``auto-large`` duplicates.
+            ``context_window_fallbacks`` is ``None`` when every deployment has
+            the same context size (no useful fallback).
+        """
+        from litellm import get_model_info
+
+        ctx_map: dict[str, int] = {}
+        for dep in model_list:
+            params = dep.get("litellm_params", {})
+            base_model = params.get("base_model") or params.get("model", "")
+            try:
+                info = get_model_info(base_model)
+                ctx = info.get("max_input_tokens")
+                if isinstance(ctx, int) and ctx > 0:
+                    ctx_map[base_model] = ctx
+            except Exception:
+                continue
+
+        if not ctx_map:
+            return model_list, None
+
+        min_ctx = min(ctx_map.values())
+
+        large_deployments: list[dict] = []
+        for dep in model_list:
+            params = dep.get("litellm_params", {})
+            base_model = params.get("base_model") or params.get("model", "")
+            if ctx_map.get(base_model, 0) > min_ctx:
+                dup = {**dep, "model_name": "auto-large"}
+                large_deployments.append(dup)
+
+        if not large_deployments:
+            return model_list, None
+
+        logger.info(
+            "Context-window fallback: %d large-context deployments "
+            "(min_ctx=%d) added to 'auto-large' group",
+            len(large_deployments),
+            min_ctx,
+        )
+        return model_list + large_deployments, [{"auto": ["auto-large"]}]
+
    @classmethod
    def _config_to_deployment(cls, config: dict) -> dict | None:
        """