diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py index fcdb1c61e..27cee8b37 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py @@ -6,6 +6,7 @@ from langchain_core.language_models import BaseChatModel from app.agents.new_chat.filesystem_selection import FilesystemMode from app.agents.new_chat.middleware import KnowledgePriorityMiddleware +from app.services.llm_service import get_planner_llm def build_knowledge_priority_mw( @@ -19,6 +20,7 @@ def build_knowledge_priority_mw( ) -> KnowledgePriorityMiddleware: return KnowledgePriorityMiddleware( llm=llm, + planner_llm=get_planner_llm(), search_space_id=search_space_id, filesystem_mode=filesystem_mode, available_connectors=available_connectors, diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index 605c31416..f8db333ba 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -102,6 +102,7 @@ from app.agents.new_chat.tools.registry import ( ) from app.db import ChatVisibility from app.services.connector_service import ConnectorService +from app.services.llm_service import get_planner_llm from app.utils.perf import get_perf_logger _perf_log = get_perf_logger() @@ -1077,6 +1078,7 @@ def _build_compiled_agent_blocking( else None, KnowledgePriorityMiddleware( llm=llm, + planner_llm=get_planner_llm(), search_space_id=search_space_id, filesystem_mode=filesystem_mode, available_connectors=available_connectors, diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py index 98bbf3bd7..77b413940 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py +++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py @@ -579,6 +579,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] self, *, llm: BaseChatModel | None = None, + planner_llm: BaseChatModel | None = None, search_space_id: int, filesystem_mode: FilesystemMode = FilesystemMode.CLOUD, available_connectors: list[str] | None = None, @@ -588,6 +589,15 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] inject_system_message: bool = True, # For backwards compatibility ) -> None: self.llm = llm + # The planner LLM handles short, structured internal tasks (query + # rewriting, date extraction, recency classification). When an + # operator marks a global config ``is_planner: true`` we route + # those calls to a cheap/fast model (e.g. gpt-4o-mini, Haiku, Azure + # gpt-5.x-nano) instead of the user's chat LLM — those classification + # tasks don't need frontier-tier capability. Falls back to the chat + # LLM when no planner config is wired up so deployments without one + # keep working unchanged. + self.planner_llm = planner_llm or llm self.search_space_id = search_space_id self.filesystem_mode = filesystem_mode self.available_connectors = available_connectors @@ -598,7 +608,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] # Build the kb-planner private Runnable ONCE here so we don't pay # the ``create_agent`` compile cost (50-200ms) on every turn. # Disabled by default behind ``enable_kb_planner_runnable``; when - # off the planner falls back to the legacy ``self.llm.ainvoke`` + # off the planner falls back to the legacy ``planner_llm.ainvoke`` # path. self._planner: Runnable | None = None self._planner_compile_failed = False @@ -608,7 +618,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] Returns ``None`` when the feature flag is disabled, when the LLM is unavailable, or when ``create_agent`` raises (we fall back to the - legacy ``self.llm.ainvoke`` path in that case). Compilation happens + legacy ``planner_llm.ainvoke`` path in that case). Compilation happens lazily on first call, then memoized via ``self._planner``. The compiled agent is constructed without tools — the planner's @@ -618,7 +628,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] """ if self._planner is not None or self._planner_compile_failed: return self._planner - if self.llm is None: + if self.planner_llm is None: return None flags = get_flags() if not flags.enable_kb_planner_runnable or flags.disable_new_agent_stack: @@ -628,13 +638,13 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] try: self._planner = create_agent( - self.llm, + self.planner_llm, tools=[], middleware=[RetryAfterMiddleware(max_retries=2)], ) except Exception as exc: # pragma: no cover - defensive logger.warning( - "kb-planner Runnable compile failed; falling back to llm.ainvoke: %s", + "kb-planner Runnable compile failed; falling back to planner_llm.ainvoke: %s", exc, ) self._planner_compile_failed = True @@ -647,12 +657,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] messages: Sequence[BaseMessage], user_text: str, ) -> tuple[str, datetime | None, datetime | None, bool]: - if self.llm is None: + if self.planner_llm is None: return user_text, None, None, False recent_conversation = _render_recent_conversation( messages, - llm=self.llm, + llm=self.planner_llm, user_text=user_text, ) prompt = _build_kb_planner_prompt( @@ -663,8 +673,8 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] t0 = loop.time() # Prefer the compiled-once planner Runnable when enabled; otherwise - # fall back to ``self.llm.ainvoke``. The ``surfsense:internal`` tag - # is preserved on both paths so ``_stream_agent_events`` still + # fall back to ``planner_llm.ainvoke``. The ``surfsense:internal`` + # tag is preserved on both paths so ``_stream_agent_events`` still # suppresses the planner's intermediate events from the UI. planner = self._build_kb_planner_runnable() try: @@ -684,7 +694,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] else AIMessage(content="") ) else: - response = await self.llm.ainvoke( + response = await self.planner_llm.ainvoke( [HumanMessage(content=prompt)], config={"tags": ["surfsense:internal"]}, ) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 448818e88..5643c048b 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -110,6 +110,19 @@ def load_global_llm_configs(): except Exception as e: print(f"Warning: Failed to score global LLM configs: {e}") + # Planner LLM is a singleton role. If an operator accidentally + # marks multiple configs ``is_planner: true``, only the first one + # is used at runtime — surface the others at startup so the + # mistake is caught before traffic, not silently buried. + planner_cfgs = [c for c in configs if c.get("is_planner") is True] + if len(planner_cfgs) > 1: + extra_ids = [c.get("id") for c in planner_cfgs[1:]] + print( + "Warning: Multiple global LLM configs marked is_planner=true " + f"(ids {[c.get('id') for c in planner_cfgs]}); using id " + f"{planner_cfgs[0].get('id')} and ignoring {extra_ids}" + ) + return configs except Exception as e: print(f"Warning: Failed to load global LLM configs: {e}") diff --git a/surfsense_backend/app/config/global_llm_config.example.yaml b/surfsense_backend/app/config/global_llm_config.example.yaml index d92640c8d..83d556754 100644 --- a/surfsense_backend/app/config/global_llm_config.example.yaml +++ b/surfsense_backend/app/config/global_llm_config.example.yaml @@ -258,6 +258,45 @@ global_llm_configs: use_default_system_instructions: true citations_enabled: true + # Example: Planner LLM - small, fast model used for internal utility tasks + # + # The PLANNER role handles short, structured internal calls (KB query + # rewriting, date extraction, recency classification, etc.) that don't + # need frontier-tier capability. Pointing the planner at a cheap+fast + # model (gpt-4o-mini, Claude Haiku, Azure gpt-5.x-nano, Groq Llama, ...) + # typically saves 500ms-1.5s per turn vs. routing those same internal + # calls through the user's chat model. + # + # Activation: + # - Mark EXACTLY ONE global config with ``is_planner: true``. + # - If multiple are marked, the first one wins and a WARNING is logged. + # - If none is marked, every internal call falls back to the user's + # chat LLM (same behavior as before this flag existed). + # + # This config is operator-only — it is NOT exposed in the user-facing + # model selector, never billed against premium quota, and the + # billing_tier / anonymous_enabled fields below are ignored. + - id: -9 + name: "Global Planner (GPT-4o mini)" + description: "Internal-only planner LLM for query rewriting and classification" + is_planner: true + billing_tier: "free" + anonymous_enabled: false + seo_enabled: false + quota_reserve_tokens: 1000 + provider: "OPENAI" + model_name: "gpt-4o-mini" + api_key: "sk-your-openai-api-key-here" + api_base: "" + rpm: 3500 + tpm: 200000 + litellm_params: + temperature: 0 + max_tokens: 1000 + system_instructions: "" + use_default_system_instructions: true + citations_enabled: false + # ============================================================================= # OpenRouter Integration # ============================================================================= @@ -493,6 +532,20 @@ global_vision_llm_configs: # - Lower temperature (0.3) is recommended for accurate screenshot analysis # - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions # +# PLANNER LLM NOTES: +# - is_planner: true marks a config as the internal-only planner LLM (small, +# fast model used for KB query rewriting, date extraction, recency +# classification, etc.). Only one config may carry this flag — if +# multiple do, the first one wins and a startup WARNING is logged. +# - When no config is marked is_planner, every internal utility call falls +# back to the user's chat LLM (the historical behavior). +# - Planner configs are NOT shown in the user-facing model selector and +# are NOT billed against the user's premium quota. Their billing_tier, +# anonymous_enabled, seo_* fields are ignored. +# - Recommended models: gpt-4o-mini, claude-3-5-haiku, gemini-1.5-flash, +# azure gpt-5.x-nano, groq llama3-8b — anything <200ms p50 on a 1-2k +# prompt. Frontier models here defeat the purpose of the flag. +# # TOKEN QUOTA & ANONYMOUS ACCESS NOTES: # - billing_tier: "free" or "premium". Controls whether registered users need premium token quota. # - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog. diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py index ade202c72..fa97fb33a 100644 --- a/surfsense_backend/app/services/llm_service.py +++ b/surfsense_backend/app/services/llm_service.py @@ -659,3 +659,36 @@ async def get_user_long_context_llm( return await get_document_summary_llm( session, search_space_id, disable_streaming=disable_streaming ) + + +def get_planner_llm() -> ChatLiteLLM | None: + """Return a planner LLM instance from the first global config marked + ``is_planner: true``, or ``None`` if no planner config is defined. + + The planner role handles short, structured internal tasks (KB search + planning: query rewriting, date extraction, recency classification). + These tasks are well-served by small/fast models (e.g. gpt-4o-mini, + Claude Haiku, Azure gpt-5.x-nano) — using the user's chat LLM for them + is unnecessarily expensive and slow. + + This helper reads from ``config.GLOBAL_LLM_CONFIGS`` (loaded at import + time from ``global_llm_config.yaml``) so it has no DB cost and can be + called synchronously from middleware/factory code. It returns the same + instance shape as the global path of ``get_search_space_llm_instance``. + + Callers MUST fall back to their chat LLM when this returns ``None`` so + deployments without a planner config keep working unchanged. + """ + from app.agents.new_chat.llm_config import create_chat_litellm_from_config + + planner_cfg = next( + ( + cfg + for cfg in config.GLOBAL_LLM_CONFIGS + if cfg.get("is_planner") is True + ), + None, + ) + if not planner_cfg: + return None + return create_chat_litellm_from_config(planner_cfg)