mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
perf(kb-planner): route internal planner calls to dedicated small/fast LLM
Adds an optional planner LLM role wired through KnowledgePriorityMiddleware so KB query rewriting, date extraction, and recency classification run on a cheap model (e.g. gpt-4o-mini, Haiku, Azure nano) instead of the user's chat LLM. Operators opt in by setting is_planner: true on exactly one global config; without it, behavior is unchanged.
This commit is contained in:
parent
c3db25302b
commit
71dead0406
6 changed files with 123 additions and 10 deletions
|
|
@ -6,6 +6,7 @@ from langchain_core.language_models import BaseChatModel
|
||||||
|
|
||||||
from app.agents.new_chat.filesystem_selection import FilesystemMode
|
from app.agents.new_chat.filesystem_selection import FilesystemMode
|
||||||
from app.agents.new_chat.middleware import KnowledgePriorityMiddleware
|
from app.agents.new_chat.middleware import KnowledgePriorityMiddleware
|
||||||
|
from app.services.llm_service import get_planner_llm
|
||||||
|
|
||||||
|
|
||||||
def build_knowledge_priority_mw(
|
def build_knowledge_priority_mw(
|
||||||
|
|
@ -19,6 +20,7 @@ def build_knowledge_priority_mw(
|
||||||
) -> KnowledgePriorityMiddleware:
|
) -> KnowledgePriorityMiddleware:
|
||||||
return KnowledgePriorityMiddleware(
|
return KnowledgePriorityMiddleware(
|
||||||
llm=llm,
|
llm=llm,
|
||||||
|
planner_llm=get_planner_llm(),
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
filesystem_mode=filesystem_mode,
|
filesystem_mode=filesystem_mode,
|
||||||
available_connectors=available_connectors,
|
available_connectors=available_connectors,
|
||||||
|
|
|
||||||
|
|
@ -102,6 +102,7 @@ from app.agents.new_chat.tools.registry import (
|
||||||
)
|
)
|
||||||
from app.db import ChatVisibility
|
from app.db import ChatVisibility
|
||||||
from app.services.connector_service import ConnectorService
|
from app.services.connector_service import ConnectorService
|
||||||
|
from app.services.llm_service import get_planner_llm
|
||||||
from app.utils.perf import get_perf_logger
|
from app.utils.perf import get_perf_logger
|
||||||
|
|
||||||
_perf_log = get_perf_logger()
|
_perf_log = get_perf_logger()
|
||||||
|
|
@ -1077,6 +1078,7 @@ def _build_compiled_agent_blocking(
|
||||||
else None,
|
else None,
|
||||||
KnowledgePriorityMiddleware(
|
KnowledgePriorityMiddleware(
|
||||||
llm=llm,
|
llm=llm,
|
||||||
|
planner_llm=get_planner_llm(),
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
filesystem_mode=filesystem_mode,
|
filesystem_mode=filesystem_mode,
|
||||||
available_connectors=available_connectors,
|
available_connectors=available_connectors,
|
||||||
|
|
|
||||||
|
|
@ -579,6 +579,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
llm: BaseChatModel | None = None,
|
llm: BaseChatModel | None = None,
|
||||||
|
planner_llm: BaseChatModel | None = None,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
|
filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
|
||||||
available_connectors: list[str] | None = None,
|
available_connectors: list[str] | None = None,
|
||||||
|
|
@ -588,6 +589,15 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
inject_system_message: bool = True, # For backwards compatibility
|
inject_system_message: bool = True, # For backwards compatibility
|
||||||
) -> None:
|
) -> None:
|
||||||
self.llm = llm
|
self.llm = llm
|
||||||
|
# The planner LLM handles short, structured internal tasks (query
|
||||||
|
# rewriting, date extraction, recency classification). When an
|
||||||
|
# operator marks a global config ``is_planner: true`` we route
|
||||||
|
# those calls to a cheap/fast model (e.g. gpt-4o-mini, Haiku, Azure
|
||||||
|
# gpt-5.x-nano) instead of the user's chat LLM — those classification
|
||||||
|
# tasks don't need frontier-tier capability. Falls back to the chat
|
||||||
|
# LLM when no planner config is wired up so deployments without one
|
||||||
|
# keep working unchanged.
|
||||||
|
self.planner_llm = planner_llm or llm
|
||||||
self.search_space_id = search_space_id
|
self.search_space_id = search_space_id
|
||||||
self.filesystem_mode = filesystem_mode
|
self.filesystem_mode = filesystem_mode
|
||||||
self.available_connectors = available_connectors
|
self.available_connectors = available_connectors
|
||||||
|
|
@ -598,7 +608,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
# Build the kb-planner private Runnable ONCE here so we don't pay
|
# Build the kb-planner private Runnable ONCE here so we don't pay
|
||||||
# the ``create_agent`` compile cost (50-200ms) on every turn.
|
# the ``create_agent`` compile cost (50-200ms) on every turn.
|
||||||
# Disabled by default behind ``enable_kb_planner_runnable``; when
|
# Disabled by default behind ``enable_kb_planner_runnable``; when
|
||||||
# off the planner falls back to the legacy ``self.llm.ainvoke``
|
# off the planner falls back to the legacy ``planner_llm.ainvoke``
|
||||||
# path.
|
# path.
|
||||||
self._planner: Runnable | None = None
|
self._planner: Runnable | None = None
|
||||||
self._planner_compile_failed = False
|
self._planner_compile_failed = False
|
||||||
|
|
@ -608,7 +618,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
|
|
||||||
Returns ``None`` when the feature flag is disabled, when the LLM is
|
Returns ``None`` when the feature flag is disabled, when the LLM is
|
||||||
unavailable, or when ``create_agent`` raises (we fall back to the
|
unavailable, or when ``create_agent`` raises (we fall back to the
|
||||||
legacy ``self.llm.ainvoke`` path in that case). Compilation happens
|
legacy ``planner_llm.ainvoke`` path in that case). Compilation happens
|
||||||
lazily on first call, then memoized via ``self._planner``.
|
lazily on first call, then memoized via ``self._planner``.
|
||||||
|
|
||||||
The compiled agent is constructed without tools — the planner's
|
The compiled agent is constructed without tools — the planner's
|
||||||
|
|
@ -618,7 +628,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
"""
|
"""
|
||||||
if self._planner is not None or self._planner_compile_failed:
|
if self._planner is not None or self._planner_compile_failed:
|
||||||
return self._planner
|
return self._planner
|
||||||
if self.llm is None:
|
if self.planner_llm is None:
|
||||||
return None
|
return None
|
||||||
flags = get_flags()
|
flags = get_flags()
|
||||||
if not flags.enable_kb_planner_runnable or flags.disable_new_agent_stack:
|
if not flags.enable_kb_planner_runnable or flags.disable_new_agent_stack:
|
||||||
|
|
@ -628,13 +638,13 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._planner = create_agent(
|
self._planner = create_agent(
|
||||||
self.llm,
|
self.planner_llm,
|
||||||
tools=[],
|
tools=[],
|
||||||
middleware=[RetryAfterMiddleware(max_retries=2)],
|
middleware=[RetryAfterMiddleware(max_retries=2)],
|
||||||
)
|
)
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"kb-planner Runnable compile failed; falling back to llm.ainvoke: %s",
|
"kb-planner Runnable compile failed; falling back to planner_llm.ainvoke: %s",
|
||||||
exc,
|
exc,
|
||||||
)
|
)
|
||||||
self._planner_compile_failed = True
|
self._planner_compile_failed = True
|
||||||
|
|
@ -647,12 +657,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
messages: Sequence[BaseMessage],
|
messages: Sequence[BaseMessage],
|
||||||
user_text: str,
|
user_text: str,
|
||||||
) -> tuple[str, datetime | None, datetime | None, bool]:
|
) -> tuple[str, datetime | None, datetime | None, bool]:
|
||||||
if self.llm is None:
|
if self.planner_llm is None:
|
||||||
return user_text, None, None, False
|
return user_text, None, None, False
|
||||||
|
|
||||||
recent_conversation = _render_recent_conversation(
|
recent_conversation = _render_recent_conversation(
|
||||||
messages,
|
messages,
|
||||||
llm=self.llm,
|
llm=self.planner_llm,
|
||||||
user_text=user_text,
|
user_text=user_text,
|
||||||
)
|
)
|
||||||
prompt = _build_kb_planner_prompt(
|
prompt = _build_kb_planner_prompt(
|
||||||
|
|
@ -663,8 +673,8 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
t0 = loop.time()
|
t0 = loop.time()
|
||||||
|
|
||||||
# Prefer the compiled-once planner Runnable when enabled; otherwise
|
# Prefer the compiled-once planner Runnable when enabled; otherwise
|
||||||
# fall back to ``self.llm.ainvoke``. The ``surfsense:internal`` tag
|
# fall back to ``planner_llm.ainvoke``. The ``surfsense:internal``
|
||||||
# is preserved on both paths so ``_stream_agent_events`` still
|
# tag is preserved on both paths so ``_stream_agent_events`` still
|
||||||
# suppresses the planner's intermediate events from the UI.
|
# suppresses the planner's intermediate events from the UI.
|
||||||
planner = self._build_kb_planner_runnable()
|
planner = self._build_kb_planner_runnable()
|
||||||
try:
|
try:
|
||||||
|
|
@ -684,7 +694,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
else AIMessage(content="")
|
else AIMessage(content="")
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
response = await self.llm.ainvoke(
|
response = await self.planner_llm.ainvoke(
|
||||||
[HumanMessage(content=prompt)],
|
[HumanMessage(content=prompt)],
|
||||||
config={"tags": ["surfsense:internal"]},
|
config={"tags": ["surfsense:internal"]},
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -110,6 +110,19 @@ def load_global_llm_configs():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Warning: Failed to score global LLM configs: {e}")
|
print(f"Warning: Failed to score global LLM configs: {e}")
|
||||||
|
|
||||||
|
# Planner LLM is a singleton role. If an operator accidentally
|
||||||
|
# marks multiple configs ``is_planner: true``, only the first one
|
||||||
|
# is used at runtime — surface the others at startup so the
|
||||||
|
# mistake is caught before traffic, not silently buried.
|
||||||
|
planner_cfgs = [c for c in configs if c.get("is_planner") is True]
|
||||||
|
if len(planner_cfgs) > 1:
|
||||||
|
extra_ids = [c.get("id") for c in planner_cfgs[1:]]
|
||||||
|
print(
|
||||||
|
"Warning: Multiple global LLM configs marked is_planner=true "
|
||||||
|
f"(ids {[c.get('id') for c in planner_cfgs]}); using id "
|
||||||
|
f"{planner_cfgs[0].get('id')} and ignoring {extra_ids}"
|
||||||
|
)
|
||||||
|
|
||||||
return configs
|
return configs
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Warning: Failed to load global LLM configs: {e}")
|
print(f"Warning: Failed to load global LLM configs: {e}")
|
||||||
|
|
|
||||||
|
|
@ -258,6 +258,45 @@ global_llm_configs:
|
||||||
use_default_system_instructions: true
|
use_default_system_instructions: true
|
||||||
citations_enabled: true
|
citations_enabled: true
|
||||||
|
|
||||||
|
# Example: Planner LLM - small, fast model used for internal utility tasks
|
||||||
|
#
|
||||||
|
# The PLANNER role handles short, structured internal calls (KB query
|
||||||
|
# rewriting, date extraction, recency classification, etc.) that don't
|
||||||
|
# need frontier-tier capability. Pointing the planner at a cheap+fast
|
||||||
|
# model (gpt-4o-mini, Claude Haiku, Azure gpt-5.x-nano, Groq Llama, ...)
|
||||||
|
# typically saves 500ms-1.5s per turn vs. routing those same internal
|
||||||
|
# calls through the user's chat model.
|
||||||
|
#
|
||||||
|
# Activation:
|
||||||
|
# - Mark EXACTLY ONE global config with ``is_planner: true``.
|
||||||
|
# - If multiple are marked, the first one wins and a WARNING is logged.
|
||||||
|
# - If none is marked, every internal call falls back to the user's
|
||||||
|
# chat LLM (same behavior as before this flag existed).
|
||||||
|
#
|
||||||
|
# This config is operator-only — it is NOT exposed in the user-facing
|
||||||
|
# model selector, never billed against premium quota, and the
|
||||||
|
# billing_tier / anonymous_enabled fields below are ignored.
|
||||||
|
- id: -9
|
||||||
|
name: "Global Planner (GPT-4o mini)"
|
||||||
|
description: "Internal-only planner LLM for query rewriting and classification"
|
||||||
|
is_planner: true
|
||||||
|
billing_tier: "free"
|
||||||
|
anonymous_enabled: false
|
||||||
|
seo_enabled: false
|
||||||
|
quota_reserve_tokens: 1000
|
||||||
|
provider: "OPENAI"
|
||||||
|
model_name: "gpt-4o-mini"
|
||||||
|
api_key: "sk-your-openai-api-key-here"
|
||||||
|
api_base: ""
|
||||||
|
rpm: 3500
|
||||||
|
tpm: 200000
|
||||||
|
litellm_params:
|
||||||
|
temperature: 0
|
||||||
|
max_tokens: 1000
|
||||||
|
system_instructions: ""
|
||||||
|
use_default_system_instructions: true
|
||||||
|
citations_enabled: false
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# OpenRouter Integration
|
# OpenRouter Integration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
@ -493,6 +532,20 @@ global_vision_llm_configs:
|
||||||
# - Lower temperature (0.3) is recommended for accurate screenshot analysis
|
# - Lower temperature (0.3) is recommended for accurate screenshot analysis
|
||||||
# - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
|
# - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
|
||||||
#
|
#
|
||||||
|
# PLANNER LLM NOTES:
|
||||||
|
# - is_planner: true marks a config as the internal-only planner LLM (small,
|
||||||
|
# fast model used for KB query rewriting, date extraction, recency
|
||||||
|
# classification, etc.). Only one config may carry this flag — if
|
||||||
|
# multiple do, the first one wins and a startup WARNING is logged.
|
||||||
|
# - When no config is marked is_planner, every internal utility call falls
|
||||||
|
# back to the user's chat LLM (the historical behavior).
|
||||||
|
# - Planner configs are NOT shown in the user-facing model selector and
|
||||||
|
# are NOT billed against the user's premium quota. Their billing_tier,
|
||||||
|
# anonymous_enabled, seo_* fields are ignored.
|
||||||
|
# - Recommended models: gpt-4o-mini, claude-3-5-haiku, gemini-1.5-flash,
|
||||||
|
# azure gpt-5.x-nano, groq llama3-8b — anything <200ms p50 on a 1-2k
|
||||||
|
# prompt. Frontier models here defeat the purpose of the flag.
|
||||||
|
#
|
||||||
# TOKEN QUOTA & ANONYMOUS ACCESS NOTES:
|
# TOKEN QUOTA & ANONYMOUS ACCESS NOTES:
|
||||||
# - billing_tier: "free" or "premium". Controls whether registered users need premium token quota.
|
# - billing_tier: "free" or "premium". Controls whether registered users need premium token quota.
|
||||||
# - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog.
|
# - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog.
|
||||||
|
|
|
||||||
|
|
@ -659,3 +659,36 @@ async def get_user_long_context_llm(
|
||||||
return await get_document_summary_llm(
|
return await get_document_summary_llm(
|
||||||
session, search_space_id, disable_streaming=disable_streaming
|
session, search_space_id, disable_streaming=disable_streaming
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_planner_llm() -> ChatLiteLLM | None:
|
||||||
|
"""Return a planner LLM instance from the first global config marked
|
||||||
|
``is_planner: true``, or ``None`` if no planner config is defined.
|
||||||
|
|
||||||
|
The planner role handles short, structured internal tasks (KB search
|
||||||
|
planning: query rewriting, date extraction, recency classification).
|
||||||
|
These tasks are well-served by small/fast models (e.g. gpt-4o-mini,
|
||||||
|
Claude Haiku, Azure gpt-5.x-nano) — using the user's chat LLM for them
|
||||||
|
is unnecessarily expensive and slow.
|
||||||
|
|
||||||
|
This helper reads from ``config.GLOBAL_LLM_CONFIGS`` (loaded at import
|
||||||
|
time from ``global_llm_config.yaml``) so it has no DB cost and can be
|
||||||
|
called synchronously from middleware/factory code. It returns the same
|
||||||
|
instance shape as the global path of ``get_search_space_llm_instance``.
|
||||||
|
|
||||||
|
Callers MUST fall back to their chat LLM when this returns ``None`` so
|
||||||
|
deployments without a planner config keep working unchanged.
|
||||||
|
"""
|
||||||
|
from app.agents.new_chat.llm_config import create_chat_litellm_from_config
|
||||||
|
|
||||||
|
planner_cfg = next(
|
||||||
|
(
|
||||||
|
cfg
|
||||||
|
for cfg in config.GLOBAL_LLM_CONFIGS
|
||||||
|
if cfg.get("is_planner") is True
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if not planner_cfg:
|
||||||
|
return None
|
||||||
|
return create_chat_litellm_from_config(planner_cfg)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue