diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py
index fcdb1c61e..27cee8b37 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py
@@ -6,6 +6,7 @@ from langchain_core.language_models import BaseChatModel
 
 from app.agents.new_chat.filesystem_selection import FilesystemMode
 from app.agents.new_chat.middleware import KnowledgePriorityMiddleware
+from app.services.llm_service import get_planner_llm
 
 
 def build_knowledge_priority_mw(
@@ -19,6 +20,7 @@ def build_knowledge_priority_mw(
 ) -> KnowledgePriorityMiddleware:
     return KnowledgePriorityMiddleware(
         llm=llm,
+        planner_llm=get_planner_llm(),
         search_space_id=search_space_id,
         filesystem_mode=filesystem_mode,
         available_connectors=available_connectors,
diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index 605c31416..f8db333ba 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -102,6 +102,7 @@ from app.agents.new_chat.tools.registry import (
 )
 from app.db import ChatVisibility
 from app.services.connector_service import ConnectorService
+from app.services.llm_service import get_planner_llm
 from app.utils.perf import get_perf_logger
 
 _perf_log = get_perf_logger()
@@ -1077,6 +1078,7 @@ def _build_compiled_agent_blocking(
         else None,
         KnowledgePriorityMiddleware(
             llm=llm,
+            planner_llm=get_planner_llm(),
             search_space_id=search_space_id,
             filesystem_mode=filesystem_mode,
             available_connectors=available_connectors,
diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
index 98bbf3bd7..77b413940 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
@@ -579,6 +579,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         self,
         *,
         llm: BaseChatModel | None = None,
+        planner_llm: BaseChatModel | None = None,
         search_space_id: int,
         filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
         available_connectors: list[str] | None = None,
@@ -588,6 +589,15 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         inject_system_message: bool = True,  # For backwards compatibility
     ) -> None:
         self.llm = llm
+        # The planner LLM handles short, structured internal tasks (query
+        # rewriting, date extraction, recency classification). When an
+        # operator marks a global config ``is_planner: true`` we route
+        # those calls to a cheap/fast model (e.g. gpt-4o-mini, Haiku, Azure
+        # gpt-5.x-nano) instead of the user's chat LLM — those classification
+        # tasks don't need frontier-tier capability. Falls back to the chat
+        # LLM when no planner config is wired up so deployments without one
+        # keep working unchanged.
+        self.planner_llm = planner_llm or llm
         self.search_space_id = search_space_id
         self.filesystem_mode = filesystem_mode
         self.available_connectors = available_connectors
@@ -598,7 +608,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         # Build the kb-planner private Runnable ONCE here so we don't pay
         # the ``create_agent`` compile cost (50-200ms) on every turn.
         # Disabled by default behind ``enable_kb_planner_runnable``; when
-        # off the planner falls back to the legacy ``self.llm.ainvoke``
+        # off the planner falls back to the legacy ``planner_llm.ainvoke``
         # path.
         self._planner: Runnable | None = None
         self._planner_compile_failed = False
@@ -608,7 +618,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
 
         Returns ``None`` when the feature flag is disabled, when the LLM is
         unavailable, or when ``create_agent`` raises (we fall back to the
-        legacy ``self.llm.ainvoke`` path in that case). Compilation happens
+        legacy ``planner_llm.ainvoke`` path in that case). Compilation happens
         lazily on first call, then memoized via ``self._planner``.
 
         The compiled agent is constructed without tools — the planner's
@@ -618,7 +628,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         """
         if self._planner is not None or self._planner_compile_failed:
             return self._planner
-        if self.llm is None:
+        if self.planner_llm is None:
             return None
         flags = get_flags()
         if not flags.enable_kb_planner_runnable or flags.disable_new_agent_stack:
@@ -628,13 +638,13 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
 
         try:
             self._planner = create_agent(
-                self.llm,
+                self.planner_llm,
                 tools=[],
                 middleware=[RetryAfterMiddleware(max_retries=2)],
             )
         except Exception as exc:  # pragma: no cover - defensive
             logger.warning(
-                "kb-planner Runnable compile failed; falling back to llm.ainvoke: %s",
+                "kb-planner Runnable compile failed; falling back to planner_llm.ainvoke: %s",
                 exc,
             )
             self._planner_compile_failed = True
@@ -647,12 +657,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         messages: Sequence[BaseMessage],
         user_text: str,
     ) -> tuple[str, datetime | None, datetime | None, bool]:
-        if self.llm is None:
+        if self.planner_llm is None:
             return user_text, None, None, False
 
         recent_conversation = _render_recent_conversation(
             messages,
-            llm=self.llm,
+            llm=self.planner_llm,
             user_text=user_text,
         )
         prompt = _build_kb_planner_prompt(
@@ -663,8 +673,8 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         t0 = loop.time()
 
         # Prefer the compiled-once planner Runnable when enabled; otherwise
-        # fall back to ``self.llm.ainvoke``. The ``surfsense:internal`` tag
-        # is preserved on both paths so ``_stream_agent_events`` still
+        # fall back to ``planner_llm.ainvoke``. The ``surfsense:internal``
+        # tag is preserved on both paths so ``_stream_agent_events`` still
         # suppresses the planner's intermediate events from the UI.
         planner = self._build_kb_planner_runnable()
         try:
@@ -684,7 +694,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
                     else AIMessage(content="")
                 )
             else:
-                response = await self.llm.ainvoke(
+                response = await self.planner_llm.ainvoke(
                     [HumanMessage(content=prompt)],
                     config={"tags": ["surfsense:internal"]},
                 )
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 448818e88..5643c048b 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -110,6 +110,19 @@ def load_global_llm_configs():
         except Exception as e:
             print(f"Warning: Failed to score global LLM configs: {e}")
 
+        # Planner LLM is a singleton role. If an operator accidentally
+        # marks multiple configs ``is_planner: true``, only the first one
+        # is used at runtime — surface the others at startup so the
+        # mistake is caught before traffic, not silently buried.
+        planner_cfgs = [c for c in configs if c.get("is_planner") is True]
+        if len(planner_cfgs) > 1:
+            extra_ids = [c.get("id") for c in planner_cfgs[1:]]
+            print(
+                "Warning: Multiple global LLM configs marked is_planner=true "
+                f"(ids {[c.get('id') for c in planner_cfgs]}); using id "
+                f"{planner_cfgs[0].get('id')} and ignoring {extra_ids}"
+            )
+
         return configs
     except Exception as e:
         print(f"Warning: Failed to load global LLM configs: {e}")
diff --git a/surfsense_backend/app/config/global_llm_config.example.yaml b/surfsense_backend/app/config/global_llm_config.example.yaml
index d92640c8d..83d556754 100644
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@@ -258,6 +258,45 @@ global_llm_configs:
     use_default_system_instructions: true
     citations_enabled: true
 
+  # Example: Planner LLM - small, fast model used for internal utility tasks
+  #
+  # The PLANNER role handles short, structured internal calls (KB query
+  # rewriting, date extraction, recency classification, etc.) that don't
+  # need frontier-tier capability. Pointing the planner at a cheap+fast
+  # model (gpt-4o-mini, Claude Haiku, Azure gpt-5.x-nano, Groq Llama, ...)
+  # typically saves 500ms-1.5s per turn vs. routing those same internal
+  # calls through the user's chat model.
+  #
+  # Activation:
+  #   - Mark EXACTLY ONE global config with ``is_planner: true``.
+  #   - If multiple are marked, the first one wins and a WARNING is logged.
+  #   - If none is marked, every internal call falls back to the user's
+  #     chat LLM (same behavior as before this flag existed).
+  #
+  # This config is operator-only — it is NOT exposed in the user-facing
+  # model selector, never billed against premium quota, and the
+  # billing_tier / anonymous_enabled fields below are ignored.
+  - id: -9
+    name: "Global Planner (GPT-4o mini)"
+    description: "Internal-only planner LLM for query rewriting and classification"
+    is_planner: true
+    billing_tier: "free"
+    anonymous_enabled: false
+    seo_enabled: false
+    quota_reserve_tokens: 1000
+    provider: "OPENAI"
+    model_name: "gpt-4o-mini"
+    api_key: "sk-your-openai-api-key-here"
+    api_base: ""
+    rpm: 3500
+    tpm: 200000
+    litellm_params:
+      temperature: 0
+      max_tokens: 1000
+    system_instructions: ""
+    use_default_system_instructions: true
+    citations_enabled: false
+
 # =============================================================================
 # OpenRouter Integration
 # =============================================================================
@@ -493,6 +532,20 @@ global_vision_llm_configs:
 # - Lower temperature (0.3) is recommended for accurate screenshot analysis
 # - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
 #
+# PLANNER LLM NOTES:
+# - is_planner: true marks a config as the internal-only planner LLM (small,
+#   fast model used for KB query rewriting, date extraction, recency
+#   classification, etc.). Only one config may carry this flag — if
+#   multiple do, the first one wins and a startup WARNING is logged.
+# - When no config is marked is_planner, every internal utility call falls
+#   back to the user's chat LLM (the historical behavior).
+# - Planner configs are NOT shown in the user-facing model selector and
+#   are NOT billed against the user's premium quota. Their billing_tier,
+#   anonymous_enabled, seo_* fields are ignored.
+# - Recommended models: gpt-4o-mini, claude-3-5-haiku, gemini-1.5-flash,
+#   azure gpt-5.x-nano, groq llama3-8b — anything <200ms p50 on a 1-2k
+#   prompt. Frontier models here defeat the purpose of the flag.
+#
 # TOKEN QUOTA & ANONYMOUS ACCESS NOTES:
 # - billing_tier: "free" or "premium". Controls whether registered users need premium token quota.
 # - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog.
diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py
index ade202c72..fa97fb33a 100644
--- a/surfsense_backend/app/services/llm_service.py
+++ b/surfsense_backend/app/services/llm_service.py
@@ -659,3 +659,36 @@ async def get_user_long_context_llm(
     return await get_document_summary_llm(
         session, search_space_id, disable_streaming=disable_streaming
     )
+
+
+def get_planner_llm() -> ChatLiteLLM | None:
+    """Return a planner LLM instance from the first global config marked
+    ``is_planner: true``, or ``None`` if no planner config is defined.
+
+    The planner role handles short, structured internal tasks (KB search
+    planning: query rewriting, date extraction, recency classification).
+    These tasks are well-served by small/fast models (e.g. gpt-4o-mini,
+    Claude Haiku, Azure gpt-5.x-nano) — using the user's chat LLM for them
+    is unnecessarily expensive and slow.
+
+    This helper reads from ``config.GLOBAL_LLM_CONFIGS`` (loaded at import
+    time from ``global_llm_config.yaml``) so it has no DB cost and can be
+    called synchronously from middleware/factory code. It returns the same
+    instance shape as the global path of ``get_search_space_llm_instance``.
+
+    Callers MUST fall back to their chat LLM when this returns ``None`` so
+    deployments without a planner config keep working unchanged.
+    """
+    from app.agents.new_chat.llm_config import create_chat_litellm_from_config
+
+    planner_cfg = next(
+        (
+            cfg
+            for cfg in config.GLOBAL_LLM_CONFIGS
+            if cfg.get("is_planner") is True
+        ),
+        None,
+    )
+    if not planner_cfg:
+        return None
+    return create_chat_litellm_from_config(planner_cfg)