From a4422c0ee466a0d90e0051fc3e6e170ccb699865 Mon Sep 17 00:00:00 2001
From: thierryverse <thierryuniverse@gmail.com>
Date: Fri, 19 Dec 2025 20:40:10 +0200
Subject: [PATCH] organize deepagent codebase

---
 .../app/agents/new_chat/__init__.py           |  17 +-
 .../app/agents/new_chat/chat_deepagent.py     | 295 +-----------------
 .../app/agents/new_chat/context.py            |  28 ++
 .../app/agents/new_chat/llm_config.py         | 104 ++++++
 .../app/agents/new_chat/new_chat_test.py      |   7 +-
 .../app/agents/new_chat/system_prompt.py      | 143 +++++++++
 6 files changed, 291 insertions(+), 303 deletions(-)
 create mode 100644 surfsense_backend/app/agents/new_chat/context.py
 create mode 100644 surfsense_backend/app/agents/new_chat/llm_config.py
 create mode 100644 surfsense_backend/app/agents/new_chat/system_prompt.py

diff --git a/surfsense_backend/app/agents/new_chat/__init__.py b/surfsense_backend/app/agents/new_chat/__init__.py
index 9f4d32bd2..5cbb7ce2b 100644
--- a/surfsense_backend/app/agents/new_chat/__init__.py
+++ b/surfsense_backend/app/agents/new_chat/__init__.py
@@ -1,19 +1,18 @@
 """Chat agents module."""
 
-from .chat_deepagent import (
-    SURFSENSE_CITATION_INSTRUCTIONS,
-    SURFSENSE_SYSTEM_PROMPT,
-    SurfSenseContextSchema,
-    build_surfsense_system_prompt,
-    create_chat_litellm_from_config,
-    create_surfsense_deep_agent,
-    load_llm_config_from_yaml,
-)
+from .chat_deepagent import create_surfsense_deep_agent
+from .context import SurfSenseContextSchema
 from .knowledge_base import (
     create_search_knowledge_base_tool,
     format_documents_for_context,
     search_knowledge_base_async,
 )
+from .llm_config import create_chat_litellm_from_config, load_llm_config_from_yaml
+from .system_prompt import (
+    SURFSENSE_CITATION_INSTRUCTIONS,
+    SURFSENSE_SYSTEM_PROMPT,
+    build_surfsense_system_prompt,
+)
 
 __all__ = [
     "SURFSENSE_CITATION_INSTRUCTIONS",
diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index b5a474657..eb2dac737 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -1,305 +1,22 @@
 """
-Test script for create_deep_agent with ChatLiteLLM from global_llm_config.yaml
+SurfSense deep agent implementation.
 
-This demonstrates:
-1. Loading LLM config from global_llm_config.yaml
-2. Creating a ChatLiteLLM instance
-3. Using context_schema to add custom state fields
-4. Creating a search_knowledge_base tool similar to fetch_relevant_documents
+This module provides the factory function for creating SurfSense deep agents
+with knowledge base search capability.
 """
 
-import sys
 from collections.abc import Sequence
-from datetime import UTC, datetime
-from pathlib import Path
-from typing import TypedDict
 
-import yaml
 from deepagents import create_deep_agent
 from langchain_core.tools import BaseTool
 from langchain_litellm import ChatLiteLLM
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.context import SurfSenseContextSchema
+from app.agents.new_chat.knowledge_base import create_search_knowledge_base_tool
+from app.agents.new_chat.system_prompt import build_surfsense_system_prompt
 from app.services.connector_service import ConnectorService
 
-from .knowledge_base import create_search_knowledge_base_tool
-
-# Add parent directory to path so 'app' module can be found when running directly
-_THIS_FILE = Path(__file__).resolve()
-_BACKEND_ROOT = _THIS_FILE.parent.parent.parent.parent  # surfsense_backend/
-if str(_BACKEND_ROOT) not in sys.path:
-    sys.path.insert(0, str(_BACKEND_ROOT))
-
-
-# =============================================================================
-# LLM Configuration Loading
-# =============================================================================
-
-
-def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None:
-    """
-    Load a specific LLM config from global_llm_config.yaml.
-
-    Args:
-        llm_config_id: The id of the config to load (default: -1)
-
-    Returns:
-        LLM config dict or None if not found
-    """
-    # Get the config file path
-    base_dir = Path(__file__).resolve().parent.parent.parent.parent
-    config_file = base_dir / "app" / "config" / "global_llm_config.yaml"
-
-    # Fallback to example file if main config doesn't exist
-    if not config_file.exists():
-        config_file = base_dir / "app" / "config" / "global_llm_config.example.yaml"
-        if not config_file.exists():
-            print("Error: No global_llm_config.yaml or example file found")
-            return None
-
-    try:
-        with open(config_file, encoding="utf-8") as f:
-            data = yaml.safe_load(f)
-            configs = data.get("global_llm_configs", [])
-            for cfg in configs:
-                if isinstance(cfg, dict) and cfg.get("id") == llm_config_id:
-                    return cfg
-
-            print(f"Error: Global LLM config id {llm_config_id} not found")
-            return None
-    except Exception as e:
-        print(f"Error loading config: {e}")
-        return None
-
-
-def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
-    """
-    Create a ChatLiteLLM instance from a global LLM config.
-
-    Args:
-        llm_config: LLM configuration dictionary from YAML
-
-    Returns:
-        ChatLiteLLM instance or None on error
-    """
-    # Provider mapping (same as in llm_service.py)
-    provider_map = {
-        "OPENAI": "openai",
-        "ANTHROPIC": "anthropic",
-        "GROQ": "groq",
-        "COHERE": "cohere",
-        "GOOGLE": "gemini",
-        "OLLAMA": "ollama",
-        "MISTRAL": "mistral",
-        "AZURE_OPENAI": "azure",
-        "OPENROUTER": "openrouter",
-        "XAI": "xai",
-        "BEDROCK": "bedrock",
-        "VERTEX_AI": "vertex_ai",
-        "TOGETHER_AI": "together_ai",
-        "FIREWORKS_AI": "fireworks_ai",
-        "DEEPSEEK": "openai",
-        "ALIBABA_QWEN": "openai",
-        "MOONSHOT": "openai",
-        "ZHIPU": "openai",
-    }
-
-    # Build the model string
-    if llm_config.get("custom_provider"):
-        model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}"
-    else:
-        provider = llm_config.get("provider", "").upper()
-        provider_prefix = provider_map.get(provider, provider.lower())
-        model_string = f"{provider_prefix}/{llm_config['model_name']}"
-
-    # Create ChatLiteLLM instance
-    litellm_kwargs = {
-        "model": model_string,
-        "api_key": llm_config.get("api_key"),
-    }
-
-    # Add optional parameters
-    if llm_config.get("api_base"):
-        litellm_kwargs["api_base"] = llm_config["api_base"]
-
-    # Add any additional litellm parameters
-    if llm_config.get("litellm_params"):
-        litellm_kwargs.update(llm_config["litellm_params"])
-
-    return ChatLiteLLM(**litellm_kwargs)
-
-
-# =============================================================================
-# Custom Context Schema
-# =============================================================================
-
-
-class SurfSenseContextSchema(TypedDict):
-    """
-    Custom state schema for the SurfSense deep agent.
-
-    This extends the default agent state with custom fields.
-    The default state already includes:
-    - messages: Conversation history
-    - todos: Task list from TodoListMiddleware
-    - files: Virtual filesystem from FilesystemMiddleware
-
-    We're adding fields needed for knowledge base search:
-    - search_space_id: The user's search space ID
-    - db_session: Database session (injected at runtime)
-    - connector_service: Connector service instance (injected at runtime)
-    """
-
-    search_space_id: int
-    # These are runtime-injected and won't be serialized
-    # db_session and connector_service are passed when invoking the agent
-
-
-# =============================================================================
-# Citation Instructions
-# =============================================================================
-
-SURFSENSE_CITATION_INSTRUCTIONS = """
-<citation_instructions>
-CRITICAL CITATION REQUIREMENTS:
-
-1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
-2. Make sure ALL factual statements from the documents have proper citations.
-3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
-4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
-5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
-6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
-7. Do not return citations as clickable links.
-8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
-9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
-10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
-11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
-
-<document_structure_example>
-The documents you receive are structured like this:
-
-<document>
-<document_metadata>
-  <document_id>42</document_id>
-  <document_type>GITHUB_CONNECTOR</document_type>
-  <title><![CDATA[Some repo / file / issue title]]></title>
-  <url><![CDATA[https://example.com]]></url>
-  <metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
-</document_metadata>
-
-<document_content>
-  <chunk id='123'><![CDATA[First chunk text...]]></chunk>
-  <chunk id='124'><![CDATA[Second chunk text...]]></chunk>
-</document_content>
-</document>
-
-IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite document_id.
-</document_structure_example>
-
-<citation_format>
-- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
-- Citations should appear at the end of the sentence containing the information they support
-- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
-- No need to return references section. Just citations in answer.
-- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
-- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
-- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
-</citation_format>
-
-<citation_examples>
-CORRECT citation formats:
-- [citation:5]
-- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
-
-INCORRECT citation formats (DO NOT use):
-- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
-- Using parentheses around brackets: ([citation:5])
-- Using hyperlinked text: [link to source 5](https://example.com)
-- Using footnote style: ... library¹
-- Making up source IDs when source_id is unknown
-- Using old IEEE format: [1], [2], [3]
-- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
-</citation_examples>
-
-<citation_output_example>
-Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
-
-The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
-
-However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
-</citation_output_example>
-</citation_instructions>
-"""
-
-
-# =============================================================================
-# System Prompt
-# =============================================================================
-
-
-def build_surfsense_system_prompt(
-    today: datetime | None = None,
-    user_instructions: str | None = None,
-    enable_citations: bool = True,
-) -> str:
-    """
-    Build the SurfSense system prompt with optional user instructions and citation toggle.
-
-    Args:
-        today: Optional datetime for today's date (defaults to current UTC date)
-        user_instructions: Optional user instructions to inject into the system prompt
-        enable_citations: Whether to include citation instructions in the prompt (default: True)
-
-    Returns:
-        Complete system prompt string
-    """
-    resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
-
-    # Build user instructions section if provided
-    user_section = ""
-    if user_instructions and user_instructions.strip():
-        user_section = f"""
-<user_instructions>
-{user_instructions.strip()}
-</user_instructions>
-"""
-
-    # Include citation instructions only if enabled
-    citation_section = (
-        f"\n{SURFSENSE_CITATION_INSTRUCTIONS}" if enable_citations else ""
-    )
-
-    return f"""
-<system_instruction>
-You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
-
-Today's date (UTC): {resolved_today}
-
-</system_instruction>{user_section}
-<tools>
-You have access to the following tools:
-- search_knowledge_base: Search the user's personal knowledge base for relevant information.
-  - Args:
-    - query: The search query - be specific and include key terms
-    - top_k: Number of results to retrieve (default: 10)
-    - start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
-    - end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
-    - connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
-  - Returns: Formatted string with relevant documents and their content
-</tools>
-<tool_call_examples>
-- User: "Fetch all my notes and what's in them?"
-  - Call: `search_knowledge_base(query="*", top_k=50, connectors_to_search=["NOTE"])`
-
-- User: "What did I discuss on Slack last week about the React migration?"
-  - Call: `search_knowledge_base(query="React migration", connectors_to_search=["SLACK_CONNECTOR"], start_date="YYYY-MM-DD", end_date="YYYY-MM-DD")`
-</tool_call_examples>{citation_section}
-"""
-
-
-SURFSENSE_SYSTEM_PROMPT = build_surfsense_system_prompt()
-
-
 # =============================================================================
 # Deep Agent Factory
 # =============================================================================
diff --git a/surfsense_backend/app/agents/new_chat/context.py b/surfsense_backend/app/agents/new_chat/context.py
new file mode 100644
index 000000000..da113adf4
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/context.py
@@ -0,0 +1,28 @@
+"""
+Context schema definitions for SurfSense agents.
+
+This module defines the custom state schema used by the SurfSense deep agent.
+"""
+
+from typing import TypedDict
+
+
+class SurfSenseContextSchema(TypedDict):
+    """
+    Custom state schema for the SurfSense deep agent.
+
+    This extends the default agent state with custom fields.
+    The default state already includes:
+    - messages: Conversation history
+    - todos: Task list from TodoListMiddleware
+    - files: Virtual filesystem from FilesystemMiddleware
+
+    We're adding fields needed for knowledge base search:
+    - search_space_id: The user's search space ID
+    - db_session: Database session (injected at runtime)
+    - connector_service: Connector service instance (injected at runtime)
+    """
+
+    search_space_id: int
+    # These are runtime-injected and won't be serialized
+    # db_session and connector_service are passed when invoking the agent
diff --git a/surfsense_backend/app/agents/new_chat/llm_config.py b/surfsense_backend/app/agents/new_chat/llm_config.py
new file mode 100644
index 000000000..a99386df4
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/llm_config.py
@@ -0,0 +1,104 @@
+"""
+LLM configuration utilities for SurfSense agents.
+
+This module provides functions for loading LLM configurations from YAML files
+and creating ChatLiteLLM instances from configuration dictionaries.
+"""
+
+from pathlib import Path
+
+import yaml
+from langchain_litellm import ChatLiteLLM
+
+
+def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None:
+    """
+    Load a specific LLM config from global_llm_config.yaml.
+
+    Args:
+        llm_config_id: The id of the config to load (default: -1)
+
+    Returns:
+        LLM config dict or None if not found
+    """
+    # Get the config file path
+    base_dir = Path(__file__).resolve().parent.parent.parent.parent
+    config_file = base_dir / "app" / "config" / "global_llm_config.yaml"
+
+    # Fallback to example file if main config doesn't exist
+    if not config_file.exists():
+        config_file = base_dir / "app" / "config" / "global_llm_config.example.yaml"
+        if not config_file.exists():
+            print("Error: No global_llm_config.yaml or example file found")
+            return None
+
+    try:
+        with open(config_file, encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            configs = data.get("global_llm_configs", [])
+            for cfg in configs:
+                if isinstance(cfg, dict) and cfg.get("id") == llm_config_id:
+                    return cfg
+
+            print(f"Error: Global LLM config id {llm_config_id} not found")
+            return None
+    except Exception as e:
+        print(f"Error loading config: {e}")
+        return None
+
+
+def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
+    """
+    Create a ChatLiteLLM instance from a global LLM config.
+
+    Args:
+        llm_config: LLM configuration dictionary from YAML
+
+    Returns:
+        ChatLiteLLM instance or None on error
+    """
+    # Provider mapping (same as in llm_service.py)
+    provider_map = {
+        "OPENAI": "openai",
+        "ANTHROPIC": "anthropic",
+        "GROQ": "groq",
+        "COHERE": "cohere",
+        "GOOGLE": "gemini",
+        "OLLAMA": "ollama",
+        "MISTRAL": "mistral",
+        "AZURE_OPENAI": "azure",
+        "OPENROUTER": "openrouter",
+        "XAI": "xai",
+        "BEDROCK": "bedrock",
+        "VERTEX_AI": "vertex_ai",
+        "TOGETHER_AI": "together_ai",
+        "FIREWORKS_AI": "fireworks_ai",
+        "DEEPSEEK": "openai",
+        "ALIBABA_QWEN": "openai",
+        "MOONSHOT": "openai",
+        "ZHIPU": "openai",
+    }
+
+    # Build the model string
+    if llm_config.get("custom_provider"):
+        model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}"
+    else:
+        provider = llm_config.get("provider", "").upper()
+        provider_prefix = provider_map.get(provider, provider.lower())
+        model_string = f"{provider_prefix}/{llm_config['model_name']}"
+
+    # Create ChatLiteLLM instance
+    litellm_kwargs = {
+        "model": model_string,
+        "api_key": llm_config.get("api_key"),
+    }
+
+    # Add optional parameters
+    if llm_config.get("api_base"):
+        litellm_kwargs["api_base"] = llm_config["api_base"]
+
+    # Add any additional litellm parameters
+    if llm_config.get("litellm_params"):
+        litellm_kwargs.update(llm_config["litellm_params"])
+
+    return ChatLiteLLM(**litellm_kwargs)
diff --git a/surfsense_backend/app/agents/new_chat/new_chat_test.py b/surfsense_backend/app/agents/new_chat/new_chat_test.py
index 9178007a0..6a4e9bd02 100644
--- a/surfsense_backend/app/agents/new_chat/new_chat_test.py
+++ b/surfsense_backend/app/agents/new_chat/new_chat_test.py
@@ -11,11 +11,8 @@ from langchain_core.messages import HumanMessage
 from app.db import async_session_maker
 from app.services.connector_service import ConnectorService
 
-from .chat_deepagent import (
-    create_chat_litellm_from_config,
-    create_surfsense_deep_agent,
-    load_llm_config_from_yaml,
-)
+from .chat_deepagent import create_surfsense_deep_agent
+from .llm_config import create_chat_litellm_from_config, load_llm_config_from_yaml
 
 # =============================================================================
 # Test Runner
diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py
new file mode 100644
index 000000000..65a5b1203
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@@ -0,0 +1,143 @@
+"""
+System prompt building for SurfSense agents.
+
+This module provides functions and constants for building the SurfSense system prompt
+with configurable user instructions and citation support.
+"""
+
+from datetime import UTC, datetime
+
+SURFSENSE_CITATION_INSTRUCTIONS = """
+<citation_instructions>
+CRITICAL CITATION REQUIREMENTS:
+
+1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
+2. Make sure ALL factual statements from the documents have proper citations.
+3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
+4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
+5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
+6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
+7. Do not return citations as clickable links.
+8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
+9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
+10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
+11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
+
+<document_structure_example>
+The documents you receive are structured like this:
+
+<document>
+<document_metadata>
+  <document_id>42</document_id>
+  <document_type>GITHUB_CONNECTOR</document_type>
+  <title><![CDATA[Some repo / file / issue title]]></title>
+  <url><![CDATA[https://example.com]]></url>
+  <metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
+</document_metadata>
+
+<document_content>
+  <chunk id='123'><![CDATA[First chunk text...]]></chunk>
+  <chunk id='124'><![CDATA[Second chunk text...]]></chunk>
+</document_content>
+</document>
+
+IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite document_id.
+</document_structure_example>
+
+<citation_format>
+- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
+- Citations should appear at the end of the sentence containing the information they support
+- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
+- No need to return references section. Just citations in answer.
+- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
+- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
+- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
+</citation_format>
+
+<citation_examples>
+CORRECT citation formats:
+- [citation:5]
+- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
+
+INCORRECT citation formats (DO NOT use):
+- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
+- Using parentheses around brackets: ([citation:5])
+- Using hyperlinked text: [link to source 5](https://example.com)
+- Using footnote style: ... library¹
+- Making up source IDs when source_id is unknown
+- Using old IEEE format: [1], [2], [3]
+- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
+</citation_examples>
+
+<citation_output_example>
+Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
+
+The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
+
+However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
+</citation_output_example>
+</citation_instructions>
+"""
+
+
+def build_surfsense_system_prompt(
+    today: datetime | None = None,
+    user_instructions: str | None = None,
+    enable_citations: bool = True,
+) -> str:
+    """
+    Build the SurfSense system prompt with optional user instructions and citation toggle.
+
+    Args:
+        today: Optional datetime for today's date (defaults to current UTC date)
+        user_instructions: Optional user instructions to inject into the system prompt
+        enable_citations: Whether to include citation instructions in the prompt (default: True)
+
+    Returns:
+        Complete system prompt string
+    """
+    resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
+
+    # Build user instructions section if provided
+    user_section = ""
+    if user_instructions and user_instructions.strip():
+        user_section = f"""
+<user_instructions>
+{user_instructions.strip()}
+</user_instructions>
+"""
+
+    # Include citation instructions only if enabled
+    citation_section = (
+        f"\n{SURFSENSE_CITATION_INSTRUCTIONS}" if enable_citations else ""
+    )
+
+    return f"""
+<system_instruction>
+You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
+
+Today's date (UTC): {resolved_today}
+
+</system_instruction>{user_section}
+<tools>
+You have access to the following tools:
+- search_knowledge_base: Search the user's personal knowledge base for relevant information.
+  - Args:
+    - query: The search query - be specific and include key terms
+    - top_k: Number of results to retrieve (default: 10)
+    - start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
+    - end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
+    - connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
+  - Returns: Formatted string with relevant documents and their content
+</tools>
+<tool_call_examples>
+- User: "Fetch all my notes and what's in them?"
+  - Call: `search_knowledge_base(query="*", top_k=50, connectors_to_search=["NOTE"])`
+
+- User: "What did I discuss on Slack last week about the React migration?"
+  - Call: `search_knowledge_base(query="React migration", connectors_to_search=["SLACK_CONNECTOR"], start_date="YYYY-MM-DD", end_date="YYYY-MM-DD")`
+</tool_call_examples>{citation_section}
+"""
+
+
+SURFSENSE_SYSTEM_PROMPT = build_surfsense_system_prompt()