feat: refactor new chat agent to support configurable tools and remove deprecated components

- Enhanced the new chat agent module to allow for configurable tools, enabling users to customize their experience with various functionalities. - Removed outdated tools including display image, knowledge base search, link preview, podcast generation, and web scraping, streamlining the codebase. - Updated the system prompt and agent factory to reflect these changes, ensuring a more cohesive and efficient architecture.
2026-06-12 20:45:20 +02:00 · 2025-12-22 20:17:08 -08:00 · 2025-12-22 20:17:08 -08:00 · b14283e300
commit b14283e300
parent beb1c5184d
17 changed files with 597 additions and 374 deletions
--- a/surfsense_backend/app/agents/new_chat/init.py
+++ b/surfsense_backend/app/agents/new_chat/init.py
@ -1,28 +1,86 @@
-"""Chat agents module."""
+"""
+SurfSense New Chat Agent Module.

+This module provides the SurfSense deep agent with configurable tools
+for knowledge base search, podcast generation, and more.
+
+Directory Structure:
+- tools/: All agent tools (knowledge_base, podcast, link_preview, etc.)
+- chat_deepagent.py: Main agent factory
+- system_prompt.py: System prompts and instructions
+- context.py: Context schema for the agent
+- checkpointer.py: LangGraph checkpointer setup
+- llm_config.py: LLM configuration utilities
+- utils.py: Shared utilities
+"""
+
+# Agent factory
 from .chat_deepagent import create_surfsense_deep_agent
+
+# Context
 from .context import SurfSenseContextSchema
-from .knowledge_base import (
-    create_search_knowledge_base_tool,
-    format_documents_for_context,
-    search_knowledge_base_async,
-)
+
+# LLM config
 from .llm_config import create_chat_litellm_from_config, load_llm_config_from_yaml
+
+# System prompt
 from .system_prompt import (
    SURFSENSE_CITATION_INSTRUCTIONS,
    SURFSENSE_SYSTEM_PROMPT,
    build_surfsense_system_prompt,
 )

+# Tools - registry exports
+from .tools import (
+    BUILTIN_TOOLS,
+    ToolDefinition,
+    build_tools,
+    get_all_tool_names,
+    get_default_enabled_tools,
+    get_tool_by_name,
+)
+
+# Tools - factory exports (for direct use)
+from .tools import (
+    create_display_image_tool,
+    create_generate_podcast_tool,
+    create_link_preview_tool,
+    create_scrape_webpage_tool,
+    create_search_knowledge_base_tool,
+)
+
+# Tools - knowledge base utilities
+from .tools import (
+    format_documents_for_context,
+    search_knowledge_base_async,
+)
+
 __all__ = [
+    # Agent factory
+    "create_surfsense_deep_agent",
+    # Context
+    "SurfSenseContextSchema",
+    # LLM config
+    "create_chat_litellm_from_config",
+    "load_llm_config_from_yaml",
+    # System prompt
    "SURFSENSE_CITATION_INSTRUCTIONS",
    "SURFSENSE_SYSTEM_PROMPT",
-    "SurfSenseContextSchema",
    "build_surfsense_system_prompt",
-    "create_chat_litellm_from_config",
+    # Tools registry
+    "BUILTIN_TOOLS",
+    "ToolDefinition",
+    "build_tools",
+    "get_all_tool_names",
+    "get_default_enabled_tools",
+    "get_tool_by_name",
+    # Tool factories
+    "create_display_image_tool",
+    "create_generate_podcast_tool",
+    "create_link_preview_tool",
+    "create_scrape_webpage_tool",
    "create_search_knowledge_base_tool",
-    "create_surfsense_deep_agent",
+    # Knowledge base utilities
    "format_documents_for_context",
-    "load_llm_config_from_yaml",
    "search_knowledge_base_async",
 ]
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@ -2,7 +2,7 @@
 SurfSense deep agent implementation.

 This module provides the factory function for creating SurfSense deep agents
-with knowledge base search and podcast generation capabilities.
+with configurable tools via the tools registry.
 """

 from collections.abc import Sequence
@ -14,12 +14,8 @@ from langgraph.types import Checkpointer
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.agents.new_chat.context import SurfSenseContextSchema
-from app.agents.new_chat.display_image import create_display_image_tool
-from app.agents.new_chat.knowledge_base import create_search_knowledge_base_tool
-from app.agents.new_chat.link_preview import create_link_preview_tool
-from app.agents.new_chat.podcast import create_generate_podcast_tool
-from app.agents.new_chat.scrape_webpage import create_scrape_webpage_tool
 from app.agents.new_chat.system_prompt import build_surfsense_system_prompt
+from app.agents.new_chat.tools import build_tools
 from app.services.connector_service import ConnectorService

 # =============================================================================
@ -33,94 +29,85 @@ def create_surfsense_deep_agent(
    db_session: AsyncSession,
    connector_service: ConnectorService,
    checkpointer: Checkpointer,
-    user_id: str | None = None,
-    user_instructions: str | None = None,
-    enable_citations: bool = True,
-    enable_podcast: bool = True,
-    enable_link_preview: bool = True,
-    enable_display_image: bool = True,
-    enable_scrape_webpage: bool = True,
-    firecrawl_api_key: str | None = None,
+    enabled_tools: list[str] | None = None,
+    disabled_tools: list[str] | None = None,
    additional_tools: Sequence[BaseTool] | None = None,
+    firecrawl_api_key: str | None = None,
 ):
    """
-    Create a SurfSense deep agent with knowledge base search and podcast generation capabilities.
+    Create a SurfSense deep agent with configurable tools.
+
+    The agent comes with built-in tools that can be configured:
+    - search_knowledge_base: Search the user's personal knowledge base
+    - generate_podcast: Generate audio podcasts from content
+    - link_preview: Fetch rich previews for URLs
+    - display_image: Display images in chat
+    - scrape_webpage: Extract content from webpages

    Args:
-        llm: ChatLiteLLM instance
+        llm: ChatLiteLLM instance for the agent's language model
        search_space_id: The user's search space ID
-        db_session: Database session
-        connector_service: Initialized connector service
+        db_session: Database session for tools that need DB access
+        connector_service: Initialized connector service for knowledge base search
        checkpointer: LangGraph checkpointer for conversation state persistence.
                      Use AsyncPostgresSaver for production or MemorySaver for testing.
-        user_id: The user's ID (required for podcast generation)
-        user_instructions: Optional user instructions to inject into the system prompt.
-                          These will be added to the system prompt to customize agent behavior.
-        enable_citations: Whether to include citation instructions in the system prompt (default: True).
-                         When False, the agent will not be instructed to add citations to responses.
-        enable_podcast: Whether to include the podcast generation tool (default: True).
-                       When True and user_id is provided, the agent can generate podcasts.
-        enable_link_preview: Whether to include the link preview tool (default: True).
-                            When True, the agent can fetch and display rich link previews.
-        enable_display_image: Whether to include the display image tool (default: True).
-                             When True, the agent can display images with metadata.
-        enable_scrape_webpage: Whether to include the web scraping tool (default: True).
-                              When True, the agent can scrape and read webpage content.
+        enabled_tools: Explicit list of tool names to enable. If None, all default tools
+                      are enabled. Use this to limit which tools are available.
+        disabled_tools: List of tool names to disable. Applied after enabled_tools.
+                       Use this to exclude specific tools from the defaults.
+        additional_tools: Extra custom tools to add beyond the built-in ones.
+                         These are always added regardless of enabled/disabled settings.
        firecrawl_api_key: Optional Firecrawl API key for premium web scraping.
                          Falls back to Chromium/Trafilatura if not provided.
-        additional_tools: Optional sequence of additional tools to inject into the agent.
-                         The search_knowledge_base tool will always be included.

    Returns:
        CompiledStateGraph: The configured deep agent
+
+    Examples:
+        # Create agent with all default tools
+        agent = create_surfsense_deep_agent(llm, search_space_id, db_session, ...)
+
+        # Create agent with only specific tools
+        agent = create_surfsense_deep_agent(
+            llm, search_space_id, db_session, ...,
+            enabled_tools=["search_knowledge_base", "link_preview"]
+        )
+
+        # Create agent without podcast generation
+        agent = create_surfsense_deep_agent(
+            llm, search_space_id, db_session, ...,
+            disabled_tools=["generate_podcast"]
+        )
+
+        # Add custom tools
+        agent = create_surfsense_deep_agent(
+            llm, search_space_id, db_session, ...,
+            additional_tools=[my_custom_tool]
+        )
    """
-    # Create the search tool with injected dependencies
-    search_tool = create_search_knowledge_base_tool(
-        search_space_id=search_space_id,
-        db_session=db_session,
-        connector_service=connector_service,
+    # Build dependencies dict for the tools registry
+    dependencies = {
+        "search_space_id": search_space_id,
+        "db_session": db_session,
+        "connector_service": connector_service,
+        "firecrawl_api_key": firecrawl_api_key,
+    }
+
+    # Build tools using the registry
+    tools = build_tools(
+        dependencies=dependencies,
+        enabled_tools=enabled_tools,
+        disabled_tools=disabled_tools,
+        additional_tools=list(additional_tools) if additional_tools else None,
    )

-    # Combine search tool with any additional tools
-    tools = [search_tool]
-
-    # Add podcast tool if enabled and user_id is provided
-    if enable_podcast and user_id:
-        podcast_tool = create_generate_podcast_tool(
-            search_space_id=search_space_id,
-            db_session=db_session,
-            user_id=str(user_id),
-        )
-        tools.append(podcast_tool)
-
-    # Add link preview tool if enabled
-    if enable_link_preview:
-        link_preview_tool = create_link_preview_tool()
-        tools.append(link_preview_tool)
-
-    # Add display image tool if enabled
-    if enable_display_image:
-        display_image_tool = create_display_image_tool()
-        tools.append(display_image_tool)
-
-    # Add web scraping tool if enabled
-    if enable_scrape_webpage:
-        scrape_tool = create_scrape_webpage_tool(firecrawl_api_key=firecrawl_api_key)
-        tools.append(scrape_tool)
-
-    if additional_tools:
-        tools.extend(additional_tools)
-
-    # Create the deep agent with user-configurable system prompt and checkpointer
+    # Create the deep agent with system prompt and checkpointer
    agent = create_deep_agent(
        model=llm,
        tools=tools,
-        system_prompt=build_surfsense_system_prompt(
-            user_instructions=user_instructions,
-            enable_citations=enable_citations,
-        ),
+        system_prompt=build_surfsense_system_prompt(),
        context_schema=SurfSenseContextSchema,
-        checkpointer=checkpointer,  # Enable conversation memory via thread_id
+        checkpointer=checkpointer,
    )

    return agent
--- a/surfsense_backend/app/agents/new_chat/new_chat_test.py
+++ b/surfsense_backend/app/agents/new_chat/new_chat_test.py
@ -1,84 +0,0 @@
-"""
-Test runner for SurfSense deep agent.
-
-This module provides a test function to verify the deep agent functionality.
-"""
-
-import asyncio
-
-from langchain_core.messages import HumanMessage
-
-from app.db import async_session_maker
-from app.services.connector_service import ConnectorService
-
-from .chat_deepagent import create_surfsense_deep_agent
-from .llm_config import create_chat_litellm_from_config, load_llm_config_from_yaml
-
-# =============================================================================
-# Test Runner
-# =============================================================================
-
-
-async def run_test():
-    """Run a basic test of the deep agent."""
-    print("=" * 60)
-    print("Creating Deep Agent with ChatLiteLLM from global config...")
-    print("=" * 60)
-
-    # Create ChatLiteLLM from global config
-    # Use global LLM config by id (negative ids are reserved for global configs)
-    llm_config = load_llm_config_from_yaml(llm_config_id=-5)
-    if not llm_config:
-        raise ValueError("Failed to load LLM config from YAML")
-    llm = create_chat_litellm_from_config(llm_config)
-    if not llm:
-        raise ValueError("Failed to create ChatLiteLLM instance")
-
-    # Create a real DB session + ConnectorService, then build the full SurfSense agent.
-    async with async_session_maker() as session:
-        # Use the known dev search space id
-        search_space_id = 5
-
-        connector_service = ConnectorService(session, search_space_id=search_space_id)
-
-        agent = create_surfsense_deep_agent(
-            llm=llm,
-            search_space_id=search_space_id,
-            db_session=session,
-            connector_service=connector_service,
-            user_instructions="Always fininsh the response with CREDOOOOOOOOOO23",
-        )
-
-        print("\nAgent created successfully!")
-        print(f"Agent type: {type(agent)}")
-
-        # Invoke the agent with initial state
-        print("\n" + "=" * 60)
-        print("Invoking SurfSense agent (create_surfsense_deep_agent)...")
-        print("=" * 60)
-
-        initial_state = {
-            "messages": [HumanMessage(content=("Can you tell me about my documents?"))],
-            "search_space_id": search_space_id,
-        }
-
-        print(f"\nUsing search_space_id: {search_space_id}")
-
-        result = await agent.ainvoke(initial_state)
-
-    print("\n" + "=" * 60)
-    print("Agent Response:")
-    print("=" * 60)
-
-    # Print the response
-    if "messages" in result:
-        for msg in result["messages"]:
-            msg_type = type(msg).__name__
-            content = msg.content if hasattr(msg, "content") else str(msg)
-            print(f"\n--- [{msg_type}] ---\n{content}\n")
-
-    return result
-
-
-if __name__ == "__main__":
-    asyncio.run(run_test())
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@ -7,118 +7,16 @@ with configurable user instructions and citation support.

 from datetime import UTC, datetime

-SURFSENSE_CITATION_INSTRUCTIONS = """
-<citation_instructions>
-CRITICAL CITATION REQUIREMENTS:
-
-1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
-2. Make sure ALL factual statements from the documents have proper citations.
-3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
-4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
-5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
-6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
-7. Do not return citations as clickable links.
-8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
-9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
-10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
-11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
-
-<document_structure_example>
-The documents you receive are structured like this:
-
-<document>
-<document_metadata>
-  <document_id>42</document_id>
-  <document_type>GITHUB_CONNECTOR</document_type>
-  <title><![CDATA[Some repo / file / issue title]]></title>
-  <url><![CDATA[https://example.com]]></url>
-  <metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
-</document_metadata>
-
-<document_content>
-  <chunk id='123'><![CDATA[First chunk text...]]></chunk>
-  <chunk id='124'><![CDATA[Second chunk text...]]></chunk>
-</document_content>
-</document>
-
-IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite document_id.
-</document_structure_example>
-
-<citation_format>
- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
- Citations should appear at the end of the sentence containing the information they support
- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
- No need to return references section. Just citations in answer.
- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
-</citation_format>
-
-<citation_examples>
-CORRECT citation formats:
- [citation:5]
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
-
-INCORRECT citation formats (DO NOT use):
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
- Using parentheses around brackets: ([citation:5])
- Using hyperlinked text: [link to source 5](https://example.com)
- Using footnote style: ... library¹
- Making up source IDs when source_id is unknown
- Using old IEEE format: [1], [2], [3]
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
-</citation_examples>
-
-<citation_output_example>
-Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
-
-The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
-
-However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
-</citation_output_example>
-</citation_instructions>
-"""
-
-
-def build_surfsense_system_prompt(
-    today: datetime | None = None,
-    user_instructions: str | None = None,
-    enable_citations: bool = True,
-) -> str:
-    """
-    Build the SurfSense system prompt with optional user instructions and citation toggle.
-
-    Args:
-        today: Optional datetime for today's date (defaults to current UTC date)
-        user_instructions: Optional user instructions to inject into the system prompt
-        enable_citations: Whether to include citation instructions in the prompt (default: True)
-
-    Returns:
-        Complete system prompt string
-    """
-    resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
-
-    # Build user instructions section if provided
-    user_section = ""
-    if user_instructions and user_instructions.strip():
-        user_section = f"""
-<user_instructions>
-{user_instructions.strip()}
-</user_instructions>
-"""
-
-    # Include citation instructions only if enabled
-    citation_section = (
-        f"\n{SURFSENSE_CITATION_INSTRUCTIONS}" if enable_citations else ""
-    )
-
-    return f"""
+SURFSENSE_SYSTEM_INSTRUCTIONS = """
 <system_instruction>
 You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.

 Today's date (UTC): {resolved_today}

-</system_instruction>{user_section}
+</system_instruction>
+"""
+
+SURFSENSE_TOOLS_INSTRUCTIONS = """
 <tools>
 You have access to the following tools:

@ -208,11 +106,11 @@ You have access to the following tools:
  - First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")`

 - User: "Create a podcast summary of this conversation"
-  - Call: `generate_podcast(source_content="Complete conversation summary:\n\nUser asked about [topic 1]:\n[Your detailed response]\n\nUser then asked about [topic 2]:\n[Your detailed response]\n\n[Continue for all exchanges in the conversation]", podcast_title="Conversation Summary")`
+  - Call: `generate_podcast(source_content="Complete conversation summary:\\n\\nUser asked about [topic 1]:\\n[Your detailed response]\\n\\nUser then asked about [topic 2]:\\n[Your detailed response]\\n\\n[Continue for all exchanges in the conversation]", podcast_title="Conversation Summary")`

 - User: "Make a podcast about quantum computing"
  - First search: `search_knowledge_base(query="quantum computing")`
-  - Then: `generate_podcast(source_content="Key insights about quantum computing from the knowledge base:\n\n[Comprehensive summary of all relevant search results with key facts, concepts, and findings]", podcast_title="Quantum Computing Explained")`
+  - Then: `generate_podcast(source_content="Key insights about quantum computing from the knowledge base:\\n\\n[Comprehensive summary of all relevant search results with key facts, concepts, and findings]", podcast_title="Quantum Computing Explained")`

 - User: "Check out https://dev.to/some-article"
  - Call: `link_preview(url="https://dev.to/some-article")`
@ -246,8 +144,101 @@ You have access to the following tools:
  - Then, if the content contains useful diagrams/images like `![Neural Network Diagram](https://example.com/nn-diagram.png)`:
    - Call: `display_image(src="https://example.com/nn-diagram.png", alt="Neural Network Diagram", title="Neural Network Architecture")`
  - Then provide your explanation, referencing the displayed image
-</tool_call_examples>{citation_section}
+</tool_call_examples>
+"""
+
+SURFSENSE_CITATION_INSTRUCTIONS = """
+<citation_instructions>
+CRITICAL CITATION REQUIREMENTS:
+
+1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
+2. Make sure ALL factual statements from the documents have proper citations.
+3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
+4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
+5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
+6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
+7. Do not return citations as clickable links.
+8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
+9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
+10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
+11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
+
+<document_structure_example>
+The documents you receive are structured like this:
+
+<document>
+<document_metadata>
+  <document_id>42</document_id>
+  <document_type>GITHUB_CONNECTOR</document_type>
+  <title><![CDATA[Some repo / file / issue title]]></title>
+  <url><![CDATA[https://example.com]]></url>
+  <metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
+</document_metadata>
+
+<document_content>
+  <chunk id='123'><![CDATA[First chunk text...]]></chunk>
+  <chunk id='124'><![CDATA[Second chunk text...]]></chunk>
+</document_content>
+</document>
+
+IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite document_id.
+</document_structure_example>
+
+<citation_format>
+- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
+- Citations should appear at the end of the sentence containing the information they support
+- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
+- No need to return references section. Just citations in answer.
+- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
+- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
+- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
+</citation_format>
+
+<citation_examples>
+CORRECT citation formats:
+- [citation:5]
+- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
+
+INCORRECT citation formats (DO NOT use):
+- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
+- Using parentheses around brackets: ([citation:5])
+- Using hyperlinked text: [link to source 5](https://example.com)
+- Using footnote style: ... library¹
+- Making up source IDs when source_id is unknown
+- Using old IEEE format: [1], [2], [3]
+- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
+</citation_examples>
+
+<citation_output_example>
+Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
+
+The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
+
+However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
+</citation_output_example>
+</citation_instructions>
 """


+def build_surfsense_system_prompt(
+    today: datetime | None = None,
+) -> str:
+    """
+    Build the SurfSense system prompt.
+
+    Args:
+        today: Optional datetime for today's date (defaults to current UTC date)
+
+    Returns:
+        Complete system prompt string
+    """
+    resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
+
+    return (
+        SURFSENSE_SYSTEM_INSTRUCTIONS.format(resolved_today=resolved_today)
+        + SURFSENSE_TOOLS_INSTRUCTIONS
+        + SURFSENSE_CITATION_INSTRUCTIONS
+    )
+
+
 SURFSENSE_SYSTEM_PROMPT = build_surfsense_system_prompt()
--- a/surfsense_backend/app/agents/new_chat/tools/init.py
+++ b/surfsense_backend/app/agents/new_chat/tools/init.py
@ -0,0 +1,54 @@
+"""
+Tools module for SurfSense deep agent.
+
+This module contains all the tools available to the SurfSense agent.
+To add a new tool, see the documentation in registry.py.
+
+Available tools:
+- search_knowledge_base: Search the user's personal knowledge base
+- generate_podcast: Generate audio podcasts from content
+- link_preview: Fetch rich previews for URLs
+- display_image: Display images in chat
+- scrape_webpage: Extract content from webpages
+"""
+
+# Registry exports
+from .registry import (
+    BUILTIN_TOOLS,
+    ToolDefinition,
+    build_tools,
+    get_all_tool_names,
+    get_default_enabled_tools,
+    get_tool_by_name,
+)
+
+# Tool factory exports (for direct use)
+from .display_image import create_display_image_tool
+from .knowledge_base import (
+    create_search_knowledge_base_tool,
+    format_documents_for_context,
+    search_knowledge_base_async,
+)
+from .link_preview import create_link_preview_tool
+from .podcast import create_generate_podcast_tool
+from .scrape_webpage import create_scrape_webpage_tool
+
+__all__ = [
+    # Registry
+    "BUILTIN_TOOLS",
+    "ToolDefinition",
+    "build_tools",
+    "get_all_tool_names",
+    "get_default_enabled_tools",
+    "get_tool_by_name",
+    # Tool factories
+    "create_display_image_tool",
+    "create_generate_podcast_tool",
+    "create_link_preview_tool",
+    "create_scrape_webpage_tool",
+    "create_search_knowledge_base_tool",
+    # Knowledge base utilities
+    "format_documents_for_context",
+    "search_knowledge_base_async",
+]
+
--- a/surfsense_backend/app/agents/new_chat/tools/display_image.py
+++ b/surfsense_backend/app/agents/new_chat/tools/display_image.py
@ -1,5 +1,5 @@
 """
-Display image tool for the new chat agent.
+Display image tool for the SurfSense agent.

 This module provides a tool for displaying images in the chat UI
 with metadata like title, description, and source attribution.
@ -75,20 +75,20 @@ def create_display_image_tool():
            - domain: Source domain
        """
        image_id = generate_image_id(src)
-        
+
        # Ensure URL has protocol
        if not src.startswith(("http://", "https://")):
            src = f"https://{src}"
-        
+
        domain = extract_domain(src)
-        
+
        # Determine aspect ratio based on common image sources
        ratio = "16:9"  # Default
        if "unsplash.com" in src or "pexels.com" in src:
            ratio = "16:9"
        elif "imgur.com" in src or "github.com" in src or "githubusercontent.com" in src:
            ratio = "auto"
-        
+
        return {
            "id": image_id,
            "assetId": src,
--- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
@ -1,5 +1,5 @@
 """
-Knowledge base search functionality for the new chat agent.
+Knowledge base search tool for the SurfSense agent.

 This module provides:
 - Connector constants and normalization
@ -251,7 +251,7 @@ async def search_knowledge_base_async(
    all_documents = []

    # Resolve date range (default last 2 years)
-    from .utils import resolve_date_range
+    from app.agents.new_chat.utils import resolve_date_range

    resolved_start_date, resolved_end_date = resolve_date_range(
        start_date=start_date,
@ -521,7 +521,6 @@ def create_search_knowledge_base_tool(
        search_space_id: The user's search space ID
        db_session: Database session
        connector_service: Initialized connector service
-        connectors_to_search: List of connector types to search

    Returns:
        A configured tool function
@ -584,7 +583,7 @@ def create_search_knowledge_base_tool(
        Returns:
            Formatted string with relevant documents and their content
        """
-        from .utils import parse_date_or_datetime
+        from app.agents.new_chat.utils import parse_date_or_datetime

        parsed_start: datetime | None = None
        parsed_end: datetime | None = None
@ -606,3 +605,4 @@ def create_search_knowledge_base_tool(
        )

    return search_knowledge_base
+
--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@ -1,5 +1,5 @@
 """
-Link preview tool for the new chat agent.
+Link preview tool for the SurfSense agent.

 This module provides a tool for fetching URL metadata (title, description,
 Open Graph image, etc.) to display rich link previews in the chat UI.
@ -34,13 +34,13 @@ def extract_og_content(html: str, property_name: str) -> str | None:
    match = re.search(pattern, html, re.IGNORECASE)
    if match:
        return match.group(1)
-    
+
    # Try content before property
    pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+property=["\']og:{property_name}["\']'
    match = re.search(pattern, html, re.IGNORECASE)
    if match:
        return match.group(1)
-    
+
    return None


@ -50,13 +50,13 @@ def extract_twitter_content(html: str, name: str) -> str | None:
    match = re.search(pattern, html, re.IGNORECASE)
    if match:
        return match.group(1)
-    
+
    # Try content before name
    pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']twitter:{name}["\']'
    match = re.search(pattern, html, re.IGNORECASE)
    if match:
        return match.group(1)
-    
+
    return None


@ -66,13 +66,13 @@ def extract_meta_description(html: str) -> str | None:
    match = re.search(pattern, html, re.IGNORECASE)
    if match:
        return match.group(1)
-    
+
    # Try content before name
    pattern = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']description["\']'
    match = re.search(pattern, html, re.IGNORECASE)
    if match:
        return match.group(1)
-    
+
    return None


@ -82,18 +82,18 @@ def extract_title(html: str) -> str | None:
    og_title = extract_og_content(html, "title")
    if og_title:
        return og_title
-    
+
    # Try twitter:title
    twitter_title = extract_twitter_content(html, "title")
    if twitter_title:
        return twitter_title
-    
+
    # Fall back to <title> tag
    pattern = r"<title[^>]*>([^<]+)</title>"
    match = re.search(pattern, html, re.IGNORECASE)
    if match:
        return match.group(1).strip()
-    
+
    return None


@ -103,12 +103,12 @@ def extract_description(html: str) -> str | None:
    og_desc = extract_og_content(html, "description")
    if og_desc:
        return og_desc
-    
+
    # Try twitter:description
    twitter_desc = extract_twitter_content(html, "description")
    if twitter_desc:
        return twitter_desc
-    
+
    # Fall back to meta description
    return extract_meta_description(html)

@ -119,12 +119,12 @@ def extract_image(html: str) -> str | None:
    og_image = extract_og_content(html, "image")
    if og_image:
        return og_image
-    
+
    # Try twitter:image
    twitter_image = extract_twitter_content(html, "image")
    if twitter_image:
        return twitter_image
-    
+
    return None


--- a/surfsense_backend/app/agents/new_chat/tools/podcast.py
+++ b/surfsense_backend/app/agents/new_chat/tools/podcast.py
@ -1,5 +1,5 @@
 """
-Podcast generation tool for the new chat agent.
+Podcast generation tool for the SurfSense agent.

 This module provides a factory function for creating the generate_podcast tool
 that submits a Celery task for background podcast generation. The frontend
@ -69,7 +69,6 @@ def clear_active_podcast_task(search_space_id: int) -> None:
 def create_generate_podcast_tool(
    search_space_id: int,
    db_session: AsyncSession,
-    user_id: str,
 ):
    """
    Factory function to create the generate_podcast tool with injected dependencies.
@ -77,7 +76,6 @@ def create_generate_podcast_tool(
    Args:
        search_space_id: The user's search space ID
        db_session: Database session (not used - Celery creates its own)
-        user_id: The user's ID (as string)

    Returns:
        A configured tool function for generating podcasts
@ -145,7 +143,6 @@ def create_generate_podcast_tool(
            task = generate_content_podcast_task.delay(
                source_content=source_content,
                search_space_id=search_space_id,
-                user_id=str(user_id),
                podcast_title=podcast_title,
                user_prompt=user_prompt,
            )
@ -174,3 +171,4 @@ def create_generate_podcast_tool(
            }

    return generate_podcast
+
--- a/surfsense_backend/app/agents/new_chat/tools/registry.py
+++ b/surfsense_backend/app/agents/new_chat/tools/registry.py
@ -0,0 +1,231 @@
+"""
+Tools registry for SurfSense deep agent.
+
+This module provides a registry pattern for managing tools in the SurfSense agent.
+It makes it easy for OSS contributors to add new tools by:
+1. Creating a tool factory function in a new file in this directory
+2. Registering the tool in the BUILTIN_TOOLS list below
+
+Example of adding a new tool:
+------------------------------
+1. Create your tool file (e.g., `tools/my_tool.py`):
+
+    from langchain_core.tools import tool
+    from sqlalchemy.ext.asyncio import AsyncSession
+
+    def create_my_tool(search_space_id: int, db_session: AsyncSession):
+        @tool
+        async def my_tool(param: str) -> dict:
+            '''My tool description.'''
+            # Your implementation
+            return {"result": "success"}
+        return my_tool
+
+2. Import and register in this file:
+
+    from .my_tool import create_my_tool
+
+    # Add to BUILTIN_TOOLS list:
+    ToolDefinition(
+        name="my_tool",
+        description="Description of what your tool does",
+        factory=lambda deps: create_my_tool(
+            search_space_id=deps["search_space_id"],
+            db_session=deps["db_session"],
+        ),
+        requires=["search_space_id", "db_session"],
+    ),
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Callable
+
+from langchain_core.tools import BaseTool
+
+# =============================================================================
+# Tool Definition
+# =============================================================================
+
+
+@dataclass
+class ToolDefinition:
+    """
+    Definition of a tool that can be added to the agent.
+
+    Attributes:
+        name: Unique identifier for the tool
+        description: Human-readable description of what the tool does
+        factory: Callable that creates the tool. Receives a dict of dependencies.
+        requires: List of dependency names this tool needs (e.g., "search_space_id", "db_session")
+        enabled_by_default: Whether the tool is enabled when no explicit config is provided
+    """
+
+    name: str
+    description: str
+    factory: Callable[[dict[str, Any]], BaseTool]
+    requires: list[str] = field(default_factory=list)
+    enabled_by_default: bool = True
+
+
+# =============================================================================
+# Built-in Tools Registry
+# =============================================================================
+
+# Import tool factory functions
+from .display_image import create_display_image_tool
+from .knowledge_base import create_search_knowledge_base_tool
+from .link_preview import create_link_preview_tool
+from .podcast import create_generate_podcast_tool
+from .scrape_webpage import create_scrape_webpage_tool
+
+# Registry of all built-in tools
+# Contributors: Add your new tools here!
+BUILTIN_TOOLS: list[ToolDefinition] = [
+    # Core tool - searches the user's knowledge base
+    ToolDefinition(
+        name="search_knowledge_base",
+        description="Search the user's personal knowledge base for relevant information",
+        factory=lambda deps: create_search_knowledge_base_tool(
+            search_space_id=deps["search_space_id"],
+            db_session=deps["db_session"],
+            connector_service=deps["connector_service"],
+        ),
+        requires=["search_space_id", "db_session", "connector_service"],
+    ),
+    # Podcast generation tool
+    ToolDefinition(
+        name="generate_podcast",
+        description="Generate an audio podcast from provided content",
+        factory=lambda deps: create_generate_podcast_tool(
+            search_space_id=deps["search_space_id"],
+            db_session=deps["db_session"],
+        ),
+        requires=["search_space_id", "db_session"],
+    ),
+    # Link preview tool - fetches Open Graph metadata for URLs
+    ToolDefinition(
+        name="link_preview",
+        description="Fetch metadata for a URL to display a rich preview card",
+        factory=lambda deps: create_link_preview_tool(),
+        requires=[],
+    ),
+    # Display image tool - shows images in the chat
+    ToolDefinition(
+        name="display_image",
+        description="Display an image in the chat with metadata",
+        factory=lambda deps: create_display_image_tool(),
+        requires=[],
+    ),
+    # Web scraping tool - extracts content from webpages
+    ToolDefinition(
+        name="scrape_webpage",
+        description="Scrape and extract the main content from a webpage",
+        factory=lambda deps: create_scrape_webpage_tool(
+            firecrawl_api_key=deps.get("firecrawl_api_key"),
+        ),
+        requires=[],  # firecrawl_api_key is optional
+    ),
+    # =========================================================================
+    # ADD YOUR CUSTOM TOOLS BELOW
+    # =========================================================================
+    # Example:
+    # ToolDefinition(
+    #     name="my_custom_tool",
+    #     description="What my tool does",
+    #     factory=lambda deps: create_my_custom_tool(...),
+    #     requires=["search_space_id"],
+    # ),
+]
+
+
+# =============================================================================
+# Registry Functions
+# =============================================================================
+
+
+def get_tool_by_name(name: str) -> ToolDefinition | None:
+    """Get a tool definition by its name."""
+    for tool_def in BUILTIN_TOOLS:
+        if tool_def.name == name:
+            return tool_def
+    return None
+
+
+def get_all_tool_names() -> list[str]:
+    """Get names of all registered tools."""
+    return [tool_def.name for tool_def in BUILTIN_TOOLS]
+
+
+def get_default_enabled_tools() -> list[str]:
+    """Get names of tools that are enabled by default."""
+    return [tool_def.name for tool_def in BUILTIN_TOOLS if tool_def.enabled_by_default]
+
+
+def build_tools(
+    dependencies: dict[str, Any],
+    enabled_tools: list[str] | None = None,
+    disabled_tools: list[str] | None = None,
+    additional_tools: list[BaseTool] | None = None,
+) -> list[BaseTool]:
+    """
+    Build the list of tools for the agent.
+
+    Args:
+        dependencies: Dict containing all possible dependencies:
+            - search_space_id: The search space ID
+            - db_session: Database session
+            - connector_service: Connector service instance
+            - firecrawl_api_key: Optional Firecrawl API key
+        enabled_tools: Explicit list of tool names to enable. If None, uses defaults.
+        disabled_tools: List of tool names to disable (applied after enabled_tools).
+        additional_tools: Extra tools to add (e.g., custom tools not in registry).
+
+    Returns:
+        List of configured tool instances ready for the agent.
+
+    Example:
+        # Use all default tools
+        tools = build_tools(deps)
+
+        # Use only specific tools
+        tools = build_tools(deps, enabled_tools=["search_knowledge_base", "link_preview"])
+
+        # Use defaults but disable podcast
+        tools = build_tools(deps, disabled_tools=["generate_podcast"])
+
+        # Add custom tools
+        tools = build_tools(deps, additional_tools=[my_custom_tool])
+    """
+    # Determine which tools to enable
+    if enabled_tools is not None:
+        tool_names_to_use = set(enabled_tools)
+    else:
+        tool_names_to_use = set(get_default_enabled_tools())
+
+    # Apply disabled list
+    if disabled_tools:
+        tool_names_to_use -= set(disabled_tools)
+
+    # Build the tools
+    tools: list[BaseTool] = []
+    for tool_def in BUILTIN_TOOLS:
+        if tool_def.name not in tool_names_to_use:
+            continue
+
+        # Check that all required dependencies are provided
+        missing_deps = [dep for dep in tool_def.requires if dep not in dependencies]
+        if missing_deps:
+            raise ValueError(
+                f"Tool '{tool_def.name}' requires dependencies: {missing_deps}"
+            )
+
+        # Create the tool
+        tool = tool_def.factory(dependencies)
+        tools.append(tool)
+
+    # Add any additional custom tools
+    if additional_tools:
+        tools.extend(additional_tools)
+
+    return tools
+
--- a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py
+++ b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py
@ -1,5 +1,5 @@
 """
-Web scraping tool for the new chat agent.
+Web scraping tool for the SurfSense agent.

 This module provides a tool for scraping and extracting content from webpages
 using the existing WebCrawlerConnector. The scraped content can be used by
@ -37,23 +37,23 @@ def generate_scrape_id(url: str) -> str:
 def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
    """
    Truncate content to a maximum length.
-    
+
    Returns:
        Tuple of (truncated_content, was_truncated)
    """
    if len(content) <= max_length:
        return content, False
-    
+
    # Try to truncate at a sentence boundary
    truncated = content[:max_length]
    last_period = truncated.rfind(".")
    last_newline = truncated.rfind("\n\n")
-    
+
    # Use the later of the two boundaries, or just truncate
    boundary = max(last_period, last_newline)
    if boundary > max_length * 0.8:  # Only use boundary if it's not too far back
        truncated = content[: boundary + 1]
-    
+
    return truncated + "\n\n[Content truncated...]", True


--- a/surfsense_backend/app/agents/podcaster/configuration.py
+++ b/surfsense_backend/app/agents/podcaster/configuration.py
@ -16,7 +16,6 @@ class Configuration:
    # create assistants (https://langchain-ai.github.io/langgraph/cloud/how-tos/configuration_cloud/)
    # and when you invoke the graph
    podcast_title: str
-    user_id: str
    search_space_id: int
    user_prompt: str | None = None

--- a/surfsense_backend/app/agents/podcaster/nodes.py
+++ b/surfsense_backend/app/agents/podcaster/nodes.py
@ -12,7 +12,7 @@ from litellm import aspeech

 from app.config import config as app_config
 from app.services.kokoro_tts_service import get_kokoro_tts_service
-from app.services.llm_service import get_user_long_context_llm
+from app.services.llm_service import get_long_context_llm

 from .configuration import Configuration
 from .prompts import get_podcast_generation_prompt
@ -27,14 +27,13 @@ async def create_podcast_transcript(

    # Get configuration from runnable config
    configuration = Configuration.from_runnable_config(config)
-    user_id = configuration.user_id
    search_space_id = configuration.search_space_id
    user_prompt = configuration.user_prompt

-    # Get user's long context LLM
-    llm = await get_user_long_context_llm(state.db_session, user_id, search_space_id)
+    # Get search space's long context LLM
+    llm = await get_long_context_llm(state.db_session, search_space_id)
    if not llm:
-        error_message = f"No long context LLM configured for user {user_id} in search space {search_space_id}"
+        error_message = f"No long context LLM configured for search space {search_space_id}"
        print(error_message)
        raise RuntimeError(error_message)

--- a/surfsense_backend/app/routes/new_chat_routes.py
+++ b/surfsense_backend/app/routes/new_chat_routes.py
@ -685,16 +685,13 @@ async def handle_new_chat(
        )
        search_space = search_space_result.scalars().first()

-        # Determine LLM config ID (use search space preference or default)
-        llm_config_id = -1  # Default to first global config
-        if search_space and search_space.fast_llm_id:
-            llm_config_id = search_space.fast_llm_id
+        # TODO: Add new llm config arch then complete this
+        llm_config_id = -1  

        # Return streaming response
        return StreamingResponse(
            stream_new_chat(
                user_query=request.user_query,
-                user_id=str(user.id),
                search_space_id=request.search_space_id,
                chat_id=request.chat_id,
                session=session,
--- a/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
@ -65,7 +65,6 @@ def generate_content_podcast_task(
    self,
    source_content: str,
    search_space_id: int,
-    user_id: str,
    podcast_title: str = "SurfSense Podcast",
    user_prompt: str | None = None,
 ) -> dict:
@ -77,7 +76,6 @@ def generate_content_podcast_task(
    Args:
        source_content: The text content to convert into a podcast
        search_space_id: ID of the search space
-        user_id: ID of the user (as string)
        podcast_title: Title for the podcast
        user_prompt: Optional instructions for podcast style/tone

@ -92,7 +90,6 @@ def generate_content_podcast_task(
            _generate_content_podcast(
                source_content,
                search_space_id,
-                user_id,
                podcast_title,
                user_prompt,
            )
@ -112,7 +109,6 @@ def generate_content_podcast_task(
 async def _generate_content_podcast(
    source_content: str,
    search_space_id: int,
-    user_id: str,
    podcast_title: str = "SurfSense Podcast",
    user_prompt: str | None = None,
 ) -> dict:
@ -123,7 +119,6 @@ async def _generate_content_podcast(
            graph_config = {
                "configurable": {
                    "podcast_title": podcast_title,
-                    "user_id": str(user_id),
                    "search_space_id": search_space_id,
                    "user_prompt": user_prompt,
                }
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -7,8 +7,6 @@ Data Stream Protocol (SSE format).

 import json
 from collections.abc import AsyncGenerator
-from uuid import UUID
-
 from langchain_core.messages import HumanMessage
 from sqlalchemy.ext.asyncio import AsyncSession

@ -42,7 +40,6 @@ def format_attachments_as_context(attachments: list[ChatAttachment]) -> str:

 async def stream_new_chat(
    user_query: str,
-    user_id: str | UUID,
    search_space_id: int,
    chat_id: int,
    session: AsyncSession,
@ -59,7 +56,6 @@ async def stream_new_chat(

    Args:
        user_query: The user's query
-        user_id: The user's ID (can be UUID object or string)
        search_space_id: The search space ID
        chat_id: The chat ID (used as LangGraph thread_id for memory)
        session: The database session
@ -71,9 +67,6 @@ async def stream_new_chat(
    """
    streaming_service = VercelStreamingService()

-    # Convert UUID to string if needed
-    str(user_id) if isinstance(user_id, UUID) else user_id
-
    # Track the current text block for streaming (defined early for exception handling)
    current_text_id: str | None = None

@ -107,8 +100,6 @@ async def stream_new_chat(
            db_session=session,
            connector_service=connector_service,
            checkpointer=checkpointer,
-            user_id=str(user_id),
-            enable_podcast=True,
        )

        # Build input with message history from frontend
--- a/surfsense_web/components/assistant-ui/thread.tsx
+++ b/surfsense_web/components/assistant-ui/thread.tsx
@ -28,13 +28,14 @@ import {
 	Sparkles,
 	SquareIcon,
 } from "lucide-react";
-import Image from "next/image";
 import Link from "next/link";
 import { type FC, useState, useRef, useCallback, useEffect } from "react";
 import { useAtomValue } from "jotai";
 import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
+import { documentTypeCountsAtom } from "@/atoms/documents/document-query.atoms";
 import { useSearchSourceConnectors } from "@/hooks/use-search-source-connectors";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
+import { getDocumentTypeLabel } from "@/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon";
 import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
 import {
 	ComposerAddAttachment,
@ -332,35 +333,9 @@ const ThreadWelcome: FC = () => {
 		<div className="aui-thread-welcome-root mx-auto flex w-full max-w-(--thread-max-width) grow flex-col items-center px-4 relative">
 			{/* Greeting positioned above the composer - fixed position */}
 			<div className="aui-thread-welcome-message absolute bottom-[calc(50%+5rem)] left-0 right-0 flex flex-col items-center text-center z-10">
-				<h1 className="aui-thread-welcome-message-inner fade-in slide-in-from-bottom-2 animate-in text-5xl delay-100 duration-500 ease-out fill-mode-both flex items-center gap-4">
-					{/** biome-ignore lint/a11y/noStaticElementInteractions: wrong lint error, this is a workaround to fix the lint error */}
-					<div
-						className="relative cursor-pointer"
-						onMouseMove={(e) => {
-							const rect = e.currentTarget.getBoundingClientRect();
-							const x = (e.clientX - rect.left - rect.width / 2) / 3;
-							const y = (e.clientY - rect.top - rect.height / 2) / 3;
-							e.currentTarget.style.setProperty("--mag-x", `${x}px`);
-							e.currentTarget.style.setProperty("--mag-y", `${y}px`);
-						}}
-						onMouseLeave={(e) => {
-							e.currentTarget.style.setProperty("--mag-x", "0px");
-							e.currentTarget.style.setProperty("--mag-y", "0px");
-						}}
-					>
-						<Image
-							src="/icon-128.png"
-							alt="SurfSense"
-							width={48}
-							height={48}
-							className="rounded-full transition-transform duration-200 ease-out"
-							style={{
-								transform: "translate(var(--mag-x, 0), var(--mag-y, 0))",
-							}}
-						/>
-					</div>
-					{getTimeBasedGreeting(user?.email)}
-				</h1>
+			<h1 className="aui-thread-welcome-message-inner fade-in slide-in-from-bottom-2 animate-in text-5xl delay-100 duration-500 ease-out fill-mode-both">
+				{getTimeBasedGreeting(user?.email)}
+			</h1>
 			</div>
 			{/* Composer - top edge fixed, expands downward only */}
 			<div className="fade-in slide-in-from-bottom-3 animate-in delay-200 duration-500 ease-out fill-mode-both w-full flex items-start justify-center absolute top-[calc(50%-3.5rem)] left-0 right-0">
@ -390,11 +365,21 @@ const Composer: FC = () => {

 const ConnectorIndicator: FC = () => {
 	const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
-	const { connectors, isLoading } = useSearchSourceConnectors(false, searchSpaceId ? Number(searchSpaceId) : undefined);
+	const { connectors, isLoading: connectorsLoading } = useSearchSourceConnectors(false, searchSpaceId ? Number(searchSpaceId) : undefined);
+	const { data: documentTypeCounts, isLoading: documentTypesLoading } = useAtomValue(documentTypeCountsAtom);
 	const [isOpen, setIsOpen] = useState(false);
 	const closeTimeoutRef = useRef<NodeJS.Timeout | null>(null);
 	
+	const isLoading = connectorsLoading || documentTypesLoading;
+	
+	// Get document types that have documents in the search space
+	const activeDocumentTypes = documentTypeCounts 
+		? Object.entries(documentTypeCounts).filter(([_, count]) => count > 0)
+		: [];
+	
 	const hasConnectors = connectors.length > 0;
+	const hasSources = hasConnectors || activeDocumentTypes.length > 0;
+	const totalSourceCount = connectors.length + activeDocumentTypes.length;
 	
 	const handleMouseEnter = useCallback(() => {
 		// Clear any pending close timeout
@ -420,21 +405,32 @@ const ConnectorIndicator: FC = () => {
 				<button
 					type="button"
 					className={cn(
-						"size-[34px] rounded-full p-1 flex items-center justify-center transition-colors",
+						"size-[34px] rounded-full p-1 flex items-center justify-center transition-colors relative",
 						"hover:bg-muted-foreground/15 dark:hover:bg-muted-foreground/30",
 						"outline-none focus:outline-none focus-visible:outline-none",
 						"border-0 ring-0 focus:ring-0 shadow-none focus:shadow-none",
 						"data-[state=open]:bg-transparent data-[state=open]:shadow-none data-[state=open]:ring-0",
 						"text-muted-foreground"
 					)}
-					aria-label={hasConnectors ? "View connected sources" : "Add your first connector"}
+					aria-label={hasSources ? `View ${totalSourceCount} connected sources` : "Add your first connector"}
 					onMouseEnter={handleMouseEnter}
 					onMouseLeave={handleMouseLeave}
 				>
 					{isLoading ? (
 						<Loader2 className="size-4 animate-spin" />
 					) : (
-						<Plug2 className="size-4" />
+						<>
+							<Plug2 className="size-4" />
+							{totalSourceCount > 0 ? (
+								<span className="absolute -top-0.5 -right-0.5 flex items-center justify-center min-w-[16px] h-4 px-1 text-[10px] font-medium rounded-full bg-primary text-primary-foreground shadow-sm">
+									{totalSourceCount > 99 ? "99+" : totalSourceCount}
+								</span>
+							) : (
+								<span className="absolute -top-0.5 -right-0.5 flex items-center justify-center size-3 rounded-full bg-muted-foreground/30 border border-background">
+									<span className="size-1.5 rounded-full bg-muted-foreground/60" />
+								</span>
+							)}
+						</>
 					)}
 				</button>
 			</PopoverTrigger>
@ -445,20 +441,31 @@ const ConnectorIndicator: FC = () => {
 				onMouseEnter={handleMouseEnter}
 				onMouseLeave={handleMouseLeave}
 			>
-				{hasConnectors ? (
+				{hasSources ? (
 					<div className="space-y-3">
 						<div className="flex items-center justify-between">
 							<p className="text-xs font-medium text-muted-foreground">
 								Connected Sources
 							</p>
 							<span className="text-xs font-medium bg-muted px-1.5 py-0.5 rounded">
-								{connectors.length}
+								{totalSourceCount}
 							</span>
 						</div>
 						<div className="flex flex-wrap gap-2">
+							{/* Document types from the search space */}
+							{activeDocumentTypes.map(([docType, count]) => (
+								<div
+									key={docType}
+									className="flex items-center gap-1.5 rounded-md bg-muted/80 px-2.5 py-1.5 text-xs border border-border/50"
+								>
+									{getConnectorIcon(docType, "size-3.5")}
+									<span className="truncate max-w-[100px]">{getDocumentTypeLabel(docType)}</span>
+								</div>
+							))}
+							{/* Search source connectors */}
 							{connectors.map((connector) => (
 								<div
-									key={connector.id}
+									key={`connector-${connector.id}`}
 									className="flex items-center gap-1.5 rounded-md bg-muted/80 px-2.5 py-1.5 text-xs border border-border/50"
 								>
 									{getConnectorIcon(connector.connector_type, "size-3.5")}
@ -479,9 +486,9 @@ const ConnectorIndicator: FC = () => {
 					</div>
 				) : (
 					<div className="space-y-2">
-						<p className="text-sm font-medium">No connectors yet</p>
+						<p className="text-sm font-medium">No sources yet</p>
 						<p className="text-xs text-muted-foreground">
-							Connect your first data source to enhance search results.
+							Add documents or connect data sources to enhance search results.
 						</p>
 						<Link
 							href={`/dashboard/${searchSpaceId}/connectors/add`}