mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
feat: refactor new chat agent to support configurable tools and remove deprecated components
- Enhanced the new chat agent module to allow for configurable tools, enabling users to customize their experience with various functionalities. - Removed outdated tools including display image, knowledge base search, link preview, podcast generation, and web scraping, streamlining the codebase. - Updated the system prompt and agent factory to reflect these changes, ensuring a more cohesive and efficient architecture.
This commit is contained in:
parent
beb1c5184d
commit
b14283e300
17 changed files with 597 additions and 374 deletions
|
|
@ -1,28 +1,86 @@
|
|||
"""Chat agents module."""
|
||||
"""
|
||||
SurfSense New Chat Agent Module.
|
||||
|
||||
This module provides the SurfSense deep agent with configurable tools
|
||||
for knowledge base search, podcast generation, and more.
|
||||
|
||||
Directory Structure:
|
||||
- tools/: All agent tools (knowledge_base, podcast, link_preview, etc.)
|
||||
- chat_deepagent.py: Main agent factory
|
||||
- system_prompt.py: System prompts and instructions
|
||||
- context.py: Context schema for the agent
|
||||
- checkpointer.py: LangGraph checkpointer setup
|
||||
- llm_config.py: LLM configuration utilities
|
||||
- utils.py: Shared utilities
|
||||
"""
|
||||
|
||||
# Agent factory
|
||||
from .chat_deepagent import create_surfsense_deep_agent
|
||||
|
||||
# Context
|
||||
from .context import SurfSenseContextSchema
|
||||
from .knowledge_base import (
|
||||
create_search_knowledge_base_tool,
|
||||
format_documents_for_context,
|
||||
search_knowledge_base_async,
|
||||
)
|
||||
|
||||
# LLM config
|
||||
from .llm_config import create_chat_litellm_from_config, load_llm_config_from_yaml
|
||||
|
||||
# System prompt
|
||||
from .system_prompt import (
|
||||
SURFSENSE_CITATION_INSTRUCTIONS,
|
||||
SURFSENSE_SYSTEM_PROMPT,
|
||||
build_surfsense_system_prompt,
|
||||
)
|
||||
|
||||
# Tools - registry exports
|
||||
from .tools import (
|
||||
BUILTIN_TOOLS,
|
||||
ToolDefinition,
|
||||
build_tools,
|
||||
get_all_tool_names,
|
||||
get_default_enabled_tools,
|
||||
get_tool_by_name,
|
||||
)
|
||||
|
||||
# Tools - factory exports (for direct use)
|
||||
from .tools import (
|
||||
create_display_image_tool,
|
||||
create_generate_podcast_tool,
|
||||
create_link_preview_tool,
|
||||
create_scrape_webpage_tool,
|
||||
create_search_knowledge_base_tool,
|
||||
)
|
||||
|
||||
# Tools - knowledge base utilities
|
||||
from .tools import (
|
||||
format_documents_for_context,
|
||||
search_knowledge_base_async,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Agent factory
|
||||
"create_surfsense_deep_agent",
|
||||
# Context
|
||||
"SurfSenseContextSchema",
|
||||
# LLM config
|
||||
"create_chat_litellm_from_config",
|
||||
"load_llm_config_from_yaml",
|
||||
# System prompt
|
||||
"SURFSENSE_CITATION_INSTRUCTIONS",
|
||||
"SURFSENSE_SYSTEM_PROMPT",
|
||||
"SurfSenseContextSchema",
|
||||
"build_surfsense_system_prompt",
|
||||
"create_chat_litellm_from_config",
|
||||
# Tools registry
|
||||
"BUILTIN_TOOLS",
|
||||
"ToolDefinition",
|
||||
"build_tools",
|
||||
"get_all_tool_names",
|
||||
"get_default_enabled_tools",
|
||||
"get_tool_by_name",
|
||||
# Tool factories
|
||||
"create_display_image_tool",
|
||||
"create_generate_podcast_tool",
|
||||
"create_link_preview_tool",
|
||||
"create_scrape_webpage_tool",
|
||||
"create_search_knowledge_base_tool",
|
||||
"create_surfsense_deep_agent",
|
||||
# Knowledge base utilities
|
||||
"format_documents_for_context",
|
||||
"load_llm_config_from_yaml",
|
||||
"search_knowledge_base_async",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
SurfSense deep agent implementation.
|
||||
|
||||
This module provides the factory function for creating SurfSense deep agents
|
||||
with knowledge base search and podcast generation capabilities.
|
||||
with configurable tools via the tools registry.
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
|
@ -14,12 +14,8 @@ from langgraph.types import Checkpointer
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.agents.new_chat.context import SurfSenseContextSchema
|
||||
from app.agents.new_chat.display_image import create_display_image_tool
|
||||
from app.agents.new_chat.knowledge_base import create_search_knowledge_base_tool
|
||||
from app.agents.new_chat.link_preview import create_link_preview_tool
|
||||
from app.agents.new_chat.podcast import create_generate_podcast_tool
|
||||
from app.agents.new_chat.scrape_webpage import create_scrape_webpage_tool
|
||||
from app.agents.new_chat.system_prompt import build_surfsense_system_prompt
|
||||
from app.agents.new_chat.tools import build_tools
|
||||
from app.services.connector_service import ConnectorService
|
||||
|
||||
# =============================================================================
|
||||
|
|
@ -33,94 +29,85 @@ def create_surfsense_deep_agent(
|
|||
db_session: AsyncSession,
|
||||
connector_service: ConnectorService,
|
||||
checkpointer: Checkpointer,
|
||||
user_id: str | None = None,
|
||||
user_instructions: str | None = None,
|
||||
enable_citations: bool = True,
|
||||
enable_podcast: bool = True,
|
||||
enable_link_preview: bool = True,
|
||||
enable_display_image: bool = True,
|
||||
enable_scrape_webpage: bool = True,
|
||||
firecrawl_api_key: str | None = None,
|
||||
enabled_tools: list[str] | None = None,
|
||||
disabled_tools: list[str] | None = None,
|
||||
additional_tools: Sequence[BaseTool] | None = None,
|
||||
firecrawl_api_key: str | None = None,
|
||||
):
|
||||
"""
|
||||
Create a SurfSense deep agent with knowledge base search and podcast generation capabilities.
|
||||
Create a SurfSense deep agent with configurable tools.
|
||||
|
||||
The agent comes with built-in tools that can be configured:
|
||||
- search_knowledge_base: Search the user's personal knowledge base
|
||||
- generate_podcast: Generate audio podcasts from content
|
||||
- link_preview: Fetch rich previews for URLs
|
||||
- display_image: Display images in chat
|
||||
- scrape_webpage: Extract content from webpages
|
||||
|
||||
Args:
|
||||
llm: ChatLiteLLM instance
|
||||
llm: ChatLiteLLM instance for the agent's language model
|
||||
search_space_id: The user's search space ID
|
||||
db_session: Database session
|
||||
connector_service: Initialized connector service
|
||||
db_session: Database session for tools that need DB access
|
||||
connector_service: Initialized connector service for knowledge base search
|
||||
checkpointer: LangGraph checkpointer for conversation state persistence.
|
||||
Use AsyncPostgresSaver for production or MemorySaver for testing.
|
||||
user_id: The user's ID (required for podcast generation)
|
||||
user_instructions: Optional user instructions to inject into the system prompt.
|
||||
These will be added to the system prompt to customize agent behavior.
|
||||
enable_citations: Whether to include citation instructions in the system prompt (default: True).
|
||||
When False, the agent will not be instructed to add citations to responses.
|
||||
enable_podcast: Whether to include the podcast generation tool (default: True).
|
||||
When True and user_id is provided, the agent can generate podcasts.
|
||||
enable_link_preview: Whether to include the link preview tool (default: True).
|
||||
When True, the agent can fetch and display rich link previews.
|
||||
enable_display_image: Whether to include the display image tool (default: True).
|
||||
When True, the agent can display images with metadata.
|
||||
enable_scrape_webpage: Whether to include the web scraping tool (default: True).
|
||||
When True, the agent can scrape and read webpage content.
|
||||
enabled_tools: Explicit list of tool names to enable. If None, all default tools
|
||||
are enabled. Use this to limit which tools are available.
|
||||
disabled_tools: List of tool names to disable. Applied after enabled_tools.
|
||||
Use this to exclude specific tools from the defaults.
|
||||
additional_tools: Extra custom tools to add beyond the built-in ones.
|
||||
These are always added regardless of enabled/disabled settings.
|
||||
firecrawl_api_key: Optional Firecrawl API key for premium web scraping.
|
||||
Falls back to Chromium/Trafilatura if not provided.
|
||||
additional_tools: Optional sequence of additional tools to inject into the agent.
|
||||
The search_knowledge_base tool will always be included.
|
||||
|
||||
Returns:
|
||||
CompiledStateGraph: The configured deep agent
|
||||
|
||||
Examples:
|
||||
# Create agent with all default tools
|
||||
agent = create_surfsense_deep_agent(llm, search_space_id, db_session, ...)
|
||||
|
||||
# Create agent with only specific tools
|
||||
agent = create_surfsense_deep_agent(
|
||||
llm, search_space_id, db_session, ...,
|
||||
enabled_tools=["search_knowledge_base", "link_preview"]
|
||||
)
|
||||
|
||||
# Create agent without podcast generation
|
||||
agent = create_surfsense_deep_agent(
|
||||
llm, search_space_id, db_session, ...,
|
||||
disabled_tools=["generate_podcast"]
|
||||
)
|
||||
|
||||
# Add custom tools
|
||||
agent = create_surfsense_deep_agent(
|
||||
llm, search_space_id, db_session, ...,
|
||||
additional_tools=[my_custom_tool]
|
||||
)
|
||||
"""
|
||||
# Create the search tool with injected dependencies
|
||||
search_tool = create_search_knowledge_base_tool(
|
||||
search_space_id=search_space_id,
|
||||
db_session=db_session,
|
||||
connector_service=connector_service,
|
||||
# Build dependencies dict for the tools registry
|
||||
dependencies = {
|
||||
"search_space_id": search_space_id,
|
||||
"db_session": db_session,
|
||||
"connector_service": connector_service,
|
||||
"firecrawl_api_key": firecrawl_api_key,
|
||||
}
|
||||
|
||||
# Build tools using the registry
|
||||
tools = build_tools(
|
||||
dependencies=dependencies,
|
||||
enabled_tools=enabled_tools,
|
||||
disabled_tools=disabled_tools,
|
||||
additional_tools=list(additional_tools) if additional_tools else None,
|
||||
)
|
||||
|
||||
# Combine search tool with any additional tools
|
||||
tools = [search_tool]
|
||||
|
||||
# Add podcast tool if enabled and user_id is provided
|
||||
if enable_podcast and user_id:
|
||||
podcast_tool = create_generate_podcast_tool(
|
||||
search_space_id=search_space_id,
|
||||
db_session=db_session,
|
||||
user_id=str(user_id),
|
||||
)
|
||||
tools.append(podcast_tool)
|
||||
|
||||
# Add link preview tool if enabled
|
||||
if enable_link_preview:
|
||||
link_preview_tool = create_link_preview_tool()
|
||||
tools.append(link_preview_tool)
|
||||
|
||||
# Add display image tool if enabled
|
||||
if enable_display_image:
|
||||
display_image_tool = create_display_image_tool()
|
||||
tools.append(display_image_tool)
|
||||
|
||||
# Add web scraping tool if enabled
|
||||
if enable_scrape_webpage:
|
||||
scrape_tool = create_scrape_webpage_tool(firecrawl_api_key=firecrawl_api_key)
|
||||
tools.append(scrape_tool)
|
||||
|
||||
if additional_tools:
|
||||
tools.extend(additional_tools)
|
||||
|
||||
# Create the deep agent with user-configurable system prompt and checkpointer
|
||||
# Create the deep agent with system prompt and checkpointer
|
||||
agent = create_deep_agent(
|
||||
model=llm,
|
||||
tools=tools,
|
||||
system_prompt=build_surfsense_system_prompt(
|
||||
user_instructions=user_instructions,
|
||||
enable_citations=enable_citations,
|
||||
),
|
||||
system_prompt=build_surfsense_system_prompt(),
|
||||
context_schema=SurfSenseContextSchema,
|
||||
checkpointer=checkpointer, # Enable conversation memory via thread_id
|
||||
checkpointer=checkpointer,
|
||||
)
|
||||
|
||||
return agent
|
||||
|
|
|
|||
|
|
@ -1,84 +0,0 @@
|
|||
"""
|
||||
Test runner for SurfSense deep agent.
|
||||
|
||||
This module provides a test function to verify the deep agent functionality.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
from app.db import async_session_maker
|
||||
from app.services.connector_service import ConnectorService
|
||||
|
||||
from .chat_deepagent import create_surfsense_deep_agent
|
||||
from .llm_config import create_chat_litellm_from_config, load_llm_config_from_yaml
|
||||
|
||||
# =============================================================================
|
||||
# Test Runner
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def run_test():
|
||||
"""Run a basic test of the deep agent."""
|
||||
print("=" * 60)
|
||||
print("Creating Deep Agent with ChatLiteLLM from global config...")
|
||||
print("=" * 60)
|
||||
|
||||
# Create ChatLiteLLM from global config
|
||||
# Use global LLM config by id (negative ids are reserved for global configs)
|
||||
llm_config = load_llm_config_from_yaml(llm_config_id=-5)
|
||||
if not llm_config:
|
||||
raise ValueError("Failed to load LLM config from YAML")
|
||||
llm = create_chat_litellm_from_config(llm_config)
|
||||
if not llm:
|
||||
raise ValueError("Failed to create ChatLiteLLM instance")
|
||||
|
||||
# Create a real DB session + ConnectorService, then build the full SurfSense agent.
|
||||
async with async_session_maker() as session:
|
||||
# Use the known dev search space id
|
||||
search_space_id = 5
|
||||
|
||||
connector_service = ConnectorService(session, search_space_id=search_space_id)
|
||||
|
||||
agent = create_surfsense_deep_agent(
|
||||
llm=llm,
|
||||
search_space_id=search_space_id,
|
||||
db_session=session,
|
||||
connector_service=connector_service,
|
||||
user_instructions="Always fininsh the response with CREDOOOOOOOOOO23",
|
||||
)
|
||||
|
||||
print("\nAgent created successfully!")
|
||||
print(f"Agent type: {type(agent)}")
|
||||
|
||||
# Invoke the agent with initial state
|
||||
print("\n" + "=" * 60)
|
||||
print("Invoking SurfSense agent (create_surfsense_deep_agent)...")
|
||||
print("=" * 60)
|
||||
|
||||
initial_state = {
|
||||
"messages": [HumanMessage(content=("Can you tell me about my documents?"))],
|
||||
"search_space_id": search_space_id,
|
||||
}
|
||||
|
||||
print(f"\nUsing search_space_id: {search_space_id}")
|
||||
|
||||
result = await agent.ainvoke(initial_state)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Agent Response:")
|
||||
print("=" * 60)
|
||||
|
||||
# Print the response
|
||||
if "messages" in result:
|
||||
for msg in result["messages"]:
|
||||
msg_type = type(msg).__name__
|
||||
content = msg.content if hasattr(msg, "content") else str(msg)
|
||||
print(f"\n--- [{msg_type}] ---\n{content}\n")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_test())
|
||||
|
|
@ -7,118 +7,16 @@ with configurable user instructions and citation support.
|
|||
|
||||
from datetime import UTC, datetime
|
||||
|
||||
SURFSENSE_CITATION_INSTRUCTIONS = """
|
||||
<citation_instructions>
|
||||
CRITICAL CITATION REQUIREMENTS:
|
||||
|
||||
1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
|
||||
2. Make sure ALL factual statements from the documents have proper citations.
|
||||
3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
|
||||
4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
|
||||
5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
|
||||
6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
|
||||
7. Do not return citations as clickable links.
|
||||
8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
|
||||
9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
|
||||
10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
|
||||
11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
|
||||
|
||||
<document_structure_example>
|
||||
The documents you receive are structured like this:
|
||||
|
||||
<document>
|
||||
<document_metadata>
|
||||
<document_id>42</document_id>
|
||||
<document_type>GITHUB_CONNECTOR</document_type>
|
||||
<title><![CDATA[Some repo / file / issue title]]></title>
|
||||
<url><![CDATA[https://example.com]]></url>
|
||||
<metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
|
||||
</document_metadata>
|
||||
|
||||
<document_content>
|
||||
<chunk id='123'><![CDATA[First chunk text...]]></chunk>
|
||||
<chunk id='124'><![CDATA[Second chunk text...]]></chunk>
|
||||
</document_content>
|
||||
</document>
|
||||
|
||||
IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite document_id.
|
||||
</document_structure_example>
|
||||
|
||||
<citation_format>
|
||||
- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
|
||||
- Citations should appear at the end of the sentence containing the information they support
|
||||
- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
|
||||
- No need to return references section. Just citations in answer.
|
||||
- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
|
||||
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
|
||||
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
|
||||
</citation_format>
|
||||
|
||||
<citation_examples>
|
||||
CORRECT citation formats:
|
||||
- [citation:5]
|
||||
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
|
||||
|
||||
INCORRECT citation formats (DO NOT use):
|
||||
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
|
||||
- Using parentheses around brackets: ([citation:5])
|
||||
- Using hyperlinked text: [link to source 5](https://example.com)
|
||||
- Using footnote style: ... library¹
|
||||
- Making up source IDs when source_id is unknown
|
||||
- Using old IEEE format: [1], [2], [3]
|
||||
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
|
||||
</citation_examples>
|
||||
|
||||
<citation_output_example>
|
||||
Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
|
||||
|
||||
The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
|
||||
|
||||
However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
|
||||
</citation_output_example>
|
||||
</citation_instructions>
|
||||
"""
|
||||
|
||||
|
||||
def build_surfsense_system_prompt(
|
||||
today: datetime | None = None,
|
||||
user_instructions: str | None = None,
|
||||
enable_citations: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Build the SurfSense system prompt with optional user instructions and citation toggle.
|
||||
|
||||
Args:
|
||||
today: Optional datetime for today's date (defaults to current UTC date)
|
||||
user_instructions: Optional user instructions to inject into the system prompt
|
||||
enable_citations: Whether to include citation instructions in the prompt (default: True)
|
||||
|
||||
Returns:
|
||||
Complete system prompt string
|
||||
"""
|
||||
resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
|
||||
|
||||
# Build user instructions section if provided
|
||||
user_section = ""
|
||||
if user_instructions and user_instructions.strip():
|
||||
user_section = f"""
|
||||
<user_instructions>
|
||||
{user_instructions.strip()}
|
||||
</user_instructions>
|
||||
"""
|
||||
|
||||
# Include citation instructions only if enabled
|
||||
citation_section = (
|
||||
f"\n{SURFSENSE_CITATION_INSTRUCTIONS}" if enable_citations else ""
|
||||
)
|
||||
|
||||
return f"""
|
||||
SURFSENSE_SYSTEM_INSTRUCTIONS = """
|
||||
<system_instruction>
|
||||
You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
|
||||
|
||||
Today's date (UTC): {resolved_today}
|
||||
|
||||
</system_instruction>{user_section}
|
||||
</system_instruction>
|
||||
"""
|
||||
|
||||
SURFSENSE_TOOLS_INSTRUCTIONS = """
|
||||
<tools>
|
||||
You have access to the following tools:
|
||||
|
||||
|
|
@ -208,11 +106,11 @@ You have access to the following tools:
|
|||
- First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")`
|
||||
|
||||
- User: "Create a podcast summary of this conversation"
|
||||
- Call: `generate_podcast(source_content="Complete conversation summary:\n\nUser asked about [topic 1]:\n[Your detailed response]\n\nUser then asked about [topic 2]:\n[Your detailed response]\n\n[Continue for all exchanges in the conversation]", podcast_title="Conversation Summary")`
|
||||
- Call: `generate_podcast(source_content="Complete conversation summary:\\n\\nUser asked about [topic 1]:\\n[Your detailed response]\\n\\nUser then asked about [topic 2]:\\n[Your detailed response]\\n\\n[Continue for all exchanges in the conversation]", podcast_title="Conversation Summary")`
|
||||
|
||||
- User: "Make a podcast about quantum computing"
|
||||
- First search: `search_knowledge_base(query="quantum computing")`
|
||||
- Then: `generate_podcast(source_content="Key insights about quantum computing from the knowledge base:\n\n[Comprehensive summary of all relevant search results with key facts, concepts, and findings]", podcast_title="Quantum Computing Explained")`
|
||||
- Then: `generate_podcast(source_content="Key insights about quantum computing from the knowledge base:\\n\\n[Comprehensive summary of all relevant search results with key facts, concepts, and findings]", podcast_title="Quantum Computing Explained")`
|
||||
|
||||
- User: "Check out https://dev.to/some-article"
|
||||
- Call: `link_preview(url="https://dev.to/some-article")`
|
||||
|
|
@ -246,8 +144,101 @@ You have access to the following tools:
|
|||
- Then, if the content contains useful diagrams/images like ``:
|
||||
- Call: `display_image(src="https://example.com/nn-diagram.png", alt="Neural Network Diagram", title="Neural Network Architecture")`
|
||||
- Then provide your explanation, referencing the displayed image
|
||||
</tool_call_examples>{citation_section}
|
||||
</tool_call_examples>
|
||||
"""
|
||||
|
||||
SURFSENSE_CITATION_INSTRUCTIONS = """
|
||||
<citation_instructions>
|
||||
CRITICAL CITATION REQUIREMENTS:
|
||||
|
||||
1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
|
||||
2. Make sure ALL factual statements from the documents have proper citations.
|
||||
3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
|
||||
4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
|
||||
5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
|
||||
6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
|
||||
7. Do not return citations as clickable links.
|
||||
8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
|
||||
9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
|
||||
10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
|
||||
11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
|
||||
|
||||
<document_structure_example>
|
||||
The documents you receive are structured like this:
|
||||
|
||||
<document>
|
||||
<document_metadata>
|
||||
<document_id>42</document_id>
|
||||
<document_type>GITHUB_CONNECTOR</document_type>
|
||||
<title><![CDATA[Some repo / file / issue title]]></title>
|
||||
<url><![CDATA[https://example.com]]></url>
|
||||
<metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
|
||||
</document_metadata>
|
||||
|
||||
<document_content>
|
||||
<chunk id='123'><![CDATA[First chunk text...]]></chunk>
|
||||
<chunk id='124'><![CDATA[Second chunk text...]]></chunk>
|
||||
</document_content>
|
||||
</document>
|
||||
|
||||
IMPORTANT: You MUST cite using the chunk ids (e.g. 123, 124). Do NOT cite document_id.
|
||||
</document_structure_example>
|
||||
|
||||
<citation_format>
|
||||
- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
|
||||
- Citations should appear at the end of the sentence containing the information they support
|
||||
- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
|
||||
- No need to return references section. Just citations in answer.
|
||||
- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
|
||||
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
|
||||
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
|
||||
</citation_format>
|
||||
|
||||
<citation_examples>
|
||||
CORRECT citation formats:
|
||||
- [citation:5]
|
||||
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
|
||||
|
||||
INCORRECT citation formats (DO NOT use):
|
||||
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
|
||||
- Using parentheses around brackets: ([citation:5])
|
||||
- Using hyperlinked text: [link to source 5](https://example.com)
|
||||
- Using footnote style: ... library¹
|
||||
- Making up source IDs when source_id is unknown
|
||||
- Using old IEEE format: [1], [2], [3]
|
||||
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
|
||||
</citation_examples>
|
||||
|
||||
<citation_output_example>
|
||||
Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
|
||||
|
||||
The key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:12]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
|
||||
|
||||
However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
|
||||
</citation_output_example>
|
||||
</citation_instructions>
|
||||
"""
|
||||
|
||||
|
||||
def build_surfsense_system_prompt(
|
||||
today: datetime | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build the SurfSense system prompt.
|
||||
|
||||
Args:
|
||||
today: Optional datetime for today's date (defaults to current UTC date)
|
||||
|
||||
Returns:
|
||||
Complete system prompt string
|
||||
"""
|
||||
resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
|
||||
|
||||
return (
|
||||
SURFSENSE_SYSTEM_INSTRUCTIONS.format(resolved_today=resolved_today)
|
||||
+ SURFSENSE_TOOLS_INSTRUCTIONS
|
||||
+ SURFSENSE_CITATION_INSTRUCTIONS
|
||||
)
|
||||
|
||||
|
||||
SURFSENSE_SYSTEM_PROMPT = build_surfsense_system_prompt()
|
||||
|
|
|
|||
54
surfsense_backend/app/agents/new_chat/tools/__init__.py
Normal file
54
surfsense_backend/app/agents/new_chat/tools/__init__.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
"""
|
||||
Tools module for SurfSense deep agent.
|
||||
|
||||
This module contains all the tools available to the SurfSense agent.
|
||||
To add a new tool, see the documentation in registry.py.
|
||||
|
||||
Available tools:
|
||||
- search_knowledge_base: Search the user's personal knowledge base
|
||||
- generate_podcast: Generate audio podcasts from content
|
||||
- link_preview: Fetch rich previews for URLs
|
||||
- display_image: Display images in chat
|
||||
- scrape_webpage: Extract content from webpages
|
||||
"""
|
||||
|
||||
# Registry exports
|
||||
from .registry import (
|
||||
BUILTIN_TOOLS,
|
||||
ToolDefinition,
|
||||
build_tools,
|
||||
get_all_tool_names,
|
||||
get_default_enabled_tools,
|
||||
get_tool_by_name,
|
||||
)
|
||||
|
||||
# Tool factory exports (for direct use)
|
||||
from .display_image import create_display_image_tool
|
||||
from .knowledge_base import (
|
||||
create_search_knowledge_base_tool,
|
||||
format_documents_for_context,
|
||||
search_knowledge_base_async,
|
||||
)
|
||||
from .link_preview import create_link_preview_tool
|
||||
from .podcast import create_generate_podcast_tool
|
||||
from .scrape_webpage import create_scrape_webpage_tool
|
||||
|
||||
__all__ = [
|
||||
# Registry
|
||||
"BUILTIN_TOOLS",
|
||||
"ToolDefinition",
|
||||
"build_tools",
|
||||
"get_all_tool_names",
|
||||
"get_default_enabled_tools",
|
||||
"get_tool_by_name",
|
||||
# Tool factories
|
||||
"create_display_image_tool",
|
||||
"create_generate_podcast_tool",
|
||||
"create_link_preview_tool",
|
||||
"create_scrape_webpage_tool",
|
||||
"create_search_knowledge_base_tool",
|
||||
# Knowledge base utilities
|
||||
"format_documents_for_context",
|
||||
"search_knowledge_base_async",
|
||||
]
|
||||
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
Display image tool for the new chat agent.
|
||||
Display image tool for the SurfSense agent.
|
||||
|
||||
This module provides a tool for displaying images in the chat UI
|
||||
with metadata like title, description, and source attribution.
|
||||
|
|
@ -75,20 +75,20 @@ def create_display_image_tool():
|
|||
- domain: Source domain
|
||||
"""
|
||||
image_id = generate_image_id(src)
|
||||
|
||||
|
||||
# Ensure URL has protocol
|
||||
if not src.startswith(("http://", "https://")):
|
||||
src = f"https://{src}"
|
||||
|
||||
|
||||
domain = extract_domain(src)
|
||||
|
||||
|
||||
# Determine aspect ratio based on common image sources
|
||||
ratio = "16:9" # Default
|
||||
if "unsplash.com" in src or "pexels.com" in src:
|
||||
ratio = "16:9"
|
||||
elif "imgur.com" in src or "github.com" in src or "githubusercontent.com" in src:
|
||||
ratio = "auto"
|
||||
|
||||
|
||||
return {
|
||||
"id": image_id,
|
||||
"assetId": src,
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
Knowledge base search functionality for the new chat agent.
|
||||
Knowledge base search tool for the SurfSense agent.
|
||||
|
||||
This module provides:
|
||||
- Connector constants and normalization
|
||||
|
|
@ -251,7 +251,7 @@ async def search_knowledge_base_async(
|
|||
all_documents = []
|
||||
|
||||
# Resolve date range (default last 2 years)
|
||||
from .utils import resolve_date_range
|
||||
from app.agents.new_chat.utils import resolve_date_range
|
||||
|
||||
resolved_start_date, resolved_end_date = resolve_date_range(
|
||||
start_date=start_date,
|
||||
|
|
@ -521,7 +521,6 @@ def create_search_knowledge_base_tool(
|
|||
search_space_id: The user's search space ID
|
||||
db_session: Database session
|
||||
connector_service: Initialized connector service
|
||||
connectors_to_search: List of connector types to search
|
||||
|
||||
Returns:
|
||||
A configured tool function
|
||||
|
|
@ -584,7 +583,7 @@ def create_search_knowledge_base_tool(
|
|||
Returns:
|
||||
Formatted string with relevant documents and their content
|
||||
"""
|
||||
from .utils import parse_date_or_datetime
|
||||
from app.agents.new_chat.utils import parse_date_or_datetime
|
||||
|
||||
parsed_start: datetime | None = None
|
||||
parsed_end: datetime | None = None
|
||||
|
|
@ -606,3 +605,4 @@ def create_search_knowledge_base_tool(
|
|||
)
|
||||
|
||||
return search_knowledge_base
|
||||
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
Link preview tool for the new chat agent.
|
||||
Link preview tool for the SurfSense agent.
|
||||
|
||||
This module provides a tool for fetching URL metadata (title, description,
|
||||
Open Graph image, etc.) to display rich link previews in the chat UI.
|
||||
|
|
@ -34,13 +34,13 @@ def extract_og_content(html: str, property_name: str) -> str | None:
|
|||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
|
||||
# Try content before property
|
||||
pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+property=["\']og:{property_name}["\']'
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -50,13 +50,13 @@ def extract_twitter_content(html: str, name: str) -> str | None:
|
|||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
|
||||
# Try content before name
|
||||
pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']twitter:{name}["\']'
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -66,13 +66,13 @@ def extract_meta_description(html: str) -> str | None:
|
|||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
|
||||
# Try content before name
|
||||
pattern = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']description["\']'
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -82,18 +82,18 @@ def extract_title(html: str) -> str | None:
|
|||
og_title = extract_og_content(html, "title")
|
||||
if og_title:
|
||||
return og_title
|
||||
|
||||
|
||||
# Try twitter:title
|
||||
twitter_title = extract_twitter_content(html, "title")
|
||||
if twitter_title:
|
||||
return twitter_title
|
||||
|
||||
|
||||
# Fall back to <title> tag
|
||||
pattern = r"<title[^>]*>([^<]+)</title>"
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -103,12 +103,12 @@ def extract_description(html: str) -> str | None:
|
|||
og_desc = extract_og_content(html, "description")
|
||||
if og_desc:
|
||||
return og_desc
|
||||
|
||||
|
||||
# Try twitter:description
|
||||
twitter_desc = extract_twitter_content(html, "description")
|
||||
if twitter_desc:
|
||||
return twitter_desc
|
||||
|
||||
|
||||
# Fall back to meta description
|
||||
return extract_meta_description(html)
|
||||
|
||||
|
|
@ -119,12 +119,12 @@ def extract_image(html: str) -> str | None:
|
|||
og_image = extract_og_content(html, "image")
|
||||
if og_image:
|
||||
return og_image
|
||||
|
||||
|
||||
# Try twitter:image
|
||||
twitter_image = extract_twitter_content(html, "image")
|
||||
if twitter_image:
|
||||
return twitter_image
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
Podcast generation tool for the new chat agent.
|
||||
Podcast generation tool for the SurfSense agent.
|
||||
|
||||
This module provides a factory function for creating the generate_podcast tool
|
||||
that submits a Celery task for background podcast generation. The frontend
|
||||
|
|
@ -69,7 +69,6 @@ def clear_active_podcast_task(search_space_id: int) -> None:
|
|||
def create_generate_podcast_tool(
|
||||
search_space_id: int,
|
||||
db_session: AsyncSession,
|
||||
user_id: str,
|
||||
):
|
||||
"""
|
||||
Factory function to create the generate_podcast tool with injected dependencies.
|
||||
|
|
@ -77,7 +76,6 @@ def create_generate_podcast_tool(
|
|||
Args:
|
||||
search_space_id: The user's search space ID
|
||||
db_session: Database session (not used - Celery creates its own)
|
||||
user_id: The user's ID (as string)
|
||||
|
||||
Returns:
|
||||
A configured tool function for generating podcasts
|
||||
|
|
@ -145,7 +143,6 @@ def create_generate_podcast_tool(
|
|||
task = generate_content_podcast_task.delay(
|
||||
source_content=source_content,
|
||||
search_space_id=search_space_id,
|
||||
user_id=str(user_id),
|
||||
podcast_title=podcast_title,
|
||||
user_prompt=user_prompt,
|
||||
)
|
||||
|
|
@ -174,3 +171,4 @@ def create_generate_podcast_tool(
|
|||
}
|
||||
|
||||
return generate_podcast
|
||||
|
||||
231
surfsense_backend/app/agents/new_chat/tools/registry.py
Normal file
231
surfsense_backend/app/agents/new_chat/tools/registry.py
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
"""
|
||||
Tools registry for SurfSense deep agent.
|
||||
|
||||
This module provides a registry pattern for managing tools in the SurfSense agent.
|
||||
It makes it easy for OSS contributors to add new tools by:
|
||||
1. Creating a tool factory function in a new file in this directory
|
||||
2. Registering the tool in the BUILTIN_TOOLS list below
|
||||
|
||||
Example of adding a new tool:
|
||||
------------------------------
|
||||
1. Create your tool file (e.g., `tools/my_tool.py`):
|
||||
|
||||
from langchain_core.tools import tool
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
def create_my_tool(search_space_id: int, db_session: AsyncSession):
|
||||
@tool
|
||||
async def my_tool(param: str) -> dict:
|
||||
'''My tool description.'''
|
||||
# Your implementation
|
||||
return {"result": "success"}
|
||||
return my_tool
|
||||
|
||||
2. Import and register in this file:
|
||||
|
||||
from .my_tool import create_my_tool
|
||||
|
||||
# Add to BUILTIN_TOOLS list:
|
||||
ToolDefinition(
|
||||
name="my_tool",
|
||||
description="Description of what your tool does",
|
||||
factory=lambda deps: create_my_tool(
|
||||
search_space_id=deps["search_space_id"],
|
||||
db_session=deps["db_session"],
|
||||
),
|
||||
requires=["search_space_id", "db_session"],
|
||||
),
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Callable
|
||||
|
||||
from langchain_core.tools import BaseTool
|
||||
|
||||
# =============================================================================
|
||||
# Tool Definition
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class ToolDefinition:
|
||||
"""
|
||||
Definition of a tool that can be added to the agent.
|
||||
|
||||
Attributes:
|
||||
name: Unique identifier for the tool
|
||||
description: Human-readable description of what the tool does
|
||||
factory: Callable that creates the tool. Receives a dict of dependencies.
|
||||
requires: List of dependency names this tool needs (e.g., "search_space_id", "db_session")
|
||||
enabled_by_default: Whether the tool is enabled when no explicit config is provided
|
||||
"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
factory: Callable[[dict[str, Any]], BaseTool]
|
||||
requires: list[str] = field(default_factory=list)
|
||||
enabled_by_default: bool = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Built-in Tools Registry
|
||||
# =============================================================================
|
||||
|
||||
# Import tool factory functions
|
||||
from .display_image import create_display_image_tool
|
||||
from .knowledge_base import create_search_knowledge_base_tool
|
||||
from .link_preview import create_link_preview_tool
|
||||
from .podcast import create_generate_podcast_tool
|
||||
from .scrape_webpage import create_scrape_webpage_tool
|
||||
|
||||
# Registry of all built-in tools
|
||||
# Contributors: Add your new tools here!
|
||||
BUILTIN_TOOLS: list[ToolDefinition] = [
|
||||
# Core tool - searches the user's knowledge base
|
||||
ToolDefinition(
|
||||
name="search_knowledge_base",
|
||||
description="Search the user's personal knowledge base for relevant information",
|
||||
factory=lambda deps: create_search_knowledge_base_tool(
|
||||
search_space_id=deps["search_space_id"],
|
||||
db_session=deps["db_session"],
|
||||
connector_service=deps["connector_service"],
|
||||
),
|
||||
requires=["search_space_id", "db_session", "connector_service"],
|
||||
),
|
||||
# Podcast generation tool
|
||||
ToolDefinition(
|
||||
name="generate_podcast",
|
||||
description="Generate an audio podcast from provided content",
|
||||
factory=lambda deps: create_generate_podcast_tool(
|
||||
search_space_id=deps["search_space_id"],
|
||||
db_session=deps["db_session"],
|
||||
),
|
||||
requires=["search_space_id", "db_session"],
|
||||
),
|
||||
# Link preview tool - fetches Open Graph metadata for URLs
|
||||
ToolDefinition(
|
||||
name="link_preview",
|
||||
description="Fetch metadata for a URL to display a rich preview card",
|
||||
factory=lambda deps: create_link_preview_tool(),
|
||||
requires=[],
|
||||
),
|
||||
# Display image tool - shows images in the chat
|
||||
ToolDefinition(
|
||||
name="display_image",
|
||||
description="Display an image in the chat with metadata",
|
||||
factory=lambda deps: create_display_image_tool(),
|
||||
requires=[],
|
||||
),
|
||||
# Web scraping tool - extracts content from webpages
|
||||
ToolDefinition(
|
||||
name="scrape_webpage",
|
||||
description="Scrape and extract the main content from a webpage",
|
||||
factory=lambda deps: create_scrape_webpage_tool(
|
||||
firecrawl_api_key=deps.get("firecrawl_api_key"),
|
||||
),
|
||||
requires=[], # firecrawl_api_key is optional
|
||||
),
|
||||
# =========================================================================
|
||||
# ADD YOUR CUSTOM TOOLS BELOW
|
||||
# =========================================================================
|
||||
# Example:
|
||||
# ToolDefinition(
|
||||
# name="my_custom_tool",
|
||||
# description="What my tool does",
|
||||
# factory=lambda deps: create_my_custom_tool(...),
|
||||
# requires=["search_space_id"],
|
||||
# ),
|
||||
]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Registry Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def get_tool_by_name(name: str) -> ToolDefinition | None:
|
||||
"""Get a tool definition by its name."""
|
||||
for tool_def in BUILTIN_TOOLS:
|
||||
if tool_def.name == name:
|
||||
return tool_def
|
||||
return None
|
||||
|
||||
|
||||
def get_all_tool_names() -> list[str]:
|
||||
"""Get names of all registered tools."""
|
||||
return [tool_def.name for tool_def in BUILTIN_TOOLS]
|
||||
|
||||
|
||||
def get_default_enabled_tools() -> list[str]:
|
||||
"""Get names of tools that are enabled by default."""
|
||||
return [tool_def.name for tool_def in BUILTIN_TOOLS if tool_def.enabled_by_default]
|
||||
|
||||
|
||||
def build_tools(
|
||||
dependencies: dict[str, Any],
|
||||
enabled_tools: list[str] | None = None,
|
||||
disabled_tools: list[str] | None = None,
|
||||
additional_tools: list[BaseTool] | None = None,
|
||||
) -> list[BaseTool]:
|
||||
"""
|
||||
Build the list of tools for the agent.
|
||||
|
||||
Args:
|
||||
dependencies: Dict containing all possible dependencies:
|
||||
- search_space_id: The search space ID
|
||||
- db_session: Database session
|
||||
- connector_service: Connector service instance
|
||||
- firecrawl_api_key: Optional Firecrawl API key
|
||||
enabled_tools: Explicit list of tool names to enable. If None, uses defaults.
|
||||
disabled_tools: List of tool names to disable (applied after enabled_tools).
|
||||
additional_tools: Extra tools to add (e.g., custom tools not in registry).
|
||||
|
||||
Returns:
|
||||
List of configured tool instances ready for the agent.
|
||||
|
||||
Example:
|
||||
# Use all default tools
|
||||
tools = build_tools(deps)
|
||||
|
||||
# Use only specific tools
|
||||
tools = build_tools(deps, enabled_tools=["search_knowledge_base", "link_preview"])
|
||||
|
||||
# Use defaults but disable podcast
|
||||
tools = build_tools(deps, disabled_tools=["generate_podcast"])
|
||||
|
||||
# Add custom tools
|
||||
tools = build_tools(deps, additional_tools=[my_custom_tool])
|
||||
"""
|
||||
# Determine which tools to enable
|
||||
if enabled_tools is not None:
|
||||
tool_names_to_use = set(enabled_tools)
|
||||
else:
|
||||
tool_names_to_use = set(get_default_enabled_tools())
|
||||
|
||||
# Apply disabled list
|
||||
if disabled_tools:
|
||||
tool_names_to_use -= set(disabled_tools)
|
||||
|
||||
# Build the tools
|
||||
tools: list[BaseTool] = []
|
||||
for tool_def in BUILTIN_TOOLS:
|
||||
if tool_def.name not in tool_names_to_use:
|
||||
continue
|
||||
|
||||
# Check that all required dependencies are provided
|
||||
missing_deps = [dep for dep in tool_def.requires if dep not in dependencies]
|
||||
if missing_deps:
|
||||
raise ValueError(
|
||||
f"Tool '{tool_def.name}' requires dependencies: {missing_deps}"
|
||||
)
|
||||
|
||||
# Create the tool
|
||||
tool = tool_def.factory(dependencies)
|
||||
tools.append(tool)
|
||||
|
||||
# Add any additional custom tools
|
||||
if additional_tools:
|
||||
tools.extend(additional_tools)
|
||||
|
||||
return tools
|
||||
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
Web scraping tool for the new chat agent.
|
||||
Web scraping tool for the SurfSense agent.
|
||||
|
||||
This module provides a tool for scraping and extracting content from webpages
|
||||
using the existing WebCrawlerConnector. The scraped content can be used by
|
||||
|
|
@ -37,23 +37,23 @@ def generate_scrape_id(url: str) -> str:
|
|||
def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
|
||||
"""
|
||||
Truncate content to a maximum length.
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (truncated_content, was_truncated)
|
||||
"""
|
||||
if len(content) <= max_length:
|
||||
return content, False
|
||||
|
||||
|
||||
# Try to truncate at a sentence boundary
|
||||
truncated = content[:max_length]
|
||||
last_period = truncated.rfind(".")
|
||||
last_newline = truncated.rfind("\n\n")
|
||||
|
||||
|
||||
# Use the later of the two boundaries, or just truncate
|
||||
boundary = max(last_period, last_newline)
|
||||
if boundary > max_length * 0.8: # Only use boundary if it's not too far back
|
||||
truncated = content[: boundary + 1]
|
||||
|
||||
|
||||
return truncated + "\n\n[Content truncated...]", True
|
||||
|
||||
|
||||
|
|
@ -16,7 +16,6 @@ class Configuration:
|
|||
# create assistants (https://langchain-ai.github.io/langgraph/cloud/how-tos/configuration_cloud/)
|
||||
# and when you invoke the graph
|
||||
podcast_title: str
|
||||
user_id: str
|
||||
search_space_id: int
|
||||
user_prompt: str | None = None
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ from litellm import aspeech
|
|||
|
||||
from app.config import config as app_config
|
||||
from app.services.kokoro_tts_service import get_kokoro_tts_service
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.llm_service import get_long_context_llm
|
||||
|
||||
from .configuration import Configuration
|
||||
from .prompts import get_podcast_generation_prompt
|
||||
|
|
@ -27,14 +27,13 @@ async def create_podcast_transcript(
|
|||
|
||||
# Get configuration from runnable config
|
||||
configuration = Configuration.from_runnable_config(config)
|
||||
user_id = configuration.user_id
|
||||
search_space_id = configuration.search_space_id
|
||||
user_prompt = configuration.user_prompt
|
||||
|
||||
# Get user's long context LLM
|
||||
llm = await get_user_long_context_llm(state.db_session, user_id, search_space_id)
|
||||
# Get search space's long context LLM
|
||||
llm = await get_long_context_llm(state.db_session, search_space_id)
|
||||
if not llm:
|
||||
error_message = f"No long context LLM configured for user {user_id} in search space {search_space_id}"
|
||||
error_message = f"No long context LLM configured for search space {search_space_id}"
|
||||
print(error_message)
|
||||
raise RuntimeError(error_message)
|
||||
|
||||
|
|
|
|||
|
|
@ -685,16 +685,13 @@ async def handle_new_chat(
|
|||
)
|
||||
search_space = search_space_result.scalars().first()
|
||||
|
||||
# Determine LLM config ID (use search space preference or default)
|
||||
llm_config_id = -1 # Default to first global config
|
||||
if search_space and search_space.fast_llm_id:
|
||||
llm_config_id = search_space.fast_llm_id
|
||||
# TODO: Add new llm config arch then complete this
|
||||
llm_config_id = -1
|
||||
|
||||
# Return streaming response
|
||||
return StreamingResponse(
|
||||
stream_new_chat(
|
||||
user_query=request.user_query,
|
||||
user_id=str(user.id),
|
||||
search_space_id=request.search_space_id,
|
||||
chat_id=request.chat_id,
|
||||
session=session,
|
||||
|
|
|
|||
|
|
@ -65,7 +65,6 @@ def generate_content_podcast_task(
|
|||
self,
|
||||
source_content: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
podcast_title: str = "SurfSense Podcast",
|
||||
user_prompt: str | None = None,
|
||||
) -> dict:
|
||||
|
|
@ -77,7 +76,6 @@ def generate_content_podcast_task(
|
|||
Args:
|
||||
source_content: The text content to convert into a podcast
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user (as string)
|
||||
podcast_title: Title for the podcast
|
||||
user_prompt: Optional instructions for podcast style/tone
|
||||
|
||||
|
|
@ -92,7 +90,6 @@ def generate_content_podcast_task(
|
|||
_generate_content_podcast(
|
||||
source_content,
|
||||
search_space_id,
|
||||
user_id,
|
||||
podcast_title,
|
||||
user_prompt,
|
||||
)
|
||||
|
|
@ -112,7 +109,6 @@ def generate_content_podcast_task(
|
|||
async def _generate_content_podcast(
|
||||
source_content: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
podcast_title: str = "SurfSense Podcast",
|
||||
user_prompt: str | None = None,
|
||||
) -> dict:
|
||||
|
|
@ -123,7 +119,6 @@ async def _generate_content_podcast(
|
|||
graph_config = {
|
||||
"configurable": {
|
||||
"podcast_title": podcast_title,
|
||||
"user_id": str(user_id),
|
||||
"search_space_id": search_space_id,
|
||||
"user_prompt": user_prompt,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,8 +7,6 @@ Data Stream Protocol (SSE format).
|
|||
|
||||
import json
|
||||
from collections.abc import AsyncGenerator
|
||||
from uuid import UUID
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
|
|
@ -42,7 +40,6 @@ def format_attachments_as_context(attachments: list[ChatAttachment]) -> str:
|
|||
|
||||
async def stream_new_chat(
|
||||
user_query: str,
|
||||
user_id: str | UUID,
|
||||
search_space_id: int,
|
||||
chat_id: int,
|
||||
session: AsyncSession,
|
||||
|
|
@ -59,7 +56,6 @@ async def stream_new_chat(
|
|||
|
||||
Args:
|
||||
user_query: The user's query
|
||||
user_id: The user's ID (can be UUID object or string)
|
||||
search_space_id: The search space ID
|
||||
chat_id: The chat ID (used as LangGraph thread_id for memory)
|
||||
session: The database session
|
||||
|
|
@ -71,9 +67,6 @@ async def stream_new_chat(
|
|||
"""
|
||||
streaming_service = VercelStreamingService()
|
||||
|
||||
# Convert UUID to string if needed
|
||||
str(user_id) if isinstance(user_id, UUID) else user_id
|
||||
|
||||
# Track the current text block for streaming (defined early for exception handling)
|
||||
current_text_id: str | None = None
|
||||
|
||||
|
|
@ -107,8 +100,6 @@ async def stream_new_chat(
|
|||
db_session=session,
|
||||
connector_service=connector_service,
|
||||
checkpointer=checkpointer,
|
||||
user_id=str(user_id),
|
||||
enable_podcast=True,
|
||||
)
|
||||
|
||||
# Build input with message history from frontend
|
||||
|
|
|
|||
|
|
@ -28,13 +28,14 @@ import {
|
|||
Sparkles,
|
||||
SquareIcon,
|
||||
} from "lucide-react";
|
||||
import Image from "next/image";
|
||||
import Link from "next/link";
|
||||
import { type FC, useState, useRef, useCallback, useEffect } from "react";
|
||||
import { useAtomValue } from "jotai";
|
||||
import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
|
||||
import { documentTypeCountsAtom } from "@/atoms/documents/document-query.atoms";
|
||||
import { useSearchSourceConnectors } from "@/hooks/use-search-source-connectors";
|
||||
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
|
||||
import { getDocumentTypeLabel } from "@/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon";
|
||||
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
|
||||
import {
|
||||
ComposerAddAttachment,
|
||||
|
|
@ -332,35 +333,9 @@ const ThreadWelcome: FC = () => {
|
|||
<div className="aui-thread-welcome-root mx-auto flex w-full max-w-(--thread-max-width) grow flex-col items-center px-4 relative">
|
||||
{/* Greeting positioned above the composer - fixed position */}
|
||||
<div className="aui-thread-welcome-message absolute bottom-[calc(50%+5rem)] left-0 right-0 flex flex-col items-center text-center z-10">
|
||||
<h1 className="aui-thread-welcome-message-inner fade-in slide-in-from-bottom-2 animate-in text-5xl delay-100 duration-500 ease-out fill-mode-both flex items-center gap-4">
|
||||
{/** biome-ignore lint/a11y/noStaticElementInteractions: wrong lint error, this is a workaround to fix the lint error */}
|
||||
<div
|
||||
className="relative cursor-pointer"
|
||||
onMouseMove={(e) => {
|
||||
const rect = e.currentTarget.getBoundingClientRect();
|
||||
const x = (e.clientX - rect.left - rect.width / 2) / 3;
|
||||
const y = (e.clientY - rect.top - rect.height / 2) / 3;
|
||||
e.currentTarget.style.setProperty("--mag-x", `${x}px`);
|
||||
e.currentTarget.style.setProperty("--mag-y", `${y}px`);
|
||||
}}
|
||||
onMouseLeave={(e) => {
|
||||
e.currentTarget.style.setProperty("--mag-x", "0px");
|
||||
e.currentTarget.style.setProperty("--mag-y", "0px");
|
||||
}}
|
||||
>
|
||||
<Image
|
||||
src="/icon-128.png"
|
||||
alt="SurfSense"
|
||||
width={48}
|
||||
height={48}
|
||||
className="rounded-full transition-transform duration-200 ease-out"
|
||||
style={{
|
||||
transform: "translate(var(--mag-x, 0), var(--mag-y, 0))",
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
{getTimeBasedGreeting(user?.email)}
|
||||
</h1>
|
||||
<h1 className="aui-thread-welcome-message-inner fade-in slide-in-from-bottom-2 animate-in text-5xl delay-100 duration-500 ease-out fill-mode-both">
|
||||
{getTimeBasedGreeting(user?.email)}
|
||||
</h1>
|
||||
</div>
|
||||
{/* Composer - top edge fixed, expands downward only */}
|
||||
<div className="fade-in slide-in-from-bottom-3 animate-in delay-200 duration-500 ease-out fill-mode-both w-full flex items-start justify-center absolute top-[calc(50%-3.5rem)] left-0 right-0">
|
||||
|
|
@ -390,11 +365,21 @@ const Composer: FC = () => {
|
|||
|
||||
const ConnectorIndicator: FC = () => {
|
||||
const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
|
||||
const { connectors, isLoading } = useSearchSourceConnectors(false, searchSpaceId ? Number(searchSpaceId) : undefined);
|
||||
const { connectors, isLoading: connectorsLoading } = useSearchSourceConnectors(false, searchSpaceId ? Number(searchSpaceId) : undefined);
|
||||
const { data: documentTypeCounts, isLoading: documentTypesLoading } = useAtomValue(documentTypeCountsAtom);
|
||||
const [isOpen, setIsOpen] = useState(false);
|
||||
const closeTimeoutRef = useRef<NodeJS.Timeout | null>(null);
|
||||
|
||||
const isLoading = connectorsLoading || documentTypesLoading;
|
||||
|
||||
// Get document types that have documents in the search space
|
||||
const activeDocumentTypes = documentTypeCounts
|
||||
? Object.entries(documentTypeCounts).filter(([_, count]) => count > 0)
|
||||
: [];
|
||||
|
||||
const hasConnectors = connectors.length > 0;
|
||||
const hasSources = hasConnectors || activeDocumentTypes.length > 0;
|
||||
const totalSourceCount = connectors.length + activeDocumentTypes.length;
|
||||
|
||||
const handleMouseEnter = useCallback(() => {
|
||||
// Clear any pending close timeout
|
||||
|
|
@ -420,21 +405,32 @@ const ConnectorIndicator: FC = () => {
|
|||
<button
|
||||
type="button"
|
||||
className={cn(
|
||||
"size-[34px] rounded-full p-1 flex items-center justify-center transition-colors",
|
||||
"size-[34px] rounded-full p-1 flex items-center justify-center transition-colors relative",
|
||||
"hover:bg-muted-foreground/15 dark:hover:bg-muted-foreground/30",
|
||||
"outline-none focus:outline-none focus-visible:outline-none",
|
||||
"border-0 ring-0 focus:ring-0 shadow-none focus:shadow-none",
|
||||
"data-[state=open]:bg-transparent data-[state=open]:shadow-none data-[state=open]:ring-0",
|
||||
"text-muted-foreground"
|
||||
)}
|
||||
aria-label={hasConnectors ? "View connected sources" : "Add your first connector"}
|
||||
aria-label={hasSources ? `View ${totalSourceCount} connected sources` : "Add your first connector"}
|
||||
onMouseEnter={handleMouseEnter}
|
||||
onMouseLeave={handleMouseLeave}
|
||||
>
|
||||
{isLoading ? (
|
||||
<Loader2 className="size-4 animate-spin" />
|
||||
) : (
|
||||
<Plug2 className="size-4" />
|
||||
<>
|
||||
<Plug2 className="size-4" />
|
||||
{totalSourceCount > 0 ? (
|
||||
<span className="absolute -top-0.5 -right-0.5 flex items-center justify-center min-w-[16px] h-4 px-1 text-[10px] font-medium rounded-full bg-primary text-primary-foreground shadow-sm">
|
||||
{totalSourceCount > 99 ? "99+" : totalSourceCount}
|
||||
</span>
|
||||
) : (
|
||||
<span className="absolute -top-0.5 -right-0.5 flex items-center justify-center size-3 rounded-full bg-muted-foreground/30 border border-background">
|
||||
<span className="size-1.5 rounded-full bg-muted-foreground/60" />
|
||||
</span>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
</PopoverTrigger>
|
||||
|
|
@ -445,20 +441,31 @@ const ConnectorIndicator: FC = () => {
|
|||
onMouseEnter={handleMouseEnter}
|
||||
onMouseLeave={handleMouseLeave}
|
||||
>
|
||||
{hasConnectors ? (
|
||||
{hasSources ? (
|
||||
<div className="space-y-3">
|
||||
<div className="flex items-center justify-between">
|
||||
<p className="text-xs font-medium text-muted-foreground">
|
||||
Connected Sources
|
||||
</p>
|
||||
<span className="text-xs font-medium bg-muted px-1.5 py-0.5 rounded">
|
||||
{connectors.length}
|
||||
{totalSourceCount}
|
||||
</span>
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-2">
|
||||
{/* Document types from the search space */}
|
||||
{activeDocumentTypes.map(([docType, count]) => (
|
||||
<div
|
||||
key={docType}
|
||||
className="flex items-center gap-1.5 rounded-md bg-muted/80 px-2.5 py-1.5 text-xs border border-border/50"
|
||||
>
|
||||
{getConnectorIcon(docType, "size-3.5")}
|
||||
<span className="truncate max-w-[100px]">{getDocumentTypeLabel(docType)}</span>
|
||||
</div>
|
||||
))}
|
||||
{/* Search source connectors */}
|
||||
{connectors.map((connector) => (
|
||||
<div
|
||||
key={connector.id}
|
||||
key={`connector-${connector.id}`}
|
||||
className="flex items-center gap-1.5 rounded-md bg-muted/80 px-2.5 py-1.5 text-xs border border-border/50"
|
||||
>
|
||||
{getConnectorIcon(connector.connector_type, "size-3.5")}
|
||||
|
|
@ -479,9 +486,9 @@ const ConnectorIndicator: FC = () => {
|
|||
</div>
|
||||
) : (
|
||||
<div className="space-y-2">
|
||||
<p className="text-sm font-medium">No connectors yet</p>
|
||||
<p className="text-sm font-medium">No sources yet</p>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Connect your first data source to enhance search results.
|
||||
Add documents or connect data sources to enhance search results.
|
||||
</p>
|
||||
<Link
|
||||
href={`/dashboard/${searchSpaceId}/connectors/add`}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue