mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-29 19:06:24 +02:00
merge: upstream/dev with migration renumbering
This commit is contained in:
commit
a7145b2c63
176 changed files with 8791 additions and 3608 deletions
|
|
@ -19,6 +19,7 @@ Available tools:
|
|||
# Tool factory exports (for direct use)
|
||||
from .display_image import create_display_image_tool
|
||||
from .knowledge_base import (
|
||||
CONNECTOR_DESCRIPTIONS,
|
||||
create_search_knowledge_base_tool,
|
||||
format_documents_for_context,
|
||||
search_knowledge_base_async,
|
||||
|
|
@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool
|
|||
__all__ = [
|
||||
# Registry
|
||||
"BUILTIN_TOOLS",
|
||||
# Knowledge base utilities
|
||||
"CONNECTOR_DESCRIPTIONS",
|
||||
"ToolDefinition",
|
||||
"build_tools",
|
||||
# Tool factories
|
||||
|
|
@ -51,7 +54,6 @@ __all__ = [
|
|||
"create_scrape_webpage_tool",
|
||||
"create_search_knowledge_base_tool",
|
||||
"create_search_surfsense_docs_tool",
|
||||
# Knowledge base utilities
|
||||
"format_documents_for_context",
|
||||
"get_all_tool_names",
|
||||
"get_default_enabled_tools",
|
||||
|
|
|
|||
|
|
@ -12,7 +12,8 @@ import json
|
|||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from langchain_core.tools import tool
|
||||
from langchain_core.tools import StructuredTool
|
||||
from pydantic import BaseModel, Field
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.services.connector_service import ConnectorService
|
||||
|
|
@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService
|
|||
# =============================================================================
|
||||
|
||||
# Canonical connector values used internally by ConnectorService
|
||||
# Includes all document types and search source connectors
|
||||
_ALL_CONNECTORS: list[str] = [
|
||||
"EXTENSION",
|
||||
"FILE",
|
||||
|
|
@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [
|
|||
"CRAWLED_URL",
|
||||
"CIRCLEBACK",
|
||||
"OBSIDIAN_CONNECTOR",
|
||||
# Composio connectors
|
||||
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"COMPOSIO_GMAIL_CONNECTOR",
|
||||
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
]
|
||||
|
||||
# Human-readable descriptions for each connector type
|
||||
# Used for generating dynamic docstrings and informing the LLM
|
||||
CONNECTOR_DESCRIPTIONS: dict[str, str] = {
|
||||
"EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
|
||||
"FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
|
||||
"NOTE": "SurfSense Notes (notes created inside SurfSense)",
|
||||
"SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
|
||||
"TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
|
||||
"NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
|
||||
"YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
|
||||
"GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
|
||||
"ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
|
||||
"LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
|
||||
"JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
|
||||
"CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
|
||||
"CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
|
||||
"GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
|
||||
"GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
|
||||
"GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
|
||||
"DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
|
||||
"AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
|
||||
"TAVILY_API": "Tavily web search API results (real-time web search)",
|
||||
"SEARXNG_API": "SearxNG search API results (privacy-focused web search)",
|
||||
"LINKUP_API": "Linkup search API results (web search)",
|
||||
"BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)",
|
||||
"LUMA_CONNECTOR": "Luma events and meetings",
|
||||
"WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
|
||||
"CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
|
||||
"BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
|
||||
"CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
|
||||
"OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
|
||||
# Composio connectors
|
||||
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)",
|
||||
"COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)",
|
||||
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)",
|
||||
}
|
||||
|
||||
def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]:
|
||||
|
||||
def _normalize_connectors(
|
||||
connectors_to_search: list[str] | None,
|
||||
available_connectors: list[str] | None = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Normalize connectors provided by the model.
|
||||
|
||||
- Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical
|
||||
ConnectorService types.
|
||||
- Drops unknown values.
|
||||
- If None/empty, defaults to searching across all known connectors.
|
||||
- If available_connectors is provided, only includes connectors from that list.
|
||||
- If connectors_to_search is None/empty, defaults to available_connectors or all.
|
||||
|
||||
Args:
|
||||
connectors_to_search: List of connectors requested by the model
|
||||
available_connectors: List of connectors actually available in the search space
|
||||
|
||||
Returns:
|
||||
List of normalized connector strings to search
|
||||
"""
|
||||
# Determine the set of valid connectors to consider
|
||||
valid_set = (
|
||||
set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
|
||||
)
|
||||
|
||||
if not connectors_to_search:
|
||||
return list(_ALL_CONNECTORS)
|
||||
# Search all available connectors if none specified
|
||||
return (
|
||||
list(available_connectors)
|
||||
if available_connectors
|
||||
else list(_ALL_CONNECTORS)
|
||||
)
|
||||
|
||||
normalized: list[str] = []
|
||||
for raw in connectors_to_search:
|
||||
c = (raw or "").strip().upper()
|
||||
if not c:
|
||||
continue
|
||||
# Map user-facing aliases to canonical names
|
||||
if c == "WEBCRAWLER_CONNECTOR":
|
||||
c = "CRAWLED_URL"
|
||||
normalized.append(c)
|
||||
|
||||
# de-dupe while preserving order + filter unknown
|
||||
# de-dupe while preserving order + filter to valid connectors
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for c in normalized:
|
||||
if c in seen:
|
||||
continue
|
||||
# Only include if it's a known connector AND available
|
||||
if c not in _ALL_CONNECTORS:
|
||||
continue
|
||||
if c not in valid_set:
|
||||
continue
|
||||
seen.add(c)
|
||||
out.append(c)
|
||||
return out if out else list(_ALL_CONNECTORS)
|
||||
|
||||
# Fallback to all available if nothing matched
|
||||
return (
|
||||
out
|
||||
if out
|
||||
else (
|
||||
list(available_connectors)
|
||||
if available_connectors
|
||||
else list(_ALL_CONNECTORS)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
|
@ -233,6 +311,7 @@ async def search_knowledge_base_async(
|
|||
top_k: int = 10,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
available_connectors: list[str] | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Search the user's knowledge base for relevant documents.
|
||||
|
|
@ -248,6 +327,8 @@ async def search_knowledge_base_async(
|
|||
top_k: Number of results per connector
|
||||
start_date: Optional start datetime (UTC) for filtering documents
|
||||
end_date: Optional end datetime (UTC) for filtering documents
|
||||
available_connectors: Optional list of connectors actually available in the search space.
|
||||
If provided, only these connectors will be searched.
|
||||
|
||||
Returns:
|
||||
Formatted string with search results
|
||||
|
|
@ -262,7 +343,7 @@ async def search_knowledge_base_async(
|
|||
end_date=end_date,
|
||||
)
|
||||
|
||||
connectors = _normalize_connectors(connectors_to_search)
|
||||
connectors = _normalize_connectors(connectors_to_search, available_connectors)
|
||||
|
||||
for connector in connectors:
|
||||
try:
|
||||
|
|
@ -316,6 +397,16 @@ async def search_knowledge_base_async(
|
|||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
elif connector == "TEAMS_CONNECTOR":
|
||||
_, chunks = await connector_service.search_teams(
|
||||
user_query=query,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
start_date=resolved_start_date,
|
||||
end_date=resolved_end_date,
|
||||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
elif connector == "NOTION_CONNECTOR":
|
||||
_, chunks = await connector_service.search_notion(
|
||||
user_query=query,
|
||||
|
|
@ -519,6 +610,39 @@ async def search_knowledge_base_async(
|
|||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
# =========================================================
|
||||
# Composio Connectors
|
||||
# =========================================================
|
||||
elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
|
||||
_, chunks = await connector_service.search_composio_google_drive(
|
||||
user_query=query,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
start_date=resolved_start_date,
|
||||
end_date=resolved_end_date,
|
||||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
elif connector == "COMPOSIO_GMAIL_CONNECTOR":
|
||||
_, chunks = await connector_service.search_composio_gmail(
|
||||
user_query=query,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
start_date=resolved_start_date,
|
||||
end_date=resolved_end_date,
|
||||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR":
|
||||
_, chunks = await connector_service.search_composio_google_calendar(
|
||||
user_query=query,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
start_date=resolved_start_date,
|
||||
end_date=resolved_end_date,
|
||||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error searching connector {connector}: {e}")
|
||||
continue
|
||||
|
|
@ -543,11 +667,68 @@ async def search_knowledge_base_async(
|
|||
return format_documents_for_context(deduplicated)
|
||||
|
||||
|
||||
def _build_connector_docstring(available_connectors: list[str] | None) -> str:
|
||||
"""
|
||||
Build the connector documentation section for the tool docstring.
|
||||
|
||||
Args:
|
||||
available_connectors: List of available connector types, or None for all
|
||||
|
||||
Returns:
|
||||
Formatted docstring section listing available connectors
|
||||
"""
|
||||
connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS)
|
||||
|
||||
lines = []
|
||||
for connector in connectors:
|
||||
# Skip internal names, prefer user-facing aliases
|
||||
if connector == "CRAWLED_URL":
|
||||
# Show as WEBCRAWLER_CONNECTOR for user-facing docs
|
||||
description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
|
||||
lines.append(f"- WEBCRAWLER_CONNECTOR: {description}")
|
||||
else:
|
||||
description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
|
||||
lines.append(f"- {connector}: {description}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tool Input Schema
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class SearchKnowledgeBaseInput(BaseModel):
|
||||
"""Input schema for the search_knowledge_base tool."""
|
||||
|
||||
query: str = Field(
|
||||
description="The search query - be specific and include key terms"
|
||||
)
|
||||
top_k: int = Field(
|
||||
default=10,
|
||||
description="Number of results to retrieve (default: 10)",
|
||||
)
|
||||
start_date: str | None = Field(
|
||||
default=None,
|
||||
description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')",
|
||||
)
|
||||
end_date: str | None = Field(
|
||||
default=None,
|
||||
description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')",
|
||||
)
|
||||
connectors_to_search: list[str] | None = Field(
|
||||
default=None,
|
||||
description="Optional list of connector enums to search. If omitted, searches all available.",
|
||||
)
|
||||
|
||||
|
||||
def create_search_knowledge_base_tool(
|
||||
search_space_id: int,
|
||||
db_session: AsyncSession,
|
||||
connector_service: ConnectorService,
|
||||
):
|
||||
available_connectors: list[str] | None = None,
|
||||
available_document_types: list[str] | None = None,
|
||||
) -> StructuredTool:
|
||||
"""
|
||||
Factory function to create the search_knowledge_base tool with injected dependencies.
|
||||
|
||||
|
|
@ -555,72 +736,57 @@ def create_search_knowledge_base_tool(
|
|||
search_space_id: The user's search space ID
|
||||
db_session: Database session
|
||||
connector_service: Initialized connector service
|
||||
available_connectors: Optional list of connector types available in the search space.
|
||||
Used to dynamically generate the tool docstring.
|
||||
available_document_types: Optional list of document types that have data in the search space.
|
||||
Used to inform the LLM about what data exists.
|
||||
|
||||
Returns:
|
||||
A configured tool function
|
||||
A configured StructuredTool instance
|
||||
"""
|
||||
# Build connector documentation dynamically
|
||||
connector_docs = _build_connector_docstring(available_connectors)
|
||||
|
||||
@tool
|
||||
async def search_knowledge_base(
|
||||
# Build context about available document types
|
||||
doc_types_info = ""
|
||||
if available_document_types:
|
||||
doc_types_info = f"""
|
||||
|
||||
## Document types with indexed content in this search space
|
||||
|
||||
The following document types have content available for search:
|
||||
{", ".join(available_document_types)}
|
||||
|
||||
Focus searches on these types for best results."""
|
||||
|
||||
# Build the dynamic description for the tool
|
||||
# This is what the LLM sees when deciding whether/how to use the tool
|
||||
dynamic_description = f"""Search the user's personal knowledge base for relevant information.
|
||||
|
||||
Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question.
|
||||
|
||||
IMPORTANT:
|
||||
- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
|
||||
- If `connectors_to_search` is omitted/empty, the system will search broadly.
|
||||
- Only connectors that are enabled/configured for this search space are available.{doc_types_info}
|
||||
|
||||
## Available connector enums for `connectors_to_search`
|
||||
|
||||
{connector_docs}
|
||||
|
||||
NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`."""
|
||||
|
||||
# Capture for closure
|
||||
_available_connectors = available_connectors
|
||||
|
||||
async def _search_knowledge_base_impl(
|
||||
query: str,
|
||||
top_k: int = 10,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
connectors_to_search: list[str] | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Search the user's personal knowledge base for relevant information.
|
||||
|
||||
Use this tool to find documents, notes, files, web pages, and other content
|
||||
that may help answer the user's question.
|
||||
|
||||
IMPORTANT:
|
||||
- If the user requests a specific source type (e.g. "my notes", "Slack messages"),
|
||||
pass `connectors_to_search=[...]` using the enums below.
|
||||
- If `connectors_to_search` is omitted/empty, the system will search broadly.
|
||||
|
||||
## Available connector enums for `connectors_to_search`
|
||||
|
||||
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
|
||||
- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
|
||||
- NOTE: "SurfSense Notes" (notes created inside SurfSense)
|
||||
- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
|
||||
- TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications)
|
||||
- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
|
||||
- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
|
||||
- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
|
||||
- ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources)
|
||||
- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
|
||||
- JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking)
|
||||
- CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation)
|
||||
- CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management)
|
||||
- GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management)
|
||||
- GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications)
|
||||
- GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management)
|
||||
- DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications)
|
||||
- AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization)
|
||||
- TAVILY_API: "Tavily search API results" (personalized search results)
|
||||
- SEARXNG_API: "SearxNG search API results" (personalized search results)
|
||||
- LINKUP_API: "Linkup search API results" (personalized search results)
|
||||
- BAIDU_SEARCH_API: "Baidu search API results" (personalized search results)
|
||||
- LUMA_CONNECTOR: "Luma events"
|
||||
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
|
||||
- BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation)
|
||||
- CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records)
|
||||
- OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management)
|
||||
|
||||
NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.
|
||||
|
||||
Args:
|
||||
query: The search query - be specific and include key terms
|
||||
top_k: Number of results to retrieve (default: 10)
|
||||
start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
|
||||
end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
|
||||
connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
|
||||
|
||||
Returns:
|
||||
Formatted string with relevant documents and their content
|
||||
"""
|
||||
"""Implementation function for knowledge base search."""
|
||||
from app.agents.new_chat.utils import parse_date_or_datetime
|
||||
|
||||
parsed_start: datetime | None = None
|
||||
|
|
@ -640,6 +806,16 @@ def create_search_knowledge_base_tool(
|
|||
top_k=top_k,
|
||||
start_date=parsed_start,
|
||||
end_date=parsed_end,
|
||||
available_connectors=_available_connectors,
|
||||
)
|
||||
|
||||
return search_knowledge_base
|
||||
# Create StructuredTool with dynamic description
|
||||
# This properly sets the description that the LLM sees
|
||||
tool = StructuredTool(
|
||||
name="search_knowledge_base",
|
||||
description=dynamic_description,
|
||||
coroutine=_search_knowledge_base_impl,
|
||||
args_schema=SearchKnowledgeBaseInput,
|
||||
)
|
||||
|
||||
return tool
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ class ToolDefinition:
|
|||
# Contributors: Add your new tools here!
|
||||
BUILTIN_TOOLS: list[ToolDefinition] = [
|
||||
# Core tool - searches the user's knowledge base
|
||||
# Now supports dynamic connector/document type discovery
|
||||
ToolDefinition(
|
||||
name="search_knowledge_base",
|
||||
description="Search the user's personal knowledge base for relevant information",
|
||||
|
|
@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
|
|||
search_space_id=deps["search_space_id"],
|
||||
db_session=deps["db_session"],
|
||||
connector_service=deps["connector_service"],
|
||||
# Optional: dynamically discovered connectors/document types
|
||||
available_connectors=deps.get("available_connectors"),
|
||||
available_document_types=deps.get("available_document_types"),
|
||||
),
|
||||
requires=["search_space_id", "db_session", "connector_service"],
|
||||
# Note: available_connectors and available_document_types are optional
|
||||
),
|
||||
# Podcast generation tool
|
||||
ToolDefinition(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue