merge: upstream/dev with migration renumbering

This commit is contained in:
CREDO23 2026-01-27 11:22:26 +02:00
commit a7145b2c63
176 changed files with 8791 additions and 3608 deletions

View file

@ -19,6 +19,7 @@ Available tools:
# Tool factory exports (for direct use)
from .display_image import create_display_image_tool
from .knowledge_base import (
CONNECTOR_DESCRIPTIONS,
create_search_knowledge_base_tool,
format_documents_for_context,
search_knowledge_base_async,
@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool
__all__ = [
# Registry
"BUILTIN_TOOLS",
# Knowledge base utilities
"CONNECTOR_DESCRIPTIONS",
"ToolDefinition",
"build_tools",
# Tool factories
@ -51,7 +54,6 @@ __all__ = [
"create_scrape_webpage_tool",
"create_search_knowledge_base_tool",
"create_search_surfsense_docs_tool",
# Knowledge base utilities
"format_documents_for_context",
"get_all_tool_names",
"get_default_enabled_tools",

View file

@ -12,7 +12,8 @@ import json
from datetime import datetime
from typing import Any
from langchain_core.tools import tool
from langchain_core.tools import StructuredTool
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.connector_service import ConnectorService
@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService
# =============================================================================
# Canonical connector values used internally by ConnectorService
# Includes all document types and search source connectors
_ALL_CONNECTORS: list[str] = [
"EXTENSION",
"FILE",
@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [
"CRAWLED_URL",
"CIRCLEBACK",
"OBSIDIAN_CONNECTOR",
# Composio connectors
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"COMPOSIO_GMAIL_CONNECTOR",
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
]
# Human-readable descriptions for each connector type
# Used for generating dynamic docstrings and informing the LLM
CONNECTOR_DESCRIPTIONS: dict[str, str] = {
"EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
"FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
"NOTE": "SurfSense Notes (notes created inside SurfSense)",
"SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
"TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
"NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
"YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
"GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
"ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
"LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
"JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
"CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
"CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
"GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
"GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
"GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
"DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
"AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
"TAVILY_API": "Tavily web search API results (real-time web search)",
"SEARXNG_API": "SearxNG search API results (privacy-focused web search)",
"LINKUP_API": "Linkup search API results (web search)",
"BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)",
"LUMA_CONNECTOR": "Luma events and meetings",
"WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
"CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
"BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
"CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
"OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
# Composio connectors
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)",
"COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)",
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)",
}
def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]:
def _normalize_connectors(
connectors_to_search: list[str] | None,
available_connectors: list[str] | None = None,
) -> list[str]:
"""
Normalize connectors provided by the model.
- Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical
ConnectorService types.
- Drops unknown values.
- If None/empty, defaults to searching across all known connectors.
- If available_connectors is provided, only includes connectors from that list.
- If connectors_to_search is None/empty, defaults to available_connectors or all.
Args:
connectors_to_search: List of connectors requested by the model
available_connectors: List of connectors actually available in the search space
Returns:
List of normalized connector strings to search
"""
# Determine the set of valid connectors to consider
valid_set = (
set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
)
if not connectors_to_search:
return list(_ALL_CONNECTORS)
# Search all available connectors if none specified
return (
list(available_connectors)
if available_connectors
else list(_ALL_CONNECTORS)
)
normalized: list[str] = []
for raw in connectors_to_search:
c = (raw or "").strip().upper()
if not c:
continue
# Map user-facing aliases to canonical names
if c == "WEBCRAWLER_CONNECTOR":
c = "CRAWLED_URL"
normalized.append(c)
# de-dupe while preserving order + filter unknown
# de-dupe while preserving order + filter to valid connectors
seen: set[str] = set()
out: list[str] = []
for c in normalized:
if c in seen:
continue
# Only include if it's a known connector AND available
if c not in _ALL_CONNECTORS:
continue
if c not in valid_set:
continue
seen.add(c)
out.append(c)
return out if out else list(_ALL_CONNECTORS)
# Fallback to all available if nothing matched
return (
out
if out
else (
list(available_connectors)
if available_connectors
else list(_ALL_CONNECTORS)
)
)
# =============================================================================
@ -233,6 +311,7 @@ async def search_knowledge_base_async(
top_k: int = 10,
start_date: datetime | None = None,
end_date: datetime | None = None,
available_connectors: list[str] | None = None,
) -> str:
"""
Search the user's knowledge base for relevant documents.
@ -248,6 +327,8 @@ async def search_knowledge_base_async(
top_k: Number of results per connector
start_date: Optional start datetime (UTC) for filtering documents
end_date: Optional end datetime (UTC) for filtering documents
available_connectors: Optional list of connectors actually available in the search space.
If provided, only these connectors will be searched.
Returns:
Formatted string with search results
@ -262,7 +343,7 @@ async def search_knowledge_base_async(
end_date=end_date,
)
connectors = _normalize_connectors(connectors_to_search)
connectors = _normalize_connectors(connectors_to_search, available_connectors)
for connector in connectors:
try:
@ -316,6 +397,16 @@ async def search_knowledge_base_async(
)
all_documents.extend(chunks)
elif connector == "TEAMS_CONNECTOR":
_, chunks = await connector_service.search_teams(
user_query=query,
search_space_id=search_space_id,
top_k=top_k,
start_date=resolved_start_date,
end_date=resolved_end_date,
)
all_documents.extend(chunks)
elif connector == "NOTION_CONNECTOR":
_, chunks = await connector_service.search_notion(
user_query=query,
@ -519,6 +610,39 @@ async def search_knowledge_base_async(
)
all_documents.extend(chunks)
# =========================================================
# Composio Connectors
# =========================================================
elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
_, chunks = await connector_service.search_composio_google_drive(
user_query=query,
search_space_id=search_space_id,
top_k=top_k,
start_date=resolved_start_date,
end_date=resolved_end_date,
)
all_documents.extend(chunks)
elif connector == "COMPOSIO_GMAIL_CONNECTOR":
_, chunks = await connector_service.search_composio_gmail(
user_query=query,
search_space_id=search_space_id,
top_k=top_k,
start_date=resolved_start_date,
end_date=resolved_end_date,
)
all_documents.extend(chunks)
elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR":
_, chunks = await connector_service.search_composio_google_calendar(
user_query=query,
search_space_id=search_space_id,
top_k=top_k,
start_date=resolved_start_date,
end_date=resolved_end_date,
)
all_documents.extend(chunks)
except Exception as e:
print(f"Error searching connector {connector}: {e}")
continue
@ -543,11 +667,68 @@ async def search_knowledge_base_async(
return format_documents_for_context(deduplicated)
def _build_connector_docstring(available_connectors: list[str] | None) -> str:
"""
Build the connector documentation section for the tool docstring.
Args:
available_connectors: List of available connector types, or None for all
Returns:
Formatted docstring section listing available connectors
"""
connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS)
lines = []
for connector in connectors:
# Skip internal names, prefer user-facing aliases
if connector == "CRAWLED_URL":
# Show as WEBCRAWLER_CONNECTOR for user-facing docs
description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
lines.append(f"- WEBCRAWLER_CONNECTOR: {description}")
else:
description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
lines.append(f"- {connector}: {description}")
return "\n".join(lines)
# =============================================================================
# Tool Input Schema
# =============================================================================
class SearchKnowledgeBaseInput(BaseModel):
"""Input schema for the search_knowledge_base tool."""
query: str = Field(
description="The search query - be specific and include key terms"
)
top_k: int = Field(
default=10,
description="Number of results to retrieve (default: 10)",
)
start_date: str | None = Field(
default=None,
description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')",
)
end_date: str | None = Field(
default=None,
description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')",
)
connectors_to_search: list[str] | None = Field(
default=None,
description="Optional list of connector enums to search. If omitted, searches all available.",
)
def create_search_knowledge_base_tool(
search_space_id: int,
db_session: AsyncSession,
connector_service: ConnectorService,
):
available_connectors: list[str] | None = None,
available_document_types: list[str] | None = None,
) -> StructuredTool:
"""
Factory function to create the search_knowledge_base tool with injected dependencies.
@ -555,72 +736,57 @@ def create_search_knowledge_base_tool(
search_space_id: The user's search space ID
db_session: Database session
connector_service: Initialized connector service
available_connectors: Optional list of connector types available in the search space.
Used to dynamically generate the tool docstring.
available_document_types: Optional list of document types that have data in the search space.
Used to inform the LLM about what data exists.
Returns:
A configured tool function
A configured StructuredTool instance
"""
# Build connector documentation dynamically
connector_docs = _build_connector_docstring(available_connectors)
@tool
async def search_knowledge_base(
# Build context about available document types
doc_types_info = ""
if available_document_types:
doc_types_info = f"""
## Document types with indexed content in this search space
The following document types have content available for search:
{", ".join(available_document_types)}
Focus searches on these types for best results."""
# Build the dynamic description for the tool
# This is what the LLM sees when deciding whether/how to use the tool
dynamic_description = f"""Search the user's personal knowledge base for relevant information.
Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question.
IMPORTANT:
- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
- If `connectors_to_search` is omitted/empty, the system will search broadly.
- Only connectors that are enabled/configured for this search space are available.{doc_types_info}
## Available connector enums for `connectors_to_search`
{connector_docs}
NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`."""
# Capture for closure
_available_connectors = available_connectors
async def _search_knowledge_base_impl(
query: str,
top_k: int = 10,
start_date: str | None = None,
end_date: str | None = None,
connectors_to_search: list[str] | None = None,
) -> str:
"""
Search the user's personal knowledge base for relevant information.
Use this tool to find documents, notes, files, web pages, and other content
that may help answer the user's question.
IMPORTANT:
- If the user requests a specific source type (e.g. "my notes", "Slack messages"),
pass `connectors_to_search=[...]` using the enums below.
- If `connectors_to_search` is omitted/empty, the system will search broadly.
## Available connector enums for `connectors_to_search`
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
- NOTE: "SurfSense Notes" (notes created inside SurfSense)
- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
- TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications)
- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
- ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources)
- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
- JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking)
- CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation)
- CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management)
- GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management)
- GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications)
- GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management)
- DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications)
- AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization)
- TAVILY_API: "Tavily search API results" (personalized search results)
- SEARXNG_API: "SearxNG search API results" (personalized search results)
- LINKUP_API: "Linkup search API results" (personalized search results)
- BAIDU_SEARCH_API: "Baidu search API results" (personalized search results)
- LUMA_CONNECTOR: "Luma events"
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
- BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation)
- CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records)
- OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management)
NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.
Args:
query: The search query - be specific and include key terms
top_k: Number of results to retrieve (default: 10)
start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
Returns:
Formatted string with relevant documents and their content
"""
"""Implementation function for knowledge base search."""
from app.agents.new_chat.utils import parse_date_or_datetime
parsed_start: datetime | None = None
@ -640,6 +806,16 @@ def create_search_knowledge_base_tool(
top_k=top_k,
start_date=parsed_start,
end_date=parsed_end,
available_connectors=_available_connectors,
)
return search_knowledge_base
# Create StructuredTool with dynamic description
# This properly sets the description that the LLM sees
tool = StructuredTool(
name="search_knowledge_base",
description=dynamic_description,
coroutine=_search_knowledge_base_impl,
args_schema=SearchKnowledgeBaseInput,
)
return tool

View file

@ -85,6 +85,7 @@ class ToolDefinition:
# Contributors: Add your new tools here!
BUILTIN_TOOLS: list[ToolDefinition] = [
# Core tool - searches the user's knowledge base
# Now supports dynamic connector/document type discovery
ToolDefinition(
name="search_knowledge_base",
description="Search the user's personal knowledge base for relevant information",
@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
search_space_id=deps["search_space_id"],
db_session=deps["db_session"],
connector_service=deps["connector_service"],
# Optional: dynamically discovered connectors/document types
available_connectors=deps.get("available_connectors"),
available_document_types=deps.get("available_document_types"),
),
requires=["search_space_id", "db_session", "connector_service"],
# Note: available_connectors and available_document_types are optional
),
# Podcast generation tool
ToolDefinition(