merge: upstream/dev with migration renumbering

This commit is contained in:
CREDO23 2026-01-27 11:22:26 +02:00
commit a7145b2c63
176 changed files with 8791 additions and 3608 deletions

View file

@ -19,6 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libxext6 \
libxrender1 \
dos2unix \
git \
&& rm -rf /var/lib/apt/lists/*
# Update certificates and install SSL tools

View file

@ -0,0 +1,95 @@
"""Add Composio connector types to SearchSourceConnectorType and DocumentType enums
Revision ID: 79
Revises: 78
This migration adds the Composio connector enum values to both:
- searchsourceconnectortype (for connector type tracking)
- documenttype (for document type tracking)
Composio is a managed OAuth integration service that allows connecting
to various third-party services (Google Drive, Gmail, Calendar, etc.)
without requiring separate OAuth app verification.
This migration adds three specific connector types:
- COMPOSIO_GOOGLE_DRIVE_CONNECTOR
- COMPOSIO_GMAIL_CONNECTOR
- COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "79"
down_revision: str | None = "78"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
# Define the ENUM type names and the new values
CONNECTOR_ENUM = "searchsourceconnectortype"
CONNECTOR_NEW_VALUES = [
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"COMPOSIO_GMAIL_CONNECTOR",
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
]
DOCUMENT_ENUM = "documenttype"
DOCUMENT_NEW_VALUES = [
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"COMPOSIO_GMAIL_CONNECTOR",
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
]
def upgrade() -> None:
"""Upgrade schema - add Composio connector types to connector and document enums safely."""
# Add each Composio connector type to searchsourceconnectortype only if not exists
for value in CONNECTOR_NEW_VALUES:
op.execute(
f"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_enum e
JOIN pg_type t ON e.enumtypid = t.oid
WHERE t.typname = '{CONNECTOR_ENUM}' AND e.enumlabel = '{value}'
) THEN
ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{value}';
END IF;
END$$;
"""
)
# Add each Composio connector type to documenttype only if not exists
for value in DOCUMENT_NEW_VALUES:
op.execute(
f"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_enum e
JOIN pg_type t ON e.enumtypid = t.oid
WHERE t.typname = '{DOCUMENT_ENUM}' AND e.enumlabel = '{value}'
) THEN
ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{value}';
END IF;
END$$;
"""
)
def downgrade() -> None:
"""Downgrade schema - remove Composio connector types from connector and document enums.
Note: PostgreSQL does not support removing enum values directly.
To properly downgrade, you would need to:
1. Delete any rows using the Composio connector type values
2. Create new enums without the Composio connector types
3. Alter the columns to use the new enums
4. Drop the old enums
This is left as a no-op since removing enum values is complex
and typically not needed in practice.
"""
pass

View file

@ -0,0 +1,97 @@
"""Add user incentive tasks table for earning free pages
Revision ID: 80
Revises: 79
Changes:
1. Create incentive_task_type enum with GITHUB_STAR value
2. Create user_incentive_tasks table to track completed tasks
"""
from collections.abc import Sequence
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "80"
down_revision: str | None = "79"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""Create incentive tasks infrastructure."""
# Check if enum already exists (handles partial migration recovery)
conn = op.get_bind()
result = conn.execute(
sa.text("SELECT 1 FROM pg_type WHERE typname = 'incentivetasktype'")
)
enum_exists = result.fetchone() is not None
# Create the enum type only if it doesn't exist
if not enum_exists:
incentive_task_type_enum = postgresql.ENUM(
"GITHUB_STAR",
name="incentivetasktype",
create_type=False,
)
incentive_task_type_enum.create(op.get_bind(), checkfirst=True)
# Check if table already exists (handles partial migration recovery)
result = conn.execute(
sa.text(
"SELECT 1 FROM information_schema.tables WHERE table_name = 'user_incentive_tasks'"
)
)
table_exists = result.fetchone() is not None
if not table_exists:
# Create the user_incentive_tasks table
op.create_table(
"user_incentive_tasks",
sa.Column("id", sa.Integer(), primary_key=True, index=True),
sa.Column(
"user_id",
sa.UUID(as_uuid=True),
sa.ForeignKey("user.id", ondelete="CASCADE"),
nullable=False,
index=True,
),
sa.Column(
"task_type",
postgresql.ENUM(
"GITHUB_STAR", name="incentivetasktype", create_type=False
),
nullable=False,
index=True,
),
sa.Column("pages_awarded", sa.Integer(), nullable=False),
sa.Column(
"completed_at",
sa.TIMESTAMP(timezone=True),
nullable=False,
server_default=sa.func.now(),
),
sa.Column(
"created_at",
sa.TIMESTAMP(timezone=True),
nullable=False,
server_default=sa.func.now(),
index=True,
),
sa.UniqueConstraint("user_id", "task_type", name="uq_user_incentive_task"),
)
def downgrade() -> None:
"""Remove incentive tasks infrastructure."""
# Drop the table
op.drop_table("user_incentive_tasks")
# Drop the enum type
postgresql.ENUM(name="incentivetasktype").drop(op.get_bind(), checkfirst=True)

View file

@ -1,7 +1,7 @@
"""Add public sharing columns to new_chat_threads
Revision ID: 79
Revises: 78
Revision ID: 81
Revises: 80
Create Date: 2026-01-23
Adds public_share_token and public_share_enabled columns to enable
@ -13,8 +13,8 @@ from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "79"
down_revision: str | None = "78"
revision: str = "81"
down_revision: str | None = "80"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None

View file

@ -1,7 +1,7 @@
"""Add thread_id to podcasts
Revision ID: 80
Revises: 79
Revision ID: 82
Revises: 81
Create Date: 2026-01-23
"""
@ -10,8 +10,8 @@ from collections.abc import Sequence
from alembic import op
revision: str = "80"
down_revision: str | None = "79"
revision: str = "82"
down_revision: str | None = "81"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None

View file

@ -7,6 +7,7 @@ via NewLLMConfig.
"""
from collections.abc import Sequence
from typing import Any
from deepagents import create_deep_agent
from langchain_core.tools import BaseTool
@ -23,6 +24,90 @@ from app.agents.new_chat.system_prompt import (
from app.agents.new_chat.tools.registry import build_tools_async
from app.services.connector_service import ConnectorService
# =============================================================================
# Connector Type Mapping
# =============================================================================
# Maps SearchSourceConnectorType enum values to the searchable document/connector types
# used by the knowledge_base tool. Some connectors map to different document types.
_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = {
# Direct mappings (connector type == searchable type)
"TAVILY_API": "TAVILY_API",
"SEARXNG_API": "SEARXNG_API",
"LINKUP_API": "LINKUP_API",
"BAIDU_SEARCH_API": "BAIDU_SEARCH_API",
"SLACK_CONNECTOR": "SLACK_CONNECTOR",
"TEAMS_CONNECTOR": "TEAMS_CONNECTOR",
"NOTION_CONNECTOR": "NOTION_CONNECTOR",
"GITHUB_CONNECTOR": "GITHUB_CONNECTOR",
"LINEAR_CONNECTOR": "LINEAR_CONNECTOR",
"DISCORD_CONNECTOR": "DISCORD_CONNECTOR",
"JIRA_CONNECTOR": "JIRA_CONNECTOR",
"CONFLUENCE_CONNECTOR": "CONFLUENCE_CONNECTOR",
"CLICKUP_CONNECTOR": "CLICKUP_CONNECTOR",
"GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR",
"GOOGLE_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR",
"GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE", # Connector type differs from document type
"AIRTABLE_CONNECTOR": "AIRTABLE_CONNECTOR",
"LUMA_CONNECTOR": "LUMA_CONNECTOR",
"ELASTICSEARCH_CONNECTOR": "ELASTICSEARCH_CONNECTOR",
"WEBCRAWLER_CONNECTOR": "CRAWLED_URL", # Maps to document type
"BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR",
"CIRCLEBACK_CONNECTOR": "CIRCLEBACK", # Connector type differs from document type
"OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR",
# Composio connectors
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"COMPOSIO_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
}
# Document types that don't come from SearchSourceConnector but should always be searchable
_ALWAYS_AVAILABLE_DOC_TYPES: list[str] = [
"EXTENSION", # Browser extension data
"FILE", # Uploaded files
"NOTE", # User notes
"YOUTUBE_VIDEO", # YouTube videos
]
def _map_connectors_to_searchable_types(
connector_types: list[Any],
) -> list[str]:
"""
Map SearchSourceConnectorType enums to searchable document/connector types.
This function:
1. Converts connector type enums to their searchable counterparts
2. Includes always-available document types (EXTENSION, FILE, NOTE, YOUTUBE_VIDEO)
3. Deduplicates while preserving order
Args:
connector_types: List of SearchSourceConnectorType enum values
Returns:
List of searchable connector/document type strings
"""
result_set: set[str] = set()
result_list: list[str] = []
# Add always-available document types first
for doc_type in _ALWAYS_AVAILABLE_DOC_TYPES:
if doc_type not in result_set:
result_set.add(doc_type)
result_list.append(doc_type)
# Map each connector type to its searchable equivalent
for ct in connector_types:
# Handle both enum and string types
ct_str = ct.value if hasattr(ct, "value") else str(ct)
searchable = _CONNECTOR_TYPE_TO_SEARCHABLE.get(ct_str)
if searchable and searchable not in result_set:
result_set.add(searchable)
result_list.append(searchable)
return result_list
# =============================================================================
# Deep Agent Factory
# =============================================================================
@ -117,6 +202,30 @@ async def create_surfsense_deep_agent(
additional_tools=[my_custom_tool]
)
"""
# Discover available connectors and document types for this search space
# This enables dynamic tool docstrings that inform the LLM about what's actually available
available_connectors: list[str] | None = None
available_document_types: list[str] | None = None
try:
# Get enabled search source connectors for this search space
connector_types = await connector_service.get_available_connectors(
search_space_id
)
if connector_types:
# Convert enum values to strings and also include mapped document types
available_connectors = _map_connectors_to_searchable_types(connector_types)
# Get document types that have at least one document indexed
available_document_types = await connector_service.get_available_document_types(
search_space_id
)
except Exception as e:
# Log but don't fail - fall back to all connectors if discovery fails
import logging
logging.warning(f"Failed to discover available connectors/document types: {e}")
# Build dependencies dict for the tools registry
dependencies = {
"search_space_id": search_space_id,
@ -125,6 +234,9 @@ async def create_surfsense_deep_agent(
"firecrawl_api_key": firecrawl_api_key,
"user_id": user_id, # Required for memory tools
"thread_id": thread_id, # For podcast tool
# Dynamic connector/document type discovery for knowledge base tool
"available_connectors": available_connectors,
"available_document_types": available_document_types,
}
# Build tools using the async registry (includes MCP tools)

View file

@ -19,6 +19,7 @@ Available tools:
# Tool factory exports (for direct use)
from .display_image import create_display_image_tool
from .knowledge_base import (
CONNECTOR_DESCRIPTIONS,
create_search_knowledge_base_tool,
format_documents_for_context,
search_knowledge_base_async,
@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool
__all__ = [
# Registry
"BUILTIN_TOOLS",
# Knowledge base utilities
"CONNECTOR_DESCRIPTIONS",
"ToolDefinition",
"build_tools",
# Tool factories
@ -51,7 +54,6 @@ __all__ = [
"create_scrape_webpage_tool",
"create_search_knowledge_base_tool",
"create_search_surfsense_docs_tool",
# Knowledge base utilities
"format_documents_for_context",
"get_all_tool_names",
"get_default_enabled_tools",

View file

@ -12,7 +12,8 @@ import json
from datetime import datetime
from typing import Any
from langchain_core.tools import tool
from langchain_core.tools import StructuredTool
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.connector_service import ConnectorService
@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService
# =============================================================================
# Canonical connector values used internally by ConnectorService
# Includes all document types and search source connectors
_ALL_CONNECTORS: list[str] = [
"EXTENSION",
"FILE",
@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [
"CRAWLED_URL",
"CIRCLEBACK",
"OBSIDIAN_CONNECTOR",
# Composio connectors
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"COMPOSIO_GMAIL_CONNECTOR",
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
]
# Human-readable descriptions for each connector type
# Used for generating dynamic docstrings and informing the LLM
CONNECTOR_DESCRIPTIONS: dict[str, str] = {
"EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
"FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
"NOTE": "SurfSense Notes (notes created inside SurfSense)",
"SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
"TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
"NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
"YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
"GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
"ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
"LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
"JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
"CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
"CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
"GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
"GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
"GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
"DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
"AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
"TAVILY_API": "Tavily web search API results (real-time web search)",
"SEARXNG_API": "SearxNG search API results (privacy-focused web search)",
"LINKUP_API": "Linkup search API results (web search)",
"BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)",
"LUMA_CONNECTOR": "Luma events and meetings",
"WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
"CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
"BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
"CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
"OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
# Composio connectors
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)",
"COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)",
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)",
}
def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]:
def _normalize_connectors(
connectors_to_search: list[str] | None,
available_connectors: list[str] | None = None,
) -> list[str]:
"""
Normalize connectors provided by the model.
- Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical
ConnectorService types.
- Drops unknown values.
- If None/empty, defaults to searching across all known connectors.
- If available_connectors is provided, only includes connectors from that list.
- If connectors_to_search is None/empty, defaults to available_connectors or all.
Args:
connectors_to_search: List of connectors requested by the model
available_connectors: List of connectors actually available in the search space
Returns:
List of normalized connector strings to search
"""
# Determine the set of valid connectors to consider
valid_set = (
set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
)
if not connectors_to_search:
return list(_ALL_CONNECTORS)
# Search all available connectors if none specified
return (
list(available_connectors)
if available_connectors
else list(_ALL_CONNECTORS)
)
normalized: list[str] = []
for raw in connectors_to_search:
c = (raw or "").strip().upper()
if not c:
continue
# Map user-facing aliases to canonical names
if c == "WEBCRAWLER_CONNECTOR":
c = "CRAWLED_URL"
normalized.append(c)
# de-dupe while preserving order + filter unknown
# de-dupe while preserving order + filter to valid connectors
seen: set[str] = set()
out: list[str] = []
for c in normalized:
if c in seen:
continue
# Only include if it's a known connector AND available
if c not in _ALL_CONNECTORS:
continue
if c not in valid_set:
continue
seen.add(c)
out.append(c)
return out if out else list(_ALL_CONNECTORS)
# Fallback to all available if nothing matched
return (
out
if out
else (
list(available_connectors)
if available_connectors
else list(_ALL_CONNECTORS)
)
)
# =============================================================================
@ -233,6 +311,7 @@ async def search_knowledge_base_async(
top_k: int = 10,
start_date: datetime | None = None,
end_date: datetime | None = None,
available_connectors: list[str] | None = None,
) -> str:
"""
Search the user's knowledge base for relevant documents.
@ -248,6 +327,8 @@ async def search_knowledge_base_async(
top_k: Number of results per connector
start_date: Optional start datetime (UTC) for filtering documents
end_date: Optional end datetime (UTC) for filtering documents
available_connectors: Optional list of connectors actually available in the search space.
If provided, only these connectors will be searched.
Returns:
Formatted string with search results
@ -262,7 +343,7 @@ async def search_knowledge_base_async(
end_date=end_date,
)
connectors = _normalize_connectors(connectors_to_search)
connectors = _normalize_connectors(connectors_to_search, available_connectors)
for connector in connectors:
try:
@ -316,6 +397,16 @@ async def search_knowledge_base_async(
)
all_documents.extend(chunks)
elif connector == "TEAMS_CONNECTOR":
_, chunks = await connector_service.search_teams(
user_query=query,
search_space_id=search_space_id,
top_k=top_k,
start_date=resolved_start_date,
end_date=resolved_end_date,
)
all_documents.extend(chunks)
elif connector == "NOTION_CONNECTOR":
_, chunks = await connector_service.search_notion(
user_query=query,
@ -519,6 +610,39 @@ async def search_knowledge_base_async(
)
all_documents.extend(chunks)
# =========================================================
# Composio Connectors
# =========================================================
elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
_, chunks = await connector_service.search_composio_google_drive(
user_query=query,
search_space_id=search_space_id,
top_k=top_k,
start_date=resolved_start_date,
end_date=resolved_end_date,
)
all_documents.extend(chunks)
elif connector == "COMPOSIO_GMAIL_CONNECTOR":
_, chunks = await connector_service.search_composio_gmail(
user_query=query,
search_space_id=search_space_id,
top_k=top_k,
start_date=resolved_start_date,
end_date=resolved_end_date,
)
all_documents.extend(chunks)
elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR":
_, chunks = await connector_service.search_composio_google_calendar(
user_query=query,
search_space_id=search_space_id,
top_k=top_k,
start_date=resolved_start_date,
end_date=resolved_end_date,
)
all_documents.extend(chunks)
except Exception as e:
print(f"Error searching connector {connector}: {e}")
continue
@ -543,11 +667,68 @@ async def search_knowledge_base_async(
return format_documents_for_context(deduplicated)
def _build_connector_docstring(available_connectors: list[str] | None) -> str:
"""
Build the connector documentation section for the tool docstring.
Args:
available_connectors: List of available connector types, or None for all
Returns:
Formatted docstring section listing available connectors
"""
connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS)
lines = []
for connector in connectors:
# Skip internal names, prefer user-facing aliases
if connector == "CRAWLED_URL":
# Show as WEBCRAWLER_CONNECTOR for user-facing docs
description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
lines.append(f"- WEBCRAWLER_CONNECTOR: {description}")
else:
description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
lines.append(f"- {connector}: {description}")
return "\n".join(lines)
# =============================================================================
# Tool Input Schema
# =============================================================================
class SearchKnowledgeBaseInput(BaseModel):
"""Input schema for the search_knowledge_base tool."""
query: str = Field(
description="The search query - be specific and include key terms"
)
top_k: int = Field(
default=10,
description="Number of results to retrieve (default: 10)",
)
start_date: str | None = Field(
default=None,
description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')",
)
end_date: str | None = Field(
default=None,
description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')",
)
connectors_to_search: list[str] | None = Field(
default=None,
description="Optional list of connector enums to search. If omitted, searches all available.",
)
def create_search_knowledge_base_tool(
search_space_id: int,
db_session: AsyncSession,
connector_service: ConnectorService,
):
available_connectors: list[str] | None = None,
available_document_types: list[str] | None = None,
) -> StructuredTool:
"""
Factory function to create the search_knowledge_base tool with injected dependencies.
@ -555,72 +736,57 @@ def create_search_knowledge_base_tool(
search_space_id: The user's search space ID
db_session: Database session
connector_service: Initialized connector service
available_connectors: Optional list of connector types available in the search space.
Used to dynamically generate the tool docstring.
available_document_types: Optional list of document types that have data in the search space.
Used to inform the LLM about what data exists.
Returns:
A configured tool function
A configured StructuredTool instance
"""
# Build connector documentation dynamically
connector_docs = _build_connector_docstring(available_connectors)
@tool
async def search_knowledge_base(
# Build context about available document types
doc_types_info = ""
if available_document_types:
doc_types_info = f"""
## Document types with indexed content in this search space
The following document types have content available for search:
{", ".join(available_document_types)}
Focus searches on these types for best results."""
# Build the dynamic description for the tool
# This is what the LLM sees when deciding whether/how to use the tool
dynamic_description = f"""Search the user's personal knowledge base for relevant information.
Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question.
IMPORTANT:
- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
- If `connectors_to_search` is omitted/empty, the system will search broadly.
- Only connectors that are enabled/configured for this search space are available.{doc_types_info}
## Available connector enums for `connectors_to_search`
{connector_docs}
NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`."""
# Capture for closure
_available_connectors = available_connectors
async def _search_knowledge_base_impl(
query: str,
top_k: int = 10,
start_date: str | None = None,
end_date: str | None = None,
connectors_to_search: list[str] | None = None,
) -> str:
"""
Search the user's personal knowledge base for relevant information.
Use this tool to find documents, notes, files, web pages, and other content
that may help answer the user's question.
IMPORTANT:
- If the user requests a specific source type (e.g. "my notes", "Slack messages"),
pass `connectors_to_search=[...]` using the enums below.
- If `connectors_to_search` is omitted/empty, the system will search broadly.
## Available connector enums for `connectors_to_search`
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
- NOTE: "SurfSense Notes" (notes created inside SurfSense)
- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
- TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications)
- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
- ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources)
- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
- JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking)
- CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation)
- CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management)
- GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management)
- GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications)
- GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management)
- DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications)
- AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization)
- TAVILY_API: "Tavily search API results" (personalized search results)
- SEARXNG_API: "SearxNG search API results" (personalized search results)
- LINKUP_API: "Linkup search API results" (personalized search results)
- BAIDU_SEARCH_API: "Baidu search API results" (personalized search results)
- LUMA_CONNECTOR: "Luma events"
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
- BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation)
- CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records)
- OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management)
NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.
Args:
query: The search query - be specific and include key terms
top_k: Number of results to retrieve (default: 10)
start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
Returns:
Formatted string with relevant documents and their content
"""
"""Implementation function for knowledge base search."""
from app.agents.new_chat.utils import parse_date_or_datetime
parsed_start: datetime | None = None
@ -640,6 +806,16 @@ def create_search_knowledge_base_tool(
top_k=top_k,
start_date=parsed_start,
end_date=parsed_end,
available_connectors=_available_connectors,
)
return search_knowledge_base
# Create StructuredTool with dynamic description
# This properly sets the description that the LLM sees
tool = StructuredTool(
name="search_knowledge_base",
description=dynamic_description,
coroutine=_search_knowledge_base_impl,
args_schema=SearchKnowledgeBaseInput,
)
return tool

View file

@ -85,6 +85,7 @@ class ToolDefinition:
# Contributors: Add your new tools here!
BUILTIN_TOOLS: list[ToolDefinition] = [
# Core tool - searches the user's knowledge base
# Now supports dynamic connector/document type discovery
ToolDefinition(
name="search_knowledge_base",
description="Search the user's personal knowledge base for relevant information",
@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
search_space_id=deps["search_space_id"],
db_session=deps["db_session"],
connector_service=deps["connector_service"],
# Optional: dynamically discovered connectors/document types
available_connectors=deps.get("available_connectors"),
available_document_types=deps.get("available_document_types"),
),
requires=["search_space_id", "db_session", "connector_service"],
# Note: available_connectors and available_document_types are optional
),
# Podcast generation tool
ToolDefinition(

View file

@ -1,7 +1,7 @@
"""
Composio Connector Module.
Composio Connector Base Module.
Provides a unified interface for interacting with various services via Composio,
Provides a base class for interacting with various services via Composio,
primarily used during indexing operations.
"""
@ -19,10 +19,10 @@ logger = logging.getLogger(__name__)
class ComposioConnector:
"""
Generic Composio connector for data retrieval.
Base Composio connector for data retrieval.
Wraps the ComposioService to provide toolkit-specific data access
for indexing operations.
for indexing operations. Subclasses implement toolkit-specific methods.
"""
def __init__(
@ -89,302 +89,12 @@ class ComposioConnector:
toolkit_id = await self.get_toolkit_id()
return toolkit_id in INDEXABLE_TOOLKITS
# ===== Google Drive Methods =====
@property
def session(self) -> AsyncSession:
"""Get the database session."""
return self._session
async def list_drive_files(
self,
folder_id: str | None = None,
page_token: str | None = None,
page_size: int = 100,
) -> tuple[list[dict[str, Any]], str | None, str | None]:
"""
List files from Google Drive via Composio.
Args:
folder_id: Optional folder ID to list contents of.
page_token: Pagination token.
page_size: Number of files per page.
Returns:
Tuple of (files list, next_page_token, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
return [], None, "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
return await service.get_drive_files(
connected_account_id=connected_account_id,
entity_id=entity_id,
folder_id=folder_id,
page_token=page_token,
page_size=page_size,
)
async def get_drive_file_content(
self, file_id: str
) -> tuple[bytes | None, str | None]:
"""
Download file content from Google Drive via Composio.
Args:
file_id: Google Drive file ID.
Returns:
Tuple of (file content bytes, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
return None, "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
return await service.get_drive_file_content(
connected_account_id=connected_account_id,
entity_id=entity_id,
file_id=file_id,
)
# ===== Gmail Methods =====
async def list_gmail_messages(
self,
query: str = "",
max_results: int = 100,
) -> tuple[list[dict[str, Any]], str | None]:
"""
List Gmail messages via Composio.
Args:
query: Gmail search query.
max_results: Maximum number of messages.
Returns:
Tuple of (messages list, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
return [], "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
return await service.get_gmail_messages(
connected_account_id=connected_account_id,
entity_id=entity_id,
query=query,
max_results=max_results,
)
async def get_gmail_message_detail(
self, message_id: str
) -> tuple[dict[str, Any] | None, str | None]:
"""
Get full details of a Gmail message via Composio.
Args:
message_id: Gmail message ID.
Returns:
Tuple of (message details, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
return None, "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
return await service.get_gmail_message_detail(
connected_account_id=connected_account_id,
entity_id=entity_id,
message_id=message_id,
)
# ===== Google Calendar Methods =====
async def list_calendar_events(
self,
time_min: str | None = None,
time_max: str | None = None,
max_results: int = 250,
) -> tuple[list[dict[str, Any]], str | None]:
"""
List Google Calendar events via Composio.
Args:
time_min: Start time (RFC3339 format).
time_max: End time (RFC3339 format).
max_results: Maximum number of events.
Returns:
Tuple of (events list, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
return [], "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
return await service.get_calendar_events(
connected_account_id=connected_account_id,
entity_id=entity_id,
time_min=time_min,
time_max=time_max,
max_results=max_results,
)
# ===== Utility Methods =====
def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str:
"""
Format a Gmail message to markdown.
Args:
message: Message object from Composio's GMAIL_FETCH_EMAILS response.
Composio structure: messageId, messageText, messageTimestamp,
payload.headers, labelIds, attachmentList
Returns:
Formatted markdown string.
"""
try:
# Composio uses 'messageId' (camelCase)
message_id = message.get("messageId", "") or message.get("id", "")
label_ids = message.get("labelIds", [])
# Extract headers from payload
payload = message.get("payload", {})
headers = payload.get("headers", [])
# Parse headers into a dict
header_dict = {}
for header in headers:
name = header.get("name", "").lower()
value = header.get("value", "")
header_dict[name] = value
# Extract key information
subject = header_dict.get("subject", "No Subject")
from_email = header_dict.get("from", "Unknown Sender")
to_email = header_dict.get("to", "Unknown Recipient")
# Composio provides messageTimestamp directly
date_str = message.get("messageTimestamp", "") or header_dict.get(
"date", "Unknown Date"
)
# Build markdown content
markdown_content = f"# {subject}\n\n"
markdown_content += f"**From:** {from_email}\n"
markdown_content += f"**To:** {to_email}\n"
markdown_content += f"**Date:** {date_str}\n"
if label_ids:
markdown_content += f"**Labels:** {', '.join(label_ids)}\n"
markdown_content += "\n---\n\n"
# Composio provides full message text in 'messageText'
message_text = message.get("messageText", "")
if message_text:
markdown_content += f"## Content\n\n{message_text}\n\n"
else:
# Fallback to snippet if no messageText
snippet = message.get("snippet", "")
if snippet:
markdown_content += f"## Preview\n\n{snippet}\n\n"
# Add attachment info if present
attachments = message.get("attachmentList", [])
if attachments:
markdown_content += "## Attachments\n\n"
for att in attachments:
att_name = att.get("filename", att.get("name", "Unknown"))
markdown_content += f"- {att_name}\n"
markdown_content += "\n"
# Add message metadata
markdown_content += "## Message Details\n\n"
markdown_content += f"- **Message ID:** {message_id}\n"
return markdown_content
except Exception as e:
return f"Error formatting message to markdown: {e!s}"
def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str:
"""
Format a Google Calendar event to markdown.
Args:
event: Event object from Google Calendar API.
Returns:
Formatted markdown string.
"""
from datetime import datetime
try:
# Extract basic event information
summary = event.get("summary", "No Title")
description = event.get("description", "")
location = event.get("location", "")
# Extract start and end times
start = event.get("start", {})
end = event.get("end", {})
start_time = start.get("dateTime") or start.get("date", "")
end_time = end.get("dateTime") or end.get("date", "")
# Format times for display
def format_time(time_str: str) -> str:
if not time_str:
return "Unknown"
try:
if "T" in time_str:
dt = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
return dt.strftime("%Y-%m-%d %H:%M")
return time_str
except Exception:
return time_str
start_formatted = format_time(start_time)
end_formatted = format_time(end_time)
# Extract attendees
attendees = event.get("attendees", [])
attendee_list = []
for attendee in attendees:
email = attendee.get("email", "")
display_name = attendee.get("displayName", email)
response_status = attendee.get("responseStatus", "")
attendee_list.append(f"- {display_name} ({response_status})")
# Build markdown content
markdown_content = f"# {summary}\n\n"
markdown_content += f"**Start:** {start_formatted}\n"
markdown_content += f"**End:** {end_formatted}\n"
if location:
markdown_content += f"**Location:** {location}\n"
markdown_content += "\n"
if description:
markdown_content += f"## Description\n\n{description}\n\n"
if attendee_list:
markdown_content += "## Attendees\n\n"
markdown_content += "\n".join(attendee_list)
markdown_content += "\n\n"
# Add event metadata
markdown_content += "## Event Details\n\n"
markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n"
markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n"
markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n"
return markdown_content
except Exception as e:
return f"Error formatting event to markdown: {e!s}"
@property
def connector_id(self) -> int:
"""Get the connector ID."""
return self._connector_id

View file

@ -0,0 +1,613 @@
"""
Composio Gmail Connector Module.
Provides Gmail specific methods for data retrieval and indexing via Composio.
"""
import logging
from datetime import UTC, datetime
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.config import config
from app.connectors.composio_connector import ComposioConnector
from app.db import Document, DocumentType
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import calculate_date_range
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
logger = logging.getLogger(__name__)
def get_current_timestamp() -> datetime:
"""Get the current timestamp with timezone for updated_at field."""
return datetime.now(UTC)
async def check_document_by_unique_identifier(
session: AsyncSession, unique_identifier_hash: str
) -> Document | None:
"""Check if a document with the given unique identifier hash already exists."""
existing_doc_result = await session.execute(
select(Document)
.options(selectinload(Document.chunks))
.where(Document.unique_identifier_hash == unique_identifier_hash)
)
return existing_doc_result.scalars().first()
async def update_connector_last_indexed(
session: AsyncSession,
connector,
update_last_indexed: bool = True,
) -> None:
"""Update the last_indexed_at timestamp for a connector."""
if update_last_indexed:
connector.last_indexed_at = datetime.now(UTC)
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
class ComposioGmailConnector(ComposioConnector):
"""
Gmail specific Composio connector.
Provides methods for listing messages, getting message details, and formatting
Gmail messages from Gmail via Composio.
"""
async def list_gmail_messages(
self,
query: str = "",
max_results: int = 50,
page_token: str | None = None,
) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]:
"""
List Gmail messages via Composio with pagination support.
Args:
query: Gmail search query.
max_results: Maximum number of messages per page (default: 50).
page_token: Optional pagination token for next page.
Returns:
Tuple of (messages list, next_page_token, result_size_estimate, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
return [], None, None, "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
return await service.get_gmail_messages(
connected_account_id=connected_account_id,
entity_id=entity_id,
query=query,
max_results=max_results,
page_token=page_token,
)
async def get_gmail_message_detail(
self, message_id: str
) -> tuple[dict[str, Any] | None, str | None]:
"""
Get full details of a Gmail message via Composio.
Args:
message_id: Gmail message ID.
Returns:
Tuple of (message details, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
return None, "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
return await service.get_gmail_message_detail(
connected_account_id=connected_account_id,
entity_id=entity_id,
message_id=message_id,
)
def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str:
"""
Format a Gmail message to markdown.
Args:
message: Message object from Composio's GMAIL_FETCH_EMAILS response.
Composio structure: messageId, messageText, messageTimestamp,
payload.headers, labelIds, attachmentList
Returns:
Formatted markdown string.
"""
try:
# Composio uses 'messageId' (camelCase)
message_id = message.get("messageId", "") or message.get("id", "")
label_ids = message.get("labelIds", [])
# Extract headers from payload
payload = message.get("payload", {})
headers = payload.get("headers", [])
# Parse headers into a dict
header_dict = {}
for header in headers:
name = header.get("name", "").lower()
value = header.get("value", "")
header_dict[name] = value
# Extract key information
subject = header_dict.get("subject", "No Subject")
from_email = header_dict.get("from", "Unknown Sender")
to_email = header_dict.get("to", "Unknown Recipient")
# Composio provides messageTimestamp directly
date_str = message.get("messageTimestamp", "") or header_dict.get(
"date", "Unknown Date"
)
# Build markdown content
markdown_content = f"# {subject}\n\n"
markdown_content += f"**From:** {from_email}\n"
markdown_content += f"**To:** {to_email}\n"
markdown_content += f"**Date:** {date_str}\n"
if label_ids:
markdown_content += f"**Labels:** {', '.join(label_ids)}\n"
markdown_content += "\n---\n\n"
# Composio provides full message text in 'messageText'
message_text = message.get("messageText", "")
if message_text:
markdown_content += f"## Content\n\n{message_text}\n\n"
else:
# Fallback to snippet if no messageText
snippet = message.get("snippet", "")
if snippet:
markdown_content += f"## Preview\n\n{snippet}\n\n"
# Add attachment info if present
attachments = message.get("attachmentList", [])
if attachments:
markdown_content += "## Attachments\n\n"
for att in attachments:
att_name = att.get("filename", att.get("name", "Unknown"))
markdown_content += f"- {att_name}\n"
markdown_content += "\n"
# Add message metadata
markdown_content += "## Message Details\n\n"
markdown_content += f"- **Message ID:** {message_id}\n"
return markdown_content
except Exception as e:
return f"Error formatting message to markdown: {e!s}"
# ============ Indexer Functions ============
async def _process_gmail_message_batch(
session: AsyncSession,
messages: list[dict[str, Any]],
composio_connector: ComposioGmailConnector,
connector_id: int,
search_space_id: int,
user_id: str,
total_documents_indexed: int = 0,
) -> tuple[int, int]:
"""
Process a batch of Gmail messages and index them.
Args:
total_documents_indexed: Running total of documents indexed so far (for batch commits).
Returns:
Tuple of (documents_indexed, documents_skipped)
"""
documents_indexed = 0
documents_skipped = 0
for message in messages:
try:
# Composio uses 'messageId' (camelCase), not 'id'
message_id = message.get("messageId", "") or message.get("id", "")
if not message_id:
documents_skipped += 1
continue
# Composio's GMAIL_FETCH_EMAILS already returns full message content
# No need for a separate detail API call
# Extract message info from Composio response
# Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds
payload = message.get("payload", {})
headers = payload.get("headers", [])
subject = "No Subject"
sender = "Unknown Sender"
date_str = message.get("messageTimestamp", "Unknown Date")
for header in headers:
name = header.get("name", "").lower()
value = header.get("value", "")
if name == "subject":
subject = value
elif name == "from":
sender = value
elif name == "date":
date_str = value
# Format to markdown using the full message data
markdown_content = composio_connector.format_gmail_message_to_markdown(
message
)
# Check for empty content (defensive parsing per Composio best practices)
if not markdown_content.strip():
logger.warning(f"Skipping Gmail message with no content: {subject}")
documents_skipped += 1
continue
# Generate unique identifier
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"])
unique_identifier_hash = generate_unique_identifier_hash(
document_type, f"gmail_{message_id}", search_space_id
)
content_hash = generate_content_hash(markdown_content, search_space_id)
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Get label IDs from Composio response
label_ids = message.get("labelIds", [])
# Extract thread_id if available (for consistency with non-Composio implementation)
thread_id = message.get("threadId", "") or message.get("thread_id", "")
if existing_document:
if existing_document.content_hash == content_hash:
documents_skipped += 1
continue
# Update existing
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"document_type": "Gmail Message (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
existing_document.title = f"Gmail: {subject}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"labels": label_ids,
"connector_id": connector_id,
"source": "composio",
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
# Batch commit every 10 documents
current_total = total_documents_indexed + documents_indexed
if current_total % 10 == 0:
logger.info(
f"Committing batch: {current_total} Gmail messages processed so far"
)
await session.commit()
continue
# Create new document
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"document_type": "Gmail Message (Composio)",
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
document = Document(
search_space_id=search_space_id,
title=f"Gmail: {subject}",
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]),
document_metadata={
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"labels": label_ids,
"connector_id": connector_id,
"toolkit_id": "gmail",
"source": "composio",
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
)
session.add(document)
documents_indexed += 1
# Batch commit every 10 documents
current_total = total_documents_indexed + documents_indexed
if current_total % 10 == 0:
logger.info(
f"Committing batch: {current_total} Gmail messages processed so far"
)
await session.commit()
except Exception as e:
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
documents_skipped += 1
# Rollback on error to avoid partial state (per Composio best practices)
try:
await session.rollback()
except Exception as rollback_error:
logger.error(
f"Error during rollback: {rollback_error!s}", exc_info=True
)
continue
return documents_indexed, documents_skipped
async def index_composio_gmail(
session: AsyncSession,
connector,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None,
end_date: str | None,
task_logger: TaskLoggingService,
log_entry,
update_last_indexed: bool = True,
max_items: int = 1000,
) -> tuple[int, str]:
"""Index Gmail messages via Composio with pagination and incremental processing."""
try:
composio_connector = ComposioGmailConnector(session, connector_id)
# Normalize date values - handle "undefined" strings from frontend
if start_date == "undefined" or start_date == "":
start_date = None
if end_date == "undefined" or end_date == "":
end_date = None
# Use provided dates directly if both are provided, otherwise calculate from last_indexed_at
# This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior)
if start_date is not None and end_date is not None:
# User provided both dates - use them directly
start_date_str = start_date
end_date_str = end_date
else:
# Calculate date range with defaults (uses last_indexed_at or 365 days back)
# This ensures indexing works even when user doesn't specify dates
start_date_str, end_date_str = calculate_date_range(
connector, start_date, end_date, default_days_back=365
)
# Build query with date range
query_parts = []
if start_date_str:
query_parts.append(f"after:{start_date_str.replace('-', '/')}")
if end_date_str:
query_parts.append(f"before:{end_date_str.replace('-', '/')}")
query = " ".join(query_parts) if query_parts else ""
logger.info(
f"Gmail query for connector {connector_id}: '{query}' "
f"(start_date={start_date_str}, end_date={end_date_str})"
)
# Use smaller batch size to avoid 413 payload too large errors
batch_size = 50
page_token = None
total_documents_indexed = 0
total_documents_skipped = 0
total_messages_fetched = 0
result_size_estimate = None # Will be set from first API response
while total_messages_fetched < max_items:
# Calculate how many messages to fetch in this batch
remaining = max_items - total_messages_fetched
current_batch_size = min(batch_size, remaining)
# Use result_size_estimate if available, otherwise fall back to max_items
estimated_total = (
result_size_estimate if result_size_estimate is not None else max_items
)
# Cap estimated_total at max_items to avoid showing misleading progress
estimated_total = min(estimated_total, max_items)
await task_logger.log_task_progress(
log_entry,
f"Fetching Gmail messages batch via Composio for connector {connector_id} "
f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)",
{
"stage": "fetching_messages",
"batch_size": current_batch_size,
"total_fetched": total_messages_fetched,
"total_indexed": total_documents_indexed,
"estimated_total": estimated_total,
},
)
# Fetch batch of messages
(
messages,
next_token,
result_size_estimate_batch,
error,
) = await composio_connector.list_gmail_messages(
query=query,
max_results=current_batch_size,
page_token=page_token,
)
if error:
await task_logger.log_task_failure(
log_entry, f"Failed to fetch Gmail messages: {error}", {}
)
return 0, f"Failed to fetch Gmail messages: {error}"
if not messages:
# No more messages available
break
# Update result_size_estimate from first response (Gmail provides this estimate)
if result_size_estimate is None and result_size_estimate_batch is not None:
result_size_estimate = result_size_estimate_batch
logger.info(
f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'"
)
total_messages_fetched += len(messages)
# Recalculate estimated_total after potentially updating result_size_estimate
estimated_total = (
result_size_estimate if result_size_estimate is not None else max_items
)
estimated_total = min(estimated_total, max_items)
logger.info(
f"Fetched batch of {len(messages)} Gmail messages "
f"(total: {total_messages_fetched}/{estimated_total})"
)
# Process batch incrementally
batch_indexed, batch_skipped = await _process_gmail_message_batch(
session=session,
messages=messages,
composio_connector=composio_connector,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
total_documents_indexed=total_documents_indexed,
)
total_documents_indexed += batch_indexed
total_documents_skipped += batch_skipped
logger.info(
f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped "
f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)"
)
# Batch commits happen in _process_gmail_message_batch every 10 documents
# This ensures progress is saved incrementally, preventing data loss on crashes
# Check if we should continue
if not next_token:
# No more pages available
break
if len(messages) < current_batch_size:
# Last page had fewer items than requested, we're done
break
# Continue with next page
page_token = next_token
if total_messages_fetched == 0:
success_msg = "No Gmail messages found in the specified date range"
await task_logger.log_task_success(
log_entry, success_msg, {"messages_count": 0}
)
# CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
return 0, None # Return None (not error) when no items found
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit to ensure all documents are persisted (safety net)
# This matches the pattern used in non-Composio Gmail indexer
logger.info(
f"Final commit: Total {total_documents_indexed} Gmail messages processed"
)
await session.commit()
logger.info(
"Successfully committed all Composio Gmail document changes to database"
)
await task_logger.log_task_success(
log_entry,
f"Successfully completed Gmail indexing via Composio for connector {connector_id}",
{
"documents_indexed": total_documents_indexed,
"documents_skipped": total_documents_skipped,
"messages_fetched": total_messages_fetched,
},
)
return total_documents_indexed, None
except Exception as e:
logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True)
return 0, f"Failed to index Gmail via Composio: {e!s}"

View file

@ -0,0 +1,502 @@
"""
Composio Google Calendar Connector Module.
Provides Google Calendar specific methods for data retrieval and indexing via Composio.
"""
import logging
from datetime import UTC, datetime
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.config import config
from app.connectors.composio_connector import ComposioConnector
from app.db import Document, DocumentType
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
calculate_date_range,
check_duplicate_document_by_hash,
)
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
logger = logging.getLogger(__name__)
def get_current_timestamp() -> datetime:
"""Get the current timestamp with timezone for updated_at field."""
return datetime.now(UTC)
async def check_document_by_unique_identifier(
session: AsyncSession, unique_identifier_hash: str
) -> Document | None:
"""Check if a document with the given unique identifier hash already exists."""
existing_doc_result = await session.execute(
select(Document)
.options(selectinload(Document.chunks))
.where(Document.unique_identifier_hash == unique_identifier_hash)
)
return existing_doc_result.scalars().first()
async def update_connector_last_indexed(
session: AsyncSession,
connector,
update_last_indexed: bool = True,
) -> None:
"""Update the last_indexed_at timestamp for a connector."""
if update_last_indexed:
connector.last_indexed_at = datetime.now(UTC)
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
class ComposioGoogleCalendarConnector(ComposioConnector):
"""
Google Calendar specific Composio connector.
Provides methods for listing calendar events and formatting them from
Google Calendar via Composio.
"""
async def list_calendar_events(
self,
time_min: str | None = None,
time_max: str | None = None,
max_results: int = 250,
) -> tuple[list[dict[str, Any]], str | None]:
"""
List Google Calendar events via Composio.
Args:
time_min: Start time (RFC3339 format).
time_max: End time (RFC3339 format).
max_results: Maximum number of events.
Returns:
Tuple of (events list, error message).
"""
connected_account_id = await self.get_connected_account_id()
if not connected_account_id:
return [], "No connected account ID found"
entity_id = await self.get_entity_id()
service = await self._get_service()
return await service.get_calendar_events(
connected_account_id=connected_account_id,
entity_id=entity_id,
time_min=time_min,
time_max=time_max,
max_results=max_results,
)
def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str:
"""
Format a Google Calendar event to markdown.
Args:
event: Event object from Google Calendar API.
Returns:
Formatted markdown string.
"""
try:
# Extract basic event information
summary = event.get("summary", "No Title")
description = event.get("description", "")
location = event.get("location", "")
# Extract start and end times
start = event.get("start", {})
end = event.get("end", {})
start_time = start.get("dateTime") or start.get("date", "")
end_time = end.get("dateTime") or end.get("date", "")
# Format times for display
def format_time(time_str: str) -> str:
if not time_str:
return "Unknown"
try:
if "T" in time_str:
dt = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
return dt.strftime("%Y-%m-%d %H:%M")
return time_str
except Exception:
return time_str
start_formatted = format_time(start_time)
end_formatted = format_time(end_time)
# Extract attendees
attendees = event.get("attendees", [])
attendee_list = []
for attendee in attendees:
email = attendee.get("email", "")
display_name = attendee.get("displayName", email)
response_status = attendee.get("responseStatus", "")
attendee_list.append(f"- {display_name} ({response_status})")
# Build markdown content
markdown_content = f"# {summary}\n\n"
markdown_content += f"**Start:** {start_formatted}\n"
markdown_content += f"**End:** {end_formatted}\n"
if location:
markdown_content += f"**Location:** {location}\n"
markdown_content += "\n"
if description:
markdown_content += f"## Description\n\n{description}\n\n"
if attendee_list:
markdown_content += "## Attendees\n\n"
markdown_content += "\n".join(attendee_list)
markdown_content += "\n\n"
# Add event metadata
markdown_content += "## Event Details\n\n"
markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n"
markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n"
markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n"
return markdown_content
except Exception as e:
return f"Error formatting event to markdown: {e!s}"
# ============ Indexer Functions ============
async def index_composio_google_calendar(
session: AsyncSession,
connector,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None,
end_date: str | None,
task_logger: TaskLoggingService,
log_entry,
update_last_indexed: bool = True,
max_items: int = 2500,
) -> tuple[int, str]:
"""Index Google Calendar events via Composio."""
try:
composio_connector = ComposioGoogleCalendarConnector(session, connector_id)
await task_logger.log_task_progress(
log_entry,
f"Fetching Google Calendar events via Composio for connector {connector_id}",
{"stage": "fetching_events"},
)
# Normalize date values - handle "undefined" strings from frontend
if start_date == "undefined" or start_date == "":
start_date = None
if end_date == "undefined" or end_date == "":
end_date = None
# Use provided dates directly if both are provided, otherwise calculate from last_indexed_at
# This ensures user-selected dates are respected (matching non-Composio Calendar connector behavior)
if start_date is not None and end_date is not None:
# User provided both dates - use them directly
start_date_str = start_date
end_date_str = end_date
else:
# Calculate date range with defaults (uses last_indexed_at or 365 days back)
# This ensures indexing works even when user doesn't specify dates
start_date_str, end_date_str = calculate_date_range(
connector, start_date, end_date, default_days_back=365
)
# Build time range for API call
time_min = f"{start_date_str}T00:00:00Z"
time_max = f"{end_date_str}T23:59:59Z"
logger.info(
f"Google Calendar query for connector {connector_id}: "
f"(start_date={start_date_str}, end_date={end_date_str})"
)
events, error = await composio_connector.list_calendar_events(
time_min=time_min,
time_max=time_max,
max_results=max_items,
)
if error:
await task_logger.log_task_failure(
log_entry, f"Failed to fetch Calendar events: {error}", {}
)
return 0, f"Failed to fetch Calendar events: {error}"
if not events:
success_msg = "No Google Calendar events found in the specified date range"
await task_logger.log_task_success(
log_entry, success_msg, {"events_count": 0}
)
# CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
return (
0,
None,
) # Return None (not error) when no items found - this is success with 0 items
logger.info(f"Found {len(events)} Google Calendar events to index via Composio")
documents_indexed = 0
documents_skipped = 0
duplicate_content_count = (
0 # Track events skipped due to duplicate content_hash
)
for event in events:
try:
# Handle both standard Google API and potential Composio variations
event_id = event.get("id", "") or event.get("eventId", "")
summary = (
event.get("summary", "") or event.get("title", "") or "No Title"
)
if not event_id:
documents_skipped += 1
continue
# Format to markdown
markdown_content = composio_connector.format_calendar_event_to_markdown(
event
)
# Generate unique identifier
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"])
unique_identifier_hash = generate_unique_identifier_hash(
document_type, f"calendar_{event_id}", search_space_id
)
content_hash = generate_content_hash(markdown_content, search_space_id)
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Extract event times
start = event.get("start", {})
end = event.get("end", {})
start_time = start.get("dateTime") or start.get("date", "")
end_time = end.get("dateTime") or end.get("date", "")
location = event.get("location", "")
if existing_document:
if existing_document.content_hash == content_hash:
documents_skipped += 1
continue
# Update existing
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"document_type": "Google Calendar Event (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
if location:
summary_content += f"\nLocation: {location}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
existing_document.title = f"Calendar: {summary}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"end_time": end_time,
"location": location,
"connector_id": connector_id,
"source": "composio",
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
)
await session.commit()
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from standard connector)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
# A document with the same content already exists (likely from standard connector)
logger.info(
f"Event {summary} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Create new document
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"document_type": "Google Calendar Event (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
)
if location:
summary_content += f"\nLocation: {location}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
document = Document(
search_space_id=search_space_id,
title=f"Calendar: {summary}",
document_type=DocumentType(
TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]
),
document_metadata={
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"end_time": end_time,
"location": location,
"connector_id": connector_id,
"toolkit_id": "googlecalendar",
"source": "composio",
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
)
session.add(document)
documents_indexed += 1
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
)
await session.commit()
except Exception as e:
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
documents_skipped += 1
continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit to ensure all documents are persisted (safety net)
# This matches the pattern used in non-Composio Gmail indexer
logger.info(
f"Final commit: Total {documents_indexed} Google Calendar events processed"
)
try:
await session.commit()
logger.info(
"Successfully committed all Composio Google Calendar document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same event was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if duplicates were found
warning_message = None
if duplicate_content_count > 0:
warning_message = f"{duplicate_content_count} skipped (duplicate)"
await task_logger.log_task_success(
log_entry,
f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}",
{
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"duplicate_content_count": duplicate_content_count,
},
)
logger.info(
f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
f"({duplicate_content_count} due to duplicate content from other connectors)"
)
return documents_indexed, warning_message
except Exception as e:
logger.error(
f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True
)
return 0, f"Failed to index Google Calendar via Composio: {e!s}"

File diff suppressed because it is too large Load diff

View file

@ -142,6 +142,15 @@ class GoogleCalendarConnector:
flag_modified(connector, "config")
await self._session.commit()
except Exception as e:
error_str = str(e)
# Check if this is an invalid_grant error (token expired/revoked)
if (
"invalid_grant" in error_str.lower()
or "token has been expired or revoked" in error_str.lower()
):
raise Exception(
"Google Calendar authentication failed. Please re-authenticate."
) from e
raise Exception(
f"Failed to refresh Google OAuth credentials: {e!s}"
) from e
@ -165,6 +174,14 @@ class GoogleCalendarConnector:
self.service = build("calendar", "v3", credentials=credentials)
return self.service
except Exception as e:
error_str = str(e)
# If the error already contains a user-friendly re-authentication message, preserve it
if (
"re-authenticate" in error_str.lower()
or "expired or been revoked" in error_str.lower()
or "authentication failed" in error_str.lower()
):
raise Exception(error_str) from e
raise Exception(f"Failed to create Google Calendar service: {e!s}") from e
async def get_calendars(self) -> tuple[list[dict[str, Any]], str | None]:
@ -271,6 +288,14 @@ class GoogleCalendarConnector:
return events, None
except Exception as e:
error_str = str(e)
# If the error already contains a user-friendly re-authentication message, preserve it
if (
"re-authenticate" in error_str.lower()
or "expired or been revoked" in error_str.lower()
or "authentication failed" in error_str.lower()
):
return [], error_str
return [], f"Error fetching events: {e!s}"
def format_event_to_markdown(self, event: dict[str, Any]) -> str:

View file

@ -141,6 +141,15 @@ class GoogleGmailConnector:
flag_modified(connector, "config")
await self._session.commit()
except Exception as e:
error_str = str(e)
# Check if this is an invalid_grant error (token expired/revoked)
if (
"invalid_grant" in error_str.lower()
or "token has been expired or revoked" in error_str.lower()
):
raise Exception(
"Gmail authentication failed. Please re-authenticate."
) from e
raise Exception(
f"Failed to refresh Google OAuth credentials: {e!s}"
) from e
@ -164,6 +173,14 @@ class GoogleGmailConnector:
self.service = build("gmail", "v1", credentials=credentials)
return self.service
except Exception as e:
error_str = str(e)
# If the error already contains a user-friendly re-authentication message, preserve it
if (
"re-authenticate" in error_str.lower()
or "expired or been revoked" in error_str.lower()
or "authentication failed" in error_str.lower()
):
raise Exception(error_str) from e
raise Exception(f"Failed to create Gmail service: {e!s}") from e
async def get_user_profile(self) -> tuple[dict[str, Any], str | None]:
@ -225,6 +242,14 @@ class GoogleGmailConnector:
return messages, None
except Exception as e:
error_str = str(e)
# If the error already contains a user-friendly re-authentication message, preserve it
if (
"re-authenticate" in error_str.lower()
or "expired or been revoked" in error_str.lower()
or "authentication failed" in error_str.lower()
):
return [], error_str
return [], f"Error fetching messages list: {e!s}"
async def get_message_details(
@ -271,6 +296,13 @@ class GoogleGmailConnector:
try:
from datetime import datetime, timedelta
# Normalize date values - handle "undefined" strings from frontend
# This prevents "time data 'undefined' does not match format" errors
if start_date == "undefined" or start_date == "":
start_date = None
if end_date == "undefined" or end_date == "":
end_date = None
# Build date query
query_parts = []

View file

@ -55,7 +55,9 @@ class DocumentType(str, Enum):
CIRCLEBACK = "CIRCLEBACK"
OBSIDIAN_CONNECTOR = "OBSIDIAN_CONNECTOR"
NOTE = "NOTE"
COMPOSIO_CONNECTOR = "COMPOSIO_CONNECTOR" # Generic Composio integration
COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"
COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR"
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
class SearchSourceConnectorType(str, Enum):
@ -86,9 +88,9 @@ class SearchSourceConnectorType(str, Enum):
"OBSIDIAN_CONNECTOR" # Self-hosted only - Local Obsidian vault indexing
)
MCP_CONNECTOR = "MCP_CONNECTOR" # Model Context Protocol - User-defined API tools
COMPOSIO_CONNECTOR = (
"COMPOSIO_CONNECTOR" # Generic Composio integration (Google, Slack, etc.)
)
COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"
COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR"
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
class LiteLLMProvider(str, Enum):
@ -142,6 +144,43 @@ class LogStatus(str, Enum):
FAILED = "FAILED"
class IncentiveTaskType(str, Enum):
"""
Enum for incentive task types that users can complete to earn free pages.
Each task can only be completed once per user.
When adding new tasks:
1. Add a new enum value here
2. Add the task configuration to INCENTIVE_TASKS_CONFIG below
3. Create an Alembic migration to add the enum value to PostgreSQL
"""
GITHUB_STAR = "GITHUB_STAR"
# Future tasks can be added here:
# GITHUB_ISSUE = "GITHUB_ISSUE"
# SOCIAL_SHARE = "SOCIAL_SHARE"
# REFER_FRIEND = "REFER_FRIEND"
# Centralized configuration for incentive tasks
# This makes it easy to add new tasks without changing code in multiple places
INCENTIVE_TASKS_CONFIG = {
IncentiveTaskType.GITHUB_STAR: {
"title": "Star our GitHub repository",
"description": "Show your support by starring SurfSense on GitHub",
"pages_reward": 100,
"action_url": "https://github.com/MODSetter/SurfSense",
},
# Future tasks can be configured here:
# IncentiveTaskType.GITHUB_ISSUE: {
# "title": "Create an issue",
# "description": "Help improve SurfSense by reporting bugs or suggesting features",
# "pages_reward": 50,
# "action_url": "https://github.com/MODSetter/SurfSense/issues/new/choose",
# },
}
class Permission(str, Enum):
"""
Granular permissions for search space resources.
@ -936,6 +975,39 @@ class Notification(BaseModel, TimestampMixin):
search_space = relationship("SearchSpace", back_populates="notifications")
class UserIncentiveTask(BaseModel, TimestampMixin):
"""
Tracks completed incentive tasks for users.
Each user can only complete each task type once.
When a task is completed, the user's pages_limit is increased.
"""
__tablename__ = "user_incentive_tasks"
__table_args__ = (
UniqueConstraint(
"user_id",
"task_type",
name="uq_user_incentive_task",
),
)
user_id = Column(
UUID(as_uuid=True),
ForeignKey("user.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
task_type = Column(SQLAlchemyEnum(IncentiveTaskType), nullable=False, index=True)
pages_awarded = Column(Integer, nullable=False)
completed_at = Column(
TIMESTAMP(timezone=True),
nullable=False,
default=lambda: datetime.now(UTC),
)
user = relationship("User", back_populates="incentive_tasks")
class SearchSpaceRole(BaseModel, TimestampMixin):
"""
Custom roles that can be defined per search space.
@ -1114,6 +1186,13 @@ if config.AUTH_TYPE == "GOOGLE":
cascade="all, delete-orphan",
)
# Incentive tasks completed by this user
incentive_tasks = relationship(
"UserIncentiveTask",
back_populates="user",
cascade="all, delete-orphan",
)
# Page usage tracking for ETL services
pages_limit = Column(
Integer,
@ -1165,6 +1244,13 @@ else:
cascade="all, delete-orphan",
)
# Incentive tasks completed by this user
incentive_tasks = relationship(
"UserIncentiveTask",
back_populates="user",
cascade="all, delete-orphan",
)
# Page usage tracking for ETL services
pages_limit = Column(
Integer,

View file

@ -20,6 +20,7 @@ from .google_drive_add_connector_route import (
from .google_gmail_add_connector_route import (
router as google_gmail_add_connector_router,
)
from .incentive_tasks_routes import router as incentive_tasks_router
from .jira_add_connector_route import router as jira_add_connector_router
from .linear_add_connector_route import router as linear_add_connector_router
from .logs_routes import router as logs_router
@ -69,3 +70,4 @@ router.include_router(surfsense_docs_router) # Surfsense documentation for cita
router.include_router(notifications_router) # Notifications with Electric SQL sync
router.include_router(composio_router) # Composio OAuth and toolkit management
router.include_router(public_chat_router) # Public chat sharing and cloning
router.include_router(incentive_tasks_router) # Incentive tasks for earning free pages

View file

@ -8,16 +8,18 @@ Endpoints:
- GET /composio/toolkits - List available Composio toolkits
- GET /auth/composio/connector/add - Initiate OAuth for a specific toolkit
- GET /auth/composio/connector/callback - Handle OAuth callback
- GET /connectors/{connector_id}/composio-drive/folders - List folders/files for Composio Google Drive
"""
import logging
from uuid import UUID
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.responses import RedirectResponse
from pydantic import ValidationError
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.config import config
from app.db import (
@ -29,19 +31,31 @@ from app.db import (
from app.services.composio_service import (
COMPOSIO_TOOLKIT_NAMES,
INDEXABLE_TOOLKITS,
TOOLKIT_TO_CONNECTOR_TYPE,
ComposioService,
)
from app.users import current_active_user
from app.utils.connector_naming import (
check_duplicate_connector,
generate_unique_connector_name,
count_connectors_of_type,
get_base_name_for_type,
)
from app.utils.oauth_security import OAuthStateManager
# Note: We no longer use check_duplicate_connector for Composio connectors because
# Composio generates a new connected_account_id each time, even for the same Google account.
# Instead, we check for existing connectors by type/space/user and update them.
logger = logging.getLogger(__name__)
router = APIRouter()
# Map toolkit_id to frontend connector ID
TOOLKIT_TO_FRONTEND_CONNECTOR_ID = {
"googledrive": "composio-googledrive",
"gmail": "composio-gmail",
"googlecalendar": "composio-googlecalendar",
}
# Initialize security utilities
_state_manager = None
@ -166,11 +180,8 @@ async def initiate_composio_auth(
@router.get("/auth/composio/connector/callback")
async def composio_callback(
request: Request,
state: str | None = None,
composio_connected_account_id: str | None = Query(
None, alias="connectedAccountId"
), # Composio sends camelCase
connected_account_id: str | None = None, # Fallback snake_case
error: str | None = None,
session: AsyncSession = Depends(get_async_session),
):
@ -236,16 +247,17 @@ async def composio_callback(
)
# Initialize Composio service
ComposioService()
service = ComposioService()
# Use camelCase param if provided (Composio's format), fallback to snake_case
final_connected_account_id = (
composio_connected_account_id or connected_account_id
)
# Extract connected_account_id from query params (accepts both camelCase and snake_case)
query_params = request.query_params
final_connected_account_id = query_params.get(
"connectedAccountId"
) or query_params.get("connected_account_id")
# DEBUG: Log all query parameters received
# DEBUG: Log query parameter received
logger.info(
f"DEBUG: Callback received - connectedAccountId: {composio_connected_account_id}, connected_account_id: {connected_account_id}, using: {final_connected_account_id}"
f"DEBUG: Callback received - connectedAccountId: {query_params.get('connectedAccountId')}, connected_account_id: {query_params.get('connected_account_id')}, using: {final_connected_account_id}"
)
# If we still don't have a connected_account_id, warn but continue
@ -268,38 +280,89 @@ async def composio_callback(
"is_indexable": toolkit_id in INDEXABLE_TOOLKITS,
}
# Check for duplicate connector
# For Composio, we use toolkit_id + connected_account_id as unique identifier
identifier = final_connected_account_id or f"{toolkit_id}_{user_id}"
# Get the specific connector type for this toolkit
connector_type_str = TOOLKIT_TO_CONNECTOR_TYPE.get(toolkit_id)
if not connector_type_str:
raise HTTPException(
status_code=400,
detail=f"Unknown toolkit: {toolkit_id}. Available: {list(TOOLKIT_TO_CONNECTOR_TYPE.keys())}",
)
connector_type = SearchSourceConnectorType(connector_type_str)
is_duplicate = await check_duplicate_connector(
session,
SearchSourceConnectorType.COMPOSIO_CONNECTOR,
space_id,
user_id,
identifier,
# Check for existing connector of the same type for this user/space
# When reconnecting, Composio gives a new connected_account_id, so we need to
# check by connector_type, user_id, and search_space_id instead of connected_account_id
existing_connector_result = await session.execute(
select(SearchSourceConnector).where(
SearchSourceConnector.connector_type == connector_type,
SearchSourceConnector.search_space_id == space_id,
SearchSourceConnector.user_id == user_id,
)
)
if is_duplicate:
logger.warning(
f"Duplicate Composio connector detected for user {user_id} with toolkit {toolkit_id}"
existing_connector = existing_connector_result.scalars().first()
if existing_connector:
# Delete the old Composio connected account before updating
old_connected_account_id = existing_connector.config.get(
"composio_connected_account_id"
)
if (
old_connected_account_id
and old_connected_account_id != final_connected_account_id
):
try:
deleted = await service.delete_connected_account(
old_connected_account_id
)
if deleted:
logger.info(
f"Deleted old Composio connected account {old_connected_account_id} "
f"before updating connector {existing_connector.id}"
)
else:
logger.warning(
f"Failed to delete old Composio connected account {old_connected_account_id}"
)
except Exception as delete_error:
# Log but don't fail - the old account may already be deleted
logger.warning(
f"Error deleting old Composio connected account {old_connected_account_id}: {delete_error!s}"
)
# Update existing connector with new connected_account_id
logger.info(
f"Updating existing Composio connector {existing_connector.id} with new connected_account_id {final_connected_account_id}"
)
existing_connector.config = connector_config
await session.commit()
await session.refresh(existing_connector)
# Get the frontend connector ID based on toolkit_id
frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get(
toolkit_id, "composio-connector"
)
return RedirectResponse(
url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&error=duplicate_account&connector=composio-connector"
url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={existing_connector.id}"
)
try:
# Generate a unique, user-friendly connector name
connector_name = await generate_unique_connector_name(
session,
SearchSourceConnectorType.COMPOSIO_CONNECTOR,
space_id,
user_id,
f"{toolkit_name} (Composio)",
# Count existing connectors of this type to determine the number
count = await count_connectors_of_type(
session, connector_type, space_id, user_id
)
# Generate base name (e.g., "Gmail", "Google Drive")
base_name = get_base_name_for_type(connector_type)
# Format: "Gmail (Composio) 1", "Gmail (Composio) 2", etc.
if count == 0:
connector_name = f"{base_name} (Composio) 1"
else:
connector_name = f"{base_name} (Composio) {count + 1}"
db_connector = SearchSourceConnector(
name=connector_name,
connector_type=SearchSourceConnectorType.COMPOSIO_CONNECTOR,
connector_type=connector_type,
config=connector_config,
search_space_id=space_id,
user_id=user_id,
@ -314,8 +377,12 @@ async def composio_callback(
f"Successfully created Composio connector {db_connector.id} for user {user_id}, toolkit {toolkit_id}"
)
# Get the frontend connector ID based on toolkit_id
frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get(
toolkit_id, "composio-connector"
)
return RedirectResponse(
url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector=composio-connector&connectorId={db_connector.id}"
url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={db_connector.id}"
)
except IntegrityError as e:
@ -339,3 +406,136 @@ async def composio_callback(
raise HTTPException(
status_code=500, detail=f"Failed to complete Composio OAuth: {e!s}"
) from e
@router.get("/connectors/{connector_id}/composio-drive/folders")
async def list_composio_drive_folders(
connector_id: int,
parent_id: str | None = None,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
List folders AND files in user's Google Drive via Composio with hierarchical support.
This is called at index time from the manage connector page to display
the complete file system (folders and files). Only folders are selectable.
Args:
connector_id: ID of the Composio Google Drive connector
parent_id: Optional parent folder ID to list contents (None for root)
Returns:
JSON with list of items: {
"items": [
{"id": str, "name": str, "mimeType": str, "isFolder": bool, ...},
...
]
}
"""
if not ComposioService.is_enabled():
raise HTTPException(
status_code=503,
detail="Composio integration is not enabled.",
)
try:
# Get connector and verify ownership
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id,
SearchSourceConnector.user_id == user.id,
SearchSourceConnector.connector_type
== SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
)
)
connector = result.scalars().first()
if not connector:
raise HTTPException(
status_code=404,
detail="Composio Google Drive connector not found or access denied",
)
# Get Composio connected account ID from config
composio_connected_account_id = connector.config.get(
"composio_connected_account_id"
)
if not composio_connected_account_id:
raise HTTPException(
status_code=400,
detail="Composio connected account not found. Please reconnect the connector.",
)
# Initialize Composio service and fetch files
service = ComposioService()
entity_id = f"surfsense_{user.id}"
# Fetch files/folders from Composio Google Drive
files, _next_token, error = await service.get_drive_files(
connected_account_id=composio_connected_account_id,
entity_id=entity_id,
folder_id=parent_id,
page_size=100,
)
if error:
logger.error(f"Failed to list Composio Drive files: {error}")
raise HTTPException(
status_code=500, detail=f"Failed to list folder contents: {error}"
)
# Transform files to match the expected format with isFolder field
items = []
for file_info in files:
file_id = file_info.get("id", "") or file_info.get("fileId", "")
file_name = (
file_info.get("name", "") or file_info.get("fileName", "") or "Untitled"
)
mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "")
if not file_id:
continue
is_folder = mime_type == "application/vnd.google-apps.folder"
items.append(
{
"id": file_id,
"name": file_name,
"mimeType": mime_type,
"isFolder": is_folder,
"parents": file_info.get("parents", []),
"size": file_info.get("size"),
"iconLink": file_info.get("iconLink"),
}
)
# Sort: folders first, then files, both alphabetically
folders = sorted(
[item for item in items if item["isFolder"]],
key=lambda x: x["name"].lower(),
)
files_list = sorted(
[item for item in items if not item["isFolder"]],
key=lambda x: x["name"].lower(),
)
items = folders + files_list
folder_count = len(folders)
file_count = len(files_list)
logger.info(
f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for Composio connector {connector_id}"
+ (f" in folder {parent_id}" if parent_id else " in ROOT")
)
return {"items": items}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error listing Composio Drive contents: {e!s}", exc_info=True)
raise HTTPException(
status_code=500, detail=f"Failed to list Drive contents: {e!s}"
) from e

View file

@ -402,7 +402,7 @@ async def list_google_drive_folders(
file_count = len(items) - folder_count
logger.info(
f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}"
f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}"
+ (f" in folder {parent_id}" if parent_id else " in ROOT")
)

View file

@ -0,0 +1,131 @@
"""
Incentive Tasks API routes.
Allows users to complete tasks (like starring GitHub repo) to earn free pages.
Each task can only be completed once per user.
"""
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import (
INCENTIVE_TASKS_CONFIG,
IncentiveTaskType,
User,
UserIncentiveTask,
get_async_session,
)
from app.schemas.incentive_tasks import (
CompleteTaskResponse,
IncentiveTaskInfo,
IncentiveTasksResponse,
TaskAlreadyCompletedResponse,
)
from app.users import current_active_user
router = APIRouter(prefix="/incentive-tasks", tags=["incentive-tasks"])
@router.get("", response_model=IncentiveTasksResponse)
async def get_incentive_tasks(
user: User = Depends(current_active_user),
session: AsyncSession = Depends(get_async_session),
) -> IncentiveTasksResponse:
"""
Get all available incentive tasks with the user's completion status.
"""
# Get all completed tasks for this user
result = await session.execute(
select(UserIncentiveTask).where(UserIncentiveTask.user_id == user.id)
)
completed_tasks = {task.task_type: task for task in result.scalars().all()}
# Build task list with completion status
tasks = []
total_pages_earned = 0
for task_type, config in INCENTIVE_TASKS_CONFIG.items():
completed_task = completed_tasks.get(task_type)
is_completed = completed_task is not None
if is_completed:
total_pages_earned += completed_task.pages_awarded
tasks.append(
IncentiveTaskInfo(
task_type=task_type,
title=config["title"],
description=config["description"],
pages_reward=config["pages_reward"],
action_url=config["action_url"],
completed=is_completed,
completed_at=completed_task.completed_at if completed_task else None,
)
)
return IncentiveTasksResponse(
tasks=tasks,
total_pages_earned=total_pages_earned,
)
@router.post(
"/{task_type}/complete",
response_model=CompleteTaskResponse | TaskAlreadyCompletedResponse,
)
async def complete_task(
task_type: IncentiveTaskType,
user: User = Depends(current_active_user),
session: AsyncSession = Depends(get_async_session),
) -> CompleteTaskResponse | TaskAlreadyCompletedResponse:
"""
Mark an incentive task as completed and award pages to the user.
Each task can only be completed once. If the task was already completed,
returns the existing completion information without awarding additional pages.
"""
# Validate task type exists in config
task_config = INCENTIVE_TASKS_CONFIG.get(task_type)
if not task_config:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unknown task type: {task_type}",
)
# Check if task was already completed
existing_task = await session.execute(
select(UserIncentiveTask).where(
UserIncentiveTask.user_id == user.id,
UserIncentiveTask.task_type == task_type,
)
)
existing = existing_task.scalar_one_or_none()
if existing:
return TaskAlreadyCompletedResponse(
success=False,
message="Task already completed",
completed_at=existing.completed_at,
)
# Create the task completion record
pages_reward = task_config["pages_reward"]
new_task = UserIncentiveTask(
user_id=user.id,
task_type=task_type,
pages_awarded=pages_reward,
)
session.add(new_task)
# Update user's pages_limit
user.pages_limit += pages_reward
await session.commit()
await session.refresh(user)
return CompleteTaskResponse(
success=True,
message=f"Task completed! You earned {pages_reward} pages.",
pages_awarded=pages_reward,
new_pages_limit=user.pages_limit,
)

View file

@ -59,6 +59,58 @@ router = APIRouter()
# ============ Permissions Endpoints ============
# Human-readable descriptions for each permission
PERMISSION_DESCRIPTIONS = {
# Documents
"documents:create": "Add new documents, files, and content to the search space",
"documents:read": "View and search documents in the search space",
"documents:update": "Edit existing documents and their metadata",
"documents:delete": "Remove documents from the search space",
# Chats
"chats:create": "Start new AI chat conversations",
"chats:read": "View chat history and conversations",
"chats:update": "Edit chat titles and settings",
"chats:delete": "Delete chat conversations",
# Comments
"comments:create": "Add comments and annotations to documents",
"comments:read": "View comments on documents",
"comments:delete": "Remove comments from documents",
# LLM Configs
"llm_configs:create": "Add new AI model configurations",
"llm_configs:read": "View AI model settings and configurations",
"llm_configs:update": "Modify AI model configurations",
"llm_configs:delete": "Remove AI model configurations",
# Podcasts
"podcasts:create": "Generate new AI podcasts from content",
"podcasts:read": "Listen to and view generated podcasts",
"podcasts:update": "Edit podcast settings and metadata",
"podcasts:delete": "Remove generated podcasts",
# Connectors
"connectors:create": "Set up new data source integrations",
"connectors:read": "View configured data sources and their status",
"connectors:update": "Modify data source configurations",
"connectors:delete": "Remove data source integrations",
# Logs
"logs:read": "View activity logs and audit trail",
"logs:delete": "Clear activity logs",
# Members
"members:invite": "Send invitations to new team members",
"members:view": "View the list of team members",
"members:remove": "Remove members from the search space",
"members:manage_roles": "Assign and change member roles",
# Roles
"roles:create": "Create new custom roles",
"roles:read": "View available roles and their permissions",
"roles:update": "Modify role permissions",
"roles:delete": "Remove custom roles",
# Settings
"settings:view": "View search space settings",
"settings:update": "Modify search space settings",
"settings:delete": "Delete the entire search space",
# Full access
"*": "Full access to all features and settings",
}
@router.get("/permissions", response_model=PermissionsListResponse)
async def list_all_permissions(
@ -71,12 +123,14 @@ async def list_all_permissions(
for perm in Permission:
# Extract category from permission value (e.g., "documents:read" -> "documents")
category = perm.value.split(":")[0] if ":" in perm.value else "general"
description = PERMISSION_DESCRIPTIONS.get(perm.value, f"Permission for {perm.value}")
permissions.append(
PermissionInfo(
value=perm.value,
name=perm.name,
category=category,
description=description,
)
)

View file

@ -22,6 +22,8 @@ import logging
from datetime import UTC, datetime, timedelta
from typing import Any
import pytz
from dateutil.parser import isoparse
from fastapi import APIRouter, Body, Depends, HTTPException, Query
from pydantic import BaseModel, Field, ValidationError
from sqlalchemy.exc import IntegrityError
@ -47,6 +49,7 @@ from app.schemas import (
SearchSourceConnectorRead,
SearchSourceConnectorUpdate,
)
from app.services.composio_service import ComposioService
from app.services.notification_service import NotificationService
from app.tasks.connector_indexers import (
index_airtable_records,
@ -529,6 +532,38 @@ async def delete_search_source_connector(
f"Failed to delete periodic schedule for connector {connector_id}"
)
# For Composio connectors, also delete the connected account in Composio
composio_connector_types = [
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
]
if db_connector.connector_type in composio_connector_types:
composio_connected_account_id = db_connector.config.get(
"composio_connected_account_id"
)
if composio_connected_account_id and ComposioService.is_enabled():
try:
service = ComposioService()
deleted = await service.delete_connected_account(
composio_connected_account_id
)
if deleted:
logger.info(
f"Successfully deleted Composio connected account {composio_connected_account_id} "
f"for connector {connector_id}"
)
else:
logger.warning(
f"Failed to delete Composio connected account {composio_connected_account_id} "
f"for connector {connector_id}"
)
except Exception as composio_error:
# Log but don't fail the deletion - Composio account may already be deleted
logger.warning(
f"Error deleting Composio connected account {composio_connected_account_id}: {composio_error!s}"
)
await session.delete(db_connector)
await session.commit()
return {"message": "Search source connector deleted successfully"}
@ -611,32 +646,59 @@ async def index_connector_content(
# Handle different connector types
response_message = ""
today_str = datetime.now().strftime("%Y-%m-%d")
# Use UTC for consistency with last_indexed_at storage
today_str = datetime.now(UTC).strftime("%Y-%m-%d")
# Determine the actual date range to use
if start_date is None:
# Use last_indexed_at or default to 365 days ago
if connector.last_indexed_at:
today = datetime.now().date()
if connector.last_indexed_at.date() == today:
# If last indexed today, go back 1 day to ensure we don't miss anything
indexing_from = (today - timedelta(days=1)).strftime("%Y-%m-%d")
else:
indexing_from = connector.last_indexed_at.strftime("%Y-%m-%d")
else:
indexing_from = (datetime.now() - timedelta(days=365)).strftime(
"%Y-%m-%d"
# Convert last_indexed_at to timezone-naive for comparison (like calculate_date_range does)
last_indexed_naive = (
connector.last_indexed_at.replace(tzinfo=None)
if connector.last_indexed_at.tzinfo
else connector.last_indexed_at
)
# Use UTC for "today" to match how last_indexed_at is stored
today_utc = datetime.now(UTC).replace(tzinfo=None).date()
last_indexed_date = last_indexed_naive.date()
if last_indexed_date == today_utc:
# If last indexed today, go back 1 day to ensure we don't miss anything
indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d")
else:
indexing_from = last_indexed_naive.strftime("%Y-%m-%d")
else:
indexing_from = (
datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365)
).strftime("%Y-%m-%d")
else:
indexing_from = start_date
# For calendar connectors, default to today but allow future dates if explicitly provided
if connector.connector_type in [
SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
SearchSourceConnectorType.LUMA_CONNECTOR,
]:
# Default to today if no end_date provided (users can manually select future dates)
indexing_to = today_str if end_date is None else end_date
# If start_date and end_date are the same, adjust end_date to be one day later
# to ensure valid date range (start_date must be strictly before end_date)
if indexing_from == indexing_to:
dt = isoparse(indexing_to)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=pytz.UTC)
else:
dt = dt.astimezone(pytz.UTC)
# Add one day to end_date to make it strictly after start_date
dt_end = dt + timedelta(days=1)
indexing_to = dt_end.strftime("%Y-%m-%d")
logger.info(
f"Adjusted end_date from {end_date} to {indexing_to} "
f"to ensure valid date range (start_date must be strictly before end_date)"
)
else:
# For non-calendar connectors, cap at today
indexing_to = end_date if end_date else today_str
@ -887,11 +949,66 @@ async def index_connector_content(
)
response_message = "Obsidian vault indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.COMPOSIO_CONNECTOR:
elif (
connector.connector_type
== SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
):
from app.tasks.celery_tasks.connector_tasks import (
index_composio_connector_task,
)
# For Composio Google Drive, if drive_items is provided, update connector config
# This allows the UI to pass folder/file selection like the regular Google Drive connector
if drive_items and drive_items.has_items():
# Update connector config with the selected folders/files
config = connector.config or {}
config["selected_folders"] = [
{"id": f.id, "name": f.name} for f in drive_items.folders
]
config["selected_files"] = [
{"id": f.id, "name": f.name} for f in drive_items.files
]
if drive_items.indexing_options:
config["indexing_options"] = {
"max_files_per_folder": drive_items.indexing_options.max_files_per_folder,
"incremental_sync": drive_items.indexing_options.incremental_sync,
"include_subfolders": drive_items.indexing_options.include_subfolders,
}
connector.config = config
from sqlalchemy.orm.attributes import flag_modified
flag_modified(connector, "config")
await session.commit()
await session.refresh(connector)
logger.info(
f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id}, "
f"folders: {len(drive_items.folders)}, files: {len(drive_items.files)}"
)
else:
logger.info(
f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id} "
f"using existing config (from {indexing_from} to {indexing_to})"
)
index_composio_connector_task.delay(
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
)
response_message = (
"Composio Google Drive indexing started in the background."
)
elif connector.connector_type in [
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
]:
from app.tasks.celery_tasks.connector_tasks import (
index_composio_connector_task,
)
# For Composio Gmail and Calendar, use the same date calculation logic as normal connectors
# This ensures consistent behavior and uses last_indexed_at to reduce API calls
# (includes special case: if indexed today, go back 1 day to avoid missing data)
logger.info(
f"Triggering Composio connector indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
)
@ -943,7 +1060,9 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id:
connector = result.scalars().first()
if connector:
connector.last_indexed_at = datetime.now()
connector.last_indexed_at = datetime.now(
UTC
) # Use UTC for timezone consistency
await session.commit()
logger.info(f"Updated last_indexed_at for connector {connector_id}")
except Exception as e:
@ -1083,18 +1202,24 @@ async def _run_indexing_with_notifications(
)
await update_timestamp_func(session, connector_id)
await session.commit() # Commit timestamp update
logger.info(
f"Indexing completed successfully: {documents_processed} documents processed"
)
# Update notification on success
# Update notification on success (or partial success with errors)
if notification:
# Refresh notification to ensure it's not stale after timestamp update commit
await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=documents_processed,
error_message=None,
error_message=error_or_warning, # Show errors even if some documents were indexed
)
await (
session.commit()
) # Commit to ensure Electric SQL syncs the notification update
elif documents_processed > 0:
# Update notification to storing stage
if notification:
@ -1110,24 +1235,73 @@ async def _run_indexing_with_notifications(
f"Indexing completed successfully: {documents_processed} documents processed"
)
if notification:
# Refresh notification to ensure it's not stale after indexing function commits
await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=documents_processed,
error_message=None,
error_message=error_or_warning, # Show errors even if some documents were indexed
)
await (
session.commit()
) # Commit to ensure Electric SQL syncs the notification update
else:
# No new documents processed - check if this is an error or just no changes
if error_or_warning:
# Actual failure
logger.error(f"Indexing failed: {error_or_warning}")
if notification:
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=0,
error_message=error_or_warning,
)
# Check if this is a duplicate warning or empty result (success cases) or an actual error
# Handle both normal and Composio calendar connectors
error_or_warning_lower = (
str(error_or_warning).lower() if error_or_warning else ""
)
is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower
# "No X found" messages are success cases - sync worked, just found nothing in date range
is_empty_result = (
"no " in error_or_warning_lower
and "found" in error_or_warning_lower
)
if is_duplicate_warning or is_empty_result:
# These are success cases - sync worked, just found nothing new
logger.info(f"Indexing completed successfully: {error_or_warning}")
# Still update timestamp so ElectricSQL syncs and clears "Syncing" UI
if update_timestamp_func:
await update_timestamp_func(session, connector_id)
await session.commit() # Commit timestamp update
if notification:
# Refresh notification to ensure it's not stale after timestamp update commit
await session.refresh(notification)
# For empty results, use a cleaner message
notification_message = (
"No new items found in date range"
if is_empty_result
else error_or_warning
)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=0,
error_message=notification_message, # Pass as warning, not error
is_warning=True, # Flag to indicate this is a warning, not an error
)
await (
session.commit()
) # Commit to ensure Electric SQL syncs the notification update
else:
# Actual failure
logger.error(f"Indexing failed: {error_or_warning}")
if notification:
# Refresh notification to ensure it's not stale after indexing function commits
await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=0,
error_message=error_or_warning,
)
await (
session.commit()
) # Commit to ensure Electric SQL syncs the notification update
else:
# Success - just no new documents to index (all skipped/unchanged)
logger.info(
@ -1136,13 +1310,19 @@ async def _run_indexing_with_notifications(
# Still update timestamp so ElectricSQL syncs and clears "Syncing" UI
if update_timestamp_func:
await update_timestamp_func(session, connector_id)
await session.commit() # Commit timestamp update
if notification:
# Refresh notification to ensure it's not stale after timestamp update commit
await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed(
session=session,
notification=notification,
indexed_count=0,
error_message=None, # No error - sync succeeded
)
await (
session.commit()
) # Commit to ensure Electric SQL syncs the notification update
except Exception as e:
logger.error(f"Error in indexing task: {e!s}", exc_info=True)
@ -2157,6 +2337,59 @@ async def run_obsidian_indexing(
)
async def run_composio_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Create a new session and run the Composio indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_composio_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_composio_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None,
end_date: str | None,
):
"""
Run Composio connector indexing with real-time notifications.
This wraps the Composio indexer with the notification system so that
Electric SQL can sync indexing progress to the frontend in real-time.
Args:
session: Database session
connector_id: ID of the Composio connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.composio_indexer import index_composio_connector
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_composio_connector,
update_timestamp_func=_update_connector_timestamp_by_id,
)
# =============================================================================
# MCP Connector Routes
# =============================================================================

View file

@ -129,6 +129,7 @@ async def read_search_spaces(
result = await session.execute(
select(SearchSpace)
.filter(SearchSpace.user_id == user.id)
.order_by(SearchSpace.id.asc())
.offset(skip)
.limit(limit)
)
@ -138,6 +139,7 @@ async def read_search_spaces(
select(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
.order_by(SearchSpace.id.asc())
.offset(skip)
.limit(limit)
)

View file

@ -0,0 +1,61 @@
"""
Schemas for incentive tasks API.
"""
from datetime import datetime
from pydantic import BaseModel
from app.db import INCENTIVE_TASKS_CONFIG, IncentiveTaskType
class IncentiveTaskInfo(BaseModel):
"""Information about an available incentive task."""
task_type: IncentiveTaskType
title: str
description: str
pages_reward: int
action_url: str
completed: bool
completed_at: datetime | None = None
class IncentiveTasksResponse(BaseModel):
"""Response containing all available incentive tasks with completion status."""
tasks: list[IncentiveTaskInfo]
total_pages_earned: int
class CompleteTaskRequest(BaseModel):
"""Request to mark a task as completed."""
task_type: IncentiveTaskType
class CompleteTaskResponse(BaseModel):
"""Response after completing a task."""
success: bool
message: str
pages_awarded: int
new_pages_limit: int
class TaskAlreadyCompletedResponse(BaseModel):
"""Response when task was already completed."""
success: bool
message: str
completed_at: datetime
def get_task_info(task_type: IncentiveTaskType) -> dict | None:
"""Get task configuration by type."""
return INCENTIVE_TASKS_CONFIG.get(task_type)
def get_all_task_types() -> list[IncentiveTaskType]:
"""Get all configured task types."""
return list(INCENTIVE_TASKS_CONFIG.keys())

View file

@ -167,6 +167,7 @@ class PermissionInfo(BaseModel):
value: str
name: str
category: str
description: str
class PermissionsListResponse(BaseModel):

View file

@ -39,21 +39,73 @@ COMPOSIO_TOOLKIT_NAMES = {
# Toolkits that support indexing (Phase 1: Google services only)
INDEXABLE_TOOLKITS = {"googledrive", "gmail", "googlecalendar"}
# Mapping of toolkit IDs to connector types
TOOLKIT_TO_CONNECTOR_TYPE = {
"googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"gmail": "COMPOSIO_GMAIL_CONNECTOR",
"googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
}
# Mapping of toolkit IDs to document types
TOOLKIT_TO_DOCUMENT_TYPE = {
"googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"gmail": "COMPOSIO_GMAIL_CONNECTOR",
"googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
}
# Mapping of toolkit IDs to their indexer functions
# Format: toolkit_id -> (module_path, function_name, supports_date_filter)
# supports_date_filter: True if the indexer accepts start_date/end_date params
TOOLKIT_TO_INDEXER = {
"googledrive": (
"app.connectors.composio_google_drive_connector",
"index_composio_google_drive",
False, # Google Drive doesn't use date filtering
),
"gmail": (
"app.connectors.composio_gmail_connector",
"index_composio_gmail",
True, # Gmail uses date filtering
),
"googlecalendar": (
"app.connectors.composio_google_calendar_connector",
"index_composio_google_calendar",
True, # Calendar uses date filtering
),
}
class ComposioService:
"""Service for interacting with Composio API."""
def __init__(self, api_key: str | None = None):
# Default download directory for files from Composio
DEFAULT_DOWNLOAD_DIR = "/tmp/composio_downloads"
def __init__(
self, api_key: str | None = None, file_download_dir: str | None = None
):
"""
Initialize the Composio service.
Args:
api_key: Composio API key. If not provided, uses config.COMPOSIO_API_KEY.
file_download_dir: Directory for downloaded files. Defaults to /tmp/composio_downloads.
"""
import os
self.api_key = api_key or config.COMPOSIO_API_KEY
if not self.api_key:
raise ValueError("COMPOSIO_API_KEY is required but not configured")
self.client = Composio(api_key=self.api_key)
# Set up download directory
self.file_download_dir = file_download_dir or self.DEFAULT_DOWNLOAD_DIR
os.makedirs(self.file_download_dir, exist_ok=True)
# Initialize Composio client with download directory
# Per docs: file_download_dir configures where files are downloaded
self.client = Composio(
api_key=self.api_key, file_download_dir=self.file_download_dir
)
@staticmethod
def is_enabled() -> bool:
@ -252,7 +304,6 @@ class ComposioService:
}
)
logger.info(f"DEBUG: Found {len(result)} TOTAL connections in Composio")
return result
except Exception as e:
logger.error(f"Failed to list all connections: {e!s}")
@ -269,7 +320,6 @@ class ComposioService:
List of connected account details.
"""
try:
logger.info(f"DEBUG: Calling connected_accounts.list(user_id='{user_id}')")
accounts_response = self.client.connected_accounts.list(user_id=user_id)
# Handle paginated response (may have .items attribute) or direct list
@ -312,6 +362,30 @@ class ComposioService:
logger.error(f"Failed to list connections for user {user_id}: {e!s}")
return []
async def delete_connected_account(self, connected_account_id: str) -> bool:
"""
Delete a connected account from Composio.
This permanently removes the connected account and revokes access tokens.
Args:
connected_account_id: The Composio connected account ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
self.client.connected_accounts.delete(connected_account_id)
logger.info(
f"Successfully deleted Composio connected account: {connected_account_id}"
)
return True
except Exception as e:
logger.error(
f"Failed to delete Composio connected account {connected_account_id}: {e!s}"
)
return False
async def execute_tool(
self,
connected_account_id: str,
@ -338,7 +412,6 @@ class ComposioService:
# - connected_account_id: for authentication
# - user_id: user identifier (SDK uses user_id, not entity_id)
# - dangerously_skip_version_check: skip version check for manual execution
logger.info(f"DEBUG: Executing tool {tool_name} with params: {params}")
result = self.client.tools.execute(
slug=tool_name,
connected_account_id=connected_account_id,
@ -346,8 +419,6 @@ class ComposioService:
arguments=params or {},
dangerously_skip_version_check=True,
)
logger.info(f"DEBUG: Tool {tool_name} raw result type: {type(result)}")
logger.info(f"DEBUG: Tool {tool_name} raw result: {result}")
return {"success": True, "data": result}
except Exception as e:
logger.error(f"Failed to execute tool {tool_name}: {e!s}")
@ -382,7 +453,15 @@ class ComposioService:
"page_size": min(page_size, 100),
}
if folder_id:
params["folder_id"] = folder_id
# List contents of a specific folder (exclude shortcuts - we don't have access to them)
params["q"] = (
f"'{folder_id}' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'"
)
else:
# List root-level items only (My Drive root), exclude shortcuts
params["q"] = (
"'root' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'"
)
if page_token:
params["page_token"] = page_token
@ -397,9 +476,6 @@ class ComposioService:
return [], None, result.get("error", "Unknown error")
data = result.get("data", {})
logger.info(
f"DEBUG: Drive data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}"
)
# Handle nested response structure from Composio
files = []
@ -415,7 +491,6 @@ class ComposioService:
elif isinstance(data, list):
files = data
logger.info(f"DEBUG: Extracted {len(files)} drive files")
return files, next_token, None
except Exception as e:
@ -428,6 +503,10 @@ class ComposioService:
"""
Download file content from Google Drive via Composio.
Per Composio docs: When tools return files, they are automatically downloaded
to a local directory, and the local file path is provided in the response.
Response includes: file_path, file_name, size fields.
Args:
connected_account_id: Composio connected account ID.
entity_id: The entity/user ID that owns the connected account.
@ -436,27 +515,264 @@ class ComposioService:
Returns:
Tuple of (file content bytes, error message).
"""
from pathlib import Path
try:
result = await self.execute_tool(
connected_account_id=connected_account_id,
tool_name="GOOGLEDRIVE_DOWNLOAD_FILE",
params={"file_id": file_id}, # snake_case
params={"file_id": file_id},
entity_id=entity_id,
)
if not result.get("success"):
return None, result.get("error", "Unknown error")
content = result.get("data")
if isinstance(content, str):
content = content.encode("utf-8")
data = result.get("data")
if not data:
return None, "No data returned from Composio"
return content, None
# Per Composio docs, response includes file_path where file was downloaded
# Response structure: {data: {...}, error: ..., successful: ...}
# The actual file info is nested inside data["data"]
file_path = None
if isinstance(data, dict):
# Handle nested response structure: data contains {data, error, successful}
# The actual file info is in data["data"]
inner_data = data
if "data" in data and isinstance(data["data"], dict):
inner_data = data["data"]
logger.debug(
f"Found nested data structure. Inner keys: {list(inner_data.keys())}"
)
elif "successful" in data and "data" in data:
# Standard Composio response wrapper
inner_data = data["data"] if data["data"] else data
# Try documented fields: file_path, downloaded_file_content, path, uri
file_path = (
inner_data.get("file_path")
or inner_data.get("downloaded_file_content")
or inner_data.get("path")
or inner_data.get("uri")
)
# Handle nested dict case where downloaded_file_content contains the path
if isinstance(file_path, dict):
file_path = (
file_path.get("file_path")
or file_path.get("downloaded_file_content")
or file_path.get("path")
or file_path.get("uri")
)
# If still no path, check if inner_data itself has the nested structure
if not file_path and isinstance(inner_data, dict):
for key in ["downloaded_file_content", "file_path", "path", "uri"]:
if key in inner_data:
val = inner_data[key]
if isinstance(val, str):
file_path = val
break
elif isinstance(val, dict):
# One more level of nesting
file_path = (
val.get("file_path")
or val.get("downloaded_file_content")
or val.get("path")
or val.get("uri")
)
if file_path:
break
logger.debug(
f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}"
)
elif isinstance(data, str):
# Direct string response (could be path or content)
file_path = data
elif isinstance(data, bytes):
# Direct bytes response
return data, None
# Read file from the path
if file_path and isinstance(file_path, str):
path_obj = Path(file_path)
# Check if it's a valid file path (absolute or in .composio directory)
if path_obj.is_absolute() or ".composio" in str(path_obj):
try:
if path_obj.exists():
content = path_obj.read_bytes()
logger.info(
f"Successfully read {len(content)} bytes from Composio file: {file_path}"
)
return content, None
else:
logger.warning(
f"File path from Composio does not exist: {file_path}"
)
return None, f"File not found at path: {file_path}"
except Exception as e:
logger.error(
f"Failed to read file from Composio path {file_path}: {e!s}"
)
return None, f"Failed to read file: {e!s}"
else:
# Not a file path - might be base64 encoded content
try:
import base64
content = base64.b64decode(file_path)
return content, None
except Exception:
# Not base64, return as UTF-8 bytes
return file_path.encode("utf-8"), None
# If we got here, couldn't extract file path
if isinstance(data, dict):
# Log full structure for debugging
inner_data = data.get("data", {})
logger.warning(
f"Could not extract file path from Composio response. "
f"Top keys: {list(data.keys())}, "
f"Inner data keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else type(inner_data).__name__}, "
f"Full inner data: {inner_data}"
)
return (
None,
f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}",
)
return None, f"Unexpected data type from Composio: {type(data).__name__}"
except Exception as e:
logger.error(f"Failed to get Drive file content: {e!s}")
return None, str(e)
async def get_drive_start_page_token(
self, connected_account_id: str, entity_id: str
) -> tuple[str | None, str | None]:
"""
Get the starting page token for Google Drive change tracking.
This token represents the current state and is used for future delta syncs.
Per Composio docs: Use GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN to get initial token.
Args:
connected_account_id: Composio connected account ID.
entity_id: The entity/user ID that owns the connected account.
Returns:
Tuple of (start_page_token, error message).
"""
try:
result = await self.execute_tool(
connected_account_id=connected_account_id,
tool_name="GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN",
params={},
entity_id=entity_id,
)
if not result.get("success"):
return None, result.get("error", "Unknown error")
data = result.get("data", {})
# Handle nested response: {data: {startPageToken: ...}, successful: ...}
if isinstance(data, dict):
inner_data = data.get("data", data)
token = (
inner_data.get("startPageToken")
or inner_data.get("start_page_token")
or data.get("startPageToken")
or data.get("start_page_token")
)
if token:
logger.info(f"Got Drive start page token: {token}")
return token, None
logger.warning(f"Could not extract start page token from response: {data}")
return None, "No start page token in response"
except Exception as e:
logger.error(f"Failed to get Drive start page token: {e!s}")
return None, str(e)
async def list_drive_changes(
self,
connected_account_id: str,
entity_id: str,
page_token: str | None = None,
page_size: int = 100,
include_removed: bool = True,
) -> tuple[list[dict[str, Any]], str | None, str | None]:
"""
List changes in Google Drive since the given page token.
Per Composio docs: GOOGLEDRIVE_LIST_CHANGES tracks modifications to files/folders.
If pageToken is not provided, it auto-fetches the current start page token.
Response includes nextPageToken for pagination and newStartPageToken for future syncs.
Args:
connected_account_id: Composio connected account ID.
entity_id: The entity/user ID that owns the connected account.
page_token: Page token from previous sync (optional - will auto-fetch if not provided).
page_size: Number of changes per page.
include_removed: Whether to include removed items in the response.
Returns:
Tuple of (changes list, new_start_page_token, error message).
"""
try:
params = {
"pageSize": min(page_size, 100),
"includeRemoved": include_removed,
}
if page_token:
params["pageToken"] = page_token
result = await self.execute_tool(
connected_account_id=connected_account_id,
tool_name="GOOGLEDRIVE_LIST_CHANGES",
params=params,
entity_id=entity_id,
)
if not result.get("success"):
return [], None, result.get("error", "Unknown error")
data = result.get("data", {})
# Handle nested response structure
changes = []
new_start_token = None
if isinstance(data, dict):
inner_data = data.get("data", data)
changes = inner_data.get("changes", []) or data.get("changes", [])
# Get the token for next sync
# newStartPageToken is returned when all changes have been fetched
# nextPageToken is for pagination within the current fetch
new_start_token = (
inner_data.get("newStartPageToken")
or inner_data.get("new_start_page_token")
or inner_data.get("nextPageToken")
or inner_data.get("next_page_token")
or data.get("newStartPageToken")
or data.get("nextPageToken")
)
logger.info(
f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}..."
)
return changes, new_start_token, None
except Exception as e:
logger.error(f"Failed to list Drive changes: {e!s}")
return [], None, str(e)
# ===== Gmail specific methods =====
async def get_gmail_messages(
@ -464,25 +780,30 @@ class ComposioService:
connected_account_id: str,
entity_id: str,
query: str = "",
max_results: int = 100,
) -> tuple[list[dict[str, Any]], str | None]:
max_results: int = 50,
page_token: str | None = None,
) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]:
"""
List Gmail messages via Composio.
List Gmail messages via Composio with pagination support.
Args:
connected_account_id: Composio connected account ID.
entity_id: The entity/user ID that owns the connected account.
query: Gmail search query.
max_results: Maximum number of messages to return.
max_results: Maximum number of messages to return per page (default: 50 to avoid payload size issues).
page_token: Optional pagination token for next page.
Returns:
Tuple of (messages list, error message).
Tuple of (messages list, next_page_token, result_size_estimate, error message).
"""
try:
# Composio uses snake_case for parameters, max is 500
params = {"max_results": min(max_results, 500)}
# Use smaller batch size to avoid 413 payload too large errors
# Composio uses snake_case for parameters
params = {"max_results": min(max_results, 50)} # Reduced from 500 to 50
if query:
params["query"] = query # Composio uses 'query' not 'q'
if page_token:
params["page_token"] = page_token
result = await self.execute_tool(
connected_account_id=connected_account_id,
@ -492,31 +813,42 @@ class ComposioService:
)
if not result.get("success"):
return [], result.get("error", "Unknown error")
return [], None, result.get("error", "Unknown error")
data = result.get("data", {})
logger.info(
f"DEBUG: Gmail data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}"
)
logger.info(f"DEBUG: Gmail full data: {data}")
# Try different possible response structures
messages = []
next_token = None
result_size_estimate = None
if isinstance(data, dict):
messages = (
data.get("messages", [])
or data.get("data", {}).get("messages", [])
or data.get("emails", [])
)
# Check for pagination token in various possible locations
next_token = (
data.get("nextPageToken")
or data.get("next_page_token")
or data.get("data", {}).get("nextPageToken")
or data.get("data", {}).get("next_page_token")
)
# Extract resultSizeEstimate if available (Gmail API provides this)
result_size_estimate = (
data.get("resultSizeEstimate")
or data.get("result_size_estimate")
or data.get("data", {}).get("resultSizeEstimate")
or data.get("data", {}).get("result_size_estimate")
)
elif isinstance(data, list):
messages = data
logger.info(f"DEBUG: Extracted {len(messages)} messages")
return messages, None
return messages, next_token, result_size_estimate, None
except Exception as e:
logger.error(f"Failed to list Gmail messages: {e!s}")
return [], str(e)
return [], None, str(e)
async def get_gmail_message_detail(
self, connected_account_id: str, entity_id: str, message_id: str
@ -595,10 +927,6 @@ class ComposioService:
return [], result.get("error", "Unknown error")
data = result.get("data", {})
logger.info(
f"DEBUG: Calendar data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}"
)
logger.info(f"DEBUG: Calendar full data: {data}")
# Try different possible response structures
events = []
@ -611,7 +939,6 @@ class ComposioService:
elif isinstance(data, list):
events = data
logger.info(f"DEBUG: Extracted {len(events)} calendar events")
return events, None
except Exception as e:

View file

@ -2871,3 +2871,350 @@ class ConnectorService:
}
return result_object, obsidian_docs
# =========================================================================
# Composio Connector Search Methods
# =========================================================================
async def search_composio_google_drive(
self,
user_query: str,
search_space_id: int,
top_k: int = 20,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Composio Google Drive files and return both the source information
and langchain documents.
Uses combined chunk-level and document-level hybrid search with RRF fusion.
Args:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
"""
composio_drive_docs = await self._combined_rrf_search(
query_text=user_query,
search_space_id=search_space_id,
document_type="COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
top_k=top_k,
start_date=start_date,
end_date=end_date,
)
# Early return if no results
if not composio_drive_docs:
return {
"id": 54,
"name": "Google Drive (Composio)",
"type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"sources": [],
}, []
def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
return (
doc_info.get("title")
or metadata.get("title")
or metadata.get("file_name")
or "Untitled Document"
)
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
return metadata.get("url") or metadata.get("web_view_link") or ""
def _description_fn(
chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
) -> str:
description = self._chunk_preview(chunk.get("content", ""), limit=200)
info_parts = []
mime_type = metadata.get("mime_type")
modified_time = metadata.get("modified_time")
if mime_type:
info_parts.append(f"Type: {mime_type}")
if modified_time:
info_parts.append(f"Modified: {modified_time}")
if info_parts:
description = (description + " | " + " | ".join(info_parts)).strip(" |")
return description
def _extra_fields_fn(
_chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
) -> dict[str, Any]:
return {
"mime_type": metadata.get("mime_type", ""),
"file_id": metadata.get("file_id", ""),
"modified_time": metadata.get("modified_time", ""),
}
sources_list = self._build_chunk_sources_from_documents(
composio_drive_docs,
title_fn=_title_fn,
url_fn=_url_fn,
description_fn=_description_fn,
extra_fields_fn=_extra_fields_fn,
)
# Create result object
result_object = {
"id": 54,
"name": "Google Drive (Composio)",
"type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
"sources": sources_list,
}
return result_object, composio_drive_docs
async def search_composio_gmail(
self,
user_query: str,
search_space_id: int,
top_k: int = 20,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Composio Gmail messages and return both the source information
and langchain documents.
Uses combined chunk-level and document-level hybrid search with RRF fusion.
Args:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
"""
composio_gmail_docs = await self._combined_rrf_search(
query_text=user_query,
search_space_id=search_space_id,
document_type="COMPOSIO_GMAIL_CONNECTOR",
top_k=top_k,
start_date=start_date,
end_date=end_date,
)
# Early return if no results
if not composio_gmail_docs:
return {
"id": 55,
"name": "Gmail (Composio)",
"type": "COMPOSIO_GMAIL_CONNECTOR",
"sources": [],
}, []
def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
return (
doc_info.get("title")
or metadata.get("subject")
or metadata.get("title")
or "Untitled Email"
)
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
return metadata.get("url") or ""
def _description_fn(
chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
) -> str:
description = self._chunk_preview(chunk.get("content", ""), limit=200)
info_parts = []
sender = metadata.get("from") or metadata.get("sender")
date = metadata.get("date") or metadata.get("received_at")
if sender:
info_parts.append(f"From: {sender}")
if date:
info_parts.append(f"Date: {date}")
if info_parts:
description = (description + " | " + " | ".join(info_parts)).strip(" |")
return description
def _extra_fields_fn(
_chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
) -> dict[str, Any]:
return {
"message_id": metadata.get("message_id", ""),
"thread_id": metadata.get("thread_id", ""),
"from": metadata.get("from", ""),
"to": metadata.get("to", ""),
"date": metadata.get("date", ""),
}
sources_list = self._build_chunk_sources_from_documents(
composio_gmail_docs,
title_fn=_title_fn,
url_fn=_url_fn,
description_fn=_description_fn,
extra_fields_fn=_extra_fields_fn,
)
# Create result object
result_object = {
"id": 55,
"name": "Gmail (Composio)",
"type": "COMPOSIO_GMAIL_CONNECTOR",
"sources": sources_list,
}
return result_object, composio_gmail_docs
async def search_composio_google_calendar(
self,
user_query: str,
search_space_id: int,
top_k: int = 20,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Composio Google Calendar events and return both the source information
and langchain documents.
Uses combined chunk-level and document-level hybrid search with RRF fusion.
Args:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
"""
composio_calendar_docs = await self._combined_rrf_search(
query_text=user_query,
search_space_id=search_space_id,
document_type="COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
top_k=top_k,
start_date=start_date,
end_date=end_date,
)
# Early return if no results
if not composio_calendar_docs:
return {
"id": 56,
"name": "Google Calendar (Composio)",
"type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
"sources": [],
}, []
def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
return (
doc_info.get("title")
or metadata.get("summary")
or metadata.get("title")
or "Untitled Event"
)
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
return metadata.get("url") or metadata.get("html_link") or ""
def _description_fn(
chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
) -> str:
description = self._chunk_preview(chunk.get("content", ""), limit=200)
info_parts = []
start_time = metadata.get("start_time") or metadata.get("start")
end_time = metadata.get("end_time") or metadata.get("end")
if start_time:
info_parts.append(f"Start: {start_time}")
if end_time:
info_parts.append(f"End: {end_time}")
if info_parts:
description = (description + " | " + " | ".join(info_parts)).strip(" |")
return description
def _extra_fields_fn(
_chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
) -> dict[str, Any]:
return {
"event_id": metadata.get("event_id", ""),
"calendar_id": metadata.get("calendar_id", ""),
"start_time": metadata.get("start_time", ""),
"end_time": metadata.get("end_time", ""),
"location": metadata.get("location", ""),
}
sources_list = self._build_chunk_sources_from_documents(
composio_calendar_docs,
title_fn=_title_fn,
url_fn=_url_fn,
description_fn=_description_fn,
extra_fields_fn=_extra_fields_fn,
)
# Create result object
result_object = {
"id": 56,
"name": "Google Calendar (Composio)",
"type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
"sources": sources_list,
}
return result_object, composio_calendar_docs
# =========================================================================
# Utility Methods for Connector Discovery
# =========================================================================
async def get_available_connectors(
self,
search_space_id: int,
) -> list[SearchSourceConnectorType]:
"""
Get all available (enabled) connector types for a search space.
Args:
search_space_id: The search space ID
Returns:
List of SearchSourceConnectorType enums for enabled connectors
"""
query = (
select(SearchSourceConnector.connector_type)
.filter(
SearchSourceConnector.search_space_id == search_space_id,
)
.distinct()
)
result = await self.session.execute(query)
connector_types = result.scalars().all()
return list(connector_types)
async def get_available_document_types(
self,
search_space_id: int,
) -> list[str]:
"""
Get all document types that have at least one document in the search space.
Args:
search_space_id: The search space ID
Returns:
List of document type strings that have documents indexed
"""
from sqlalchemy import distinct
from app.db import Document
query = select(distinct(Document.document_type)).filter(
Document.search_space_id == search_space_id,
)
result = await self.session.execute(query)
doc_types = result.scalars().all()
return [str(dt) for dt in doc_types]

View file

@ -335,6 +335,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
notification: Notification,
indexed_count: int,
error_message: str | None = None,
is_warning: bool = False,
) -> Notification:
"""
Update notification when connector indexing completes.
@ -343,7 +344,8 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
session: Database session
notification: Notification to update
indexed_count: Total number of items indexed
error_message: Error message if indexing failed (optional)
error_message: Error message if indexing failed, or warning message (optional)
is_warning: If True, treat error_message as a warning (success case) rather than an error
Returns:
Updated notification
@ -352,10 +354,26 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
"connector_name", "Connector"
)
# If there's an error message but items were indexed, treat it as a warning (partial success)
# If is_warning is True, treat it as success even with 0 items (e.g., duplicates found)
# Otherwise, treat it as a failure
if error_message:
title = f"Failed: {connector_name}"
message = f"Sync failed: {error_message}"
status = "failed"
if indexed_count > 0:
# Partial success with warnings (e.g., duplicate content from other connectors)
title = f"Ready: {connector_name}"
item_text = "item" if indexed_count == 1 else "items"
message = f"Now searchable! {indexed_count} {item_text} synced. Note: {error_message}"
status = "completed"
elif is_warning:
# Warning case (e.g., duplicates found) - treat as success
title = f"Ready: {connector_name}"
message = f"Sync completed. {error_message}"
status = "completed"
else:
# Complete failure
title = f"Failed: {connector_name}"
message = f"Sync failed: {error_message}"
status = "failed"
else:
title = f"Ready: {connector_name}"
if indexed_count == 0:
@ -367,7 +385,9 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
metadata_updates = {
"indexed_count": indexed_count,
"sync_stage": "completed" if not error_message else "failed",
"sync_stage": "completed"
if (not error_message or is_warning or indexed_count > 0)
else "failed",
"error_message": error_message,
}

View file

@ -810,8 +810,8 @@ def index_composio_connector_task(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
start_date: str | None,
end_date: str | None,
):
"""Celery task to index Composio connector content (Google Drive, Gmail, Calendar via Composio)."""
import asyncio
@ -833,14 +833,16 @@ async def _index_composio_connector(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
start_date: str | None,
end_date: str | None,
):
"""Index Composio connector content with new session."""
# Import from tasks folder (not connector_indexers) to avoid circular import
from app.tasks.composio_indexer import index_composio_connector
"""Index Composio connector content with new session and real-time notifications."""
# Import from routes to use the notification-wrapped version
from app.routes.search_source_connectors_routes import (
run_composio_indexing,
)
async with get_celery_session_maker()() as session:
await index_composio_connector(
await run_composio_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)

View file

@ -66,6 +66,7 @@ async def _check_and_trigger_schedules():
from app.tasks.celery_tasks.connector_tasks import (
index_airtable_records_task,
index_clickup_tasks_task,
index_composio_connector_task,
index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task,
@ -98,6 +99,10 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
# Composio connector types
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_composio_connector_task,
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: index_composio_connector_task,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_composio_connector_task,
}
# Trigger indexing for each due connector

View file

@ -54,21 +54,68 @@ def format_attachments_as_context(attachments: list[ChatAttachment]) -> str:
def format_mentioned_documents_as_context(documents: list[Document]) -> str:
"""Format mentioned documents as context for the agent."""
"""
Format mentioned documents as context for the agent.
Uses the same XML structure as knowledge_base.format_documents_for_context
to ensure citations work properly with chunk IDs.
"""
if not documents:
return ""
context_parts = ["<mentioned_documents>"]
context_parts.append(
"The user has explicitly mentioned the following documents from their knowledge base. "
"These documents are directly relevant to the query and should be prioritized as primary sources."
"These documents are directly relevant to the query and should be prioritized as primary sources. "
"Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])."
)
for i, doc in enumerate(documents, 1):
context_parts.append(
f"<document index='{i}' id='{doc.id}' title='{doc.title}' type='{doc.document_type.value}'>"
context_parts.append("")
for doc in documents:
# Build metadata JSON
metadata = doc.document_metadata or {}
metadata_json = json.dumps(metadata, ensure_ascii=False)
# Get URL from metadata
url = (
metadata.get("url")
or metadata.get("source")
or metadata.get("page_url")
or ""
)
context_parts.append(f"<![CDATA[{doc.content}]]>")
context_parts.append("<document>")
context_parts.append("<document_metadata>")
context_parts.append(f" <document_id>{doc.id}</document_id>")
context_parts.append(
f" <document_type>{doc.document_type.value}</document_type>"
)
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
context_parts.append(f" <url><![CDATA[{url}]]></url>")
context_parts.append(
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
)
context_parts.append("</document_metadata>")
context_parts.append("")
context_parts.append("<document_content>")
# Use chunks if available (preferred for proper citations)
if hasattr(doc, "chunks") and doc.chunks:
for chunk in doc.chunks:
context_parts.append(
f" <chunk id='{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
)
else:
# Fallback to document content if chunks not loaded
# Use document ID as chunk ID prefix for consistency
context_parts.append(
f" <chunk id='{doc.id}'><![CDATA[{doc.content}]]></chunk>"
)
context_parts.append("</document_content>")
context_parts.append("</document>")
context_parts.append("")
context_parts.append("</mentioned_documents>")
return "\n".join(context_parts)
@ -81,8 +128,6 @@ def format_mentioned_surfsense_docs_as_context(
if not documents:
return ""
import json
context_parts = ["<mentioned_surfsense_docs>"]
context_parts.append(
"The user has explicitly mentioned the following SurfSense documentation pages. "
@ -263,11 +308,15 @@ async def stream_new_chat(
# Build input with message history from frontend
langchain_messages = []
# Fetch mentioned documents if any
# Fetch mentioned documents if any (with chunks for proper citations)
mentioned_documents: list[Document] = []
if mentioned_document_ids:
from sqlalchemy.orm import selectinload as doc_selectinload
result = await session.execute(
select(Document).filter(
select(Document)
.options(doc_selectinload(Document.chunks))
.filter(
Document.id.in_(mentioned_document_ids),
Document.search_space_id == search_space_id,
)

View file

@ -2,83 +2,76 @@
Composio connector indexer.
Routes indexing requests to toolkit-specific handlers (Google Drive, Gmail, Calendar).
Uses a registry pattern for clean, extensible connector routing.
Note: This module is intentionally placed in app/tasks/ (not in connector_indexers/)
to avoid circular import issues with the connector_indexers package.
"""
import logging
from datetime import UTC, datetime
from importlib import import_module
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.config import config
from app.connectors.composio_connector import ComposioConnector
from app.db import (
Document,
DocumentType,
SearchSourceConnector,
SearchSourceConnectorType,
)
from app.services.composio_service import INDEXABLE_TOOLKITS
from app.services.llm_service import get_user_long_context_llm
from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_INDEXER
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
# Set up logging
logger = logging.getLogger(__name__)
# ============ Utility functions (copied from connector_indexers.base to avoid circular imports) ============
# Valid Composio connector types
COMPOSIO_CONNECTOR_TYPES = {
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
}
def get_current_timestamp() -> datetime:
"""Get the current timestamp with timezone for updated_at field."""
return datetime.now(UTC)
async def check_document_by_unique_identifier(
session: AsyncSession, unique_identifier_hash: str
) -> Document | None:
"""Check if a document with the given unique identifier hash already exists."""
existing_doc_result = await session.execute(
select(Document)
.options(selectinload(Document.chunks))
.where(Document.unique_identifier_hash == unique_identifier_hash)
)
return existing_doc_result.scalars().first()
# ============ Utility functions ============
async def get_connector_by_id(
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
session: AsyncSession,
connector_id: int,
connector_type: SearchSourceConnectorType | None,
) -> SearchSourceConnector | None:
"""Get a connector by ID and type from the database."""
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id,
SearchSourceConnector.connector_type == connector_type,
)
"""Get a connector by ID and optionally by type from the database."""
query = select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id
)
if connector_type is not None:
query = query.filter(SearchSourceConnector.connector_type == connector_type)
result = await session.execute(query)
return result.scalars().first()
async def update_connector_last_indexed(
session: AsyncSession,
connector: SearchSourceConnector,
update_last_indexed: bool = True,
) -> None:
"""Update the last_indexed_at timestamp for a connector."""
if update_last_indexed:
connector.last_indexed_at = datetime.now()
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
def get_indexer_function(toolkit_id: str):
"""
Dynamically import and return the indexer function for a toolkit.
Args:
toolkit_id: The toolkit ID (e.g., "googledrive", "gmail")
Returns:
Tuple of (indexer_function, supports_date_filter)
Raises:
ValueError: If toolkit not found in registry
"""
if toolkit_id not in TOOLKIT_TO_INDEXER:
raise ValueError(f"No indexer registered for toolkit: {toolkit_id}")
module_path, function_name, supports_date_filter = TOOLKIT_TO_INDEXER[toolkit_id]
module = import_module(module_path)
indexer_func = getattr(module, function_name)
return indexer_func, supports_date_filter
# ============ Main indexer function ============
@ -98,6 +91,7 @@ async def index_composio_connector(
Index content from a Composio connector.
Routes to toolkit-specific indexing based on the connector's toolkit_id.
Uses a registry pattern for clean, extensible connector routing.
Args:
session: Database session
@ -129,10 +123,16 @@ async def index_composio_connector(
)
try:
# Get connector by id
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.COMPOSIO_CONNECTOR
)
# Get connector by id - accept any Composio connector type
connector = await get_connector_by_id(session, connector_id, None)
# Validate it's a Composio connector
if connector and connector.connector_type not in COMPOSIO_CONNECTOR_TYPES:
error_msg = f"Connector {connector_id} is not a Composio connector"
await task_logger.log_task_failure(
log_entry, error_msg, {"error_type": "InvalidConnectorType"}
)
return 0, error_msg
if not connector:
error_msg = f"Composio connector with ID {connector_id} not found"
@ -160,53 +160,35 @@ async def index_composio_connector(
)
return 0, error_msg
# Route to toolkit-specific indexer
if toolkit_id == "googledrive":
return await _index_composio_google_drive(
session=session,
connector=connector,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
update_last_indexed=update_last_indexed,
max_items=max_items,
)
elif toolkit_id == "gmail":
return await _index_composio_gmail(
session=session,
connector=connector,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
task_logger=task_logger,
log_entry=log_entry,
update_last_indexed=update_last_indexed,
max_items=max_items,
)
elif toolkit_id == "googlecalendar":
return await _index_composio_google_calendar(
session=session,
connector=connector,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
task_logger=task_logger,
log_entry=log_entry,
update_last_indexed=update_last_indexed,
max_items=max_items,
)
else:
error_msg = f"No indexer implemented for toolkit: {toolkit_id}"
# Get indexer function from registry
try:
indexer_func, supports_date_filter = get_indexer_function(toolkit_id)
except ValueError as e:
await task_logger.log_task_failure(
log_entry, error_msg, {"error_type": "NoIndexerImplemented"}
log_entry, str(e), {"error_type": "NoIndexerImplemented"}
)
return 0, error_msg
return 0, str(e)
# Build kwargs for the indexer function
kwargs = {
"session": session,
"connector": connector,
"connector_id": connector_id,
"search_space_id": search_space_id,
"user_id": user_id,
"task_logger": task_logger,
"log_entry": log_entry,
"update_last_indexed": update_last_indexed,
"max_items": max_items,
}
# Add date params for toolkits that support them
if supports_date_filter:
kwargs["start_date"] = start_date
kwargs["end_date"] = end_date
# Call the toolkit-specific indexer
return await indexer_func(**kwargs)
except SQLAlchemyError as db_error:
await session.rollback()
@ -228,714 +210,3 @@ async def index_composio_connector(
)
logger.error(f"Failed to index Composio connector: {e!s}", exc_info=True)
return 0, f"Failed to index Composio connector: {e!s}"
async def _index_composio_google_drive(
session: AsyncSession,
connector,
connector_id: int,
search_space_id: int,
user_id: str,
task_logger: TaskLoggingService,
log_entry,
update_last_indexed: bool = True,
max_items: int = 1000,
) -> tuple[int, str]:
"""Index Google Drive files via Composio."""
try:
composio_connector = ComposioConnector(session, connector_id)
await task_logger.log_task_progress(
log_entry,
f"Fetching Google Drive files via Composio for connector {connector_id}",
{"stage": "fetching_files"},
)
# Fetch files
all_files = []
page_token = None
while len(all_files) < max_items:
files, next_token, error = await composio_connector.list_drive_files(
page_token=page_token,
page_size=min(100, max_items - len(all_files)),
)
if error:
await task_logger.log_task_failure(
log_entry, f"Failed to fetch Drive files: {error}", {}
)
return 0, f"Failed to fetch Drive files: {error}"
all_files.extend(files)
if not next_token:
break
page_token = next_token
if not all_files:
success_msg = "No Google Drive files found"
await task_logger.log_task_success(
log_entry, success_msg, {"files_count": 0}
)
return 0, success_msg
logger.info(f"Found {len(all_files)} Google Drive files to index via Composio")
documents_indexed = 0
documents_skipped = 0
for file_info in all_files:
try:
# Handle both standard Google API and potential Composio variations
file_id = file_info.get("id", "") or file_info.get("fileId", "")
file_name = (
file_info.get("name", "")
or file_info.get("fileName", "")
or "Untitled"
)
mime_type = file_info.get("mimeType", "") or file_info.get(
"mime_type", ""
)
if not file_id:
documents_skipped += 1
continue
# Skip folders
if mime_type == "application/vnd.google-apps.folder":
continue
# Generate unique identifier hash
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.COMPOSIO_CONNECTOR, f"drive_{file_id}", search_space_id
)
# Check if document exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Get file content
(
content,
content_error,
) = await composio_connector.get_drive_file_content(file_id)
if content_error or not content:
logger.warning(
f"Could not get content for file {file_name}: {content_error}"
)
# Use metadata as content fallback
markdown_content = f"# {file_name}\n\n"
markdown_content += f"**File ID:** {file_id}\n"
markdown_content += f"**Type:** {mime_type}\n"
else:
try:
markdown_content = content.decode("utf-8")
except UnicodeDecodeError:
markdown_content = f"# {file_name}\n\n[Binary file content]\n"
content_hash = generate_content_hash(markdown_content, search_space_id)
if existing_document:
if existing_document.content_hash == content_hash:
documents_skipped += 1
continue
# Update existing document
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
"document_type": "Google Drive File (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Google Drive File: {file_name}\n\nType: {mime_type}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
existing_document.title = f"Drive: {file_name}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
"connector_id": connector_id,
"source": "composio",
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
continue
# Create new document
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
"document_type": "Google Drive File (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Google Drive File: {file_name}\n\nType: {mime_type}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
document = Document(
search_space_id=search_space_id,
title=f"Drive: {file_name}",
document_type=DocumentType.COMPOSIO_CONNECTOR,
document_metadata={
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
"connector_id": connector_id,
"toolkit_id": "googledrive",
"source": "composio",
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
)
session.add(document)
documents_indexed += 1
if documents_indexed % 10 == 0:
await session.commit()
except Exception as e:
logger.error(f"Error processing Drive file: {e!s}", exc_info=True)
documents_skipped += 1
continue
if documents_indexed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
await task_logger.log_task_success(
log_entry,
f"Successfully completed Google Drive indexing via Composio for connector {connector_id}",
{
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
},
)
return documents_indexed, None
except Exception as e:
logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True)
return 0, f"Failed to index Google Drive via Composio: {e!s}"
async def _index_composio_gmail(
session: AsyncSession,
connector,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None,
end_date: str | None,
task_logger: TaskLoggingService,
log_entry,
update_last_indexed: bool = True,
max_items: int = 1000,
) -> tuple[int, str]:
"""Index Gmail messages via Composio."""
try:
composio_connector = ComposioConnector(session, connector_id)
await task_logger.log_task_progress(
log_entry,
f"Fetching Gmail messages via Composio for connector {connector_id}",
{"stage": "fetching_messages"},
)
# Build query with date range
query_parts = []
if start_date:
query_parts.append(f"after:{start_date.replace('-', '/')}")
if end_date:
query_parts.append(f"before:{end_date.replace('-', '/')}")
query = " ".join(query_parts)
messages, error = await composio_connector.list_gmail_messages(
query=query,
max_results=max_items,
)
if error:
await task_logger.log_task_failure(
log_entry, f"Failed to fetch Gmail messages: {error}", {}
)
return 0, f"Failed to fetch Gmail messages: {error}"
if not messages:
success_msg = "No Gmail messages found in the specified date range"
await task_logger.log_task_success(
log_entry, success_msg, {"messages_count": 0}
)
return 0, success_msg
logger.info(f"Found {len(messages)} Gmail messages to index via Composio")
documents_indexed = 0
documents_skipped = 0
for message in messages:
try:
# Composio uses 'messageId' (camelCase), not 'id'
message_id = message.get("messageId", "") or message.get("id", "")
if not message_id:
documents_skipped += 1
continue
# Composio's GMAIL_FETCH_EMAILS already returns full message content
# No need for a separate detail API call
# Extract message info from Composio response
# Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds
payload = message.get("payload", {})
headers = payload.get("headers", [])
subject = "No Subject"
sender = "Unknown Sender"
date_str = message.get("messageTimestamp", "Unknown Date")
for header in headers:
name = header.get("name", "").lower()
value = header.get("value", "")
if name == "subject":
subject = value
elif name == "from":
sender = value
elif name == "date":
date_str = value
# Format to markdown using the full message data
markdown_content = composio_connector.format_gmail_message_to_markdown(
message
)
# Generate unique identifier
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.COMPOSIO_CONNECTOR,
f"gmail_{message_id}",
search_space_id,
)
content_hash = generate_content_hash(markdown_content, search_space_id)
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Get label IDs from Composio response
label_ids = message.get("labelIds", [])
if existing_document:
if existing_document.content_hash == content_hash:
documents_skipped += 1
continue
# Update existing
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"message_id": message_id,
"subject": subject,
"sender": sender,
"document_type": "Gmail Message (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
existing_document.title = f"Gmail: {subject}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"message_id": message_id,
"subject": subject,
"sender": sender,
"date": date_str,
"labels": label_ids,
"connector_id": connector_id,
"source": "composio",
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
continue
# Create new document
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"message_id": message_id,
"subject": subject,
"sender": sender,
"document_type": "Gmail Message (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
document = Document(
search_space_id=search_space_id,
title=f"Gmail: {subject}",
document_type=DocumentType.COMPOSIO_CONNECTOR,
document_metadata={
"message_id": message_id,
"subject": subject,
"sender": sender,
"date": date_str,
"labels": label_ids,
"connector_id": connector_id,
"toolkit_id": "gmail",
"source": "composio",
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
)
session.add(document)
documents_indexed += 1
if documents_indexed % 10 == 0:
await session.commit()
except Exception as e:
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
documents_skipped += 1
continue
if documents_indexed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
await task_logger.log_task_success(
log_entry,
f"Successfully completed Gmail indexing via Composio for connector {connector_id}",
{
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
},
)
return documents_indexed, None
except Exception as e:
logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True)
return 0, f"Failed to index Gmail via Composio: {e!s}"
async def _index_composio_google_calendar(
session: AsyncSession,
connector,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None,
end_date: str | None,
task_logger: TaskLoggingService,
log_entry,
update_last_indexed: bool = True,
max_items: int = 2500,
) -> tuple[int, str]:
"""Index Google Calendar events via Composio."""
from datetime import datetime, timedelta
try:
composio_connector = ComposioConnector(session, connector_id)
await task_logger.log_task_progress(
log_entry,
f"Fetching Google Calendar events via Composio for connector {connector_id}",
{"stage": "fetching_events"},
)
# Build time range
if start_date:
time_min = f"{start_date}T00:00:00Z"
else:
# Default to 365 days ago
default_start = datetime.now() - timedelta(days=365)
time_min = default_start.strftime("%Y-%m-%dT00:00:00Z")
if end_date:
time_max = f"{end_date}T23:59:59Z"
else:
time_max = datetime.now().strftime("%Y-%m-%dT23:59:59Z")
events, error = await composio_connector.list_calendar_events(
time_min=time_min,
time_max=time_max,
max_results=max_items,
)
if error:
await task_logger.log_task_failure(
log_entry, f"Failed to fetch Calendar events: {error}", {}
)
return 0, f"Failed to fetch Calendar events: {error}"
if not events:
success_msg = "No Google Calendar events found in the specified date range"
await task_logger.log_task_success(
log_entry, success_msg, {"events_count": 0}
)
return 0, success_msg
logger.info(f"Found {len(events)} Google Calendar events to index via Composio")
documents_indexed = 0
documents_skipped = 0
for event in events:
try:
# Handle both standard Google API and potential Composio variations
event_id = event.get("id", "") or event.get("eventId", "")
summary = (
event.get("summary", "") or event.get("title", "") or "No Title"
)
if not event_id:
documents_skipped += 1
continue
# Format to markdown
markdown_content = composio_connector.format_calendar_event_to_markdown(
event
)
# Generate unique identifier
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.COMPOSIO_CONNECTOR,
f"calendar_{event_id}",
search_space_id,
)
content_hash = generate_content_hash(markdown_content, search_space_id)
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Extract event times
start = event.get("start", {})
end = event.get("end", {})
start_time = start.get("dateTime") or start.get("date", "")
end_time = end.get("dateTime") or end.get("date", "")
location = event.get("location", "")
if existing_document:
if existing_document.content_hash == content_hash:
documents_skipped += 1
continue
# Update existing
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"document_type": "Google Calendar Event (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
if location:
summary_content += f"\nLocation: {location}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
existing_document.title = f"Calendar: {summary}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"end_time": end_time,
"location": location,
"connector_id": connector_id,
"source": "composio",
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
continue
# Create new document
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"document_type": "Google Calendar Event (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
)
if location:
summary_content += f"\nLocation: {location}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
document = Document(
search_space_id=search_space_id,
title=f"Calendar: {summary}",
document_type=DocumentType.COMPOSIO_CONNECTOR,
document_metadata={
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"end_time": end_time,
"location": location,
"connector_id": connector_id,
"toolkit_id": "googlecalendar",
"source": "composio",
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
)
session.add(document)
documents_indexed += 1
if documents_indexed % 10 == 0:
await session.commit()
except Exception as e:
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
documents_skipped += 1
continue
if documents_indexed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
await task_logger.log_task_success(
log_entry,
f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}",
{
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
},
)
return documents_indexed, None
except Exception as e:
logger.error(
f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True
)
return 0, f"Failed to index Google Calendar via Composio: {e!s}"

View file

@ -112,6 +112,13 @@ def calculate_date_range(
Returns:
Tuple of (start_date_str, end_date_str)
"""
# Normalize "undefined" strings to None (from frontend)
# This prevents parsing errors and ensures consistent behavior across all indexers
if start_date == "undefined" or start_date == "":
start_date = None
if end_date == "undefined" or end_date == "":
end_date = None
if start_date is not None and end_date is not None:
return start_date, end_date

View file

@ -136,10 +136,9 @@ async def index_bookstack_pages(
)
if error:
logger.error(f"Failed to get BookStack pages: {error}")
# Don't treat "No pages found" as an error that should stop indexing
if "No pages found" in error:
logger.info(f"No BookStack pages found: {error}")
logger.info(
"No pages found is not a critical error, continuing with update"
)
@ -159,6 +158,7 @@ async def index_bookstack_pages(
)
return 0, None
else:
logger.error(f"Failed to get BookStack pages: {error}")
await task_logger.log_task_failure(
log_entry,
f"Failed to get BookStack pages: {error}",

View file

@ -120,10 +120,9 @@ async def index_confluence_pages(
)
if error:
logger.error(f"Failed to get Confluence pages: {error}")
# Don't treat "No pages found" as an error that should stop indexing
if "No pages found" in error:
logger.info(f"No Confluence pages found: {error}")
logger.info(
"No pages found is not a critical error, continuing with update"
)
@ -147,6 +146,7 @@ async def index_confluence_pages(
await confluence_client.close()
return 0, None
else:
logger.error(f"Failed to get Confluence pages: {error}")
await task_logger.log_task_failure(
log_entry,
f"Failed to get Confluence pages: {error}",

View file

@ -4,6 +4,8 @@ Google Calendar connector indexer.
from datetime import datetime, timedelta
import pytz
from dateutil.parser import isoparse
from google.oauth2.credentials import Credentials
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@ -21,6 +23,7 @@ from app.utils.document_converters import (
from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash,
get_connector_by_id,
get_current_timestamp,
logger,
@ -206,6 +209,23 @@ async def index_google_calendar_events(
start_date_str = start_date
end_date_str = end_date
# If start_date and end_date are the same, adjust end_date to be one day later
# to ensure valid date range (start_date must be strictly before end_date)
if start_date_str == end_date_str:
# Parse the date and add one day to ensure valid range
dt = isoparse(end_date_str)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=pytz.UTC)
else:
dt = dt.astimezone(pytz.UTC)
# Add one day to end_date to make it strictly after start_date
dt_end = dt + timedelta(days=1)
end_date_str = dt_end.strftime("%Y-%m-%d")
logger.info(
f"Adjusted end_date from {end_date} to {end_date_str} "
f"to ensure valid date range (start_date must be strictly before end_date)"
)
await task_logger.log_task_progress(
log_entry,
f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
@ -223,10 +243,9 @@ async def index_google_calendar_events(
)
if error:
logger.error(f"Failed to get Google Calendar events: {error}")
# Don't treat "No events found" as an error that should stop indexing
if "No events found" in error:
logger.info(f"No Google Calendar events found: {error}")
logger.info(
"No events found is not a critical error, continuing with update"
)
@ -246,13 +265,25 @@ async def index_google_calendar_events(
)
return 0, None
else:
logger.error(f"Failed to get Google Calendar events: {error}")
# Check if this is an authentication error that requires re-authentication
error_message = error
error_type = "APIError"
if (
"re-authenticate" in error.lower()
or "expired or been revoked" in error.lower()
or "authentication failed" in error.lower()
):
error_message = "Google Calendar authentication failed. Please re-authenticate."
error_type = "AuthenticationError"
await task_logger.log_task_failure(
log_entry,
f"Failed to get Google Calendar events: {error}",
"API Error",
{"error_type": "APIError"},
error_message,
error,
{"error_type": error_type},
)
return 0, f"Failed to get Google Calendar events: {error}"
return 0, error_message
logger.info(f"Retrieved {len(events)} events from Google Calendar API")
@ -263,6 +294,9 @@ async def index_google_calendar_events(
documents_indexed = 0
documents_skipped = 0
skipped_events = []
duplicate_content_count = (
0 # Track events skipped due to duplicate content_hash
)
for event in events:
try:
@ -383,6 +417,27 @@ async def index_google_calendar_events(
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
# A document with the same content already exists (likely from Composio connector)
logger.info(
f"Event {event_summary} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
)
duplicate_content_count += 1
documents_skipped += 1
skipped_events.append(
f"{event_summary} (already indexed by another connector)"
)
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
@ -475,7 +530,28 @@ async def index_google_calendar_events(
logger.info(
f"Final commit: Total {documents_indexed} Google Calendar events processed"
)
await session.commit()
try:
await session.commit()
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same event was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if duplicates were found
warning_message = None
if duplicate_content_count > 0:
warning_message = f"{duplicate_content_count} skipped (duplicate)"
await task_logger.log_task_success(
log_entry,
@ -484,14 +560,16 @@ async def index_google_calendar_events(
"events_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"duplicate_content_count": duplicate_content_count,
"skipped_events_count": len(skipped_events),
},
)
logger.info(
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped"
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
f"({duplicate_content_count} due to duplicate content from other connectors)"
)
return total_processed, None
return total_processed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -578,7 +578,7 @@ async def _check_rename_only_update(
- (True, message): Only filename changed, document was updated
- (False, None): Content changed or new file, needs full processing
"""
from sqlalchemy import select
from sqlalchemy import String, cast, select
from sqlalchemy.orm.attributes import flag_modified
from app.db import Document
@ -603,7 +603,8 @@ async def _check_rename_only_update(
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
Document.document_metadata["google_drive_file_id"].astext == file_id,
cast(Document.document_metadata["google_drive_file_id"], String)
== file_id,
)
)
existing_document = result.scalar_one_or_none()
@ -755,7 +756,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
"""
from sqlalchemy import select
from sqlalchemy import String, cast, select
from app.db import Document
@ -774,7 +775,8 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
Document.document_metadata["google_drive_file_id"].astext == file_id,
cast(Document.document_metadata["google_drive_file_id"], String)
== file_id,
)
)
existing_document = result.scalar_one_or_none()

View file

@ -170,10 +170,21 @@ async def index_google_gmail_messages(
)
if error:
# Check if this is an authentication error that requires re-authentication
error_message = error
error_type = "APIError"
if (
"re-authenticate" in error.lower()
or "expired or been revoked" in error.lower()
or "authentication failed" in error.lower()
):
error_message = "Gmail authentication failed. Please re-authenticate."
error_type = "AuthenticationError"
await task_logger.log_task_failure(
log_entry, f"Failed to fetch messages: {error}", {}
log_entry, error_message, error, {"error_type": error_type}
)
return 0, f"Failed to fetch Gmail messages: {error}"
return 0, error_message
if not messages:
success_msg = "No Google gmail messages found in the specified date range"

View file

@ -126,10 +126,9 @@ async def index_jira_issues(
)
if error:
logger.error(f"Failed to get Jira issues: {error}")
# Don't treat "No issues found" as an error that should stop indexing
if "No issues found" in error:
logger.info(f"No Jira issues found: {error}")
logger.info(
"No issues found is not a critical error, continuing with update"
)
@ -149,6 +148,7 @@ async def index_jira_issues(
)
return 0, None
else:
logger.error(f"Failed to get Jira issues: {error}")
await task_logger.log_task_failure(
log_entry,
f"Failed to get Jira issues: {error}",

View file

@ -145,10 +145,9 @@ async def index_linear_issues(
)
if error:
logger.error(f"Failed to get Linear issues: {error}")
# Don't treat "No issues found" as an error that should stop indexing
if "No issues found" in error:
logger.info(f"No Linear issues found: {error}")
logger.info(
"No issues found is not a critical error, continuing with update"
)
@ -162,6 +161,7 @@ async def index_linear_issues(
)
return 0, None
else:
logger.error(f"Failed to get Linear issues: {error}")
return 0, f"Failed to get Linear issues: {error}"
logger.info(f"Retrieved {len(issues)} issues from Linear API")

View file

@ -116,6 +116,13 @@ async def index_luma_events(
luma_client = LumaConnector(api_key=api_key)
# Handle 'undefined' string from frontend (treat as None)
# This prevents "time data 'undefined' does not match format" errors
if start_date == "undefined" or start_date == "":
start_date = None
if end_date == "undefined" or end_date == "":
end_date = None
# Calculate date range
# For calendar connectors, allow future dates to index upcoming events
if start_date is None or end_date is None:
@ -172,10 +179,9 @@ async def index_luma_events(
)
if error:
logger.error(f"Failed to get Luma events: {error}")
# Don't treat "No events found" as an error that should stop indexing
if "No events found" in error or "no events" in error.lower():
logger.info(f"No Luma events found: {error}")
logger.info(
"No events found is not a critical error, continuing with update"
)
@ -195,6 +201,7 @@ async def index_luma_events(
)
return 0, None
else:
logger.error(f"Failed to get Luma events: {error}")
await task_logger.log_task_failure(
log_entry,
f"Failed to get Luma events: {error}",

View file

@ -28,6 +28,9 @@ BASE_NAME_FOR_TYPE = {
SearchSourceConnectorType.CONFLUENCE_CONNECTOR: "Confluence",
SearchSourceConnectorType.AIRTABLE_CONNECTOR: "Airtable",
SearchSourceConnectorType.MCP_CONNECTOR: "Model Context Protocol (MCP)",
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: "Gmail",
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive",
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar",
}

View file

@ -1,6 +1,6 @@
[project]
name = "surf-new-backend"
version = "0.0.11"
version = "0.0.12"
description = "SurfSense Backend"
requires-python = ">=3.12"
dependencies = [

View file

@ -6545,7 +6545,7 @@ wheels = [
[[package]]
name = "surf-new-backend"
version = "0.0.11"
version = "0.0.12"
source = { editable = "." }
dependencies = [
{ name = "alembic" },