mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-22 21:28:12 +02:00
merge: upstream/dev with migration renumbering
This commit is contained in:
commit
a7145b2c63
176 changed files with 8791 additions and 3608 deletions
|
|
@ -19,6 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||
libxext6 \
|
||||
libxrender1 \
|
||||
dos2unix \
|
||||
git \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Update certificates and install SSL tools
|
||||
|
|
|
|||
|
|
@ -0,0 +1,95 @@
|
|||
"""Add Composio connector types to SearchSourceConnectorType and DocumentType enums
|
||||
|
||||
Revision ID: 79
|
||||
Revises: 78
|
||||
|
||||
This migration adds the Composio connector enum values to both:
|
||||
- searchsourceconnectortype (for connector type tracking)
|
||||
- documenttype (for document type tracking)
|
||||
|
||||
Composio is a managed OAuth integration service that allows connecting
|
||||
to various third-party services (Google Drive, Gmail, Calendar, etc.)
|
||||
without requiring separate OAuth app verification.
|
||||
|
||||
This migration adds three specific connector types:
|
||||
- COMPOSIO_GOOGLE_DRIVE_CONNECTOR
|
||||
- COMPOSIO_GMAIL_CONNECTOR
|
||||
- COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "79"
|
||||
down_revision: str | None = "78"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
# Define the ENUM type names and the new values
|
||||
CONNECTOR_ENUM = "searchsourceconnectortype"
|
||||
CONNECTOR_NEW_VALUES = [
|
||||
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"COMPOSIO_GMAIL_CONNECTOR",
|
||||
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
]
|
||||
DOCUMENT_ENUM = "documenttype"
|
||||
DOCUMENT_NEW_VALUES = [
|
||||
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"COMPOSIO_GMAIL_CONNECTOR",
|
||||
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
]
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Upgrade schema - add Composio connector types to connector and document enums safely."""
|
||||
# Add each Composio connector type to searchsourceconnectortype only if not exists
|
||||
for value in CONNECTOR_NEW_VALUES:
|
||||
op.execute(
|
||||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_enum e
|
||||
JOIN pg_type t ON e.enumtypid = t.oid
|
||||
WHERE t.typname = '{CONNECTOR_ENUM}' AND e.enumlabel = '{value}'
|
||||
) THEN
|
||||
ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{value}';
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# Add each Composio connector type to documenttype only if not exists
|
||||
for value in DOCUMENT_NEW_VALUES:
|
||||
op.execute(
|
||||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_enum e
|
||||
JOIN pg_type t ON e.enumtypid = t.oid
|
||||
WHERE t.typname = '{DOCUMENT_ENUM}' AND e.enumlabel = '{value}'
|
||||
) THEN
|
||||
ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{value}';
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Downgrade schema - remove Composio connector types from connector and document enums.
|
||||
|
||||
Note: PostgreSQL does not support removing enum values directly.
|
||||
To properly downgrade, you would need to:
|
||||
1. Delete any rows using the Composio connector type values
|
||||
2. Create new enums without the Composio connector types
|
||||
3. Alter the columns to use the new enums
|
||||
4. Drop the old enums
|
||||
|
||||
This is left as a no-op since removing enum values is complex
|
||||
and typically not needed in practice.
|
||||
"""
|
||||
pass
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
"""Add user incentive tasks table for earning free pages
|
||||
|
||||
Revision ID: 80
|
||||
Revises: 79
|
||||
|
||||
Changes:
|
||||
1. Create incentive_task_type enum with GITHUB_STAR value
|
||||
2. Create user_incentive_tasks table to track completed tasks
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "80"
|
||||
down_revision: str | None = "79"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Create incentive tasks infrastructure."""
|
||||
|
||||
# Check if enum already exists (handles partial migration recovery)
|
||||
conn = op.get_bind()
|
||||
result = conn.execute(
|
||||
sa.text("SELECT 1 FROM pg_type WHERE typname = 'incentivetasktype'")
|
||||
)
|
||||
enum_exists = result.fetchone() is not None
|
||||
|
||||
# Create the enum type only if it doesn't exist
|
||||
if not enum_exists:
|
||||
incentive_task_type_enum = postgresql.ENUM(
|
||||
"GITHUB_STAR",
|
||||
name="incentivetasktype",
|
||||
create_type=False,
|
||||
)
|
||||
incentive_task_type_enum.create(op.get_bind(), checkfirst=True)
|
||||
|
||||
# Check if table already exists (handles partial migration recovery)
|
||||
result = conn.execute(
|
||||
sa.text(
|
||||
"SELECT 1 FROM information_schema.tables WHERE table_name = 'user_incentive_tasks'"
|
||||
)
|
||||
)
|
||||
table_exists = result.fetchone() is not None
|
||||
|
||||
if not table_exists:
|
||||
# Create the user_incentive_tasks table
|
||||
op.create_table(
|
||||
"user_incentive_tasks",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, index=True),
|
||||
sa.Column(
|
||||
"user_id",
|
||||
sa.UUID(as_uuid=True),
|
||||
sa.ForeignKey("user.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
),
|
||||
sa.Column(
|
||||
"task_type",
|
||||
postgresql.ENUM(
|
||||
"GITHUB_STAR", name="incentivetasktype", create_type=False
|
||||
),
|
||||
nullable=False,
|
||||
index=True,
|
||||
),
|
||||
sa.Column("pages_awarded", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"completed_at",
|
||||
sa.TIMESTAMP(timezone=True),
|
||||
nullable=False,
|
||||
server_default=sa.func.now(),
|
||||
),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.TIMESTAMP(timezone=True),
|
||||
nullable=False,
|
||||
server_default=sa.func.now(),
|
||||
index=True,
|
||||
),
|
||||
sa.UniqueConstraint("user_id", "task_type", name="uq_user_incentive_task"),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove incentive tasks infrastructure."""
|
||||
|
||||
# Drop the table
|
||||
op.drop_table("user_incentive_tasks")
|
||||
|
||||
# Drop the enum type
|
||||
postgresql.ENUM(name="incentivetasktype").drop(op.get_bind(), checkfirst=True)
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
"""Add public sharing columns to new_chat_threads
|
||||
|
||||
Revision ID: 79
|
||||
Revises: 78
|
||||
Revision ID: 81
|
||||
Revises: 80
|
||||
Create Date: 2026-01-23
|
||||
|
||||
Adds public_share_token and public_share_enabled columns to enable
|
||||
|
|
@ -13,8 +13,8 @@ from collections.abc import Sequence
|
|||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "79"
|
||||
down_revision: str | None = "78"
|
||||
revision: str = "81"
|
||||
down_revision: str | None = "80"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
"""Add thread_id to podcasts
|
||||
|
||||
Revision ID: 80
|
||||
Revises: 79
|
||||
Revision ID: 82
|
||||
Revises: 81
|
||||
Create Date: 2026-01-23
|
||||
|
||||
"""
|
||||
|
|
@ -10,8 +10,8 @@ from collections.abc import Sequence
|
|||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "80"
|
||||
down_revision: str | None = "79"
|
||||
revision: str = "82"
|
||||
down_revision: str | None = "81"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
|
@ -7,6 +7,7 @@ via NewLLMConfig.
|
|||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Any
|
||||
|
||||
from deepagents import create_deep_agent
|
||||
from langchain_core.tools import BaseTool
|
||||
|
|
@ -23,6 +24,90 @@ from app.agents.new_chat.system_prompt import (
|
|||
from app.agents.new_chat.tools.registry import build_tools_async
|
||||
from app.services.connector_service import ConnectorService
|
||||
|
||||
# =============================================================================
|
||||
# Connector Type Mapping
|
||||
# =============================================================================
|
||||
|
||||
# Maps SearchSourceConnectorType enum values to the searchable document/connector types
|
||||
# used by the knowledge_base tool. Some connectors map to different document types.
|
||||
_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = {
|
||||
# Direct mappings (connector type == searchable type)
|
||||
"TAVILY_API": "TAVILY_API",
|
||||
"SEARXNG_API": "SEARXNG_API",
|
||||
"LINKUP_API": "LINKUP_API",
|
||||
"BAIDU_SEARCH_API": "BAIDU_SEARCH_API",
|
||||
"SLACK_CONNECTOR": "SLACK_CONNECTOR",
|
||||
"TEAMS_CONNECTOR": "TEAMS_CONNECTOR",
|
||||
"NOTION_CONNECTOR": "NOTION_CONNECTOR",
|
||||
"GITHUB_CONNECTOR": "GITHUB_CONNECTOR",
|
||||
"LINEAR_CONNECTOR": "LINEAR_CONNECTOR",
|
||||
"DISCORD_CONNECTOR": "DISCORD_CONNECTOR",
|
||||
"JIRA_CONNECTOR": "JIRA_CONNECTOR",
|
||||
"CONFLUENCE_CONNECTOR": "CONFLUENCE_CONNECTOR",
|
||||
"CLICKUP_CONNECTOR": "CLICKUP_CONNECTOR",
|
||||
"GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR",
|
||||
"GOOGLE_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR",
|
||||
"GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE", # Connector type differs from document type
|
||||
"AIRTABLE_CONNECTOR": "AIRTABLE_CONNECTOR",
|
||||
"LUMA_CONNECTOR": "LUMA_CONNECTOR",
|
||||
"ELASTICSEARCH_CONNECTOR": "ELASTICSEARCH_CONNECTOR",
|
||||
"WEBCRAWLER_CONNECTOR": "CRAWLED_URL", # Maps to document type
|
||||
"BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR",
|
||||
"CIRCLEBACK_CONNECTOR": "CIRCLEBACK", # Connector type differs from document type
|
||||
"OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR",
|
||||
# Composio connectors
|
||||
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"COMPOSIO_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
|
||||
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
}
|
||||
|
||||
# Document types that don't come from SearchSourceConnector but should always be searchable
|
||||
_ALWAYS_AVAILABLE_DOC_TYPES: list[str] = [
|
||||
"EXTENSION", # Browser extension data
|
||||
"FILE", # Uploaded files
|
||||
"NOTE", # User notes
|
||||
"YOUTUBE_VIDEO", # YouTube videos
|
||||
]
|
||||
|
||||
|
||||
def _map_connectors_to_searchable_types(
|
||||
connector_types: list[Any],
|
||||
) -> list[str]:
|
||||
"""
|
||||
Map SearchSourceConnectorType enums to searchable document/connector types.
|
||||
|
||||
This function:
|
||||
1. Converts connector type enums to their searchable counterparts
|
||||
2. Includes always-available document types (EXTENSION, FILE, NOTE, YOUTUBE_VIDEO)
|
||||
3. Deduplicates while preserving order
|
||||
|
||||
Args:
|
||||
connector_types: List of SearchSourceConnectorType enum values
|
||||
|
||||
Returns:
|
||||
List of searchable connector/document type strings
|
||||
"""
|
||||
result_set: set[str] = set()
|
||||
result_list: list[str] = []
|
||||
|
||||
# Add always-available document types first
|
||||
for doc_type in _ALWAYS_AVAILABLE_DOC_TYPES:
|
||||
if doc_type not in result_set:
|
||||
result_set.add(doc_type)
|
||||
result_list.append(doc_type)
|
||||
|
||||
# Map each connector type to its searchable equivalent
|
||||
for ct in connector_types:
|
||||
# Handle both enum and string types
|
||||
ct_str = ct.value if hasattr(ct, "value") else str(ct)
|
||||
searchable = _CONNECTOR_TYPE_TO_SEARCHABLE.get(ct_str)
|
||||
if searchable and searchable not in result_set:
|
||||
result_set.add(searchable)
|
||||
result_list.append(searchable)
|
||||
|
||||
return result_list
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Deep Agent Factory
|
||||
# =============================================================================
|
||||
|
|
@ -117,6 +202,30 @@ async def create_surfsense_deep_agent(
|
|||
additional_tools=[my_custom_tool]
|
||||
)
|
||||
"""
|
||||
# Discover available connectors and document types for this search space
|
||||
# This enables dynamic tool docstrings that inform the LLM about what's actually available
|
||||
available_connectors: list[str] | None = None
|
||||
available_document_types: list[str] | None = None
|
||||
|
||||
try:
|
||||
# Get enabled search source connectors for this search space
|
||||
connector_types = await connector_service.get_available_connectors(
|
||||
search_space_id
|
||||
)
|
||||
if connector_types:
|
||||
# Convert enum values to strings and also include mapped document types
|
||||
available_connectors = _map_connectors_to_searchable_types(connector_types)
|
||||
|
||||
# Get document types that have at least one document indexed
|
||||
available_document_types = await connector_service.get_available_document_types(
|
||||
search_space_id
|
||||
)
|
||||
except Exception as e:
|
||||
# Log but don't fail - fall back to all connectors if discovery fails
|
||||
import logging
|
||||
|
||||
logging.warning(f"Failed to discover available connectors/document types: {e}")
|
||||
|
||||
# Build dependencies dict for the tools registry
|
||||
dependencies = {
|
||||
"search_space_id": search_space_id,
|
||||
|
|
@ -125,6 +234,9 @@ async def create_surfsense_deep_agent(
|
|||
"firecrawl_api_key": firecrawl_api_key,
|
||||
"user_id": user_id, # Required for memory tools
|
||||
"thread_id": thread_id, # For podcast tool
|
||||
# Dynamic connector/document type discovery for knowledge base tool
|
||||
"available_connectors": available_connectors,
|
||||
"available_document_types": available_document_types,
|
||||
}
|
||||
|
||||
# Build tools using the async registry (includes MCP tools)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ Available tools:
|
|||
# Tool factory exports (for direct use)
|
||||
from .display_image import create_display_image_tool
|
||||
from .knowledge_base import (
|
||||
CONNECTOR_DESCRIPTIONS,
|
||||
create_search_knowledge_base_tool,
|
||||
format_documents_for_context,
|
||||
search_knowledge_base_async,
|
||||
|
|
@ -40,6 +41,8 @@ from .user_memory import create_recall_memory_tool, create_save_memory_tool
|
|||
__all__ = [
|
||||
# Registry
|
||||
"BUILTIN_TOOLS",
|
||||
# Knowledge base utilities
|
||||
"CONNECTOR_DESCRIPTIONS",
|
||||
"ToolDefinition",
|
||||
"build_tools",
|
||||
# Tool factories
|
||||
|
|
@ -51,7 +54,6 @@ __all__ = [
|
|||
"create_scrape_webpage_tool",
|
||||
"create_search_knowledge_base_tool",
|
||||
"create_search_surfsense_docs_tool",
|
||||
# Knowledge base utilities
|
||||
"format_documents_for_context",
|
||||
"get_all_tool_names",
|
||||
"get_default_enabled_tools",
|
||||
|
|
|
|||
|
|
@ -12,7 +12,8 @@ import json
|
|||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from langchain_core.tools import tool
|
||||
from langchain_core.tools import StructuredTool
|
||||
from pydantic import BaseModel, Field
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.services.connector_service import ConnectorService
|
||||
|
|
@ -22,6 +23,7 @@ from app.services.connector_service import ConnectorService
|
|||
# =============================================================================
|
||||
|
||||
# Canonical connector values used internally by ConnectorService
|
||||
# Includes all document types and search source connectors
|
||||
_ALL_CONNECTORS: list[str] = [
|
||||
"EXTENSION",
|
||||
"FILE",
|
||||
|
|
@ -50,41 +52,117 @@ _ALL_CONNECTORS: list[str] = [
|
|||
"CRAWLED_URL",
|
||||
"CIRCLEBACK",
|
||||
"OBSIDIAN_CONNECTOR",
|
||||
# Composio connectors
|
||||
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"COMPOSIO_GMAIL_CONNECTOR",
|
||||
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
]
|
||||
|
||||
# Human-readable descriptions for each connector type
|
||||
# Used for generating dynamic docstrings and informing the LLM
|
||||
CONNECTOR_DESCRIPTIONS: dict[str, str] = {
|
||||
"EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
|
||||
"FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
|
||||
"NOTE": "SurfSense Notes (notes created inside SurfSense)",
|
||||
"SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
|
||||
"TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
|
||||
"NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
|
||||
"YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
|
||||
"GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
|
||||
"ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
|
||||
"LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
|
||||
"JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
|
||||
"CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
|
||||
"CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
|
||||
"GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
|
||||
"GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
|
||||
"GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
|
||||
"DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
|
||||
"AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
|
||||
"TAVILY_API": "Tavily web search API results (real-time web search)",
|
||||
"SEARXNG_API": "SearxNG search API results (privacy-focused web search)",
|
||||
"LINKUP_API": "Linkup search API results (web search)",
|
||||
"BAIDU_SEARCH_API": "Baidu search API results (Chinese web search)",
|
||||
"LUMA_CONNECTOR": "Luma events and meetings",
|
||||
"WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
|
||||
"CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
|
||||
"BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
|
||||
"CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
|
||||
"OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
|
||||
# Composio connectors
|
||||
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "Google Drive files via Composio (personal cloud storage)",
|
||||
"COMPOSIO_GMAIL_CONNECTOR": "Gmail emails via Composio (personal emails)",
|
||||
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events via Composio (personal calendar)",
|
||||
}
|
||||
|
||||
def _normalize_connectors(connectors_to_search: list[str] | None) -> list[str]:
|
||||
|
||||
def _normalize_connectors(
|
||||
connectors_to_search: list[str] | None,
|
||||
available_connectors: list[str] | None = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Normalize connectors provided by the model.
|
||||
|
||||
- Accepts user-facing enums like WEBCRAWLER_CONNECTOR and maps them to canonical
|
||||
ConnectorService types.
|
||||
- Drops unknown values.
|
||||
- If None/empty, defaults to searching across all known connectors.
|
||||
- If available_connectors is provided, only includes connectors from that list.
|
||||
- If connectors_to_search is None/empty, defaults to available_connectors or all.
|
||||
|
||||
Args:
|
||||
connectors_to_search: List of connectors requested by the model
|
||||
available_connectors: List of connectors actually available in the search space
|
||||
|
||||
Returns:
|
||||
List of normalized connector strings to search
|
||||
"""
|
||||
# Determine the set of valid connectors to consider
|
||||
valid_set = (
|
||||
set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
|
||||
)
|
||||
|
||||
if not connectors_to_search:
|
||||
return list(_ALL_CONNECTORS)
|
||||
# Search all available connectors if none specified
|
||||
return (
|
||||
list(available_connectors)
|
||||
if available_connectors
|
||||
else list(_ALL_CONNECTORS)
|
||||
)
|
||||
|
||||
normalized: list[str] = []
|
||||
for raw in connectors_to_search:
|
||||
c = (raw or "").strip().upper()
|
||||
if not c:
|
||||
continue
|
||||
# Map user-facing aliases to canonical names
|
||||
if c == "WEBCRAWLER_CONNECTOR":
|
||||
c = "CRAWLED_URL"
|
||||
normalized.append(c)
|
||||
|
||||
# de-dupe while preserving order + filter unknown
|
||||
# de-dupe while preserving order + filter to valid connectors
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for c in normalized:
|
||||
if c in seen:
|
||||
continue
|
||||
# Only include if it's a known connector AND available
|
||||
if c not in _ALL_CONNECTORS:
|
||||
continue
|
||||
if c not in valid_set:
|
||||
continue
|
||||
seen.add(c)
|
||||
out.append(c)
|
||||
return out if out else list(_ALL_CONNECTORS)
|
||||
|
||||
# Fallback to all available if nothing matched
|
||||
return (
|
||||
out
|
||||
if out
|
||||
else (
|
||||
list(available_connectors)
|
||||
if available_connectors
|
||||
else list(_ALL_CONNECTORS)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
|
@ -233,6 +311,7 @@ async def search_knowledge_base_async(
|
|||
top_k: int = 10,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
available_connectors: list[str] | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Search the user's knowledge base for relevant documents.
|
||||
|
|
@ -248,6 +327,8 @@ async def search_knowledge_base_async(
|
|||
top_k: Number of results per connector
|
||||
start_date: Optional start datetime (UTC) for filtering documents
|
||||
end_date: Optional end datetime (UTC) for filtering documents
|
||||
available_connectors: Optional list of connectors actually available in the search space.
|
||||
If provided, only these connectors will be searched.
|
||||
|
||||
Returns:
|
||||
Formatted string with search results
|
||||
|
|
@ -262,7 +343,7 @@ async def search_knowledge_base_async(
|
|||
end_date=end_date,
|
||||
)
|
||||
|
||||
connectors = _normalize_connectors(connectors_to_search)
|
||||
connectors = _normalize_connectors(connectors_to_search, available_connectors)
|
||||
|
||||
for connector in connectors:
|
||||
try:
|
||||
|
|
@ -316,6 +397,16 @@ async def search_knowledge_base_async(
|
|||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
elif connector == "TEAMS_CONNECTOR":
|
||||
_, chunks = await connector_service.search_teams(
|
||||
user_query=query,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
start_date=resolved_start_date,
|
||||
end_date=resolved_end_date,
|
||||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
elif connector == "NOTION_CONNECTOR":
|
||||
_, chunks = await connector_service.search_notion(
|
||||
user_query=query,
|
||||
|
|
@ -519,6 +610,39 @@ async def search_knowledge_base_async(
|
|||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
# =========================================================
|
||||
# Composio Connectors
|
||||
# =========================================================
|
||||
elif connector == "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
|
||||
_, chunks = await connector_service.search_composio_google_drive(
|
||||
user_query=query,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
start_date=resolved_start_date,
|
||||
end_date=resolved_end_date,
|
||||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
elif connector == "COMPOSIO_GMAIL_CONNECTOR":
|
||||
_, chunks = await connector_service.search_composio_gmail(
|
||||
user_query=query,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
start_date=resolved_start_date,
|
||||
end_date=resolved_end_date,
|
||||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
elif connector == "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR":
|
||||
_, chunks = await connector_service.search_composio_google_calendar(
|
||||
user_query=query,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
start_date=resolved_start_date,
|
||||
end_date=resolved_end_date,
|
||||
)
|
||||
all_documents.extend(chunks)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error searching connector {connector}: {e}")
|
||||
continue
|
||||
|
|
@ -543,11 +667,68 @@ async def search_knowledge_base_async(
|
|||
return format_documents_for_context(deduplicated)
|
||||
|
||||
|
||||
def _build_connector_docstring(available_connectors: list[str] | None) -> str:
|
||||
"""
|
||||
Build the connector documentation section for the tool docstring.
|
||||
|
||||
Args:
|
||||
available_connectors: List of available connector types, or None for all
|
||||
|
||||
Returns:
|
||||
Formatted docstring section listing available connectors
|
||||
"""
|
||||
connectors = available_connectors if available_connectors else list(_ALL_CONNECTORS)
|
||||
|
||||
lines = []
|
||||
for connector in connectors:
|
||||
# Skip internal names, prefer user-facing aliases
|
||||
if connector == "CRAWLED_URL":
|
||||
# Show as WEBCRAWLER_CONNECTOR for user-facing docs
|
||||
description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
|
||||
lines.append(f"- WEBCRAWLER_CONNECTOR: {description}")
|
||||
else:
|
||||
description = CONNECTOR_DESCRIPTIONS.get(connector, connector)
|
||||
lines.append(f"- {connector}: {description}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tool Input Schema
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class SearchKnowledgeBaseInput(BaseModel):
|
||||
"""Input schema for the search_knowledge_base tool."""
|
||||
|
||||
query: str = Field(
|
||||
description="The search query - be specific and include key terms"
|
||||
)
|
||||
top_k: int = Field(
|
||||
default=10,
|
||||
description="Number of results to retrieve (default: 10)",
|
||||
)
|
||||
start_date: str | None = Field(
|
||||
default=None,
|
||||
description="Optional ISO date/datetime (e.g. '2025-12-12' or '2025-12-12T00:00:00+00:00')",
|
||||
)
|
||||
end_date: str | None = Field(
|
||||
default=None,
|
||||
description="Optional ISO date/datetime (e.g. '2025-12-19' or '2025-12-19T23:59:59+00:00')",
|
||||
)
|
||||
connectors_to_search: list[str] | None = Field(
|
||||
default=None,
|
||||
description="Optional list of connector enums to search. If omitted, searches all available.",
|
||||
)
|
||||
|
||||
|
||||
def create_search_knowledge_base_tool(
|
||||
search_space_id: int,
|
||||
db_session: AsyncSession,
|
||||
connector_service: ConnectorService,
|
||||
):
|
||||
available_connectors: list[str] | None = None,
|
||||
available_document_types: list[str] | None = None,
|
||||
) -> StructuredTool:
|
||||
"""
|
||||
Factory function to create the search_knowledge_base tool with injected dependencies.
|
||||
|
||||
|
|
@ -555,72 +736,57 @@ def create_search_knowledge_base_tool(
|
|||
search_space_id: The user's search space ID
|
||||
db_session: Database session
|
||||
connector_service: Initialized connector service
|
||||
available_connectors: Optional list of connector types available in the search space.
|
||||
Used to dynamically generate the tool docstring.
|
||||
available_document_types: Optional list of document types that have data in the search space.
|
||||
Used to inform the LLM about what data exists.
|
||||
|
||||
Returns:
|
||||
A configured tool function
|
||||
A configured StructuredTool instance
|
||||
"""
|
||||
# Build connector documentation dynamically
|
||||
connector_docs = _build_connector_docstring(available_connectors)
|
||||
|
||||
@tool
|
||||
async def search_knowledge_base(
|
||||
# Build context about available document types
|
||||
doc_types_info = ""
|
||||
if available_document_types:
|
||||
doc_types_info = f"""
|
||||
|
||||
## Document types with indexed content in this search space
|
||||
|
||||
The following document types have content available for search:
|
||||
{", ".join(available_document_types)}
|
||||
|
||||
Focus searches on these types for best results."""
|
||||
|
||||
# Build the dynamic description for the tool
|
||||
# This is what the LLM sees when deciding whether/how to use the tool
|
||||
dynamic_description = f"""Search the user's personal knowledge base for relevant information.
|
||||
|
||||
Use this tool to find documents, notes, files, web pages, and other content that may help answer the user's question.
|
||||
|
||||
IMPORTANT:
|
||||
- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
|
||||
- If `connectors_to_search` is omitted/empty, the system will search broadly.
|
||||
- Only connectors that are enabled/configured for this search space are available.{doc_types_info}
|
||||
|
||||
## Available connector enums for `connectors_to_search`
|
||||
|
||||
{connector_docs}
|
||||
|
||||
NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`."""
|
||||
|
||||
# Capture for closure
|
||||
_available_connectors = available_connectors
|
||||
|
||||
async def _search_knowledge_base_impl(
|
||||
query: str,
|
||||
top_k: int = 10,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
connectors_to_search: list[str] | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Search the user's personal knowledge base for relevant information.
|
||||
|
||||
Use this tool to find documents, notes, files, web pages, and other content
|
||||
that may help answer the user's question.
|
||||
|
||||
IMPORTANT:
|
||||
- If the user requests a specific source type (e.g. "my notes", "Slack messages"),
|
||||
pass `connectors_to_search=[...]` using the enums below.
|
||||
- If `connectors_to_search` is omitted/empty, the system will search broadly.
|
||||
|
||||
## Available connector enums for `connectors_to_search`
|
||||
|
||||
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
|
||||
- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
|
||||
- NOTE: "SurfSense Notes" (notes created inside SurfSense)
|
||||
- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
|
||||
- TEAMS_CONNECTOR: "Microsoft Teams messages and conversations" (personal Teams communications)
|
||||
- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
|
||||
- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
|
||||
- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
|
||||
- ELASTICSEARCH_CONNECTOR: "Elasticsearch indexed documents and data" (personal Elasticsearch instances and custom data sources)
|
||||
- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
|
||||
- JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking)
|
||||
- CONFLUENCE_CONNECTOR: "Confluence pages and comments" (personal project documentation)
|
||||
- CLICKUP_CONNECTOR: "ClickUp tasks and project data" (personal task management)
|
||||
- GOOGLE_CALENDAR_CONNECTOR: "Google Calendar events, meetings, and schedules" (personal calendar and time management)
|
||||
- GOOGLE_GMAIL_CONNECTOR: "Google Gmail emails and conversations" (personal emails and communications)
|
||||
- GOOGLE_DRIVE_FILE: "Google Drive files and documents" (personal cloud storage and file management)
|
||||
- DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications)
|
||||
- AIRTABLE_CONNECTOR: "Airtable records, tables, and database content" (personal data management and organization)
|
||||
- TAVILY_API: "Tavily search API results" (personalized search results)
|
||||
- SEARXNG_API: "SearxNG search API results" (personalized search results)
|
||||
- LINKUP_API: "Linkup search API results" (personalized search results)
|
||||
- BAIDU_SEARCH_API: "Baidu search API results" (personalized search results)
|
||||
- LUMA_CONNECTOR: "Luma events"
|
||||
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
|
||||
- BOOKSTACK_CONNECTOR: "BookStack pages" (personal documentation)
|
||||
- CIRCLEBACK: "Circleback meeting notes, transcripts, and action items" (personal meeting records)
|
||||
- OBSIDIAN_CONNECTOR: "Obsidian vault notes and markdown files" (personal notes and knowledge management)
|
||||
|
||||
NOTE: `WEBCRAWLER_CONNECTOR` is mapped internally to the canonical document type `CRAWLED_URL`.
|
||||
|
||||
Args:
|
||||
query: The search query - be specific and include key terms
|
||||
top_k: Number of results to retrieve (default: 10)
|
||||
start_date: Optional ISO date/datetime (e.g. "2025-12-12" or "2025-12-12T00:00:00+00:00")
|
||||
end_date: Optional ISO date/datetime (e.g. "2025-12-19" or "2025-12-19T23:59:59+00:00")
|
||||
connectors_to_search: Optional list of connector enums to search. If omitted, searches all.
|
||||
|
||||
Returns:
|
||||
Formatted string with relevant documents and their content
|
||||
"""
|
||||
"""Implementation function for knowledge base search."""
|
||||
from app.agents.new_chat.utils import parse_date_or_datetime
|
||||
|
||||
parsed_start: datetime | None = None
|
||||
|
|
@ -640,6 +806,16 @@ def create_search_knowledge_base_tool(
|
|||
top_k=top_k,
|
||||
start_date=parsed_start,
|
||||
end_date=parsed_end,
|
||||
available_connectors=_available_connectors,
|
||||
)
|
||||
|
||||
return search_knowledge_base
|
||||
# Create StructuredTool with dynamic description
|
||||
# This properly sets the description that the LLM sees
|
||||
tool = StructuredTool(
|
||||
name="search_knowledge_base",
|
||||
description=dynamic_description,
|
||||
coroutine=_search_knowledge_base_impl,
|
||||
args_schema=SearchKnowledgeBaseInput,
|
||||
)
|
||||
|
||||
return tool
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ class ToolDefinition:
|
|||
# Contributors: Add your new tools here!
|
||||
BUILTIN_TOOLS: list[ToolDefinition] = [
|
||||
# Core tool - searches the user's knowledge base
|
||||
# Now supports dynamic connector/document type discovery
|
||||
ToolDefinition(
|
||||
name="search_knowledge_base",
|
||||
description="Search the user's personal knowledge base for relevant information",
|
||||
|
|
@ -92,8 +93,12 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
|
|||
search_space_id=deps["search_space_id"],
|
||||
db_session=deps["db_session"],
|
||||
connector_service=deps["connector_service"],
|
||||
# Optional: dynamically discovered connectors/document types
|
||||
available_connectors=deps.get("available_connectors"),
|
||||
available_document_types=deps.get("available_document_types"),
|
||||
),
|
||||
requires=["search_space_id", "db_session", "connector_service"],
|
||||
# Note: available_connectors and available_document_types are optional
|
||||
),
|
||||
# Podcast generation tool
|
||||
ToolDefinition(
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
"""
|
||||
Composio Connector Module.
|
||||
Composio Connector Base Module.
|
||||
|
||||
Provides a unified interface for interacting with various services via Composio,
|
||||
Provides a base class for interacting with various services via Composio,
|
||||
primarily used during indexing operations.
|
||||
"""
|
||||
|
||||
|
|
@ -19,10 +19,10 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
class ComposioConnector:
|
||||
"""
|
||||
Generic Composio connector for data retrieval.
|
||||
Base Composio connector for data retrieval.
|
||||
|
||||
Wraps the ComposioService to provide toolkit-specific data access
|
||||
for indexing operations.
|
||||
for indexing operations. Subclasses implement toolkit-specific methods.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
|
@ -89,302 +89,12 @@ class ComposioConnector:
|
|||
toolkit_id = await self.get_toolkit_id()
|
||||
return toolkit_id in INDEXABLE_TOOLKITS
|
||||
|
||||
# ===== Google Drive Methods =====
|
||||
@property
|
||||
def session(self) -> AsyncSession:
|
||||
"""Get the database session."""
|
||||
return self._session
|
||||
|
||||
async def list_drive_files(
|
||||
self,
|
||||
folder_id: str | None = None,
|
||||
page_token: str | None = None,
|
||||
page_size: int = 100,
|
||||
) -> tuple[list[dict[str, Any]], str | None, str | None]:
|
||||
"""
|
||||
List files from Google Drive via Composio.
|
||||
|
||||
Args:
|
||||
folder_id: Optional folder ID to list contents of.
|
||||
page_token: Pagination token.
|
||||
page_size: Number of files per page.
|
||||
|
||||
Returns:
|
||||
Tuple of (files list, next_page_token, error message).
|
||||
"""
|
||||
connected_account_id = await self.get_connected_account_id()
|
||||
if not connected_account_id:
|
||||
return [], None, "No connected account ID found"
|
||||
|
||||
entity_id = await self.get_entity_id()
|
||||
service = await self._get_service()
|
||||
return await service.get_drive_files(
|
||||
connected_account_id=connected_account_id,
|
||||
entity_id=entity_id,
|
||||
folder_id=folder_id,
|
||||
page_token=page_token,
|
||||
page_size=page_size,
|
||||
)
|
||||
|
||||
async def get_drive_file_content(
|
||||
self, file_id: str
|
||||
) -> tuple[bytes | None, str | None]:
|
||||
"""
|
||||
Download file content from Google Drive via Composio.
|
||||
|
||||
Args:
|
||||
file_id: Google Drive file ID.
|
||||
|
||||
Returns:
|
||||
Tuple of (file content bytes, error message).
|
||||
"""
|
||||
connected_account_id = await self.get_connected_account_id()
|
||||
if not connected_account_id:
|
||||
return None, "No connected account ID found"
|
||||
|
||||
entity_id = await self.get_entity_id()
|
||||
service = await self._get_service()
|
||||
return await service.get_drive_file_content(
|
||||
connected_account_id=connected_account_id,
|
||||
entity_id=entity_id,
|
||||
file_id=file_id,
|
||||
)
|
||||
|
||||
# ===== Gmail Methods =====
|
||||
|
||||
async def list_gmail_messages(
|
||||
self,
|
||||
query: str = "",
|
||||
max_results: int = 100,
|
||||
) -> tuple[list[dict[str, Any]], str | None]:
|
||||
"""
|
||||
List Gmail messages via Composio.
|
||||
|
||||
Args:
|
||||
query: Gmail search query.
|
||||
max_results: Maximum number of messages.
|
||||
|
||||
Returns:
|
||||
Tuple of (messages list, error message).
|
||||
"""
|
||||
connected_account_id = await self.get_connected_account_id()
|
||||
if not connected_account_id:
|
||||
return [], "No connected account ID found"
|
||||
|
||||
entity_id = await self.get_entity_id()
|
||||
service = await self._get_service()
|
||||
return await service.get_gmail_messages(
|
||||
connected_account_id=connected_account_id,
|
||||
entity_id=entity_id,
|
||||
query=query,
|
||||
max_results=max_results,
|
||||
)
|
||||
|
||||
async def get_gmail_message_detail(
|
||||
self, message_id: str
|
||||
) -> tuple[dict[str, Any] | None, str | None]:
|
||||
"""
|
||||
Get full details of a Gmail message via Composio.
|
||||
|
||||
Args:
|
||||
message_id: Gmail message ID.
|
||||
|
||||
Returns:
|
||||
Tuple of (message details, error message).
|
||||
"""
|
||||
connected_account_id = await self.get_connected_account_id()
|
||||
if not connected_account_id:
|
||||
return None, "No connected account ID found"
|
||||
|
||||
entity_id = await self.get_entity_id()
|
||||
service = await self._get_service()
|
||||
return await service.get_gmail_message_detail(
|
||||
connected_account_id=connected_account_id,
|
||||
entity_id=entity_id,
|
||||
message_id=message_id,
|
||||
)
|
||||
|
||||
# ===== Google Calendar Methods =====
|
||||
|
||||
async def list_calendar_events(
|
||||
self,
|
||||
time_min: str | None = None,
|
||||
time_max: str | None = None,
|
||||
max_results: int = 250,
|
||||
) -> tuple[list[dict[str, Any]], str | None]:
|
||||
"""
|
||||
List Google Calendar events via Composio.
|
||||
|
||||
Args:
|
||||
time_min: Start time (RFC3339 format).
|
||||
time_max: End time (RFC3339 format).
|
||||
max_results: Maximum number of events.
|
||||
|
||||
Returns:
|
||||
Tuple of (events list, error message).
|
||||
"""
|
||||
connected_account_id = await self.get_connected_account_id()
|
||||
if not connected_account_id:
|
||||
return [], "No connected account ID found"
|
||||
|
||||
entity_id = await self.get_entity_id()
|
||||
service = await self._get_service()
|
||||
return await service.get_calendar_events(
|
||||
connected_account_id=connected_account_id,
|
||||
entity_id=entity_id,
|
||||
time_min=time_min,
|
||||
time_max=time_max,
|
||||
max_results=max_results,
|
||||
)
|
||||
|
||||
# ===== Utility Methods =====
|
||||
|
||||
def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str:
|
||||
"""
|
||||
Format a Gmail message to markdown.
|
||||
|
||||
Args:
|
||||
message: Message object from Composio's GMAIL_FETCH_EMAILS response.
|
||||
Composio structure: messageId, messageText, messageTimestamp,
|
||||
payload.headers, labelIds, attachmentList
|
||||
|
||||
Returns:
|
||||
Formatted markdown string.
|
||||
"""
|
||||
try:
|
||||
# Composio uses 'messageId' (camelCase)
|
||||
message_id = message.get("messageId", "") or message.get("id", "")
|
||||
label_ids = message.get("labelIds", [])
|
||||
|
||||
# Extract headers from payload
|
||||
payload = message.get("payload", {})
|
||||
headers = payload.get("headers", [])
|
||||
|
||||
# Parse headers into a dict
|
||||
header_dict = {}
|
||||
for header in headers:
|
||||
name = header.get("name", "").lower()
|
||||
value = header.get("value", "")
|
||||
header_dict[name] = value
|
||||
|
||||
# Extract key information
|
||||
subject = header_dict.get("subject", "No Subject")
|
||||
from_email = header_dict.get("from", "Unknown Sender")
|
||||
to_email = header_dict.get("to", "Unknown Recipient")
|
||||
# Composio provides messageTimestamp directly
|
||||
date_str = message.get("messageTimestamp", "") or header_dict.get(
|
||||
"date", "Unknown Date"
|
||||
)
|
||||
|
||||
# Build markdown content
|
||||
markdown_content = f"# {subject}\n\n"
|
||||
markdown_content += f"**From:** {from_email}\n"
|
||||
markdown_content += f"**To:** {to_email}\n"
|
||||
markdown_content += f"**Date:** {date_str}\n"
|
||||
|
||||
if label_ids:
|
||||
markdown_content += f"**Labels:** {', '.join(label_ids)}\n"
|
||||
|
||||
markdown_content += "\n---\n\n"
|
||||
|
||||
# Composio provides full message text in 'messageText'
|
||||
message_text = message.get("messageText", "")
|
||||
if message_text:
|
||||
markdown_content += f"## Content\n\n{message_text}\n\n"
|
||||
else:
|
||||
# Fallback to snippet if no messageText
|
||||
snippet = message.get("snippet", "")
|
||||
if snippet:
|
||||
markdown_content += f"## Preview\n\n{snippet}\n\n"
|
||||
|
||||
# Add attachment info if present
|
||||
attachments = message.get("attachmentList", [])
|
||||
if attachments:
|
||||
markdown_content += "## Attachments\n\n"
|
||||
for att in attachments:
|
||||
att_name = att.get("filename", att.get("name", "Unknown"))
|
||||
markdown_content += f"- {att_name}\n"
|
||||
markdown_content += "\n"
|
||||
|
||||
# Add message metadata
|
||||
markdown_content += "## Message Details\n\n"
|
||||
markdown_content += f"- **Message ID:** {message_id}\n"
|
||||
|
||||
return markdown_content
|
||||
|
||||
except Exception as e:
|
||||
return f"Error formatting message to markdown: {e!s}"
|
||||
|
||||
def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str:
|
||||
"""
|
||||
Format a Google Calendar event to markdown.
|
||||
|
||||
Args:
|
||||
event: Event object from Google Calendar API.
|
||||
|
||||
Returns:
|
||||
Formatted markdown string.
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
# Extract basic event information
|
||||
summary = event.get("summary", "No Title")
|
||||
description = event.get("description", "")
|
||||
location = event.get("location", "")
|
||||
|
||||
# Extract start and end times
|
||||
start = event.get("start", {})
|
||||
end = event.get("end", {})
|
||||
|
||||
start_time = start.get("dateTime") or start.get("date", "")
|
||||
end_time = end.get("dateTime") or end.get("date", "")
|
||||
|
||||
# Format times for display
|
||||
def format_time(time_str: str) -> str:
|
||||
if not time_str:
|
||||
return "Unknown"
|
||||
try:
|
||||
if "T" in time_str:
|
||||
dt = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
|
||||
return dt.strftime("%Y-%m-%d %H:%M")
|
||||
return time_str
|
||||
except Exception:
|
||||
return time_str
|
||||
|
||||
start_formatted = format_time(start_time)
|
||||
end_formatted = format_time(end_time)
|
||||
|
||||
# Extract attendees
|
||||
attendees = event.get("attendees", [])
|
||||
attendee_list = []
|
||||
for attendee in attendees:
|
||||
email = attendee.get("email", "")
|
||||
display_name = attendee.get("displayName", email)
|
||||
response_status = attendee.get("responseStatus", "")
|
||||
attendee_list.append(f"- {display_name} ({response_status})")
|
||||
|
||||
# Build markdown content
|
||||
markdown_content = f"# {summary}\n\n"
|
||||
markdown_content += f"**Start:** {start_formatted}\n"
|
||||
markdown_content += f"**End:** {end_formatted}\n"
|
||||
|
||||
if location:
|
||||
markdown_content += f"**Location:** {location}\n"
|
||||
|
||||
markdown_content += "\n"
|
||||
|
||||
if description:
|
||||
markdown_content += f"## Description\n\n{description}\n\n"
|
||||
|
||||
if attendee_list:
|
||||
markdown_content += "## Attendees\n\n"
|
||||
markdown_content += "\n".join(attendee_list)
|
||||
markdown_content += "\n\n"
|
||||
|
||||
# Add event metadata
|
||||
markdown_content += "## Event Details\n\n"
|
||||
markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n"
|
||||
markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n"
|
||||
markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n"
|
||||
|
||||
return markdown_content
|
||||
|
||||
except Exception as e:
|
||||
return f"Error formatting event to markdown: {e!s}"
|
||||
@property
|
||||
def connector_id(self) -> int:
|
||||
"""Get the connector ID."""
|
||||
return self._connector_id
|
||||
|
|
|
|||
613
surfsense_backend/app/connectors/composio_gmail_connector.py
Normal file
613
surfsense_backend/app/connectors/composio_gmail_connector.py
Normal file
|
|
@ -0,0 +1,613 @@
|
|||
"""
|
||||
Composio Gmail Connector Module.
|
||||
|
||||
Provides Gmail specific methods for data retrieval and indexing via Composio.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import Document, DocumentType
|
||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.tasks.connector_indexers.base import calculate_date_range
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_current_timestamp() -> datetime:
|
||||
"""Get the current timestamp with timezone for updated_at field."""
|
||||
return datetime.now(UTC)
|
||||
|
||||
|
||||
async def check_document_by_unique_identifier(
|
||||
session: AsyncSession, unique_identifier_hash: str
|
||||
) -> Document | None:
|
||||
"""Check if a document with the given unique identifier hash already exists."""
|
||||
existing_doc_result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.where(Document.unique_identifier_hash == unique_identifier_hash)
|
||||
)
|
||||
return existing_doc_result.scalars().first()
|
||||
|
||||
|
||||
async def update_connector_last_indexed(
|
||||
session: AsyncSession,
|
||||
connector,
|
||||
update_last_indexed: bool = True,
|
||||
) -> None:
|
||||
"""Update the last_indexed_at timestamp for a connector."""
|
||||
if update_last_indexed:
|
||||
connector.last_indexed_at = datetime.now(UTC)
|
||||
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
|
||||
|
||||
|
||||
class ComposioGmailConnector(ComposioConnector):
|
||||
"""
|
||||
Gmail specific Composio connector.
|
||||
|
||||
Provides methods for listing messages, getting message details, and formatting
|
||||
Gmail messages from Gmail via Composio.
|
||||
"""
|
||||
|
||||
async def list_gmail_messages(
|
||||
self,
|
||||
query: str = "",
|
||||
max_results: int = 50,
|
||||
page_token: str | None = None,
|
||||
) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]:
|
||||
"""
|
||||
List Gmail messages via Composio with pagination support.
|
||||
|
||||
Args:
|
||||
query: Gmail search query.
|
||||
max_results: Maximum number of messages per page (default: 50).
|
||||
page_token: Optional pagination token for next page.
|
||||
|
||||
Returns:
|
||||
Tuple of (messages list, next_page_token, result_size_estimate, error message).
|
||||
"""
|
||||
connected_account_id = await self.get_connected_account_id()
|
||||
if not connected_account_id:
|
||||
return [], None, None, "No connected account ID found"
|
||||
|
||||
entity_id = await self.get_entity_id()
|
||||
service = await self._get_service()
|
||||
return await service.get_gmail_messages(
|
||||
connected_account_id=connected_account_id,
|
||||
entity_id=entity_id,
|
||||
query=query,
|
||||
max_results=max_results,
|
||||
page_token=page_token,
|
||||
)
|
||||
|
||||
async def get_gmail_message_detail(
|
||||
self, message_id: str
|
||||
) -> tuple[dict[str, Any] | None, str | None]:
|
||||
"""
|
||||
Get full details of a Gmail message via Composio.
|
||||
|
||||
Args:
|
||||
message_id: Gmail message ID.
|
||||
|
||||
Returns:
|
||||
Tuple of (message details, error message).
|
||||
"""
|
||||
connected_account_id = await self.get_connected_account_id()
|
||||
if not connected_account_id:
|
||||
return None, "No connected account ID found"
|
||||
|
||||
entity_id = await self.get_entity_id()
|
||||
service = await self._get_service()
|
||||
return await service.get_gmail_message_detail(
|
||||
connected_account_id=connected_account_id,
|
||||
entity_id=entity_id,
|
||||
message_id=message_id,
|
||||
)
|
||||
|
||||
def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str:
|
||||
"""
|
||||
Format a Gmail message to markdown.
|
||||
|
||||
Args:
|
||||
message: Message object from Composio's GMAIL_FETCH_EMAILS response.
|
||||
Composio structure: messageId, messageText, messageTimestamp,
|
||||
payload.headers, labelIds, attachmentList
|
||||
|
||||
Returns:
|
||||
Formatted markdown string.
|
||||
"""
|
||||
try:
|
||||
# Composio uses 'messageId' (camelCase)
|
||||
message_id = message.get("messageId", "") or message.get("id", "")
|
||||
label_ids = message.get("labelIds", [])
|
||||
|
||||
# Extract headers from payload
|
||||
payload = message.get("payload", {})
|
||||
headers = payload.get("headers", [])
|
||||
|
||||
# Parse headers into a dict
|
||||
header_dict = {}
|
||||
for header in headers:
|
||||
name = header.get("name", "").lower()
|
||||
value = header.get("value", "")
|
||||
header_dict[name] = value
|
||||
|
||||
# Extract key information
|
||||
subject = header_dict.get("subject", "No Subject")
|
||||
from_email = header_dict.get("from", "Unknown Sender")
|
||||
to_email = header_dict.get("to", "Unknown Recipient")
|
||||
# Composio provides messageTimestamp directly
|
||||
date_str = message.get("messageTimestamp", "") or header_dict.get(
|
||||
"date", "Unknown Date"
|
||||
)
|
||||
|
||||
# Build markdown content
|
||||
markdown_content = f"# {subject}\n\n"
|
||||
markdown_content += f"**From:** {from_email}\n"
|
||||
markdown_content += f"**To:** {to_email}\n"
|
||||
markdown_content += f"**Date:** {date_str}\n"
|
||||
|
||||
if label_ids:
|
||||
markdown_content += f"**Labels:** {', '.join(label_ids)}\n"
|
||||
|
||||
markdown_content += "\n---\n\n"
|
||||
|
||||
# Composio provides full message text in 'messageText'
|
||||
message_text = message.get("messageText", "")
|
||||
if message_text:
|
||||
markdown_content += f"## Content\n\n{message_text}\n\n"
|
||||
else:
|
||||
# Fallback to snippet if no messageText
|
||||
snippet = message.get("snippet", "")
|
||||
if snippet:
|
||||
markdown_content += f"## Preview\n\n{snippet}\n\n"
|
||||
|
||||
# Add attachment info if present
|
||||
attachments = message.get("attachmentList", [])
|
||||
if attachments:
|
||||
markdown_content += "## Attachments\n\n"
|
||||
for att in attachments:
|
||||
att_name = att.get("filename", att.get("name", "Unknown"))
|
||||
markdown_content += f"- {att_name}\n"
|
||||
markdown_content += "\n"
|
||||
|
||||
# Add message metadata
|
||||
markdown_content += "## Message Details\n\n"
|
||||
markdown_content += f"- **Message ID:** {message_id}\n"
|
||||
|
||||
return markdown_content
|
||||
|
||||
except Exception as e:
|
||||
return f"Error formatting message to markdown: {e!s}"
|
||||
|
||||
|
||||
# ============ Indexer Functions ============
|
||||
|
||||
|
||||
async def _process_gmail_message_batch(
|
||||
session: AsyncSession,
|
||||
messages: list[dict[str, Any]],
|
||||
composio_connector: ComposioGmailConnector,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
total_documents_indexed: int = 0,
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
Process a batch of Gmail messages and index them.
|
||||
|
||||
Args:
|
||||
total_documents_indexed: Running total of documents indexed so far (for batch commits).
|
||||
|
||||
Returns:
|
||||
Tuple of (documents_indexed, documents_skipped)
|
||||
"""
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
# Composio uses 'messageId' (camelCase), not 'id'
|
||||
message_id = message.get("messageId", "") or message.get("id", "")
|
||||
if not message_id:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Composio's GMAIL_FETCH_EMAILS already returns full message content
|
||||
# No need for a separate detail API call
|
||||
|
||||
# Extract message info from Composio response
|
||||
# Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds
|
||||
payload = message.get("payload", {})
|
||||
headers = payload.get("headers", [])
|
||||
|
||||
subject = "No Subject"
|
||||
sender = "Unknown Sender"
|
||||
date_str = message.get("messageTimestamp", "Unknown Date")
|
||||
|
||||
for header in headers:
|
||||
name = header.get("name", "").lower()
|
||||
value = header.get("value", "")
|
||||
if name == "subject":
|
||||
subject = value
|
||||
elif name == "from":
|
||||
sender = value
|
||||
elif name == "date":
|
||||
date_str = value
|
||||
|
||||
# Format to markdown using the full message data
|
||||
markdown_content = composio_connector.format_gmail_message_to_markdown(
|
||||
message
|
||||
)
|
||||
|
||||
# Check for empty content (defensive parsing per Composio best practices)
|
||||
if not markdown_content.strip():
|
||||
logger.warning(f"Skipping Gmail message with no content: {subject}")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Generate unique identifier
|
||||
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"])
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
document_type, f"gmail_{message_id}", search_space_id
|
||||
)
|
||||
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Get label IDs from Composio response
|
||||
label_ids = message.get("labelIds", [])
|
||||
# Extract thread_id if available (for consistency with non-Composio implementation)
|
||||
thread_id = message.get("threadId", "") or message.get("thread_id", "")
|
||||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Update existing
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"document_type": "Gmail Message (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = f"Gmail: {subject}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"labels": label_ids,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
current_total = total_documents_indexed + documents_indexed
|
||||
if current_total % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {current_total} Gmail messages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"document_type": "Gmail Message (Composio)",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Gmail: {subject}",
|
||||
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]),
|
||||
document_metadata={
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"labels": label_ids,
|
||||
"connector_id": connector_id,
|
||||
"toolkit_id": "gmail",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
current_total = total_documents_indexed + documents_indexed
|
||||
if current_total % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {current_total} Gmail messages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
# Rollback on error to avoid partial state (per Composio best practices)
|
||||
try:
|
||||
await session.rollback()
|
||||
except Exception as rollback_error:
|
||||
logger.error(
|
||||
f"Error during rollback: {rollback_error!s}", exc_info=True
|
||||
)
|
||||
continue
|
||||
|
||||
return documents_indexed, documents_skipped
|
||||
|
||||
|
||||
async def index_composio_gmail(
|
||||
session: AsyncSession,
|
||||
connector,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str | None,
|
||||
end_date: str | None,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
update_last_indexed: bool = True,
|
||||
max_items: int = 1000,
|
||||
) -> tuple[int, str]:
|
||||
"""Index Gmail messages via Composio with pagination and incremental processing."""
|
||||
try:
|
||||
composio_connector = ComposioGmailConnector(session, connector_id)
|
||||
|
||||
# Normalize date values - handle "undefined" strings from frontend
|
||||
if start_date == "undefined" or start_date == "":
|
||||
start_date = None
|
||||
if end_date == "undefined" or end_date == "":
|
||||
end_date = None
|
||||
|
||||
# Use provided dates directly if both are provided, otherwise calculate from last_indexed_at
|
||||
# This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior)
|
||||
if start_date is not None and end_date is not None:
|
||||
# User provided both dates - use them directly
|
||||
start_date_str = start_date
|
||||
end_date_str = end_date
|
||||
else:
|
||||
# Calculate date range with defaults (uses last_indexed_at or 365 days back)
|
||||
# This ensures indexing works even when user doesn't specify dates
|
||||
start_date_str, end_date_str = calculate_date_range(
|
||||
connector, start_date, end_date, default_days_back=365
|
||||
)
|
||||
|
||||
# Build query with date range
|
||||
query_parts = []
|
||||
if start_date_str:
|
||||
query_parts.append(f"after:{start_date_str.replace('-', '/')}")
|
||||
if end_date_str:
|
||||
query_parts.append(f"before:{end_date_str.replace('-', '/')}")
|
||||
query = " ".join(query_parts) if query_parts else ""
|
||||
|
||||
logger.info(
|
||||
f"Gmail query for connector {connector_id}: '{query}' "
|
||||
f"(start_date={start_date_str}, end_date={end_date_str})"
|
||||
)
|
||||
|
||||
# Use smaller batch size to avoid 413 payload too large errors
|
||||
batch_size = 50
|
||||
page_token = None
|
||||
total_documents_indexed = 0
|
||||
total_documents_skipped = 0
|
||||
total_messages_fetched = 0
|
||||
result_size_estimate = None # Will be set from first API response
|
||||
|
||||
while total_messages_fetched < max_items:
|
||||
# Calculate how many messages to fetch in this batch
|
||||
remaining = max_items - total_messages_fetched
|
||||
current_batch_size = min(batch_size, remaining)
|
||||
|
||||
# Use result_size_estimate if available, otherwise fall back to max_items
|
||||
estimated_total = (
|
||||
result_size_estimate if result_size_estimate is not None else max_items
|
||||
)
|
||||
# Cap estimated_total at max_items to avoid showing misleading progress
|
||||
estimated_total = min(estimated_total, max_items)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Gmail messages batch via Composio for connector {connector_id} "
|
||||
f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)",
|
||||
{
|
||||
"stage": "fetching_messages",
|
||||
"batch_size": current_batch_size,
|
||||
"total_fetched": total_messages_fetched,
|
||||
"total_indexed": total_documents_indexed,
|
||||
"estimated_total": estimated_total,
|
||||
},
|
||||
)
|
||||
|
||||
# Fetch batch of messages
|
||||
(
|
||||
messages,
|
||||
next_token,
|
||||
result_size_estimate_batch,
|
||||
error,
|
||||
) = await composio_connector.list_gmail_messages(
|
||||
query=query,
|
||||
max_results=current_batch_size,
|
||||
page_token=page_token,
|
||||
)
|
||||
|
||||
if error:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, f"Failed to fetch Gmail messages: {error}", {}
|
||||
)
|
||||
return 0, f"Failed to fetch Gmail messages: {error}"
|
||||
|
||||
if not messages:
|
||||
# No more messages available
|
||||
break
|
||||
|
||||
# Update result_size_estimate from first response (Gmail provides this estimate)
|
||||
if result_size_estimate is None and result_size_estimate_batch is not None:
|
||||
result_size_estimate = result_size_estimate_batch
|
||||
logger.info(
|
||||
f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'"
|
||||
)
|
||||
|
||||
total_messages_fetched += len(messages)
|
||||
# Recalculate estimated_total after potentially updating result_size_estimate
|
||||
estimated_total = (
|
||||
result_size_estimate if result_size_estimate is not None else max_items
|
||||
)
|
||||
estimated_total = min(estimated_total, max_items)
|
||||
|
||||
logger.info(
|
||||
f"Fetched batch of {len(messages)} Gmail messages "
|
||||
f"(total: {total_messages_fetched}/{estimated_total})"
|
||||
)
|
||||
|
||||
# Process batch incrementally
|
||||
batch_indexed, batch_skipped = await _process_gmail_message_batch(
|
||||
session=session,
|
||||
messages=messages,
|
||||
composio_connector=composio_connector,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
total_documents_indexed=total_documents_indexed,
|
||||
)
|
||||
|
||||
total_documents_indexed += batch_indexed
|
||||
total_documents_skipped += batch_skipped
|
||||
|
||||
logger.info(
|
||||
f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped "
|
||||
f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)"
|
||||
)
|
||||
|
||||
# Batch commits happen in _process_gmail_message_batch every 10 documents
|
||||
# This ensures progress is saved incrementally, preventing data loss on crashes
|
||||
|
||||
# Check if we should continue
|
||||
if not next_token:
|
||||
# No more pages available
|
||||
break
|
||||
|
||||
if len(messages) < current_batch_size:
|
||||
# Last page had fewer items than requested, we're done
|
||||
break
|
||||
|
||||
# Continue with next page
|
||||
page_token = next_token
|
||||
|
||||
if total_messages_fetched == 0:
|
||||
success_msg = "No Gmail messages found in the specified date range"
|
||||
await task_logger.log_task_success(
|
||||
log_entry, success_msg, {"messages_count": 0}
|
||||
)
|
||||
# CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
await session.commit()
|
||||
return 0, None # Return None (not error) when no items found
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit to ensure all documents are persisted (safety net)
|
||||
# This matches the pattern used in non-Composio Gmail indexer
|
||||
logger.info(
|
||||
f"Final commit: Total {total_documents_indexed} Gmail messages processed"
|
||||
)
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Composio Gmail document changes to database"
|
||||
)
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Gmail indexing via Composio for connector {connector_id}",
|
||||
{
|
||||
"documents_indexed": total_documents_indexed,
|
||||
"documents_skipped": total_documents_skipped,
|
||||
"messages_fetched": total_messages_fetched,
|
||||
},
|
||||
)
|
||||
|
||||
return total_documents_indexed, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index Gmail via Composio: {e!s}"
|
||||
|
|
@ -0,0 +1,502 @@
|
|||
"""
|
||||
Composio Google Calendar Connector Module.
|
||||
|
||||
Provides Google Calendar specific methods for data retrieval and indexing via Composio.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import Document, DocumentType
|
||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.tasks.connector_indexers.base import (
|
||||
calculate_date_range,
|
||||
check_duplicate_document_by_hash,
|
||||
)
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_current_timestamp() -> datetime:
|
||||
"""Get the current timestamp with timezone for updated_at field."""
|
||||
return datetime.now(UTC)
|
||||
|
||||
|
||||
async def check_document_by_unique_identifier(
|
||||
session: AsyncSession, unique_identifier_hash: str
|
||||
) -> Document | None:
|
||||
"""Check if a document with the given unique identifier hash already exists."""
|
||||
existing_doc_result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.where(Document.unique_identifier_hash == unique_identifier_hash)
|
||||
)
|
||||
return existing_doc_result.scalars().first()
|
||||
|
||||
|
||||
async def update_connector_last_indexed(
|
||||
session: AsyncSession,
|
||||
connector,
|
||||
update_last_indexed: bool = True,
|
||||
) -> None:
|
||||
"""Update the last_indexed_at timestamp for a connector."""
|
||||
if update_last_indexed:
|
||||
connector.last_indexed_at = datetime.now(UTC)
|
||||
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
|
||||
|
||||
|
||||
class ComposioGoogleCalendarConnector(ComposioConnector):
|
||||
"""
|
||||
Google Calendar specific Composio connector.
|
||||
|
||||
Provides methods for listing calendar events and formatting them from
|
||||
Google Calendar via Composio.
|
||||
"""
|
||||
|
||||
async def list_calendar_events(
|
||||
self,
|
||||
time_min: str | None = None,
|
||||
time_max: str | None = None,
|
||||
max_results: int = 250,
|
||||
) -> tuple[list[dict[str, Any]], str | None]:
|
||||
"""
|
||||
List Google Calendar events via Composio.
|
||||
|
||||
Args:
|
||||
time_min: Start time (RFC3339 format).
|
||||
time_max: End time (RFC3339 format).
|
||||
max_results: Maximum number of events.
|
||||
|
||||
Returns:
|
||||
Tuple of (events list, error message).
|
||||
"""
|
||||
connected_account_id = await self.get_connected_account_id()
|
||||
if not connected_account_id:
|
||||
return [], "No connected account ID found"
|
||||
|
||||
entity_id = await self.get_entity_id()
|
||||
service = await self._get_service()
|
||||
return await service.get_calendar_events(
|
||||
connected_account_id=connected_account_id,
|
||||
entity_id=entity_id,
|
||||
time_min=time_min,
|
||||
time_max=time_max,
|
||||
max_results=max_results,
|
||||
)
|
||||
|
||||
def format_calendar_event_to_markdown(self, event: dict[str, Any]) -> str:
|
||||
"""
|
||||
Format a Google Calendar event to markdown.
|
||||
|
||||
Args:
|
||||
event: Event object from Google Calendar API.
|
||||
|
||||
Returns:
|
||||
Formatted markdown string.
|
||||
"""
|
||||
try:
|
||||
# Extract basic event information
|
||||
summary = event.get("summary", "No Title")
|
||||
description = event.get("description", "")
|
||||
location = event.get("location", "")
|
||||
|
||||
# Extract start and end times
|
||||
start = event.get("start", {})
|
||||
end = event.get("end", {})
|
||||
|
||||
start_time = start.get("dateTime") or start.get("date", "")
|
||||
end_time = end.get("dateTime") or end.get("date", "")
|
||||
|
||||
# Format times for display
|
||||
def format_time(time_str: str) -> str:
|
||||
if not time_str:
|
||||
return "Unknown"
|
||||
try:
|
||||
if "T" in time_str:
|
||||
dt = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
|
||||
return dt.strftime("%Y-%m-%d %H:%M")
|
||||
return time_str
|
||||
except Exception:
|
||||
return time_str
|
||||
|
||||
start_formatted = format_time(start_time)
|
||||
end_formatted = format_time(end_time)
|
||||
|
||||
# Extract attendees
|
||||
attendees = event.get("attendees", [])
|
||||
attendee_list = []
|
||||
for attendee in attendees:
|
||||
email = attendee.get("email", "")
|
||||
display_name = attendee.get("displayName", email)
|
||||
response_status = attendee.get("responseStatus", "")
|
||||
attendee_list.append(f"- {display_name} ({response_status})")
|
||||
|
||||
# Build markdown content
|
||||
markdown_content = f"# {summary}\n\n"
|
||||
markdown_content += f"**Start:** {start_formatted}\n"
|
||||
markdown_content += f"**End:** {end_formatted}\n"
|
||||
|
||||
if location:
|
||||
markdown_content += f"**Location:** {location}\n"
|
||||
|
||||
markdown_content += "\n"
|
||||
|
||||
if description:
|
||||
markdown_content += f"## Description\n\n{description}\n\n"
|
||||
|
||||
if attendee_list:
|
||||
markdown_content += "## Attendees\n\n"
|
||||
markdown_content += "\n".join(attendee_list)
|
||||
markdown_content += "\n\n"
|
||||
|
||||
# Add event metadata
|
||||
markdown_content += "## Event Details\n\n"
|
||||
markdown_content += f"- **Event ID:** {event.get('id', 'Unknown')}\n"
|
||||
markdown_content += f"- **Created:** {event.get('created', 'Unknown')}\n"
|
||||
markdown_content += f"- **Updated:** {event.get('updated', 'Unknown')}\n"
|
||||
|
||||
return markdown_content
|
||||
|
||||
except Exception as e:
|
||||
return f"Error formatting event to markdown: {e!s}"
|
||||
|
||||
|
||||
# ============ Indexer Functions ============
|
||||
|
||||
|
||||
async def index_composio_google_calendar(
|
||||
session: AsyncSession,
|
||||
connector,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str | None,
|
||||
end_date: str | None,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
update_last_indexed: bool = True,
|
||||
max_items: int = 2500,
|
||||
) -> tuple[int, str]:
|
||||
"""Index Google Calendar events via Composio."""
|
||||
try:
|
||||
composio_connector = ComposioGoogleCalendarConnector(session, connector_id)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Google Calendar events via Composio for connector {connector_id}",
|
||||
{"stage": "fetching_events"},
|
||||
)
|
||||
|
||||
# Normalize date values - handle "undefined" strings from frontend
|
||||
if start_date == "undefined" or start_date == "":
|
||||
start_date = None
|
||||
if end_date == "undefined" or end_date == "":
|
||||
end_date = None
|
||||
|
||||
# Use provided dates directly if both are provided, otherwise calculate from last_indexed_at
|
||||
# This ensures user-selected dates are respected (matching non-Composio Calendar connector behavior)
|
||||
if start_date is not None and end_date is not None:
|
||||
# User provided both dates - use them directly
|
||||
start_date_str = start_date
|
||||
end_date_str = end_date
|
||||
else:
|
||||
# Calculate date range with defaults (uses last_indexed_at or 365 days back)
|
||||
# This ensures indexing works even when user doesn't specify dates
|
||||
start_date_str, end_date_str = calculate_date_range(
|
||||
connector, start_date, end_date, default_days_back=365
|
||||
)
|
||||
|
||||
# Build time range for API call
|
||||
time_min = f"{start_date_str}T00:00:00Z"
|
||||
time_max = f"{end_date_str}T23:59:59Z"
|
||||
|
||||
logger.info(
|
||||
f"Google Calendar query for connector {connector_id}: "
|
||||
f"(start_date={start_date_str}, end_date={end_date_str})"
|
||||
)
|
||||
|
||||
events, error = await composio_connector.list_calendar_events(
|
||||
time_min=time_min,
|
||||
time_max=time_max,
|
||||
max_results=max_items,
|
||||
)
|
||||
|
||||
if error:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, f"Failed to fetch Calendar events: {error}", {}
|
||||
)
|
||||
return 0, f"Failed to fetch Calendar events: {error}"
|
||||
|
||||
if not events:
|
||||
success_msg = "No Google Calendar events found in the specified date range"
|
||||
await task_logger.log_task_success(
|
||||
log_entry, success_msg, {"events_count": 0}
|
||||
)
|
||||
# CRITICAL: Update timestamp even when no events found so Electric SQL syncs and UI shows indexed status
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
await session.commit()
|
||||
return (
|
||||
0,
|
||||
None,
|
||||
) # Return None (not error) when no items found - this is success with 0 items
|
||||
|
||||
logger.info(f"Found {len(events)} Google Calendar events to index via Composio")
|
||||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
duplicate_content_count = (
|
||||
0 # Track events skipped due to duplicate content_hash
|
||||
)
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
# Handle both standard Google API and potential Composio variations
|
||||
event_id = event.get("id", "") or event.get("eventId", "")
|
||||
summary = (
|
||||
event.get("summary", "") or event.get("title", "") or "No Title"
|
||||
)
|
||||
|
||||
if not event_id:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Format to markdown
|
||||
markdown_content = composio_connector.format_calendar_event_to_markdown(
|
||||
event
|
||||
)
|
||||
|
||||
# Generate unique identifier
|
||||
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"])
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
document_type, f"calendar_{event_id}", search_space_id
|
||||
)
|
||||
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Extract event times
|
||||
start = event.get("start", {})
|
||||
end = event.get("end", {})
|
||||
start_time = start.get("dateTime") or start.get("date", "")
|
||||
end_time = end.get("dateTime") or end.get("date", "")
|
||||
location = event.get("location", "")
|
||||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Update existing
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
|
||||
if location:
|
||||
summary_content += f"\nLocation: {location}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = f"Calendar: {summary}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from standard connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
# A document with the same content already exists (likely from standard connector)
|
||||
logger.info(
|
||||
f"Event {summary} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
|
||||
)
|
||||
if location:
|
||||
summary_content += f"\nLocation: {location}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Calendar: {summary}",
|
||||
document_type=DocumentType(
|
||||
TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]
|
||||
),
|
||||
document_metadata={
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"connector_id": connector_id,
|
||||
"toolkit_id": "googlecalendar",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit to ensure all documents are persisted (safety net)
|
||||
# This matches the pattern used in non-Composio Gmail indexer
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_indexed} Google Calendar events processed"
|
||||
)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Composio Google Calendar document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same event was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if duplicates were found
|
||||
warning_message = None
|
||||
if duplicate_content_count > 0:
|
||||
warning_message = f"{duplicate_content_count} skipped (duplicate)"
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}",
|
||||
{
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
|
||||
f"({duplicate_content_count} due to duplicate content from other connectors)"
|
||||
)
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True
|
||||
)
|
||||
return 0, f"Failed to index Google Calendar via Composio: {e!s}"
|
||||
1167
surfsense_backend/app/connectors/composio_google_drive_connector.py
Normal file
1167
surfsense_backend/app/connectors/composio_google_drive_connector.py
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -142,6 +142,15 @@ class GoogleCalendarConnector:
|
|||
flag_modified(connector, "config")
|
||||
await self._session.commit()
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
# Check if this is an invalid_grant error (token expired/revoked)
|
||||
if (
|
||||
"invalid_grant" in error_str.lower()
|
||||
or "token has been expired or revoked" in error_str.lower()
|
||||
):
|
||||
raise Exception(
|
||||
"Google Calendar authentication failed. Please re-authenticate."
|
||||
) from e
|
||||
raise Exception(
|
||||
f"Failed to refresh Google OAuth credentials: {e!s}"
|
||||
) from e
|
||||
|
|
@ -165,6 +174,14 @@ class GoogleCalendarConnector:
|
|||
self.service = build("calendar", "v3", credentials=credentials)
|
||||
return self.service
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
# If the error already contains a user-friendly re-authentication message, preserve it
|
||||
if (
|
||||
"re-authenticate" in error_str.lower()
|
||||
or "expired or been revoked" in error_str.lower()
|
||||
or "authentication failed" in error_str.lower()
|
||||
):
|
||||
raise Exception(error_str) from e
|
||||
raise Exception(f"Failed to create Google Calendar service: {e!s}") from e
|
||||
|
||||
async def get_calendars(self) -> tuple[list[dict[str, Any]], str | None]:
|
||||
|
|
@ -271,6 +288,14 @@ class GoogleCalendarConnector:
|
|||
return events, None
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
# If the error already contains a user-friendly re-authentication message, preserve it
|
||||
if (
|
||||
"re-authenticate" in error_str.lower()
|
||||
or "expired or been revoked" in error_str.lower()
|
||||
or "authentication failed" in error_str.lower()
|
||||
):
|
||||
return [], error_str
|
||||
return [], f"Error fetching events: {e!s}"
|
||||
|
||||
def format_event_to_markdown(self, event: dict[str, Any]) -> str:
|
||||
|
|
|
|||
|
|
@ -141,6 +141,15 @@ class GoogleGmailConnector:
|
|||
flag_modified(connector, "config")
|
||||
await self._session.commit()
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
# Check if this is an invalid_grant error (token expired/revoked)
|
||||
if (
|
||||
"invalid_grant" in error_str.lower()
|
||||
or "token has been expired or revoked" in error_str.lower()
|
||||
):
|
||||
raise Exception(
|
||||
"Gmail authentication failed. Please re-authenticate."
|
||||
) from e
|
||||
raise Exception(
|
||||
f"Failed to refresh Google OAuth credentials: {e!s}"
|
||||
) from e
|
||||
|
|
@ -164,6 +173,14 @@ class GoogleGmailConnector:
|
|||
self.service = build("gmail", "v1", credentials=credentials)
|
||||
return self.service
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
# If the error already contains a user-friendly re-authentication message, preserve it
|
||||
if (
|
||||
"re-authenticate" in error_str.lower()
|
||||
or "expired or been revoked" in error_str.lower()
|
||||
or "authentication failed" in error_str.lower()
|
||||
):
|
||||
raise Exception(error_str) from e
|
||||
raise Exception(f"Failed to create Gmail service: {e!s}") from e
|
||||
|
||||
async def get_user_profile(self) -> tuple[dict[str, Any], str | None]:
|
||||
|
|
@ -225,6 +242,14 @@ class GoogleGmailConnector:
|
|||
return messages, None
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
# If the error already contains a user-friendly re-authentication message, preserve it
|
||||
if (
|
||||
"re-authenticate" in error_str.lower()
|
||||
or "expired or been revoked" in error_str.lower()
|
||||
or "authentication failed" in error_str.lower()
|
||||
):
|
||||
return [], error_str
|
||||
return [], f"Error fetching messages list: {e!s}"
|
||||
|
||||
async def get_message_details(
|
||||
|
|
@ -271,6 +296,13 @@ class GoogleGmailConnector:
|
|||
try:
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Normalize date values - handle "undefined" strings from frontend
|
||||
# This prevents "time data 'undefined' does not match format" errors
|
||||
if start_date == "undefined" or start_date == "":
|
||||
start_date = None
|
||||
if end_date == "undefined" or end_date == "":
|
||||
end_date = None
|
||||
|
||||
# Build date query
|
||||
query_parts = []
|
||||
|
||||
|
|
|
|||
|
|
@ -55,7 +55,9 @@ class DocumentType(str, Enum):
|
|||
CIRCLEBACK = "CIRCLEBACK"
|
||||
OBSIDIAN_CONNECTOR = "OBSIDIAN_CONNECTOR"
|
||||
NOTE = "NOTE"
|
||||
COMPOSIO_CONNECTOR = "COMPOSIO_CONNECTOR" # Generic Composio integration
|
||||
COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"
|
||||
COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR"
|
||||
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
|
||||
|
||||
|
||||
class SearchSourceConnectorType(str, Enum):
|
||||
|
|
@ -86,9 +88,9 @@ class SearchSourceConnectorType(str, Enum):
|
|||
"OBSIDIAN_CONNECTOR" # Self-hosted only - Local Obsidian vault indexing
|
||||
)
|
||||
MCP_CONNECTOR = "MCP_CONNECTOR" # Model Context Protocol - User-defined API tools
|
||||
COMPOSIO_CONNECTOR = (
|
||||
"COMPOSIO_CONNECTOR" # Generic Composio integration (Google, Slack, etc.)
|
||||
)
|
||||
COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"
|
||||
COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR"
|
||||
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
|
||||
|
||||
|
||||
class LiteLLMProvider(str, Enum):
|
||||
|
|
@ -142,6 +144,43 @@ class LogStatus(str, Enum):
|
|||
FAILED = "FAILED"
|
||||
|
||||
|
||||
class IncentiveTaskType(str, Enum):
|
||||
"""
|
||||
Enum for incentive task types that users can complete to earn free pages.
|
||||
Each task can only be completed once per user.
|
||||
|
||||
When adding new tasks:
|
||||
1. Add a new enum value here
|
||||
2. Add the task configuration to INCENTIVE_TASKS_CONFIG below
|
||||
3. Create an Alembic migration to add the enum value to PostgreSQL
|
||||
"""
|
||||
|
||||
GITHUB_STAR = "GITHUB_STAR"
|
||||
# Future tasks can be added here:
|
||||
# GITHUB_ISSUE = "GITHUB_ISSUE"
|
||||
# SOCIAL_SHARE = "SOCIAL_SHARE"
|
||||
# REFER_FRIEND = "REFER_FRIEND"
|
||||
|
||||
|
||||
# Centralized configuration for incentive tasks
|
||||
# This makes it easy to add new tasks without changing code in multiple places
|
||||
INCENTIVE_TASKS_CONFIG = {
|
||||
IncentiveTaskType.GITHUB_STAR: {
|
||||
"title": "Star our GitHub repository",
|
||||
"description": "Show your support by starring SurfSense on GitHub",
|
||||
"pages_reward": 100,
|
||||
"action_url": "https://github.com/MODSetter/SurfSense",
|
||||
},
|
||||
# Future tasks can be configured here:
|
||||
# IncentiveTaskType.GITHUB_ISSUE: {
|
||||
# "title": "Create an issue",
|
||||
# "description": "Help improve SurfSense by reporting bugs or suggesting features",
|
||||
# "pages_reward": 50,
|
||||
# "action_url": "https://github.com/MODSetter/SurfSense/issues/new/choose",
|
||||
# },
|
||||
}
|
||||
|
||||
|
||||
class Permission(str, Enum):
|
||||
"""
|
||||
Granular permissions for search space resources.
|
||||
|
|
@ -936,6 +975,39 @@ class Notification(BaseModel, TimestampMixin):
|
|||
search_space = relationship("SearchSpace", back_populates="notifications")
|
||||
|
||||
|
||||
class UserIncentiveTask(BaseModel, TimestampMixin):
|
||||
"""
|
||||
Tracks completed incentive tasks for users.
|
||||
Each user can only complete each task type once.
|
||||
When a task is completed, the user's pages_limit is increased.
|
||||
"""
|
||||
|
||||
__tablename__ = "user_incentive_tasks"
|
||||
__table_args__ = (
|
||||
UniqueConstraint(
|
||||
"user_id",
|
||||
"task_type",
|
||||
name="uq_user_incentive_task",
|
||||
),
|
||||
)
|
||||
|
||||
user_id = Column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("user.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
task_type = Column(SQLAlchemyEnum(IncentiveTaskType), nullable=False, index=True)
|
||||
pages_awarded = Column(Integer, nullable=False)
|
||||
completed_at = Column(
|
||||
TIMESTAMP(timezone=True),
|
||||
nullable=False,
|
||||
default=lambda: datetime.now(UTC),
|
||||
)
|
||||
|
||||
user = relationship("User", back_populates="incentive_tasks")
|
||||
|
||||
|
||||
class SearchSpaceRole(BaseModel, TimestampMixin):
|
||||
"""
|
||||
Custom roles that can be defined per search space.
|
||||
|
|
@ -1114,6 +1186,13 @@ if config.AUTH_TYPE == "GOOGLE":
|
|||
cascade="all, delete-orphan",
|
||||
)
|
||||
|
||||
# Incentive tasks completed by this user
|
||||
incentive_tasks = relationship(
|
||||
"UserIncentiveTask",
|
||||
back_populates="user",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
|
||||
# Page usage tracking for ETL services
|
||||
pages_limit = Column(
|
||||
Integer,
|
||||
|
|
@ -1165,6 +1244,13 @@ else:
|
|||
cascade="all, delete-orphan",
|
||||
)
|
||||
|
||||
# Incentive tasks completed by this user
|
||||
incentive_tasks = relationship(
|
||||
"UserIncentiveTask",
|
||||
back_populates="user",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
|
||||
# Page usage tracking for ETL services
|
||||
pages_limit = Column(
|
||||
Integer,
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ from .google_drive_add_connector_route import (
|
|||
from .google_gmail_add_connector_route import (
|
||||
router as google_gmail_add_connector_router,
|
||||
)
|
||||
from .incentive_tasks_routes import router as incentive_tasks_router
|
||||
from .jira_add_connector_route import router as jira_add_connector_router
|
||||
from .linear_add_connector_route import router as linear_add_connector_router
|
||||
from .logs_routes import router as logs_router
|
||||
|
|
@ -69,3 +70,4 @@ router.include_router(surfsense_docs_router) # Surfsense documentation for cita
|
|||
router.include_router(notifications_router) # Notifications with Electric SQL sync
|
||||
router.include_router(composio_router) # Composio OAuth and toolkit management
|
||||
router.include_router(public_chat_router) # Public chat sharing and cloning
|
||||
router.include_router(incentive_tasks_router) # Incentive tasks for earning free pages
|
||||
|
|
|
|||
|
|
@ -8,16 +8,18 @@ Endpoints:
|
|||
- GET /composio/toolkits - List available Composio toolkits
|
||||
- GET /auth/composio/connector/add - Initiate OAuth for a specific toolkit
|
||||
- GET /auth/composio/connector/callback - Handle OAuth callback
|
||||
- GET /connectors/{connector_id}/composio-drive/folders - List folders/files for Composio Google Drive
|
||||
"""
|
||||
|
||||
import logging
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, Request
|
||||
from fastapi.responses import RedirectResponse
|
||||
from pydantic import ValidationError
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
|
||||
from app.config import config
|
||||
from app.db import (
|
||||
|
|
@ -29,19 +31,31 @@ from app.db import (
|
|||
from app.services.composio_service import (
|
||||
COMPOSIO_TOOLKIT_NAMES,
|
||||
INDEXABLE_TOOLKITS,
|
||||
TOOLKIT_TO_CONNECTOR_TYPE,
|
||||
ComposioService,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
from app.utils.connector_naming import (
|
||||
check_duplicate_connector,
|
||||
generate_unique_connector_name,
|
||||
count_connectors_of_type,
|
||||
get_base_name_for_type,
|
||||
)
|
||||
from app.utils.oauth_security import OAuthStateManager
|
||||
|
||||
# Note: We no longer use check_duplicate_connector for Composio connectors because
|
||||
# Composio generates a new connected_account_id each time, even for the same Google account.
|
||||
# Instead, we check for existing connectors by type/space/user and update them.
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Map toolkit_id to frontend connector ID
|
||||
TOOLKIT_TO_FRONTEND_CONNECTOR_ID = {
|
||||
"googledrive": "composio-googledrive",
|
||||
"gmail": "composio-gmail",
|
||||
"googlecalendar": "composio-googlecalendar",
|
||||
}
|
||||
|
||||
# Initialize security utilities
|
||||
_state_manager = None
|
||||
|
||||
|
|
@ -166,11 +180,8 @@ async def initiate_composio_auth(
|
|||
|
||||
@router.get("/auth/composio/connector/callback")
|
||||
async def composio_callback(
|
||||
request: Request,
|
||||
state: str | None = None,
|
||||
composio_connected_account_id: str | None = Query(
|
||||
None, alias="connectedAccountId"
|
||||
), # Composio sends camelCase
|
||||
connected_account_id: str | None = None, # Fallback snake_case
|
||||
error: str | None = None,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
):
|
||||
|
|
@ -236,16 +247,17 @@ async def composio_callback(
|
|||
)
|
||||
|
||||
# Initialize Composio service
|
||||
ComposioService()
|
||||
service = ComposioService()
|
||||
|
||||
# Use camelCase param if provided (Composio's format), fallback to snake_case
|
||||
final_connected_account_id = (
|
||||
composio_connected_account_id or connected_account_id
|
||||
)
|
||||
# Extract connected_account_id from query params (accepts both camelCase and snake_case)
|
||||
query_params = request.query_params
|
||||
final_connected_account_id = query_params.get(
|
||||
"connectedAccountId"
|
||||
) or query_params.get("connected_account_id")
|
||||
|
||||
# DEBUG: Log all query parameters received
|
||||
# DEBUG: Log query parameter received
|
||||
logger.info(
|
||||
f"DEBUG: Callback received - connectedAccountId: {composio_connected_account_id}, connected_account_id: {connected_account_id}, using: {final_connected_account_id}"
|
||||
f"DEBUG: Callback received - connectedAccountId: {query_params.get('connectedAccountId')}, connected_account_id: {query_params.get('connected_account_id')}, using: {final_connected_account_id}"
|
||||
)
|
||||
|
||||
# If we still don't have a connected_account_id, warn but continue
|
||||
|
|
@ -268,38 +280,89 @@ async def composio_callback(
|
|||
"is_indexable": toolkit_id in INDEXABLE_TOOLKITS,
|
||||
}
|
||||
|
||||
# Check for duplicate connector
|
||||
# For Composio, we use toolkit_id + connected_account_id as unique identifier
|
||||
identifier = final_connected_account_id or f"{toolkit_id}_{user_id}"
|
||||
# Get the specific connector type for this toolkit
|
||||
connector_type_str = TOOLKIT_TO_CONNECTOR_TYPE.get(toolkit_id)
|
||||
if not connector_type_str:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unknown toolkit: {toolkit_id}. Available: {list(TOOLKIT_TO_CONNECTOR_TYPE.keys())}",
|
||||
)
|
||||
connector_type = SearchSourceConnectorType(connector_type_str)
|
||||
|
||||
is_duplicate = await check_duplicate_connector(
|
||||
session,
|
||||
SearchSourceConnectorType.COMPOSIO_CONNECTOR,
|
||||
space_id,
|
||||
user_id,
|
||||
identifier,
|
||||
# Check for existing connector of the same type for this user/space
|
||||
# When reconnecting, Composio gives a new connected_account_id, so we need to
|
||||
# check by connector_type, user_id, and search_space_id instead of connected_account_id
|
||||
existing_connector_result = await session.execute(
|
||||
select(SearchSourceConnector).where(
|
||||
SearchSourceConnector.connector_type == connector_type,
|
||||
SearchSourceConnector.search_space_id == space_id,
|
||||
SearchSourceConnector.user_id == user_id,
|
||||
)
|
||||
)
|
||||
if is_duplicate:
|
||||
logger.warning(
|
||||
f"Duplicate Composio connector detected for user {user_id} with toolkit {toolkit_id}"
|
||||
existing_connector = existing_connector_result.scalars().first()
|
||||
|
||||
if existing_connector:
|
||||
# Delete the old Composio connected account before updating
|
||||
old_connected_account_id = existing_connector.config.get(
|
||||
"composio_connected_account_id"
|
||||
)
|
||||
if (
|
||||
old_connected_account_id
|
||||
and old_connected_account_id != final_connected_account_id
|
||||
):
|
||||
try:
|
||||
deleted = await service.delete_connected_account(
|
||||
old_connected_account_id
|
||||
)
|
||||
if deleted:
|
||||
logger.info(
|
||||
f"Deleted old Composio connected account {old_connected_account_id} "
|
||||
f"before updating connector {existing_connector.id}"
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Failed to delete old Composio connected account {old_connected_account_id}"
|
||||
)
|
||||
except Exception as delete_error:
|
||||
# Log but don't fail - the old account may already be deleted
|
||||
logger.warning(
|
||||
f"Error deleting old Composio connected account {old_connected_account_id}: {delete_error!s}"
|
||||
)
|
||||
|
||||
# Update existing connector with new connected_account_id
|
||||
logger.info(
|
||||
f"Updating existing Composio connector {existing_connector.id} with new connected_account_id {final_connected_account_id}"
|
||||
)
|
||||
existing_connector.config = connector_config
|
||||
await session.commit()
|
||||
await session.refresh(existing_connector)
|
||||
|
||||
# Get the frontend connector ID based on toolkit_id
|
||||
frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get(
|
||||
toolkit_id, "composio-connector"
|
||||
)
|
||||
return RedirectResponse(
|
||||
url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&error=duplicate_account&connector=composio-connector"
|
||||
url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={existing_connector.id}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Generate a unique, user-friendly connector name
|
||||
connector_name = await generate_unique_connector_name(
|
||||
session,
|
||||
SearchSourceConnectorType.COMPOSIO_CONNECTOR,
|
||||
space_id,
|
||||
user_id,
|
||||
f"{toolkit_name} (Composio)",
|
||||
# Count existing connectors of this type to determine the number
|
||||
count = await count_connectors_of_type(
|
||||
session, connector_type, space_id, user_id
|
||||
)
|
||||
|
||||
# Generate base name (e.g., "Gmail", "Google Drive")
|
||||
base_name = get_base_name_for_type(connector_type)
|
||||
|
||||
# Format: "Gmail (Composio) 1", "Gmail (Composio) 2", etc.
|
||||
if count == 0:
|
||||
connector_name = f"{base_name} (Composio) 1"
|
||||
else:
|
||||
connector_name = f"{base_name} (Composio) {count + 1}"
|
||||
|
||||
db_connector = SearchSourceConnector(
|
||||
name=connector_name,
|
||||
connector_type=SearchSourceConnectorType.COMPOSIO_CONNECTOR,
|
||||
connector_type=connector_type,
|
||||
config=connector_config,
|
||||
search_space_id=space_id,
|
||||
user_id=user_id,
|
||||
|
|
@ -314,8 +377,12 @@ async def composio_callback(
|
|||
f"Successfully created Composio connector {db_connector.id} for user {user_id}, toolkit {toolkit_id}"
|
||||
)
|
||||
|
||||
# Get the frontend connector ID based on toolkit_id
|
||||
frontend_connector_id = TOOLKIT_TO_FRONTEND_CONNECTOR_ID.get(
|
||||
toolkit_id, "composio-connector"
|
||||
)
|
||||
return RedirectResponse(
|
||||
url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector=composio-connector&connectorId={db_connector.id}"
|
||||
url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/new-chat?modal=connectors&tab=all&success=true&connector={frontend_connector_id}&connectorId={db_connector.id}"
|
||||
)
|
||||
|
||||
except IntegrityError as e:
|
||||
|
|
@ -339,3 +406,136 @@ async def composio_callback(
|
|||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to complete Composio OAuth: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/connectors/{connector_id}/composio-drive/folders")
|
||||
async def list_composio_drive_folders(
|
||||
connector_id: int,
|
||||
parent_id: str | None = None,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
List folders AND files in user's Google Drive via Composio with hierarchical support.
|
||||
|
||||
This is called at index time from the manage connector page to display
|
||||
the complete file system (folders and files). Only folders are selectable.
|
||||
|
||||
Args:
|
||||
connector_id: ID of the Composio Google Drive connector
|
||||
parent_id: Optional parent folder ID to list contents (None for root)
|
||||
|
||||
Returns:
|
||||
JSON with list of items: {
|
||||
"items": [
|
||||
{"id": str, "name": str, "mimeType": str, "isFolder": bool, ...},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
if not ComposioService.is_enabled():
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Composio integration is not enabled.",
|
||||
)
|
||||
|
||||
try:
|
||||
# Get connector and verify ownership
|
||||
result = await session.execute(
|
||||
select(SearchSourceConnector).filter(
|
||||
SearchSourceConnector.id == connector_id,
|
||||
SearchSourceConnector.user_id == user.id,
|
||||
SearchSourceConnector.connector_type
|
||||
== SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
|
||||
)
|
||||
)
|
||||
connector = result.scalars().first()
|
||||
|
||||
if not connector:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Composio Google Drive connector not found or access denied",
|
||||
)
|
||||
|
||||
# Get Composio connected account ID from config
|
||||
composio_connected_account_id = connector.config.get(
|
||||
"composio_connected_account_id"
|
||||
)
|
||||
if not composio_connected_account_id:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Composio connected account not found. Please reconnect the connector.",
|
||||
)
|
||||
|
||||
# Initialize Composio service and fetch files
|
||||
service = ComposioService()
|
||||
entity_id = f"surfsense_{user.id}"
|
||||
|
||||
# Fetch files/folders from Composio Google Drive
|
||||
files, _next_token, error = await service.get_drive_files(
|
||||
connected_account_id=composio_connected_account_id,
|
||||
entity_id=entity_id,
|
||||
folder_id=parent_id,
|
||||
page_size=100,
|
||||
)
|
||||
|
||||
if error:
|
||||
logger.error(f"Failed to list Composio Drive files: {error}")
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to list folder contents: {error}"
|
||||
)
|
||||
|
||||
# Transform files to match the expected format with isFolder field
|
||||
items = []
|
||||
for file_info in files:
|
||||
file_id = file_info.get("id", "") or file_info.get("fileId", "")
|
||||
file_name = (
|
||||
file_info.get("name", "") or file_info.get("fileName", "") or "Untitled"
|
||||
)
|
||||
mime_type = file_info.get("mimeType", "") or file_info.get("mime_type", "")
|
||||
|
||||
if not file_id:
|
||||
continue
|
||||
|
||||
is_folder = mime_type == "application/vnd.google-apps.folder"
|
||||
|
||||
items.append(
|
||||
{
|
||||
"id": file_id,
|
||||
"name": file_name,
|
||||
"mimeType": mime_type,
|
||||
"isFolder": is_folder,
|
||||
"parents": file_info.get("parents", []),
|
||||
"size": file_info.get("size"),
|
||||
"iconLink": file_info.get("iconLink"),
|
||||
}
|
||||
)
|
||||
|
||||
# Sort: folders first, then files, both alphabetically
|
||||
folders = sorted(
|
||||
[item for item in items if item["isFolder"]],
|
||||
key=lambda x: x["name"].lower(),
|
||||
)
|
||||
files_list = sorted(
|
||||
[item for item in items if not item["isFolder"]],
|
||||
key=lambda x: x["name"].lower(),
|
||||
)
|
||||
items = folders + files_list
|
||||
|
||||
folder_count = len(folders)
|
||||
file_count = len(files_list)
|
||||
|
||||
logger.info(
|
||||
f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for Composio connector {connector_id}"
|
||||
+ (f" in folder {parent_id}" if parent_id else " in ROOT")
|
||||
)
|
||||
|
||||
return {"items": items}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing Composio Drive contents: {e!s}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to list Drive contents: {e!s}"
|
||||
) from e
|
||||
|
|
|
|||
|
|
@ -402,7 +402,7 @@ async def list_google_drive_folders(
|
|||
file_count = len(items) - folder_count
|
||||
|
||||
logger.info(
|
||||
f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}"
|
||||
f"Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}"
|
||||
+ (f" in folder {parent_id}" if parent_id else " in ROOT")
|
||||
)
|
||||
|
||||
|
|
|
|||
131
surfsense_backend/app/routes/incentive_tasks_routes.py
Normal file
131
surfsense_backend/app/routes/incentive_tasks_routes.py
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
"""
|
||||
Incentive Tasks API routes.
|
||||
Allows users to complete tasks (like starring GitHub repo) to earn free pages.
|
||||
Each task can only be completed once per user.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import (
|
||||
INCENTIVE_TASKS_CONFIG,
|
||||
IncentiveTaskType,
|
||||
User,
|
||||
UserIncentiveTask,
|
||||
get_async_session,
|
||||
)
|
||||
from app.schemas.incentive_tasks import (
|
||||
CompleteTaskResponse,
|
||||
IncentiveTaskInfo,
|
||||
IncentiveTasksResponse,
|
||||
TaskAlreadyCompletedResponse,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
|
||||
router = APIRouter(prefix="/incentive-tasks", tags=["incentive-tasks"])
|
||||
|
||||
|
||||
@router.get("", response_model=IncentiveTasksResponse)
|
||||
async def get_incentive_tasks(
|
||||
user: User = Depends(current_active_user),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
) -> IncentiveTasksResponse:
|
||||
"""
|
||||
Get all available incentive tasks with the user's completion status.
|
||||
"""
|
||||
# Get all completed tasks for this user
|
||||
result = await session.execute(
|
||||
select(UserIncentiveTask).where(UserIncentiveTask.user_id == user.id)
|
||||
)
|
||||
completed_tasks = {task.task_type: task for task in result.scalars().all()}
|
||||
|
||||
# Build task list with completion status
|
||||
tasks = []
|
||||
total_pages_earned = 0
|
||||
|
||||
for task_type, config in INCENTIVE_TASKS_CONFIG.items():
|
||||
completed_task = completed_tasks.get(task_type)
|
||||
is_completed = completed_task is not None
|
||||
|
||||
if is_completed:
|
||||
total_pages_earned += completed_task.pages_awarded
|
||||
|
||||
tasks.append(
|
||||
IncentiveTaskInfo(
|
||||
task_type=task_type,
|
||||
title=config["title"],
|
||||
description=config["description"],
|
||||
pages_reward=config["pages_reward"],
|
||||
action_url=config["action_url"],
|
||||
completed=is_completed,
|
||||
completed_at=completed_task.completed_at if completed_task else None,
|
||||
)
|
||||
)
|
||||
|
||||
return IncentiveTasksResponse(
|
||||
tasks=tasks,
|
||||
total_pages_earned=total_pages_earned,
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/{task_type}/complete",
|
||||
response_model=CompleteTaskResponse | TaskAlreadyCompletedResponse,
|
||||
)
|
||||
async def complete_task(
|
||||
task_type: IncentiveTaskType,
|
||||
user: User = Depends(current_active_user),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
) -> CompleteTaskResponse | TaskAlreadyCompletedResponse:
|
||||
"""
|
||||
Mark an incentive task as completed and award pages to the user.
|
||||
|
||||
Each task can only be completed once. If the task was already completed,
|
||||
returns the existing completion information without awarding additional pages.
|
||||
"""
|
||||
# Validate task type exists in config
|
||||
task_config = INCENTIVE_TASKS_CONFIG.get(task_type)
|
||||
if not task_config:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Unknown task type: {task_type}",
|
||||
)
|
||||
|
||||
# Check if task was already completed
|
||||
existing_task = await session.execute(
|
||||
select(UserIncentiveTask).where(
|
||||
UserIncentiveTask.user_id == user.id,
|
||||
UserIncentiveTask.task_type == task_type,
|
||||
)
|
||||
)
|
||||
existing = existing_task.scalar_one_or_none()
|
||||
|
||||
if existing:
|
||||
return TaskAlreadyCompletedResponse(
|
||||
success=False,
|
||||
message="Task already completed",
|
||||
completed_at=existing.completed_at,
|
||||
)
|
||||
|
||||
# Create the task completion record
|
||||
pages_reward = task_config["pages_reward"]
|
||||
new_task = UserIncentiveTask(
|
||||
user_id=user.id,
|
||||
task_type=task_type,
|
||||
pages_awarded=pages_reward,
|
||||
)
|
||||
session.add(new_task)
|
||||
|
||||
# Update user's pages_limit
|
||||
user.pages_limit += pages_reward
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(user)
|
||||
|
||||
return CompleteTaskResponse(
|
||||
success=True,
|
||||
message=f"Task completed! You earned {pages_reward} pages.",
|
||||
pages_awarded=pages_reward,
|
||||
new_pages_limit=user.pages_limit,
|
||||
)
|
||||
|
|
@ -59,6 +59,58 @@ router = APIRouter()
|
|||
|
||||
# ============ Permissions Endpoints ============
|
||||
|
||||
# Human-readable descriptions for each permission
|
||||
PERMISSION_DESCRIPTIONS = {
|
||||
# Documents
|
||||
"documents:create": "Add new documents, files, and content to the search space",
|
||||
"documents:read": "View and search documents in the search space",
|
||||
"documents:update": "Edit existing documents and their metadata",
|
||||
"documents:delete": "Remove documents from the search space",
|
||||
# Chats
|
||||
"chats:create": "Start new AI chat conversations",
|
||||
"chats:read": "View chat history and conversations",
|
||||
"chats:update": "Edit chat titles and settings",
|
||||
"chats:delete": "Delete chat conversations",
|
||||
# Comments
|
||||
"comments:create": "Add comments and annotations to documents",
|
||||
"comments:read": "View comments on documents",
|
||||
"comments:delete": "Remove comments from documents",
|
||||
# LLM Configs
|
||||
"llm_configs:create": "Add new AI model configurations",
|
||||
"llm_configs:read": "View AI model settings and configurations",
|
||||
"llm_configs:update": "Modify AI model configurations",
|
||||
"llm_configs:delete": "Remove AI model configurations",
|
||||
# Podcasts
|
||||
"podcasts:create": "Generate new AI podcasts from content",
|
||||
"podcasts:read": "Listen to and view generated podcasts",
|
||||
"podcasts:update": "Edit podcast settings and metadata",
|
||||
"podcasts:delete": "Remove generated podcasts",
|
||||
# Connectors
|
||||
"connectors:create": "Set up new data source integrations",
|
||||
"connectors:read": "View configured data sources and their status",
|
||||
"connectors:update": "Modify data source configurations",
|
||||
"connectors:delete": "Remove data source integrations",
|
||||
# Logs
|
||||
"logs:read": "View activity logs and audit trail",
|
||||
"logs:delete": "Clear activity logs",
|
||||
# Members
|
||||
"members:invite": "Send invitations to new team members",
|
||||
"members:view": "View the list of team members",
|
||||
"members:remove": "Remove members from the search space",
|
||||
"members:manage_roles": "Assign and change member roles",
|
||||
# Roles
|
||||
"roles:create": "Create new custom roles",
|
||||
"roles:read": "View available roles and their permissions",
|
||||
"roles:update": "Modify role permissions",
|
||||
"roles:delete": "Remove custom roles",
|
||||
# Settings
|
||||
"settings:view": "View search space settings",
|
||||
"settings:update": "Modify search space settings",
|
||||
"settings:delete": "Delete the entire search space",
|
||||
# Full access
|
||||
"*": "Full access to all features and settings",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/permissions", response_model=PermissionsListResponse)
|
||||
async def list_all_permissions(
|
||||
|
|
@ -71,12 +123,14 @@ async def list_all_permissions(
|
|||
for perm in Permission:
|
||||
# Extract category from permission value (e.g., "documents:read" -> "documents")
|
||||
category = perm.value.split(":")[0] if ":" in perm.value else "general"
|
||||
description = PERMISSION_DESCRIPTIONS.get(perm.value, f"Permission for {perm.value}")
|
||||
|
||||
permissions.append(
|
||||
PermissionInfo(
|
||||
value=perm.value,
|
||||
name=perm.name,
|
||||
category=category,
|
||||
description=description,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,8 @@ import logging
|
|||
from datetime import UTC, datetime, timedelta
|
||||
from typing import Any
|
||||
|
||||
import pytz
|
||||
from dateutil.parser import isoparse
|
||||
from fastapi import APIRouter, Body, Depends, HTTPException, Query
|
||||
from pydantic import BaseModel, Field, ValidationError
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
|
|
@ -47,6 +49,7 @@ from app.schemas import (
|
|||
SearchSourceConnectorRead,
|
||||
SearchSourceConnectorUpdate,
|
||||
)
|
||||
from app.services.composio_service import ComposioService
|
||||
from app.services.notification_service import NotificationService
|
||||
from app.tasks.connector_indexers import (
|
||||
index_airtable_records,
|
||||
|
|
@ -529,6 +532,38 @@ async def delete_search_source_connector(
|
|||
f"Failed to delete periodic schedule for connector {connector_id}"
|
||||
)
|
||||
|
||||
# For Composio connectors, also delete the connected account in Composio
|
||||
composio_connector_types = [
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
|
||||
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
|
||||
]
|
||||
if db_connector.connector_type in composio_connector_types:
|
||||
composio_connected_account_id = db_connector.config.get(
|
||||
"composio_connected_account_id"
|
||||
)
|
||||
if composio_connected_account_id and ComposioService.is_enabled():
|
||||
try:
|
||||
service = ComposioService()
|
||||
deleted = await service.delete_connected_account(
|
||||
composio_connected_account_id
|
||||
)
|
||||
if deleted:
|
||||
logger.info(
|
||||
f"Successfully deleted Composio connected account {composio_connected_account_id} "
|
||||
f"for connector {connector_id}"
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Failed to delete Composio connected account {composio_connected_account_id} "
|
||||
f"for connector {connector_id}"
|
||||
)
|
||||
except Exception as composio_error:
|
||||
# Log but don't fail the deletion - Composio account may already be deleted
|
||||
logger.warning(
|
||||
f"Error deleting Composio connected account {composio_connected_account_id}: {composio_error!s}"
|
||||
)
|
||||
|
||||
await session.delete(db_connector)
|
||||
await session.commit()
|
||||
return {"message": "Search source connector deleted successfully"}
|
||||
|
|
@ -611,32 +646,59 @@ async def index_connector_content(
|
|||
|
||||
# Handle different connector types
|
||||
response_message = ""
|
||||
today_str = datetime.now().strftime("%Y-%m-%d")
|
||||
# Use UTC for consistency with last_indexed_at storage
|
||||
today_str = datetime.now(UTC).strftime("%Y-%m-%d")
|
||||
|
||||
# Determine the actual date range to use
|
||||
if start_date is None:
|
||||
# Use last_indexed_at or default to 365 days ago
|
||||
if connector.last_indexed_at:
|
||||
today = datetime.now().date()
|
||||
if connector.last_indexed_at.date() == today:
|
||||
# If last indexed today, go back 1 day to ensure we don't miss anything
|
||||
indexing_from = (today - timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
else:
|
||||
indexing_from = connector.last_indexed_at.strftime("%Y-%m-%d")
|
||||
else:
|
||||
indexing_from = (datetime.now() - timedelta(days=365)).strftime(
|
||||
"%Y-%m-%d"
|
||||
# Convert last_indexed_at to timezone-naive for comparison (like calculate_date_range does)
|
||||
last_indexed_naive = (
|
||||
connector.last_indexed_at.replace(tzinfo=None)
|
||||
if connector.last_indexed_at.tzinfo
|
||||
else connector.last_indexed_at
|
||||
)
|
||||
# Use UTC for "today" to match how last_indexed_at is stored
|
||||
today_utc = datetime.now(UTC).replace(tzinfo=None).date()
|
||||
last_indexed_date = last_indexed_naive.date()
|
||||
|
||||
if last_indexed_date == today_utc:
|
||||
# If last indexed today, go back 1 day to ensure we don't miss anything
|
||||
indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
else:
|
||||
indexing_from = last_indexed_naive.strftime("%Y-%m-%d")
|
||||
else:
|
||||
indexing_from = (
|
||||
datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365)
|
||||
).strftime("%Y-%m-%d")
|
||||
else:
|
||||
indexing_from = start_date
|
||||
|
||||
# For calendar connectors, default to today but allow future dates if explicitly provided
|
||||
if connector.connector_type in [
|
||||
SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
|
||||
SearchSourceConnectorType.LUMA_CONNECTOR,
|
||||
]:
|
||||
# Default to today if no end_date provided (users can manually select future dates)
|
||||
indexing_to = today_str if end_date is None else end_date
|
||||
|
||||
# If start_date and end_date are the same, adjust end_date to be one day later
|
||||
# to ensure valid date range (start_date must be strictly before end_date)
|
||||
if indexing_from == indexing_to:
|
||||
dt = isoparse(indexing_to)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=pytz.UTC)
|
||||
else:
|
||||
dt = dt.astimezone(pytz.UTC)
|
||||
# Add one day to end_date to make it strictly after start_date
|
||||
dt_end = dt + timedelta(days=1)
|
||||
indexing_to = dt_end.strftime("%Y-%m-%d")
|
||||
logger.info(
|
||||
f"Adjusted end_date from {end_date} to {indexing_to} "
|
||||
f"to ensure valid date range (start_date must be strictly before end_date)"
|
||||
)
|
||||
else:
|
||||
# For non-calendar connectors, cap at today
|
||||
indexing_to = end_date if end_date else today_str
|
||||
|
|
@ -887,11 +949,66 @@ async def index_connector_content(
|
|||
)
|
||||
response_message = "Obsidian vault indexing started in the background."
|
||||
|
||||
elif connector.connector_type == SearchSourceConnectorType.COMPOSIO_CONNECTOR:
|
||||
elif (
|
||||
connector.connector_type
|
||||
== SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
|
||||
):
|
||||
from app.tasks.celery_tasks.connector_tasks import (
|
||||
index_composio_connector_task,
|
||||
)
|
||||
|
||||
# For Composio Google Drive, if drive_items is provided, update connector config
|
||||
# This allows the UI to pass folder/file selection like the regular Google Drive connector
|
||||
if drive_items and drive_items.has_items():
|
||||
# Update connector config with the selected folders/files
|
||||
config = connector.config or {}
|
||||
config["selected_folders"] = [
|
||||
{"id": f.id, "name": f.name} for f in drive_items.folders
|
||||
]
|
||||
config["selected_files"] = [
|
||||
{"id": f.id, "name": f.name} for f in drive_items.files
|
||||
]
|
||||
if drive_items.indexing_options:
|
||||
config["indexing_options"] = {
|
||||
"max_files_per_folder": drive_items.indexing_options.max_files_per_folder,
|
||||
"incremental_sync": drive_items.indexing_options.incremental_sync,
|
||||
"include_subfolders": drive_items.indexing_options.include_subfolders,
|
||||
}
|
||||
connector.config = config
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
flag_modified(connector, "config")
|
||||
await session.commit()
|
||||
await session.refresh(connector)
|
||||
|
||||
logger.info(
|
||||
f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id}, "
|
||||
f"folders: {len(drive_items.folders)}, files: {len(drive_items.files)}"
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"Triggering Composio Google Drive indexing for connector {connector_id} into search space {search_space_id} "
|
||||
f"using existing config (from {indexing_from} to {indexing_to})"
|
||||
)
|
||||
|
||||
index_composio_connector_task.delay(
|
||||
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
|
||||
)
|
||||
response_message = (
|
||||
"Composio Google Drive indexing started in the background."
|
||||
)
|
||||
|
||||
elif connector.connector_type in [
|
||||
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
|
||||
]:
|
||||
from app.tasks.celery_tasks.connector_tasks import (
|
||||
index_composio_connector_task,
|
||||
)
|
||||
|
||||
# For Composio Gmail and Calendar, use the same date calculation logic as normal connectors
|
||||
# This ensures consistent behavior and uses last_indexed_at to reduce API calls
|
||||
# (includes special case: if indexed today, go back 1 day to avoid missing data)
|
||||
logger.info(
|
||||
f"Triggering Composio connector indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
|
||||
)
|
||||
|
|
@ -943,7 +1060,9 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id:
|
|||
connector = result.scalars().first()
|
||||
|
||||
if connector:
|
||||
connector.last_indexed_at = datetime.now()
|
||||
connector.last_indexed_at = datetime.now(
|
||||
UTC
|
||||
) # Use UTC for timezone consistency
|
||||
await session.commit()
|
||||
logger.info(f"Updated last_indexed_at for connector {connector_id}")
|
||||
except Exception as e:
|
||||
|
|
@ -1083,18 +1202,24 @@ async def _run_indexing_with_notifications(
|
|||
)
|
||||
|
||||
await update_timestamp_func(session, connector_id)
|
||||
await session.commit() # Commit timestamp update
|
||||
logger.info(
|
||||
f"Indexing completed successfully: {documents_processed} documents processed"
|
||||
)
|
||||
|
||||
# Update notification on success
|
||||
# Update notification on success (or partial success with errors)
|
||||
if notification:
|
||||
# Refresh notification to ensure it's not stale after timestamp update commit
|
||||
await session.refresh(notification)
|
||||
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
indexed_count=documents_processed,
|
||||
error_message=None,
|
||||
error_message=error_or_warning, # Show errors even if some documents were indexed
|
||||
)
|
||||
await (
|
||||
session.commit()
|
||||
) # Commit to ensure Electric SQL syncs the notification update
|
||||
elif documents_processed > 0:
|
||||
# Update notification to storing stage
|
||||
if notification:
|
||||
|
|
@ -1110,24 +1235,73 @@ async def _run_indexing_with_notifications(
|
|||
f"Indexing completed successfully: {documents_processed} documents processed"
|
||||
)
|
||||
if notification:
|
||||
# Refresh notification to ensure it's not stale after indexing function commits
|
||||
await session.refresh(notification)
|
||||
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
indexed_count=documents_processed,
|
||||
error_message=None,
|
||||
error_message=error_or_warning, # Show errors even if some documents were indexed
|
||||
)
|
||||
await (
|
||||
session.commit()
|
||||
) # Commit to ensure Electric SQL syncs the notification update
|
||||
else:
|
||||
# No new documents processed - check if this is an error or just no changes
|
||||
if error_or_warning:
|
||||
# Actual failure
|
||||
logger.error(f"Indexing failed: {error_or_warning}")
|
||||
if notification:
|
||||
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
indexed_count=0,
|
||||
error_message=error_or_warning,
|
||||
)
|
||||
# Check if this is a duplicate warning or empty result (success cases) or an actual error
|
||||
# Handle both normal and Composio calendar connectors
|
||||
error_or_warning_lower = (
|
||||
str(error_or_warning).lower() if error_or_warning else ""
|
||||
)
|
||||
is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower
|
||||
# "No X found" messages are success cases - sync worked, just found nothing in date range
|
||||
is_empty_result = (
|
||||
"no " in error_or_warning_lower
|
||||
and "found" in error_or_warning_lower
|
||||
)
|
||||
|
||||
if is_duplicate_warning or is_empty_result:
|
||||
# These are success cases - sync worked, just found nothing new
|
||||
logger.info(f"Indexing completed successfully: {error_or_warning}")
|
||||
# Still update timestamp so ElectricSQL syncs and clears "Syncing" UI
|
||||
if update_timestamp_func:
|
||||
await update_timestamp_func(session, connector_id)
|
||||
await session.commit() # Commit timestamp update
|
||||
if notification:
|
||||
# Refresh notification to ensure it's not stale after timestamp update commit
|
||||
await session.refresh(notification)
|
||||
# For empty results, use a cleaner message
|
||||
notification_message = (
|
||||
"No new items found in date range"
|
||||
if is_empty_result
|
||||
else error_or_warning
|
||||
)
|
||||
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
indexed_count=0,
|
||||
error_message=notification_message, # Pass as warning, not error
|
||||
is_warning=True, # Flag to indicate this is a warning, not an error
|
||||
)
|
||||
await (
|
||||
session.commit()
|
||||
) # Commit to ensure Electric SQL syncs the notification update
|
||||
else:
|
||||
# Actual failure
|
||||
logger.error(f"Indexing failed: {error_or_warning}")
|
||||
if notification:
|
||||
# Refresh notification to ensure it's not stale after indexing function commits
|
||||
await session.refresh(notification)
|
||||
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
indexed_count=0,
|
||||
error_message=error_or_warning,
|
||||
)
|
||||
await (
|
||||
session.commit()
|
||||
) # Commit to ensure Electric SQL syncs the notification update
|
||||
else:
|
||||
# Success - just no new documents to index (all skipped/unchanged)
|
||||
logger.info(
|
||||
|
|
@ -1136,13 +1310,19 @@ async def _run_indexing_with_notifications(
|
|||
# Still update timestamp so ElectricSQL syncs and clears "Syncing" UI
|
||||
if update_timestamp_func:
|
||||
await update_timestamp_func(session, connector_id)
|
||||
await session.commit() # Commit timestamp update
|
||||
if notification:
|
||||
# Refresh notification to ensure it's not stale after timestamp update commit
|
||||
await session.refresh(notification)
|
||||
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
indexed_count=0,
|
||||
error_message=None, # No error - sync succeeded
|
||||
)
|
||||
await (
|
||||
session.commit()
|
||||
) # Commit to ensure Electric SQL syncs the notification update
|
||||
except Exception as e:
|
||||
logger.error(f"Error in indexing task: {e!s}", exc_info=True)
|
||||
|
||||
|
|
@ -2157,6 +2337,59 @@ async def run_obsidian_indexing(
|
|||
)
|
||||
|
||||
|
||||
async def run_composio_indexing_with_new_session(
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""
|
||||
Create a new session and run the Composio indexing task.
|
||||
This prevents session leaks by creating a dedicated session for the background task.
|
||||
"""
|
||||
async with async_session_maker() as session:
|
||||
await run_composio_indexing(
|
||||
session, connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
|
||||
|
||||
async def run_composio_indexing(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str | None,
|
||||
end_date: str | None,
|
||||
):
|
||||
"""
|
||||
Run Composio connector indexing with real-time notifications.
|
||||
|
||||
This wraps the Composio indexer with the notification system so that
|
||||
Electric SQL can sync indexing progress to the frontend in real-time.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the Composio connector
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
start_date: Start date for indexing
|
||||
end_date: End date for indexing
|
||||
"""
|
||||
from app.tasks.composio_indexer import index_composio_connector
|
||||
|
||||
await _run_indexing_with_notifications(
|
||||
session=session,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
indexing_function=index_composio_connector,
|
||||
update_timestamp_func=_update_connector_timestamp_by_id,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MCP Connector Routes
|
||||
# =============================================================================
|
||||
|
|
|
|||
|
|
@ -129,6 +129,7 @@ async def read_search_spaces(
|
|||
result = await session.execute(
|
||||
select(SearchSpace)
|
||||
.filter(SearchSpace.user_id == user.id)
|
||||
.order_by(SearchSpace.id.asc())
|
||||
.offset(skip)
|
||||
.limit(limit)
|
||||
)
|
||||
|
|
@ -138,6 +139,7 @@ async def read_search_spaces(
|
|||
select(SearchSpace)
|
||||
.join(SearchSpaceMembership)
|
||||
.filter(SearchSpaceMembership.user_id == user.id)
|
||||
.order_by(SearchSpace.id.asc())
|
||||
.offset(skip)
|
||||
.limit(limit)
|
||||
)
|
||||
|
|
|
|||
61
surfsense_backend/app/schemas/incentive_tasks.py
Normal file
61
surfsense_backend/app/schemas/incentive_tasks.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
"""
|
||||
Schemas for incentive tasks API.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.db import INCENTIVE_TASKS_CONFIG, IncentiveTaskType
|
||||
|
||||
|
||||
class IncentiveTaskInfo(BaseModel):
|
||||
"""Information about an available incentive task."""
|
||||
|
||||
task_type: IncentiveTaskType
|
||||
title: str
|
||||
description: str
|
||||
pages_reward: int
|
||||
action_url: str
|
||||
completed: bool
|
||||
completed_at: datetime | None = None
|
||||
|
||||
|
||||
class IncentiveTasksResponse(BaseModel):
|
||||
"""Response containing all available incentive tasks with completion status."""
|
||||
|
||||
tasks: list[IncentiveTaskInfo]
|
||||
total_pages_earned: int
|
||||
|
||||
|
||||
class CompleteTaskRequest(BaseModel):
|
||||
"""Request to mark a task as completed."""
|
||||
|
||||
task_type: IncentiveTaskType
|
||||
|
||||
|
||||
class CompleteTaskResponse(BaseModel):
|
||||
"""Response after completing a task."""
|
||||
|
||||
success: bool
|
||||
message: str
|
||||
pages_awarded: int
|
||||
new_pages_limit: int
|
||||
|
||||
|
||||
class TaskAlreadyCompletedResponse(BaseModel):
|
||||
"""Response when task was already completed."""
|
||||
|
||||
success: bool
|
||||
message: str
|
||||
completed_at: datetime
|
||||
|
||||
|
||||
def get_task_info(task_type: IncentiveTaskType) -> dict | None:
|
||||
"""Get task configuration by type."""
|
||||
return INCENTIVE_TASKS_CONFIG.get(task_type)
|
||||
|
||||
|
||||
def get_all_task_types() -> list[IncentiveTaskType]:
|
||||
"""Get all configured task types."""
|
||||
return list(INCENTIVE_TASKS_CONFIG.keys())
|
||||
|
|
@ -167,6 +167,7 @@ class PermissionInfo(BaseModel):
|
|||
value: str
|
||||
name: str
|
||||
category: str
|
||||
description: str
|
||||
|
||||
|
||||
class PermissionsListResponse(BaseModel):
|
||||
|
|
|
|||
|
|
@ -39,21 +39,73 @@ COMPOSIO_TOOLKIT_NAMES = {
|
|||
# Toolkits that support indexing (Phase 1: Google services only)
|
||||
INDEXABLE_TOOLKITS = {"googledrive", "gmail", "googlecalendar"}
|
||||
|
||||
# Mapping of toolkit IDs to connector types
|
||||
TOOLKIT_TO_CONNECTOR_TYPE = {
|
||||
"googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"gmail": "COMPOSIO_GMAIL_CONNECTOR",
|
||||
"googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
}
|
||||
|
||||
# Mapping of toolkit IDs to document types
|
||||
TOOLKIT_TO_DOCUMENT_TYPE = {
|
||||
"googledrive": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"gmail": "COMPOSIO_GMAIL_CONNECTOR",
|
||||
"googlecalendar": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
}
|
||||
|
||||
# Mapping of toolkit IDs to their indexer functions
|
||||
# Format: toolkit_id -> (module_path, function_name, supports_date_filter)
|
||||
# supports_date_filter: True if the indexer accepts start_date/end_date params
|
||||
TOOLKIT_TO_INDEXER = {
|
||||
"googledrive": (
|
||||
"app.connectors.composio_google_drive_connector",
|
||||
"index_composio_google_drive",
|
||||
False, # Google Drive doesn't use date filtering
|
||||
),
|
||||
"gmail": (
|
||||
"app.connectors.composio_gmail_connector",
|
||||
"index_composio_gmail",
|
||||
True, # Gmail uses date filtering
|
||||
),
|
||||
"googlecalendar": (
|
||||
"app.connectors.composio_google_calendar_connector",
|
||||
"index_composio_google_calendar",
|
||||
True, # Calendar uses date filtering
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class ComposioService:
|
||||
"""Service for interacting with Composio API."""
|
||||
|
||||
def __init__(self, api_key: str | None = None):
|
||||
# Default download directory for files from Composio
|
||||
DEFAULT_DOWNLOAD_DIR = "/tmp/composio_downloads"
|
||||
|
||||
def __init__(
|
||||
self, api_key: str | None = None, file_download_dir: str | None = None
|
||||
):
|
||||
"""
|
||||
Initialize the Composio service.
|
||||
|
||||
Args:
|
||||
api_key: Composio API key. If not provided, uses config.COMPOSIO_API_KEY.
|
||||
file_download_dir: Directory for downloaded files. Defaults to /tmp/composio_downloads.
|
||||
"""
|
||||
import os
|
||||
|
||||
self.api_key = api_key or config.COMPOSIO_API_KEY
|
||||
if not self.api_key:
|
||||
raise ValueError("COMPOSIO_API_KEY is required but not configured")
|
||||
self.client = Composio(api_key=self.api_key)
|
||||
|
||||
# Set up download directory
|
||||
self.file_download_dir = file_download_dir or self.DEFAULT_DOWNLOAD_DIR
|
||||
os.makedirs(self.file_download_dir, exist_ok=True)
|
||||
|
||||
# Initialize Composio client with download directory
|
||||
# Per docs: file_download_dir configures where files are downloaded
|
||||
self.client = Composio(
|
||||
api_key=self.api_key, file_download_dir=self.file_download_dir
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def is_enabled() -> bool:
|
||||
|
|
@ -252,7 +304,6 @@ class ComposioService:
|
|||
}
|
||||
)
|
||||
|
||||
logger.info(f"DEBUG: Found {len(result)} TOTAL connections in Composio")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list all connections: {e!s}")
|
||||
|
|
@ -269,7 +320,6 @@ class ComposioService:
|
|||
List of connected account details.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"DEBUG: Calling connected_accounts.list(user_id='{user_id}')")
|
||||
accounts_response = self.client.connected_accounts.list(user_id=user_id)
|
||||
|
||||
# Handle paginated response (may have .items attribute) or direct list
|
||||
|
|
@ -312,6 +362,30 @@ class ComposioService:
|
|||
logger.error(f"Failed to list connections for user {user_id}: {e!s}")
|
||||
return []
|
||||
|
||||
async def delete_connected_account(self, connected_account_id: str) -> bool:
|
||||
"""
|
||||
Delete a connected account from Composio.
|
||||
|
||||
This permanently removes the connected account and revokes access tokens.
|
||||
|
||||
Args:
|
||||
connected_account_id: The Composio connected account ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
self.client.connected_accounts.delete(connected_account_id)
|
||||
logger.info(
|
||||
f"Successfully deleted Composio connected account: {connected_account_id}"
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to delete Composio connected account {connected_account_id}: {e!s}"
|
||||
)
|
||||
return False
|
||||
|
||||
async def execute_tool(
|
||||
self,
|
||||
connected_account_id: str,
|
||||
|
|
@ -338,7 +412,6 @@ class ComposioService:
|
|||
# - connected_account_id: for authentication
|
||||
# - user_id: user identifier (SDK uses user_id, not entity_id)
|
||||
# - dangerously_skip_version_check: skip version check for manual execution
|
||||
logger.info(f"DEBUG: Executing tool {tool_name} with params: {params}")
|
||||
result = self.client.tools.execute(
|
||||
slug=tool_name,
|
||||
connected_account_id=connected_account_id,
|
||||
|
|
@ -346,8 +419,6 @@ class ComposioService:
|
|||
arguments=params or {},
|
||||
dangerously_skip_version_check=True,
|
||||
)
|
||||
logger.info(f"DEBUG: Tool {tool_name} raw result type: {type(result)}")
|
||||
logger.info(f"DEBUG: Tool {tool_name} raw result: {result}")
|
||||
return {"success": True, "data": result}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to execute tool {tool_name}: {e!s}")
|
||||
|
|
@ -382,7 +453,15 @@ class ComposioService:
|
|||
"page_size": min(page_size, 100),
|
||||
}
|
||||
if folder_id:
|
||||
params["folder_id"] = folder_id
|
||||
# List contents of a specific folder (exclude shortcuts - we don't have access to them)
|
||||
params["q"] = (
|
||||
f"'{folder_id}' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'"
|
||||
)
|
||||
else:
|
||||
# List root-level items only (My Drive root), exclude shortcuts
|
||||
params["q"] = (
|
||||
"'root' in parents and trashed = false and mimeType != 'application/vnd.google-apps.shortcut'"
|
||||
)
|
||||
if page_token:
|
||||
params["page_token"] = page_token
|
||||
|
||||
|
|
@ -397,9 +476,6 @@ class ComposioService:
|
|||
return [], None, result.get("error", "Unknown error")
|
||||
|
||||
data = result.get("data", {})
|
||||
logger.info(
|
||||
f"DEBUG: Drive data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}"
|
||||
)
|
||||
|
||||
# Handle nested response structure from Composio
|
||||
files = []
|
||||
|
|
@ -415,7 +491,6 @@ class ComposioService:
|
|||
elif isinstance(data, list):
|
||||
files = data
|
||||
|
||||
logger.info(f"DEBUG: Extracted {len(files)} drive files")
|
||||
return files, next_token, None
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -428,6 +503,10 @@ class ComposioService:
|
|||
"""
|
||||
Download file content from Google Drive via Composio.
|
||||
|
||||
Per Composio docs: When tools return files, they are automatically downloaded
|
||||
to a local directory, and the local file path is provided in the response.
|
||||
Response includes: file_path, file_name, size fields.
|
||||
|
||||
Args:
|
||||
connected_account_id: Composio connected account ID.
|
||||
entity_id: The entity/user ID that owns the connected account.
|
||||
|
|
@ -436,27 +515,264 @@ class ComposioService:
|
|||
Returns:
|
||||
Tuple of (file content bytes, error message).
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
result = await self.execute_tool(
|
||||
connected_account_id=connected_account_id,
|
||||
tool_name="GOOGLEDRIVE_DOWNLOAD_FILE",
|
||||
params={"file_id": file_id}, # snake_case
|
||||
params={"file_id": file_id},
|
||||
entity_id=entity_id,
|
||||
)
|
||||
|
||||
if not result.get("success"):
|
||||
return None, result.get("error", "Unknown error")
|
||||
|
||||
content = result.get("data")
|
||||
if isinstance(content, str):
|
||||
content = content.encode("utf-8")
|
||||
data = result.get("data")
|
||||
if not data:
|
||||
return None, "No data returned from Composio"
|
||||
|
||||
return content, None
|
||||
# Per Composio docs, response includes file_path where file was downloaded
|
||||
# Response structure: {data: {...}, error: ..., successful: ...}
|
||||
# The actual file info is nested inside data["data"]
|
||||
file_path = None
|
||||
|
||||
if isinstance(data, dict):
|
||||
# Handle nested response structure: data contains {data, error, successful}
|
||||
# The actual file info is in data["data"]
|
||||
inner_data = data
|
||||
if "data" in data and isinstance(data["data"], dict):
|
||||
inner_data = data["data"]
|
||||
logger.debug(
|
||||
f"Found nested data structure. Inner keys: {list(inner_data.keys())}"
|
||||
)
|
||||
elif "successful" in data and "data" in data:
|
||||
# Standard Composio response wrapper
|
||||
inner_data = data["data"] if data["data"] else data
|
||||
|
||||
# Try documented fields: file_path, downloaded_file_content, path, uri
|
||||
file_path = (
|
||||
inner_data.get("file_path")
|
||||
or inner_data.get("downloaded_file_content")
|
||||
or inner_data.get("path")
|
||||
or inner_data.get("uri")
|
||||
)
|
||||
|
||||
# Handle nested dict case where downloaded_file_content contains the path
|
||||
if isinstance(file_path, dict):
|
||||
file_path = (
|
||||
file_path.get("file_path")
|
||||
or file_path.get("downloaded_file_content")
|
||||
or file_path.get("path")
|
||||
or file_path.get("uri")
|
||||
)
|
||||
|
||||
# If still no path, check if inner_data itself has the nested structure
|
||||
if not file_path and isinstance(inner_data, dict):
|
||||
for key in ["downloaded_file_content", "file_path", "path", "uri"]:
|
||||
if key in inner_data:
|
||||
val = inner_data[key]
|
||||
if isinstance(val, str):
|
||||
file_path = val
|
||||
break
|
||||
elif isinstance(val, dict):
|
||||
# One more level of nesting
|
||||
file_path = (
|
||||
val.get("file_path")
|
||||
or val.get("downloaded_file_content")
|
||||
or val.get("path")
|
||||
or val.get("uri")
|
||||
)
|
||||
if file_path:
|
||||
break
|
||||
|
||||
logger.debug(
|
||||
f"Composio response keys: {list(data.keys())}, inner keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}, extracted path: {file_path}"
|
||||
)
|
||||
elif isinstance(data, str):
|
||||
# Direct string response (could be path or content)
|
||||
file_path = data
|
||||
elif isinstance(data, bytes):
|
||||
# Direct bytes response
|
||||
return data, None
|
||||
|
||||
# Read file from the path
|
||||
if file_path and isinstance(file_path, str):
|
||||
path_obj = Path(file_path)
|
||||
|
||||
# Check if it's a valid file path (absolute or in .composio directory)
|
||||
if path_obj.is_absolute() or ".composio" in str(path_obj):
|
||||
try:
|
||||
if path_obj.exists():
|
||||
content = path_obj.read_bytes()
|
||||
logger.info(
|
||||
f"Successfully read {len(content)} bytes from Composio file: {file_path}"
|
||||
)
|
||||
return content, None
|
||||
else:
|
||||
logger.warning(
|
||||
f"File path from Composio does not exist: {file_path}"
|
||||
)
|
||||
return None, f"File not found at path: {file_path}"
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to read file from Composio path {file_path}: {e!s}"
|
||||
)
|
||||
return None, f"Failed to read file: {e!s}"
|
||||
else:
|
||||
# Not a file path - might be base64 encoded content
|
||||
try:
|
||||
import base64
|
||||
|
||||
content = base64.b64decode(file_path)
|
||||
return content, None
|
||||
except Exception:
|
||||
# Not base64, return as UTF-8 bytes
|
||||
return file_path.encode("utf-8"), None
|
||||
|
||||
# If we got here, couldn't extract file path
|
||||
if isinstance(data, dict):
|
||||
# Log full structure for debugging
|
||||
inner_data = data.get("data", {})
|
||||
logger.warning(
|
||||
f"Could not extract file path from Composio response. "
|
||||
f"Top keys: {list(data.keys())}, "
|
||||
f"Inner data keys: {list(inner_data.keys()) if isinstance(inner_data, dict) else type(inner_data).__name__}, "
|
||||
f"Full inner data: {inner_data}"
|
||||
)
|
||||
return (
|
||||
None,
|
||||
f"No file path in Composio response. Keys: {list(data.keys())}, inner: {list(inner_data.keys()) if isinstance(inner_data, dict) else 'N/A'}",
|
||||
)
|
||||
|
||||
return None, f"Unexpected data type from Composio: {type(data).__name__}"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get Drive file content: {e!s}")
|
||||
return None, str(e)
|
||||
|
||||
async def get_drive_start_page_token(
|
||||
self, connected_account_id: str, entity_id: str
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Get the starting page token for Google Drive change tracking.
|
||||
|
||||
This token represents the current state and is used for future delta syncs.
|
||||
Per Composio docs: Use GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN to get initial token.
|
||||
|
||||
Args:
|
||||
connected_account_id: Composio connected account ID.
|
||||
entity_id: The entity/user ID that owns the connected account.
|
||||
|
||||
Returns:
|
||||
Tuple of (start_page_token, error message).
|
||||
"""
|
||||
try:
|
||||
result = await self.execute_tool(
|
||||
connected_account_id=connected_account_id,
|
||||
tool_name="GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN",
|
||||
params={},
|
||||
entity_id=entity_id,
|
||||
)
|
||||
|
||||
if not result.get("success"):
|
||||
return None, result.get("error", "Unknown error")
|
||||
|
||||
data = result.get("data", {})
|
||||
# Handle nested response: {data: {startPageToken: ...}, successful: ...}
|
||||
if isinstance(data, dict):
|
||||
inner_data = data.get("data", data)
|
||||
token = (
|
||||
inner_data.get("startPageToken")
|
||||
or inner_data.get("start_page_token")
|
||||
or data.get("startPageToken")
|
||||
or data.get("start_page_token")
|
||||
)
|
||||
if token:
|
||||
logger.info(f"Got Drive start page token: {token}")
|
||||
return token, None
|
||||
|
||||
logger.warning(f"Could not extract start page token from response: {data}")
|
||||
return None, "No start page token in response"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get Drive start page token: {e!s}")
|
||||
return None, str(e)
|
||||
|
||||
async def list_drive_changes(
|
||||
self,
|
||||
connected_account_id: str,
|
||||
entity_id: str,
|
||||
page_token: str | None = None,
|
||||
page_size: int = 100,
|
||||
include_removed: bool = True,
|
||||
) -> tuple[list[dict[str, Any]], str | None, str | None]:
|
||||
"""
|
||||
List changes in Google Drive since the given page token.
|
||||
|
||||
Per Composio docs: GOOGLEDRIVE_LIST_CHANGES tracks modifications to files/folders.
|
||||
If pageToken is not provided, it auto-fetches the current start page token.
|
||||
Response includes nextPageToken for pagination and newStartPageToken for future syncs.
|
||||
|
||||
Args:
|
||||
connected_account_id: Composio connected account ID.
|
||||
entity_id: The entity/user ID that owns the connected account.
|
||||
page_token: Page token from previous sync (optional - will auto-fetch if not provided).
|
||||
page_size: Number of changes per page.
|
||||
include_removed: Whether to include removed items in the response.
|
||||
|
||||
Returns:
|
||||
Tuple of (changes list, new_start_page_token, error message).
|
||||
"""
|
||||
try:
|
||||
params = {
|
||||
"pageSize": min(page_size, 100),
|
||||
"includeRemoved": include_removed,
|
||||
}
|
||||
if page_token:
|
||||
params["pageToken"] = page_token
|
||||
|
||||
result = await self.execute_tool(
|
||||
connected_account_id=connected_account_id,
|
||||
tool_name="GOOGLEDRIVE_LIST_CHANGES",
|
||||
params=params,
|
||||
entity_id=entity_id,
|
||||
)
|
||||
|
||||
if not result.get("success"):
|
||||
return [], None, result.get("error", "Unknown error")
|
||||
|
||||
data = result.get("data", {})
|
||||
|
||||
# Handle nested response structure
|
||||
changes = []
|
||||
new_start_token = None
|
||||
|
||||
if isinstance(data, dict):
|
||||
inner_data = data.get("data", data)
|
||||
changes = inner_data.get("changes", []) or data.get("changes", [])
|
||||
|
||||
# Get the token for next sync
|
||||
# newStartPageToken is returned when all changes have been fetched
|
||||
# nextPageToken is for pagination within the current fetch
|
||||
new_start_token = (
|
||||
inner_data.get("newStartPageToken")
|
||||
or inner_data.get("new_start_page_token")
|
||||
or inner_data.get("nextPageToken")
|
||||
or inner_data.get("next_page_token")
|
||||
or data.get("newStartPageToken")
|
||||
or data.get("nextPageToken")
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Got {len(changes)} Drive changes, new token: {new_start_token[:20] if new_start_token else 'None'}..."
|
||||
)
|
||||
return changes, new_start_token, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list Drive changes: {e!s}")
|
||||
return [], None, str(e)
|
||||
|
||||
# ===== Gmail specific methods =====
|
||||
|
||||
async def get_gmail_messages(
|
||||
|
|
@ -464,25 +780,30 @@ class ComposioService:
|
|||
connected_account_id: str,
|
||||
entity_id: str,
|
||||
query: str = "",
|
||||
max_results: int = 100,
|
||||
) -> tuple[list[dict[str, Any]], str | None]:
|
||||
max_results: int = 50,
|
||||
page_token: str | None = None,
|
||||
) -> tuple[list[dict[str, Any]], str | None, int | None, str | None]:
|
||||
"""
|
||||
List Gmail messages via Composio.
|
||||
List Gmail messages via Composio with pagination support.
|
||||
|
||||
Args:
|
||||
connected_account_id: Composio connected account ID.
|
||||
entity_id: The entity/user ID that owns the connected account.
|
||||
query: Gmail search query.
|
||||
max_results: Maximum number of messages to return.
|
||||
max_results: Maximum number of messages to return per page (default: 50 to avoid payload size issues).
|
||||
page_token: Optional pagination token for next page.
|
||||
|
||||
Returns:
|
||||
Tuple of (messages list, error message).
|
||||
Tuple of (messages list, next_page_token, result_size_estimate, error message).
|
||||
"""
|
||||
try:
|
||||
# Composio uses snake_case for parameters, max is 500
|
||||
params = {"max_results": min(max_results, 500)}
|
||||
# Use smaller batch size to avoid 413 payload too large errors
|
||||
# Composio uses snake_case for parameters
|
||||
params = {"max_results": min(max_results, 50)} # Reduced from 500 to 50
|
||||
if query:
|
||||
params["query"] = query # Composio uses 'query' not 'q'
|
||||
if page_token:
|
||||
params["page_token"] = page_token
|
||||
|
||||
result = await self.execute_tool(
|
||||
connected_account_id=connected_account_id,
|
||||
|
|
@ -492,31 +813,42 @@ class ComposioService:
|
|||
)
|
||||
|
||||
if not result.get("success"):
|
||||
return [], result.get("error", "Unknown error")
|
||||
return [], None, result.get("error", "Unknown error")
|
||||
|
||||
data = result.get("data", {})
|
||||
logger.info(
|
||||
f"DEBUG: Gmail data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}"
|
||||
)
|
||||
logger.info(f"DEBUG: Gmail full data: {data}")
|
||||
|
||||
# Try different possible response structures
|
||||
messages = []
|
||||
next_token = None
|
||||
result_size_estimate = None
|
||||
if isinstance(data, dict):
|
||||
messages = (
|
||||
data.get("messages", [])
|
||||
or data.get("data", {}).get("messages", [])
|
||||
or data.get("emails", [])
|
||||
)
|
||||
# Check for pagination token in various possible locations
|
||||
next_token = (
|
||||
data.get("nextPageToken")
|
||||
or data.get("next_page_token")
|
||||
or data.get("data", {}).get("nextPageToken")
|
||||
or data.get("data", {}).get("next_page_token")
|
||||
)
|
||||
# Extract resultSizeEstimate if available (Gmail API provides this)
|
||||
result_size_estimate = (
|
||||
data.get("resultSizeEstimate")
|
||||
or data.get("result_size_estimate")
|
||||
or data.get("data", {}).get("resultSizeEstimate")
|
||||
or data.get("data", {}).get("result_size_estimate")
|
||||
)
|
||||
elif isinstance(data, list):
|
||||
messages = data
|
||||
|
||||
logger.info(f"DEBUG: Extracted {len(messages)} messages")
|
||||
return messages, None
|
||||
return messages, next_token, result_size_estimate, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list Gmail messages: {e!s}")
|
||||
return [], str(e)
|
||||
return [], None, str(e)
|
||||
|
||||
async def get_gmail_message_detail(
|
||||
self, connected_account_id: str, entity_id: str, message_id: str
|
||||
|
|
@ -595,10 +927,6 @@ class ComposioService:
|
|||
return [], result.get("error", "Unknown error")
|
||||
|
||||
data = result.get("data", {})
|
||||
logger.info(
|
||||
f"DEBUG: Calendar data type: {type(data)}, keys: {data.keys() if isinstance(data, dict) else 'N/A'}"
|
||||
)
|
||||
logger.info(f"DEBUG: Calendar full data: {data}")
|
||||
|
||||
# Try different possible response structures
|
||||
events = []
|
||||
|
|
@ -611,7 +939,6 @@ class ComposioService:
|
|||
elif isinstance(data, list):
|
||||
events = data
|
||||
|
||||
logger.info(f"DEBUG: Extracted {len(events)} calendar events")
|
||||
return events, None
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -2871,3 +2871,350 @@ class ConnectorService:
|
|||
}
|
||||
|
||||
return result_object, obsidian_docs
|
||||
|
||||
# =========================================================================
|
||||
# Composio Connector Search Methods
|
||||
# =========================================================================
|
||||
|
||||
async def search_composio_google_drive(
|
||||
self,
|
||||
user_query: str,
|
||||
search_space_id: int,
|
||||
top_k: int = 20,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
) -> tuple:
|
||||
"""
|
||||
Search for Composio Google Drive files and return both the source information
|
||||
and langchain documents.
|
||||
|
||||
Uses combined chunk-level and document-level hybrid search with RRF fusion.
|
||||
|
||||
Args:
|
||||
user_query: The user's query
|
||||
search_space_id: The search space ID to search in
|
||||
top_k: Maximum number of results to return
|
||||
start_date: Optional start date for filtering documents by updated_at
|
||||
end_date: Optional end date for filtering documents by updated_at
|
||||
|
||||
Returns:
|
||||
tuple: (sources_info, langchain_documents)
|
||||
"""
|
||||
composio_drive_docs = await self._combined_rrf_search(
|
||||
query_text=user_query,
|
||||
search_space_id=search_space_id,
|
||||
document_type="COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
top_k=top_k,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
# Early return if no results
|
||||
if not composio_drive_docs:
|
||||
return {
|
||||
"id": 54,
|
||||
"name": "Google Drive (Composio)",
|
||||
"type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
return (
|
||||
doc_info.get("title")
|
||||
or metadata.get("title")
|
||||
or metadata.get("file_name")
|
||||
or "Untitled Document"
|
||||
)
|
||||
|
||||
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
return metadata.get("url") or metadata.get("web_view_link") or ""
|
||||
|
||||
def _description_fn(
|
||||
chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
|
||||
) -> str:
|
||||
description = self._chunk_preview(chunk.get("content", ""), limit=200)
|
||||
info_parts = []
|
||||
mime_type = metadata.get("mime_type")
|
||||
modified_time = metadata.get("modified_time")
|
||||
if mime_type:
|
||||
info_parts.append(f"Type: {mime_type}")
|
||||
if modified_time:
|
||||
info_parts.append(f"Modified: {modified_time}")
|
||||
if info_parts:
|
||||
description = (description + " | " + " | ".join(info_parts)).strip(" |")
|
||||
return description
|
||||
|
||||
def _extra_fields_fn(
|
||||
_chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"mime_type": metadata.get("mime_type", ""),
|
||||
"file_id": metadata.get("file_id", ""),
|
||||
"modified_time": metadata.get("modified_time", ""),
|
||||
}
|
||||
|
||||
sources_list = self._build_chunk_sources_from_documents(
|
||||
composio_drive_docs,
|
||||
title_fn=_title_fn,
|
||||
url_fn=_url_fn,
|
||||
description_fn=_description_fn,
|
||||
extra_fields_fn=_extra_fields_fn,
|
||||
)
|
||||
|
||||
# Create result object
|
||||
result_object = {
|
||||
"id": 54,
|
||||
"name": "Google Drive (Composio)",
|
||||
"type": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, composio_drive_docs
|
||||
|
||||
async def search_composio_gmail(
|
||||
self,
|
||||
user_query: str,
|
||||
search_space_id: int,
|
||||
top_k: int = 20,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
) -> tuple:
|
||||
"""
|
||||
Search for Composio Gmail messages and return both the source information
|
||||
and langchain documents.
|
||||
|
||||
Uses combined chunk-level and document-level hybrid search with RRF fusion.
|
||||
|
||||
Args:
|
||||
user_query: The user's query
|
||||
search_space_id: The search space ID to search in
|
||||
top_k: Maximum number of results to return
|
||||
start_date: Optional start date for filtering documents by updated_at
|
||||
end_date: Optional end date for filtering documents by updated_at
|
||||
|
||||
Returns:
|
||||
tuple: (sources_info, langchain_documents)
|
||||
"""
|
||||
composio_gmail_docs = await self._combined_rrf_search(
|
||||
query_text=user_query,
|
||||
search_space_id=search_space_id,
|
||||
document_type="COMPOSIO_GMAIL_CONNECTOR",
|
||||
top_k=top_k,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
# Early return if no results
|
||||
if not composio_gmail_docs:
|
||||
return {
|
||||
"id": 55,
|
||||
"name": "Gmail (Composio)",
|
||||
"type": "COMPOSIO_GMAIL_CONNECTOR",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
return (
|
||||
doc_info.get("title")
|
||||
or metadata.get("subject")
|
||||
or metadata.get("title")
|
||||
or "Untitled Email"
|
||||
)
|
||||
|
||||
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
return metadata.get("url") or ""
|
||||
|
||||
def _description_fn(
|
||||
chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
|
||||
) -> str:
|
||||
description = self._chunk_preview(chunk.get("content", ""), limit=200)
|
||||
info_parts = []
|
||||
sender = metadata.get("from") or metadata.get("sender")
|
||||
date = metadata.get("date") or metadata.get("received_at")
|
||||
if sender:
|
||||
info_parts.append(f"From: {sender}")
|
||||
if date:
|
||||
info_parts.append(f"Date: {date}")
|
||||
if info_parts:
|
||||
description = (description + " | " + " | ".join(info_parts)).strip(" |")
|
||||
return description
|
||||
|
||||
def _extra_fields_fn(
|
||||
_chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"message_id": metadata.get("message_id", ""),
|
||||
"thread_id": metadata.get("thread_id", ""),
|
||||
"from": metadata.get("from", ""),
|
||||
"to": metadata.get("to", ""),
|
||||
"date": metadata.get("date", ""),
|
||||
}
|
||||
|
||||
sources_list = self._build_chunk_sources_from_documents(
|
||||
composio_gmail_docs,
|
||||
title_fn=_title_fn,
|
||||
url_fn=_url_fn,
|
||||
description_fn=_description_fn,
|
||||
extra_fields_fn=_extra_fields_fn,
|
||||
)
|
||||
|
||||
# Create result object
|
||||
result_object = {
|
||||
"id": 55,
|
||||
"name": "Gmail (Composio)",
|
||||
"type": "COMPOSIO_GMAIL_CONNECTOR",
|
||||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, composio_gmail_docs
|
||||
|
||||
async def search_composio_google_calendar(
|
||||
self,
|
||||
user_query: str,
|
||||
search_space_id: int,
|
||||
top_k: int = 20,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
) -> tuple:
|
||||
"""
|
||||
Search for Composio Google Calendar events and return both the source information
|
||||
and langchain documents.
|
||||
|
||||
Uses combined chunk-level and document-level hybrid search with RRF fusion.
|
||||
|
||||
Args:
|
||||
user_query: The user's query
|
||||
search_space_id: The search space ID to search in
|
||||
top_k: Maximum number of results to return
|
||||
start_date: Optional start date for filtering documents by updated_at
|
||||
end_date: Optional end date for filtering documents by updated_at
|
||||
|
||||
Returns:
|
||||
tuple: (sources_info, langchain_documents)
|
||||
"""
|
||||
composio_calendar_docs = await self._combined_rrf_search(
|
||||
query_text=user_query,
|
||||
search_space_id=search_space_id,
|
||||
document_type="COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
top_k=top_k,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
# Early return if no results
|
||||
if not composio_calendar_docs:
|
||||
return {
|
||||
"id": 56,
|
||||
"name": "Google Calendar (Composio)",
|
||||
"type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
def _title_fn(doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
return (
|
||||
doc_info.get("title")
|
||||
or metadata.get("summary")
|
||||
or metadata.get("title")
|
||||
or "Untitled Event"
|
||||
)
|
||||
|
||||
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
return metadata.get("url") or metadata.get("html_link") or ""
|
||||
|
||||
def _description_fn(
|
||||
chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
|
||||
) -> str:
|
||||
description = self._chunk_preview(chunk.get("content", ""), limit=200)
|
||||
info_parts = []
|
||||
start_time = metadata.get("start_time") or metadata.get("start")
|
||||
end_time = metadata.get("end_time") or metadata.get("end")
|
||||
if start_time:
|
||||
info_parts.append(f"Start: {start_time}")
|
||||
if end_time:
|
||||
info_parts.append(f"End: {end_time}")
|
||||
if info_parts:
|
||||
description = (description + " | " + " | ".join(info_parts)).strip(" |")
|
||||
return description
|
||||
|
||||
def _extra_fields_fn(
|
||||
_chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"event_id": metadata.get("event_id", ""),
|
||||
"calendar_id": metadata.get("calendar_id", ""),
|
||||
"start_time": metadata.get("start_time", ""),
|
||||
"end_time": metadata.get("end_time", ""),
|
||||
"location": metadata.get("location", ""),
|
||||
}
|
||||
|
||||
sources_list = self._build_chunk_sources_from_documents(
|
||||
composio_calendar_docs,
|
||||
title_fn=_title_fn,
|
||||
url_fn=_url_fn,
|
||||
description_fn=_description_fn,
|
||||
extra_fields_fn=_extra_fields_fn,
|
||||
)
|
||||
|
||||
# Create result object
|
||||
result_object = {
|
||||
"id": 56,
|
||||
"name": "Google Calendar (Composio)",
|
||||
"type": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
|
||||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, composio_calendar_docs
|
||||
|
||||
# =========================================================================
|
||||
# Utility Methods for Connector Discovery
|
||||
# =========================================================================
|
||||
|
||||
async def get_available_connectors(
|
||||
self,
|
||||
search_space_id: int,
|
||||
) -> list[SearchSourceConnectorType]:
|
||||
"""
|
||||
Get all available (enabled) connector types for a search space.
|
||||
|
||||
Args:
|
||||
search_space_id: The search space ID
|
||||
|
||||
Returns:
|
||||
List of SearchSourceConnectorType enums for enabled connectors
|
||||
"""
|
||||
query = (
|
||||
select(SearchSourceConnector.connector_type)
|
||||
.filter(
|
||||
SearchSourceConnector.search_space_id == search_space_id,
|
||||
)
|
||||
.distinct()
|
||||
)
|
||||
|
||||
result = await self.session.execute(query)
|
||||
connector_types = result.scalars().all()
|
||||
return list(connector_types)
|
||||
|
||||
async def get_available_document_types(
|
||||
self,
|
||||
search_space_id: int,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Get all document types that have at least one document in the search space.
|
||||
|
||||
Args:
|
||||
search_space_id: The search space ID
|
||||
|
||||
Returns:
|
||||
List of document type strings that have documents indexed
|
||||
"""
|
||||
from sqlalchemy import distinct
|
||||
|
||||
from app.db import Document
|
||||
|
||||
query = select(distinct(Document.document_type)).filter(
|
||||
Document.search_space_id == search_space_id,
|
||||
)
|
||||
|
||||
result = await self.session.execute(query)
|
||||
doc_types = result.scalars().all()
|
||||
return [str(dt) for dt in doc_types]
|
||||
|
|
|
|||
|
|
@ -335,6 +335,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
|
|||
notification: Notification,
|
||||
indexed_count: int,
|
||||
error_message: str | None = None,
|
||||
is_warning: bool = False,
|
||||
) -> Notification:
|
||||
"""
|
||||
Update notification when connector indexing completes.
|
||||
|
|
@ -343,7 +344,8 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
|
|||
session: Database session
|
||||
notification: Notification to update
|
||||
indexed_count: Total number of items indexed
|
||||
error_message: Error message if indexing failed (optional)
|
||||
error_message: Error message if indexing failed, or warning message (optional)
|
||||
is_warning: If True, treat error_message as a warning (success case) rather than an error
|
||||
|
||||
Returns:
|
||||
Updated notification
|
||||
|
|
@ -352,10 +354,26 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
|
|||
"connector_name", "Connector"
|
||||
)
|
||||
|
||||
# If there's an error message but items were indexed, treat it as a warning (partial success)
|
||||
# If is_warning is True, treat it as success even with 0 items (e.g., duplicates found)
|
||||
# Otherwise, treat it as a failure
|
||||
if error_message:
|
||||
title = f"Failed: {connector_name}"
|
||||
message = f"Sync failed: {error_message}"
|
||||
status = "failed"
|
||||
if indexed_count > 0:
|
||||
# Partial success with warnings (e.g., duplicate content from other connectors)
|
||||
title = f"Ready: {connector_name}"
|
||||
item_text = "item" if indexed_count == 1 else "items"
|
||||
message = f"Now searchable! {indexed_count} {item_text} synced. Note: {error_message}"
|
||||
status = "completed"
|
||||
elif is_warning:
|
||||
# Warning case (e.g., duplicates found) - treat as success
|
||||
title = f"Ready: {connector_name}"
|
||||
message = f"Sync completed. {error_message}"
|
||||
status = "completed"
|
||||
else:
|
||||
# Complete failure
|
||||
title = f"Failed: {connector_name}"
|
||||
message = f"Sync failed: {error_message}"
|
||||
status = "failed"
|
||||
else:
|
||||
title = f"Ready: {connector_name}"
|
||||
if indexed_count == 0:
|
||||
|
|
@ -367,7 +385,9 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
|
|||
|
||||
metadata_updates = {
|
||||
"indexed_count": indexed_count,
|
||||
"sync_stage": "completed" if not error_message else "failed",
|
||||
"sync_stage": "completed"
|
||||
if (not error_message or is_warning or indexed_count > 0)
|
||||
else "failed",
|
||||
"error_message": error_message,
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -810,8 +810,8 @@ def index_composio_connector_task(
|
|||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
start_date: str | None,
|
||||
end_date: str | None,
|
||||
):
|
||||
"""Celery task to index Composio connector content (Google Drive, Gmail, Calendar via Composio)."""
|
||||
import asyncio
|
||||
|
|
@ -833,14 +833,16 @@ async def _index_composio_connector(
|
|||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
start_date: str | None,
|
||||
end_date: str | None,
|
||||
):
|
||||
"""Index Composio connector content with new session."""
|
||||
# Import from tasks folder (not connector_indexers) to avoid circular import
|
||||
from app.tasks.composio_indexer import index_composio_connector
|
||||
"""Index Composio connector content with new session and real-time notifications."""
|
||||
# Import from routes to use the notification-wrapped version
|
||||
from app.routes.search_source_connectors_routes import (
|
||||
run_composio_indexing,
|
||||
)
|
||||
|
||||
async with get_celery_session_maker()() as session:
|
||||
await index_composio_connector(
|
||||
await run_composio_indexing(
|
||||
session, connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@ async def _check_and_trigger_schedules():
|
|||
from app.tasks.celery_tasks.connector_tasks import (
|
||||
index_airtable_records_task,
|
||||
index_clickup_tasks_task,
|
||||
index_composio_connector_task,
|
||||
index_confluence_pages_task,
|
||||
index_crawled_urls_task,
|
||||
index_discord_messages_task,
|
||||
|
|
@ -98,6 +99,10 @@ async def _check_and_trigger_schedules():
|
|||
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
|
||||
SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
|
||||
# Composio connector types
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_composio_connector_task,
|
||||
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: index_composio_connector_task,
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_composio_connector_task,
|
||||
}
|
||||
|
||||
# Trigger indexing for each due connector
|
||||
|
|
|
|||
|
|
@ -54,21 +54,68 @@ def format_attachments_as_context(attachments: list[ChatAttachment]) -> str:
|
|||
|
||||
|
||||
def format_mentioned_documents_as_context(documents: list[Document]) -> str:
|
||||
"""Format mentioned documents as context for the agent."""
|
||||
"""
|
||||
Format mentioned documents as context for the agent.
|
||||
|
||||
Uses the same XML structure as knowledge_base.format_documents_for_context
|
||||
to ensure citations work properly with chunk IDs.
|
||||
"""
|
||||
if not documents:
|
||||
return ""
|
||||
|
||||
context_parts = ["<mentioned_documents>"]
|
||||
context_parts.append(
|
||||
"The user has explicitly mentioned the following documents from their knowledge base. "
|
||||
"These documents are directly relevant to the query and should be prioritized as primary sources."
|
||||
"These documents are directly relevant to the query and should be prioritized as primary sources. "
|
||||
"Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])."
|
||||
)
|
||||
for i, doc in enumerate(documents, 1):
|
||||
context_parts.append(
|
||||
f"<document index='{i}' id='{doc.id}' title='{doc.title}' type='{doc.document_type.value}'>"
|
||||
context_parts.append("")
|
||||
|
||||
for doc in documents:
|
||||
# Build metadata JSON
|
||||
metadata = doc.document_metadata or {}
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False)
|
||||
|
||||
# Get URL from metadata
|
||||
url = (
|
||||
metadata.get("url")
|
||||
or metadata.get("source")
|
||||
or metadata.get("page_url")
|
||||
or ""
|
||||
)
|
||||
context_parts.append(f"<![CDATA[{doc.content}]]>")
|
||||
|
||||
context_parts.append("<document>")
|
||||
context_parts.append("<document_metadata>")
|
||||
context_parts.append(f" <document_id>{doc.id}</document_id>")
|
||||
context_parts.append(
|
||||
f" <document_type>{doc.document_type.value}</document_type>"
|
||||
)
|
||||
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
|
||||
context_parts.append(f" <url><![CDATA[{url}]]></url>")
|
||||
context_parts.append(
|
||||
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
|
||||
)
|
||||
context_parts.append("</document_metadata>")
|
||||
context_parts.append("")
|
||||
context_parts.append("<document_content>")
|
||||
|
||||
# Use chunks if available (preferred for proper citations)
|
||||
if hasattr(doc, "chunks") and doc.chunks:
|
||||
for chunk in doc.chunks:
|
||||
context_parts.append(
|
||||
f" <chunk id='{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
|
||||
)
|
||||
else:
|
||||
# Fallback to document content if chunks not loaded
|
||||
# Use document ID as chunk ID prefix for consistency
|
||||
context_parts.append(
|
||||
f" <chunk id='{doc.id}'><![CDATA[{doc.content}]]></chunk>"
|
||||
)
|
||||
|
||||
context_parts.append("</document_content>")
|
||||
context_parts.append("</document>")
|
||||
context_parts.append("")
|
||||
|
||||
context_parts.append("</mentioned_documents>")
|
||||
|
||||
return "\n".join(context_parts)
|
||||
|
|
@ -81,8 +128,6 @@ def format_mentioned_surfsense_docs_as_context(
|
|||
if not documents:
|
||||
return ""
|
||||
|
||||
import json
|
||||
|
||||
context_parts = ["<mentioned_surfsense_docs>"]
|
||||
context_parts.append(
|
||||
"The user has explicitly mentioned the following SurfSense documentation pages. "
|
||||
|
|
@ -263,11 +308,15 @@ async def stream_new_chat(
|
|||
# Build input with message history from frontend
|
||||
langchain_messages = []
|
||||
|
||||
# Fetch mentioned documents if any
|
||||
# Fetch mentioned documents if any (with chunks for proper citations)
|
||||
mentioned_documents: list[Document] = []
|
||||
if mentioned_document_ids:
|
||||
from sqlalchemy.orm import selectinload as doc_selectinload
|
||||
|
||||
result = await session.execute(
|
||||
select(Document).filter(
|
||||
select(Document)
|
||||
.options(doc_selectinload(Document.chunks))
|
||||
.filter(
|
||||
Document.id.in_(mentioned_document_ids),
|
||||
Document.search_space_id == search_space_id,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -2,83 +2,76 @@
|
|||
Composio connector indexer.
|
||||
|
||||
Routes indexing requests to toolkit-specific handlers (Google Drive, Gmail, Calendar).
|
||||
Uses a registry pattern for clean, extensible connector routing.
|
||||
|
||||
Note: This module is intentionally placed in app/tasks/ (not in connector_indexers/)
|
||||
to avoid circular import issues with the connector_indexers package.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from importlib import import_module
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import (
|
||||
Document,
|
||||
DocumentType,
|
||||
SearchSourceConnector,
|
||||
SearchSourceConnectorType,
|
||||
)
|
||||
from app.services.composio_service import INDEXABLE_TOOLKITS
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_INDEXER
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
||||
# Set up logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ============ Utility functions (copied from connector_indexers.base to avoid circular imports) ============
|
||||
# Valid Composio connector types
|
||||
COMPOSIO_CONNECTOR_TYPES = {
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
|
||||
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
|
||||
}
|
||||
|
||||
|
||||
def get_current_timestamp() -> datetime:
|
||||
"""Get the current timestamp with timezone for updated_at field."""
|
||||
return datetime.now(UTC)
|
||||
|
||||
|
||||
async def check_document_by_unique_identifier(
|
||||
session: AsyncSession, unique_identifier_hash: str
|
||||
) -> Document | None:
|
||||
"""Check if a document with the given unique identifier hash already exists."""
|
||||
existing_doc_result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.where(Document.unique_identifier_hash == unique_identifier_hash)
|
||||
)
|
||||
return existing_doc_result.scalars().first()
|
||||
# ============ Utility functions ============
|
||||
|
||||
|
||||
async def get_connector_by_id(
|
||||
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
connector_type: SearchSourceConnectorType | None,
|
||||
) -> SearchSourceConnector | None:
|
||||
"""Get a connector by ID and type from the database."""
|
||||
result = await session.execute(
|
||||
select(SearchSourceConnector).filter(
|
||||
SearchSourceConnector.id == connector_id,
|
||||
SearchSourceConnector.connector_type == connector_type,
|
||||
)
|
||||
"""Get a connector by ID and optionally by type from the database."""
|
||||
query = select(SearchSourceConnector).filter(
|
||||
SearchSourceConnector.id == connector_id
|
||||
)
|
||||
if connector_type is not None:
|
||||
query = query.filter(SearchSourceConnector.connector_type == connector_type)
|
||||
result = await session.execute(query)
|
||||
return result.scalars().first()
|
||||
|
||||
|
||||
async def update_connector_last_indexed(
|
||||
session: AsyncSession,
|
||||
connector: SearchSourceConnector,
|
||||
update_last_indexed: bool = True,
|
||||
) -> None:
|
||||
"""Update the last_indexed_at timestamp for a connector."""
|
||||
if update_last_indexed:
|
||||
connector.last_indexed_at = datetime.now()
|
||||
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
|
||||
def get_indexer_function(toolkit_id: str):
|
||||
"""
|
||||
Dynamically import and return the indexer function for a toolkit.
|
||||
|
||||
Args:
|
||||
toolkit_id: The toolkit ID (e.g., "googledrive", "gmail")
|
||||
|
||||
Returns:
|
||||
Tuple of (indexer_function, supports_date_filter)
|
||||
|
||||
Raises:
|
||||
ValueError: If toolkit not found in registry
|
||||
"""
|
||||
if toolkit_id not in TOOLKIT_TO_INDEXER:
|
||||
raise ValueError(f"No indexer registered for toolkit: {toolkit_id}")
|
||||
|
||||
module_path, function_name, supports_date_filter = TOOLKIT_TO_INDEXER[toolkit_id]
|
||||
module = import_module(module_path)
|
||||
indexer_func = getattr(module, function_name)
|
||||
return indexer_func, supports_date_filter
|
||||
|
||||
|
||||
# ============ Main indexer function ============
|
||||
|
|
@ -98,6 +91,7 @@ async def index_composio_connector(
|
|||
Index content from a Composio connector.
|
||||
|
||||
Routes to toolkit-specific indexing based on the connector's toolkit_id.
|
||||
Uses a registry pattern for clean, extensible connector routing.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
|
|
@ -129,10 +123,16 @@ async def index_composio_connector(
|
|||
)
|
||||
|
||||
try:
|
||||
# Get connector by id
|
||||
connector = await get_connector_by_id(
|
||||
session, connector_id, SearchSourceConnectorType.COMPOSIO_CONNECTOR
|
||||
)
|
||||
# Get connector by id - accept any Composio connector type
|
||||
connector = await get_connector_by_id(session, connector_id, None)
|
||||
|
||||
# Validate it's a Composio connector
|
||||
if connector and connector.connector_type not in COMPOSIO_CONNECTOR_TYPES:
|
||||
error_msg = f"Connector {connector_id} is not a Composio connector"
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, error_msg, {"error_type": "InvalidConnectorType"}
|
||||
)
|
||||
return 0, error_msg
|
||||
|
||||
if not connector:
|
||||
error_msg = f"Composio connector with ID {connector_id} not found"
|
||||
|
|
@ -160,53 +160,35 @@ async def index_composio_connector(
|
|||
)
|
||||
return 0, error_msg
|
||||
|
||||
# Route to toolkit-specific indexer
|
||||
if toolkit_id == "googledrive":
|
||||
return await _index_composio_google_drive(
|
||||
session=session,
|
||||
connector=connector,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
update_last_indexed=update_last_indexed,
|
||||
max_items=max_items,
|
||||
)
|
||||
elif toolkit_id == "gmail":
|
||||
return await _index_composio_gmail(
|
||||
session=session,
|
||||
connector=connector,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
update_last_indexed=update_last_indexed,
|
||||
max_items=max_items,
|
||||
)
|
||||
elif toolkit_id == "googlecalendar":
|
||||
return await _index_composio_google_calendar(
|
||||
session=session,
|
||||
connector=connector,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
update_last_indexed=update_last_indexed,
|
||||
max_items=max_items,
|
||||
)
|
||||
else:
|
||||
error_msg = f"No indexer implemented for toolkit: {toolkit_id}"
|
||||
# Get indexer function from registry
|
||||
try:
|
||||
indexer_func, supports_date_filter = get_indexer_function(toolkit_id)
|
||||
except ValueError as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, error_msg, {"error_type": "NoIndexerImplemented"}
|
||||
log_entry, str(e), {"error_type": "NoIndexerImplemented"}
|
||||
)
|
||||
return 0, error_msg
|
||||
return 0, str(e)
|
||||
|
||||
# Build kwargs for the indexer function
|
||||
kwargs = {
|
||||
"session": session,
|
||||
"connector": connector,
|
||||
"connector_id": connector_id,
|
||||
"search_space_id": search_space_id,
|
||||
"user_id": user_id,
|
||||
"task_logger": task_logger,
|
||||
"log_entry": log_entry,
|
||||
"update_last_indexed": update_last_indexed,
|
||||
"max_items": max_items,
|
||||
}
|
||||
|
||||
# Add date params for toolkits that support them
|
||||
if supports_date_filter:
|
||||
kwargs["start_date"] = start_date
|
||||
kwargs["end_date"] = end_date
|
||||
|
||||
# Call the toolkit-specific indexer
|
||||
return await indexer_func(**kwargs)
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
@ -228,714 +210,3 @@ async def index_composio_connector(
|
|||
)
|
||||
logger.error(f"Failed to index Composio connector: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index Composio connector: {e!s}"
|
||||
|
||||
|
||||
async def _index_composio_google_drive(
|
||||
session: AsyncSession,
|
||||
connector,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
update_last_indexed: bool = True,
|
||||
max_items: int = 1000,
|
||||
) -> tuple[int, str]:
|
||||
"""Index Google Drive files via Composio."""
|
||||
try:
|
||||
composio_connector = ComposioConnector(session, connector_id)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Google Drive files via Composio for connector {connector_id}",
|
||||
{"stage": "fetching_files"},
|
||||
)
|
||||
|
||||
# Fetch files
|
||||
all_files = []
|
||||
page_token = None
|
||||
|
||||
while len(all_files) < max_items:
|
||||
files, next_token, error = await composio_connector.list_drive_files(
|
||||
page_token=page_token,
|
||||
page_size=min(100, max_items - len(all_files)),
|
||||
)
|
||||
|
||||
if error:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, f"Failed to fetch Drive files: {error}", {}
|
||||
)
|
||||
return 0, f"Failed to fetch Drive files: {error}"
|
||||
|
||||
all_files.extend(files)
|
||||
|
||||
if not next_token:
|
||||
break
|
||||
page_token = next_token
|
||||
|
||||
if not all_files:
|
||||
success_msg = "No Google Drive files found"
|
||||
await task_logger.log_task_success(
|
||||
log_entry, success_msg, {"files_count": 0}
|
||||
)
|
||||
return 0, success_msg
|
||||
|
||||
logger.info(f"Found {len(all_files)} Google Drive files to index via Composio")
|
||||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
|
||||
for file_info in all_files:
|
||||
try:
|
||||
# Handle both standard Google API and potential Composio variations
|
||||
file_id = file_info.get("id", "") or file_info.get("fileId", "")
|
||||
file_name = (
|
||||
file_info.get("name", "")
|
||||
or file_info.get("fileName", "")
|
||||
or "Untitled"
|
||||
)
|
||||
mime_type = file_info.get("mimeType", "") or file_info.get(
|
||||
"mime_type", ""
|
||||
)
|
||||
|
||||
if not file_id:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Skip folders
|
||||
if mime_type == "application/vnd.google-apps.folder":
|
||||
continue
|
||||
|
||||
# Generate unique identifier hash
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.COMPOSIO_CONNECTOR, f"drive_{file_id}", search_space_id
|
||||
)
|
||||
|
||||
# Check if document exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Get file content
|
||||
(
|
||||
content,
|
||||
content_error,
|
||||
) = await composio_connector.get_drive_file_content(file_id)
|
||||
|
||||
if content_error or not content:
|
||||
logger.warning(
|
||||
f"Could not get content for file {file_name}: {content_error}"
|
||||
)
|
||||
# Use metadata as content fallback
|
||||
markdown_content = f"# {file_name}\n\n"
|
||||
markdown_content += f"**File ID:** {file_id}\n"
|
||||
markdown_content += f"**Type:** {mime_type}\n"
|
||||
else:
|
||||
try:
|
||||
markdown_content = content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
markdown_content = f"# {file_name}\n\n[Binary file content]\n"
|
||||
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Update existing document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
"document_type": "Google Drive File (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Google Drive File: {file_name}\n\nType: {mime_type}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = f"Drive: {file_name}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
"document_type": "Google Drive File (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Google Drive File: {file_name}\n\nType: {mime_type}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Drive: {file_name}",
|
||||
document_type=DocumentType.COMPOSIO_CONNECTOR,
|
||||
document_metadata={
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
"connector_id": connector_id,
|
||||
"toolkit_id": "googledrive",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
|
||||
if documents_indexed % 10 == 0:
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Drive file: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
if documents_indexed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
await session.commit()
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Google Drive indexing via Composio for connector {connector_id}",
|
||||
{
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
},
|
||||
)
|
||||
|
||||
return documents_indexed, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to index Google Drive via Composio: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index Google Drive via Composio: {e!s}"
|
||||
|
||||
|
||||
async def _index_composio_gmail(
|
||||
session: AsyncSession,
|
||||
connector,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str | None,
|
||||
end_date: str | None,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
update_last_indexed: bool = True,
|
||||
max_items: int = 1000,
|
||||
) -> tuple[int, str]:
|
||||
"""Index Gmail messages via Composio."""
|
||||
try:
|
||||
composio_connector = ComposioConnector(session, connector_id)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Gmail messages via Composio for connector {connector_id}",
|
||||
{"stage": "fetching_messages"},
|
||||
)
|
||||
|
||||
# Build query with date range
|
||||
query_parts = []
|
||||
if start_date:
|
||||
query_parts.append(f"after:{start_date.replace('-', '/')}")
|
||||
if end_date:
|
||||
query_parts.append(f"before:{end_date.replace('-', '/')}")
|
||||
query = " ".join(query_parts)
|
||||
|
||||
messages, error = await composio_connector.list_gmail_messages(
|
||||
query=query,
|
||||
max_results=max_items,
|
||||
)
|
||||
|
||||
if error:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, f"Failed to fetch Gmail messages: {error}", {}
|
||||
)
|
||||
return 0, f"Failed to fetch Gmail messages: {error}"
|
||||
|
||||
if not messages:
|
||||
success_msg = "No Gmail messages found in the specified date range"
|
||||
await task_logger.log_task_success(
|
||||
log_entry, success_msg, {"messages_count": 0}
|
||||
)
|
||||
return 0, success_msg
|
||||
|
||||
logger.info(f"Found {len(messages)} Gmail messages to index via Composio")
|
||||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
# Composio uses 'messageId' (camelCase), not 'id'
|
||||
message_id = message.get("messageId", "") or message.get("id", "")
|
||||
if not message_id:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Composio's GMAIL_FETCH_EMAILS already returns full message content
|
||||
# No need for a separate detail API call
|
||||
|
||||
# Extract message info from Composio response
|
||||
# Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds
|
||||
payload = message.get("payload", {})
|
||||
headers = payload.get("headers", [])
|
||||
|
||||
subject = "No Subject"
|
||||
sender = "Unknown Sender"
|
||||
date_str = message.get("messageTimestamp", "Unknown Date")
|
||||
|
||||
for header in headers:
|
||||
name = header.get("name", "").lower()
|
||||
value = header.get("value", "")
|
||||
if name == "subject":
|
||||
subject = value
|
||||
elif name == "from":
|
||||
sender = value
|
||||
elif name == "date":
|
||||
date_str = value
|
||||
|
||||
# Format to markdown using the full message data
|
||||
markdown_content = composio_connector.format_gmail_message_to_markdown(
|
||||
message
|
||||
)
|
||||
|
||||
# Generate unique identifier
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.COMPOSIO_CONNECTOR,
|
||||
f"gmail_{message_id}",
|
||||
search_space_id,
|
||||
)
|
||||
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Get label IDs from Composio response
|
||||
label_ids = message.get("labelIds", [])
|
||||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Update existing
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"message_id": message_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"document_type": "Gmail Message (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = f"Gmail: {subject}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"message_id": message_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"labels": label_ids,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"message_id": message_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"document_type": "Gmail Message (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Gmail: {subject}",
|
||||
document_type=DocumentType.COMPOSIO_CONNECTOR,
|
||||
document_metadata={
|
||||
"message_id": message_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"labels": label_ids,
|
||||
"connector_id": connector_id,
|
||||
"toolkit_id": "gmail",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
|
||||
if documents_indexed % 10 == 0:
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
if documents_indexed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
await session.commit()
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Gmail indexing via Composio for connector {connector_id}",
|
||||
{
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
},
|
||||
)
|
||||
|
||||
return documents_indexed, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index Gmail via Composio: {e!s}"
|
||||
|
||||
|
||||
async def _index_composio_google_calendar(
|
||||
session: AsyncSession,
|
||||
connector,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str | None,
|
||||
end_date: str | None,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
update_last_indexed: bool = True,
|
||||
max_items: int = 2500,
|
||||
) -> tuple[int, str]:
|
||||
"""Index Google Calendar events via Composio."""
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
try:
|
||||
composio_connector = ComposioConnector(session, connector_id)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Google Calendar events via Composio for connector {connector_id}",
|
||||
{"stage": "fetching_events"},
|
||||
)
|
||||
|
||||
# Build time range
|
||||
if start_date:
|
||||
time_min = f"{start_date}T00:00:00Z"
|
||||
else:
|
||||
# Default to 365 days ago
|
||||
default_start = datetime.now() - timedelta(days=365)
|
||||
time_min = default_start.strftime("%Y-%m-%dT00:00:00Z")
|
||||
|
||||
if end_date:
|
||||
time_max = f"{end_date}T23:59:59Z"
|
||||
else:
|
||||
time_max = datetime.now().strftime("%Y-%m-%dT23:59:59Z")
|
||||
|
||||
events, error = await composio_connector.list_calendar_events(
|
||||
time_min=time_min,
|
||||
time_max=time_max,
|
||||
max_results=max_items,
|
||||
)
|
||||
|
||||
if error:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, f"Failed to fetch Calendar events: {error}", {}
|
||||
)
|
||||
return 0, f"Failed to fetch Calendar events: {error}"
|
||||
|
||||
if not events:
|
||||
success_msg = "No Google Calendar events found in the specified date range"
|
||||
await task_logger.log_task_success(
|
||||
log_entry, success_msg, {"events_count": 0}
|
||||
)
|
||||
return 0, success_msg
|
||||
|
||||
logger.info(f"Found {len(events)} Google Calendar events to index via Composio")
|
||||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
# Handle both standard Google API and potential Composio variations
|
||||
event_id = event.get("id", "") or event.get("eventId", "")
|
||||
summary = (
|
||||
event.get("summary", "") or event.get("title", "") or "No Title"
|
||||
)
|
||||
|
||||
if not event_id:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Format to markdown
|
||||
markdown_content = composio_connector.format_calendar_event_to_markdown(
|
||||
event
|
||||
)
|
||||
|
||||
# Generate unique identifier
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.COMPOSIO_CONNECTOR,
|
||||
f"calendar_{event_id}",
|
||||
search_space_id,
|
||||
)
|
||||
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Extract event times
|
||||
start = event.get("start", {})
|
||||
end = event.get("end", {})
|
||||
start_time = start.get("dateTime") or start.get("date", "")
|
||||
end_time = end.get("dateTime") or end.get("date", "")
|
||||
location = event.get("location", "")
|
||||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Update existing
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
|
||||
if location:
|
||||
summary_content += f"\nLocation: {location}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = f"Calendar: {summary}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
|
||||
)
|
||||
if location:
|
||||
summary_content += f"\nLocation: {location}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Calendar: {summary}",
|
||||
document_type=DocumentType.COMPOSIO_CONNECTOR,
|
||||
document_metadata={
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"connector_id": connector_id,
|
||||
"toolkit_id": "googlecalendar",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
|
||||
if documents_indexed % 10 == 0:
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
if documents_indexed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
await session.commit()
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Google Calendar indexing via Composio for connector {connector_id}",
|
||||
{
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
},
|
||||
)
|
||||
|
||||
return documents_indexed, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to index Google Calendar via Composio: {e!s}", exc_info=True
|
||||
)
|
||||
return 0, f"Failed to index Google Calendar via Composio: {e!s}"
|
||||
|
|
|
|||
|
|
@ -112,6 +112,13 @@ def calculate_date_range(
|
|||
Returns:
|
||||
Tuple of (start_date_str, end_date_str)
|
||||
"""
|
||||
# Normalize "undefined" strings to None (from frontend)
|
||||
# This prevents parsing errors and ensures consistent behavior across all indexers
|
||||
if start_date == "undefined" or start_date == "":
|
||||
start_date = None
|
||||
if end_date == "undefined" or end_date == "":
|
||||
end_date = None
|
||||
|
||||
if start_date is not None and end_date is not None:
|
||||
return start_date, end_date
|
||||
|
||||
|
|
|
|||
|
|
@ -136,10 +136,9 @@ async def index_bookstack_pages(
|
|||
)
|
||||
|
||||
if error:
|
||||
logger.error(f"Failed to get BookStack pages: {error}")
|
||||
|
||||
# Don't treat "No pages found" as an error that should stop indexing
|
||||
if "No pages found" in error:
|
||||
logger.info(f"No BookStack pages found: {error}")
|
||||
logger.info(
|
||||
"No pages found is not a critical error, continuing with update"
|
||||
)
|
||||
|
|
@ -159,6 +158,7 @@ async def index_bookstack_pages(
|
|||
)
|
||||
return 0, None
|
||||
else:
|
||||
logger.error(f"Failed to get BookStack pages: {error}")
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get BookStack pages: {error}",
|
||||
|
|
|
|||
|
|
@ -120,10 +120,9 @@ async def index_confluence_pages(
|
|||
)
|
||||
|
||||
if error:
|
||||
logger.error(f"Failed to get Confluence pages: {error}")
|
||||
|
||||
# Don't treat "No pages found" as an error that should stop indexing
|
||||
if "No pages found" in error:
|
||||
logger.info(f"No Confluence pages found: {error}")
|
||||
logger.info(
|
||||
"No pages found is not a critical error, continuing with update"
|
||||
)
|
||||
|
|
@ -147,6 +146,7 @@ async def index_confluence_pages(
|
|||
await confluence_client.close()
|
||||
return 0, None
|
||||
else:
|
||||
logger.error(f"Failed to get Confluence pages: {error}")
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Confluence pages: {error}",
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ Google Calendar connector indexer.
|
|||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytz
|
||||
from dateutil.parser import isoparse
|
||||
from google.oauth2.credentials import Credentials
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -21,6 +23,7 @@ from app.utils.document_converters import (
|
|||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -206,6 +209,23 @@ async def index_google_calendar_events(
|
|||
start_date_str = start_date
|
||||
end_date_str = end_date
|
||||
|
||||
# If start_date and end_date are the same, adjust end_date to be one day later
|
||||
# to ensure valid date range (start_date must be strictly before end_date)
|
||||
if start_date_str == end_date_str:
|
||||
# Parse the date and add one day to ensure valid range
|
||||
dt = isoparse(end_date_str)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=pytz.UTC)
|
||||
else:
|
||||
dt = dt.astimezone(pytz.UTC)
|
||||
# Add one day to end_date to make it strictly after start_date
|
||||
dt_end = dt + timedelta(days=1)
|
||||
end_date_str = dt_end.strftime("%Y-%m-%d")
|
||||
logger.info(
|
||||
f"Adjusted end_date from {end_date} to {end_date_str} "
|
||||
f"to ensure valid date range (start_date must be strictly before end_date)"
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
|
||||
|
|
@ -223,10 +243,9 @@ async def index_google_calendar_events(
|
|||
)
|
||||
|
||||
if error:
|
||||
logger.error(f"Failed to get Google Calendar events: {error}")
|
||||
|
||||
# Don't treat "No events found" as an error that should stop indexing
|
||||
if "No events found" in error:
|
||||
logger.info(f"No Google Calendar events found: {error}")
|
||||
logger.info(
|
||||
"No events found is not a critical error, continuing with update"
|
||||
)
|
||||
|
|
@ -246,13 +265,25 @@ async def index_google_calendar_events(
|
|||
)
|
||||
return 0, None
|
||||
else:
|
||||
logger.error(f"Failed to get Google Calendar events: {error}")
|
||||
# Check if this is an authentication error that requires re-authentication
|
||||
error_message = error
|
||||
error_type = "APIError"
|
||||
if (
|
||||
"re-authenticate" in error.lower()
|
||||
or "expired or been revoked" in error.lower()
|
||||
or "authentication failed" in error.lower()
|
||||
):
|
||||
error_message = "Google Calendar authentication failed. Please re-authenticate."
|
||||
error_type = "AuthenticationError"
|
||||
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Google Calendar events: {error}",
|
||||
"API Error",
|
||||
{"error_type": "APIError"},
|
||||
error_message,
|
||||
error,
|
||||
{"error_type": error_type},
|
||||
)
|
||||
return 0, f"Failed to get Google Calendar events: {error}"
|
||||
return 0, error_message
|
||||
|
||||
logger.info(f"Retrieved {len(events)} events from Google Calendar API")
|
||||
|
||||
|
|
@ -263,6 +294,9 @@ async def index_google_calendar_events(
|
|||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
skipped_events = []
|
||||
duplicate_content_count = (
|
||||
0 # Track events skipped due to duplicate content_hash
|
||||
)
|
||||
|
||||
for event in events:
|
||||
try:
|
||||
|
|
@ -383,6 +417,27 @@ async def index_google_calendar_events(
|
|||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
# A document with the same content already exists (likely from Composio connector)
|
||||
logger.info(
|
||||
f"Event {event_summary} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
skipped_events.append(
|
||||
f"{event_summary} (already indexed by another connector)"
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
@ -475,7 +530,28 @@ async def index_google_calendar_events(
|
|||
logger.info(
|
||||
f"Final commit: Total {documents_indexed} Google Calendar events processed"
|
||||
)
|
||||
await session.commit()
|
||||
try:
|
||||
await session.commit()
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same event was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if duplicates were found
|
||||
warning_message = None
|
||||
if duplicate_content_count > 0:
|
||||
warning_message = f"{duplicate_content_count} skipped (duplicate)"
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -484,14 +560,16 @@ async def index_google_calendar_events(
|
|||
"events_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
"skipped_events_count": len(skipped_events),
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped"
|
||||
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
|
||||
f"({duplicate_content_count} due to duplicate content from other connectors)"
|
||||
)
|
||||
return total_processed, None
|
||||
return total_processed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -578,7 +578,7 @@ async def _check_rename_only_update(
|
|||
- (True, message): Only filename changed, document was updated
|
||||
- (False, None): Content changed or new file, needs full processing
|
||||
"""
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import String, cast, select
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
from app.db import Document
|
||||
|
|
@ -603,7 +603,8 @@ async def _check_rename_only_update(
|
|||
select(Document).where(
|
||||
Document.search_space_id == search_space_id,
|
||||
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
|
||||
Document.document_metadata["google_drive_file_id"].astext == file_id,
|
||||
cast(Document.document_metadata["google_drive_file_id"], String)
|
||||
== file_id,
|
||||
)
|
||||
)
|
||||
existing_document = result.scalar_one_or_none()
|
||||
|
|
@ -755,7 +756,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
|
|||
|
||||
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
|
||||
"""
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import String, cast, select
|
||||
|
||||
from app.db import Document
|
||||
|
||||
|
|
@ -774,7 +775,8 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
|
|||
select(Document).where(
|
||||
Document.search_space_id == search_space_id,
|
||||
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
|
||||
Document.document_metadata["google_drive_file_id"].astext == file_id,
|
||||
cast(Document.document_metadata["google_drive_file_id"], String)
|
||||
== file_id,
|
||||
)
|
||||
)
|
||||
existing_document = result.scalar_one_or_none()
|
||||
|
|
|
|||
|
|
@ -170,10 +170,21 @@ async def index_google_gmail_messages(
|
|||
)
|
||||
|
||||
if error:
|
||||
# Check if this is an authentication error that requires re-authentication
|
||||
error_message = error
|
||||
error_type = "APIError"
|
||||
if (
|
||||
"re-authenticate" in error.lower()
|
||||
or "expired or been revoked" in error.lower()
|
||||
or "authentication failed" in error.lower()
|
||||
):
|
||||
error_message = "Gmail authentication failed. Please re-authenticate."
|
||||
error_type = "AuthenticationError"
|
||||
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, f"Failed to fetch messages: {error}", {}
|
||||
log_entry, error_message, error, {"error_type": error_type}
|
||||
)
|
||||
return 0, f"Failed to fetch Gmail messages: {error}"
|
||||
return 0, error_message
|
||||
|
||||
if not messages:
|
||||
success_msg = "No Google gmail messages found in the specified date range"
|
||||
|
|
|
|||
|
|
@ -126,10 +126,9 @@ async def index_jira_issues(
|
|||
)
|
||||
|
||||
if error:
|
||||
logger.error(f"Failed to get Jira issues: {error}")
|
||||
|
||||
# Don't treat "No issues found" as an error that should stop indexing
|
||||
if "No issues found" in error:
|
||||
logger.info(f"No Jira issues found: {error}")
|
||||
logger.info(
|
||||
"No issues found is not a critical error, continuing with update"
|
||||
)
|
||||
|
|
@ -149,6 +148,7 @@ async def index_jira_issues(
|
|||
)
|
||||
return 0, None
|
||||
else:
|
||||
logger.error(f"Failed to get Jira issues: {error}")
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Jira issues: {error}",
|
||||
|
|
|
|||
|
|
@ -145,10 +145,9 @@ async def index_linear_issues(
|
|||
)
|
||||
|
||||
if error:
|
||||
logger.error(f"Failed to get Linear issues: {error}")
|
||||
|
||||
# Don't treat "No issues found" as an error that should stop indexing
|
||||
if "No issues found" in error:
|
||||
logger.info(f"No Linear issues found: {error}")
|
||||
logger.info(
|
||||
"No issues found is not a critical error, continuing with update"
|
||||
)
|
||||
|
|
@ -162,6 +161,7 @@ async def index_linear_issues(
|
|||
)
|
||||
return 0, None
|
||||
else:
|
||||
logger.error(f"Failed to get Linear issues: {error}")
|
||||
return 0, f"Failed to get Linear issues: {error}"
|
||||
|
||||
logger.info(f"Retrieved {len(issues)} issues from Linear API")
|
||||
|
|
|
|||
|
|
@ -116,6 +116,13 @@ async def index_luma_events(
|
|||
|
||||
luma_client = LumaConnector(api_key=api_key)
|
||||
|
||||
# Handle 'undefined' string from frontend (treat as None)
|
||||
# This prevents "time data 'undefined' does not match format" errors
|
||||
if start_date == "undefined" or start_date == "":
|
||||
start_date = None
|
||||
if end_date == "undefined" or end_date == "":
|
||||
end_date = None
|
||||
|
||||
# Calculate date range
|
||||
# For calendar connectors, allow future dates to index upcoming events
|
||||
if start_date is None or end_date is None:
|
||||
|
|
@ -172,10 +179,9 @@ async def index_luma_events(
|
|||
)
|
||||
|
||||
if error:
|
||||
logger.error(f"Failed to get Luma events: {error}")
|
||||
|
||||
# Don't treat "No events found" as an error that should stop indexing
|
||||
if "No events found" in error or "no events" in error.lower():
|
||||
logger.info(f"No Luma events found: {error}")
|
||||
logger.info(
|
||||
"No events found is not a critical error, continuing with update"
|
||||
)
|
||||
|
|
@ -195,6 +201,7 @@ async def index_luma_events(
|
|||
)
|
||||
return 0, None
|
||||
else:
|
||||
logger.error(f"Failed to get Luma events: {error}")
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Luma events: {error}",
|
||||
|
|
|
|||
|
|
@ -28,6 +28,9 @@ BASE_NAME_FOR_TYPE = {
|
|||
SearchSourceConnectorType.CONFLUENCE_CONNECTOR: "Confluence",
|
||||
SearchSourceConnectorType.AIRTABLE_CONNECTOR: "Airtable",
|
||||
SearchSourceConnectorType.MCP_CONNECTOR: "Model Context Protocol (MCP)",
|
||||
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: "Gmail",
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive",
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar",
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
[project]
|
||||
name = "surf-new-backend"
|
||||
version = "0.0.11"
|
||||
version = "0.0.12"
|
||||
description = "SurfSense Backend"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
|
|
|
|||
2
surfsense_backend/uv.lock
generated
2
surfsense_backend/uv.lock
generated
|
|
@ -6545,7 +6545,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "surf-new-backend"
|
||||
version = "0.0.11"
|
||||
version = "0.0.12"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "alembic" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue