mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-01 03:46:25 +02:00
feat: added circleback connector
This commit is contained in:
parent
23870042f3
commit
c19d300c9d
27 changed files with 1153 additions and 97 deletions
|
|
@ -47,6 +47,7 @@ _ALL_CONNECTORS: list[str] = [
|
|||
"NOTE",
|
||||
"BOOKSTACK_CONNECTOR",
|
||||
"CRAWLED_URL",
|
||||
"CIRCLEBACK",
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -8,13 +8,12 @@ from .folder_manager import get_files_in_folder, list_folder_contents
|
|||
|
||||
__all__ = [
|
||||
"GoogleDriveClient",
|
||||
"get_valid_credentials",
|
||||
"validate_credentials",
|
||||
"download_and_process_file",
|
||||
"get_files_in_folder",
|
||||
"list_folder_contents",
|
||||
"get_start_page_token",
|
||||
"fetch_all_changes",
|
||||
"categorize_change",
|
||||
"download_and_process_file",
|
||||
"fetch_all_changes",
|
||||
"get_files_in_folder",
|
||||
"get_start_page_token",
|
||||
"get_valid_credentials",
|
||||
"list_folder_contents",
|
||||
"validate_credentials",
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -202,4 +202,3 @@ async def fetch_all_changes(
|
|||
except Exception as e:
|
||||
logger.error(f"Error fetching all changes: {e!s}", exc_info=True)
|
||||
return all_changes, current_token, f"Error fetching all changes: {e!s}"
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
from typing import Any
|
||||
|
||||
from google.oauth2.credentials import Credentials
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -107,16 +106,18 @@ class GoogleDriveClient:
|
|||
"""
|
||||
try:
|
||||
service = await self.get_service()
|
||||
file = service.files().get(fileId=file_id, fields=fields, supportsAllDrives=True).execute()
|
||||
file = (
|
||||
service.files()
|
||||
.get(fileId=file_id, fields=fields, supportsAllDrives=True)
|
||||
.execute()
|
||||
)
|
||||
return file, None
|
||||
except HttpError as e:
|
||||
return None, f"HTTP error getting file metadata: {e.resp.status}"
|
||||
except Exception as e:
|
||||
return None, f"Error getting file metadata: {e!s}"
|
||||
|
||||
async def download_file(
|
||||
self, file_id: str
|
||||
) -> tuple[bytes | None, str | None]:
|
||||
async def download_file(self, file_id: str) -> tuple[bytes | None, str | None]:
|
||||
"""
|
||||
Download binary file content.
|
||||
|
||||
|
|
@ -164,9 +165,7 @@ class GoogleDriveClient:
|
|||
try:
|
||||
service = await self.get_service()
|
||||
content = (
|
||||
service.files()
|
||||
.export(fileId=file_id, mimeType=mime_type)
|
||||
.execute()
|
||||
service.files().export(fileId=file_id, mimeType=mime_type).execute()
|
||||
)
|
||||
|
||||
# Content is already bytes from the API
|
||||
|
|
@ -180,4 +179,3 @@ class GoogleDriveClient:
|
|||
return None, f"HTTP error exporting file: {e.resp.status}"
|
||||
except Exception as e:
|
||||
return None, f"Error exporting file: {e!s}"
|
||||
|
||||
|
|
|
|||
|
|
@ -78,10 +78,10 @@ async def download_and_process_file(
|
|||
tmp_file.write(content_bytes)
|
||||
temp_file_path = tmp_file.name
|
||||
|
||||
from app.db import DocumentType
|
||||
from app.tasks.document_processors.file_processors import (
|
||||
process_file_in_background,
|
||||
)
|
||||
from app.db import DocumentType
|
||||
|
||||
connector_info = {
|
||||
"type": DocumentType.GOOGLE_DRIVE_FILE,
|
||||
|
|
@ -92,7 +92,7 @@ async def download_and_process_file(
|
|||
"source_connector": "google_drive",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Add additional Drive metadata if available
|
||||
if "modifiedTime" in file:
|
||||
connector_info["metadata"]["modified_time"] = file["modifiedTime"]
|
||||
|
|
@ -102,10 +102,12 @@ async def download_and_process_file(
|
|||
connector_info["metadata"]["file_size"] = file["size"]
|
||||
if "webViewLink" in file:
|
||||
connector_info["metadata"]["web_view_link"] = file["webViewLink"]
|
||||
|
||||
|
||||
if is_google_workspace_file(mime_type):
|
||||
connector_info["metadata"]["exported_as"] = "pdf"
|
||||
connector_info["metadata"]["original_workspace_type"] = mime_type.split(".")[-1]
|
||||
connector_info["metadata"]["original_workspace_type"] = mime_type.split(
|
||||
"."
|
||||
)[-1]
|
||||
|
||||
logger.info(f"Processing {file_name} with Surfsense's file processor")
|
||||
await process_file_in_background(
|
||||
|
|
@ -132,5 +134,3 @@ async def download_and_process_file(
|
|||
os.unlink(temp_file_path)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not delete temp file {temp_file_path}: {e}")
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
from app.db import SearchSourceConnector, SearchSourceConnectorType
|
||||
from app.db import SearchSourceConnector
|
||||
|
||||
|
||||
async def get_valid_credentials(
|
||||
|
|
@ -31,9 +31,7 @@ async def get_valid_credentials(
|
|||
Exception: If token refresh fails
|
||||
"""
|
||||
result = await session.execute(
|
||||
select(SearchSourceConnector).filter(
|
||||
SearchSourceConnector.id == connector_id
|
||||
)
|
||||
select(SearchSourceConnector).filter(SearchSourceConnector.id == connector_id)
|
||||
)
|
||||
connector = result.scalars().first()
|
||||
|
||||
|
|
@ -95,4 +93,3 @@ def validate_credentials(credentials: Credentials) -> bool:
|
|||
credentials.refresh_token,
|
||||
]
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -26,5 +26,3 @@ def should_skip_file(mime_type: str) -> bool:
|
|||
def get_export_mime_type(mime_type: str) -> str | None:
|
||||
"""Get export MIME type for Google Workspace files."""
|
||||
return EXPORT_FORMATS.get(mime_type)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,10 @@ async def list_folders(
|
|||
"""
|
||||
try:
|
||||
# Build query to get only folders
|
||||
query_parts = ["mimeType = 'application/vnd.google-apps.folder'", "trashed = false"]
|
||||
query_parts = [
|
||||
"mimeType = 'application/vnd.google-apps.folder'",
|
||||
"trashed = false",
|
||||
]
|
||||
|
||||
if parent_id:
|
||||
query_parts.append(f"'{parent_id}' in parents")
|
||||
|
|
@ -68,8 +71,7 @@ async def get_folder_hierarchy(
|
|||
# Traverse up to root
|
||||
while current_id:
|
||||
file, error = await client.get_file_metadata(
|
||||
current_id,
|
||||
fields="id, name, parents, mimeType"
|
||||
current_id, fields="id, name, parents, mimeType"
|
||||
)
|
||||
|
||||
if error:
|
||||
|
|
@ -189,7 +191,7 @@ async def list_folder_contents(
|
|||
# Fetch all items with pagination (max 1000 per page)
|
||||
all_items = []
|
||||
page_token = None
|
||||
|
||||
|
||||
while True:
|
||||
items, next_token, error = await client.list_files(
|
||||
query=query,
|
||||
|
|
@ -202,10 +204,10 @@ async def list_folder_contents(
|
|||
return [], error
|
||||
|
||||
all_items.extend(items)
|
||||
|
||||
|
||||
if not next_token:
|
||||
break
|
||||
|
||||
|
||||
page_token = next_token
|
||||
|
||||
for item in all_items:
|
||||
|
|
@ -226,5 +228,3 @@ async def list_folder_contents(
|
|||
except Exception as e:
|
||||
logger.error(f"Error listing folder contents: {e!s}", exc_info=True)
|
||||
return [], f"Error listing folder contents: {e!s}"
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ class DocumentType(str, Enum):
|
|||
LUMA_CONNECTOR = "LUMA_CONNECTOR"
|
||||
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
|
||||
BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR"
|
||||
CIRCLEBACK = "CIRCLEBACK"
|
||||
NOTE = "NOTE"
|
||||
|
||||
|
||||
|
|
@ -76,6 +77,7 @@ class SearchSourceConnectorType(str, Enum):
|
|||
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
|
||||
WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR"
|
||||
BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR"
|
||||
CIRCLEBACK_CONNECTOR = "CIRCLEBACK_CONNECTOR"
|
||||
|
||||
|
||||
class LiteLLMProvider(str, Enum):
|
||||
|
|
|
|||
|
|
@ -3,17 +3,18 @@ from fastapi import APIRouter
|
|||
from .airtable_add_connector_route import (
|
||||
router as airtable_add_connector_router,
|
||||
)
|
||||
from .circleback_webhook_route import router as circleback_webhook_router
|
||||
from .documents_routes import router as documents_router
|
||||
from .editor_routes import router as editor_router
|
||||
from .google_calendar_add_connector_route import (
|
||||
router as google_calendar_add_connector_router,
|
||||
)
|
||||
from .google_gmail_add_connector_route import (
|
||||
router as google_gmail_add_connector_router,
|
||||
)
|
||||
from .google_drive_add_connector_route import (
|
||||
router as google_drive_add_connector_router,
|
||||
)
|
||||
from .google_gmail_add_connector_route import (
|
||||
router as google_gmail_add_connector_router,
|
||||
)
|
||||
from .logs_routes import router as logs_router
|
||||
from .luma_add_connector_route import router as luma_add_connector_router
|
||||
from .new_chat_routes import router as new_chat_router
|
||||
|
|
@ -41,3 +42,4 @@ router.include_router(airtable_add_connector_router)
|
|||
router.include_router(luma_add_connector_router)
|
||||
router.include_router(new_llm_config_router) # LLM configs with prompt configuration
|
||||
router.include_router(logs_router)
|
||||
router.include_router(circleback_webhook_router) # Circleback meeting webhooks
|
||||
|
|
|
|||
317
surfsense_backend/app/routes/circleback_webhook_route.py
Normal file
317
surfsense_backend/app/routes/circleback_webhook_route.py
Normal file
|
|
@ -0,0 +1,317 @@
|
|||
"""
|
||||
Circleback Webhook Route
|
||||
|
||||
This module provides a webhook endpoint for receiving meeting data from Circleback.
|
||||
It processes the incoming webhook payload and saves it as a document in the specified search space.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# Pydantic models for Circleback webhook payload
|
||||
class CirclebackAttendee(BaseModel):
|
||||
"""Attendee model for Circleback meeting."""
|
||||
|
||||
name: str | None = None
|
||||
email: str | None = None
|
||||
|
||||
|
||||
class CirclebackActionItemAssignee(BaseModel):
|
||||
"""Assignee model for action items."""
|
||||
|
||||
name: str | None = None
|
||||
email: str | None = None
|
||||
|
||||
|
||||
class CirclebackActionItem(BaseModel):
|
||||
"""Action item model for Circleback meeting."""
|
||||
|
||||
id: int
|
||||
title: str
|
||||
description: str = ""
|
||||
assignee: CirclebackActionItemAssignee | None = None
|
||||
status: str = "PENDING"
|
||||
|
||||
|
||||
class CirclebackTranscriptSegment(BaseModel):
|
||||
"""Transcript segment model for Circleback meeting."""
|
||||
|
||||
speaker: str
|
||||
text: str
|
||||
timestamp: float
|
||||
|
||||
|
||||
class CirclebackInsightItem(BaseModel):
|
||||
"""Individual insight item."""
|
||||
|
||||
insight: str | dict[str, Any]
|
||||
speaker: str | None = None
|
||||
timestamp: float | None = None
|
||||
|
||||
|
||||
class CirclebackWebhookPayload(BaseModel):
|
||||
"""
|
||||
Circleback webhook payload model.
|
||||
|
||||
This model represents the data sent by Circleback when a meeting is processed.
|
||||
"""
|
||||
|
||||
model_config = {"populate_by_name": True}
|
||||
|
||||
id: int = Field(..., description="Circleback meeting ID")
|
||||
name: str = Field(..., description="Meeting name")
|
||||
created_at: str = Field(
|
||||
..., alias="createdAt", description="Meeting creation date in ISO format"
|
||||
)
|
||||
duration: float = Field(..., description="Meeting duration in seconds")
|
||||
url: str | None = Field(None, description="URL of the virtual meeting")
|
||||
recording_url: str | None = Field(
|
||||
None,
|
||||
alias="recordingUrl",
|
||||
description="URL of the meeting recording (valid for 24 hours)",
|
||||
)
|
||||
tags: list[str] = Field(default_factory=list, description="Meeting tags")
|
||||
ical_uid: str | None = Field(
|
||||
None, alias="icalUid", description="Unique identifier of the calendar event"
|
||||
)
|
||||
attendees: list[CirclebackAttendee] = Field(
|
||||
default_factory=list, description="Meeting attendees"
|
||||
)
|
||||
notes: str = Field("", description="Meeting notes in Markdown format")
|
||||
action_items: list[CirclebackActionItem] = Field(
|
||||
default_factory=list,
|
||||
alias="actionItems",
|
||||
description="Action items from the meeting",
|
||||
)
|
||||
transcript: list[CirclebackTranscriptSegment] = Field(
|
||||
default_factory=list, description="Meeting transcript segments"
|
||||
)
|
||||
insights: dict[str, list[CirclebackInsightItem]] = Field(
|
||||
default_factory=dict, description="Custom insights from the meeting"
|
||||
)
|
||||
|
||||
|
||||
def format_circleback_meeting_to_markdown(payload: CirclebackWebhookPayload) -> str:
|
||||
"""
|
||||
Convert Circleback webhook payload to a well-formatted Markdown document.
|
||||
|
||||
Args:
|
||||
payload: The Circleback webhook payload
|
||||
|
||||
Returns:
|
||||
Markdown string representation of the meeting
|
||||
"""
|
||||
lines = []
|
||||
|
||||
# Title
|
||||
lines.append(f"# {payload.name}")
|
||||
lines.append("")
|
||||
|
||||
# Meeting metadata
|
||||
lines.append("## Meeting Details")
|
||||
lines.append("")
|
||||
|
||||
# Parse and format date
|
||||
try:
|
||||
created_dt = datetime.fromisoformat(payload.created_at.replace("Z", "+00:00"))
|
||||
formatted_date = created_dt.strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
except (ValueError, AttributeError):
|
||||
formatted_date = payload.created_at
|
||||
|
||||
lines.append(f"- **Date:** {formatted_date}")
|
||||
lines.append(f"- **Duration:** {int(payload.duration // 60)} minutes")
|
||||
|
||||
if payload.url:
|
||||
lines.append(f"- **Meeting URL:** {payload.url}")
|
||||
|
||||
if payload.tags:
|
||||
lines.append(f"- **Tags:** {', '.join(payload.tags)}")
|
||||
|
||||
lines.append(
|
||||
f"- **Circleback Link:** [View on Circleback](https://app.circleback.ai/meetings/{payload.id})"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Attendees
|
||||
if payload.attendees:
|
||||
lines.append("## Attendees")
|
||||
lines.append("")
|
||||
for attendee in payload.attendees:
|
||||
name = attendee.name or "Unknown"
|
||||
if attendee.email:
|
||||
lines.append(f"- **{name}** ({attendee.email})")
|
||||
else:
|
||||
lines.append(f"- **{name}**")
|
||||
lines.append("")
|
||||
|
||||
# Notes (if provided)
|
||||
if payload.notes:
|
||||
lines.append("## Meeting Notes")
|
||||
lines.append("")
|
||||
lines.append(payload.notes)
|
||||
lines.append("")
|
||||
|
||||
# Action Items
|
||||
if payload.action_items:
|
||||
lines.append("## Action Items")
|
||||
lines.append("")
|
||||
for item in payload.action_items:
|
||||
status_emoji = "✅" if item.status == "DONE" else "⬜"
|
||||
assignee_text = ""
|
||||
if item.assignee and item.assignee.name:
|
||||
assignee_text = f" (Assigned to: {item.assignee.name})"
|
||||
|
||||
lines.append(f"{status_emoji} **{item.title}**{assignee_text}")
|
||||
if item.description:
|
||||
lines.append(f" {item.description}")
|
||||
lines.append("")
|
||||
|
||||
# Insights
|
||||
if payload.insights:
|
||||
lines.append("## Insights")
|
||||
lines.append("")
|
||||
for insight_name, insight_items in payload.insights.items():
|
||||
lines.append(f"### {insight_name}")
|
||||
lines.append("")
|
||||
for insight_item in insight_items:
|
||||
if isinstance(insight_item.insight, dict):
|
||||
for key, value in insight_item.insight.items():
|
||||
lines.append(f"- **{key}:** {value}")
|
||||
else:
|
||||
speaker_info = (
|
||||
f" _{insight_item.speaker}_" if insight_item.speaker else ""
|
||||
)
|
||||
lines.append(f"- {insight_item.insight}{speaker_info}")
|
||||
lines.append("")
|
||||
|
||||
# Transcript
|
||||
if payload.transcript:
|
||||
lines.append("## Transcript")
|
||||
lines.append("")
|
||||
for segment in payload.transcript:
|
||||
# Format timestamp as MM:SS
|
||||
minutes = int(segment.timestamp // 60)
|
||||
seconds = int(segment.timestamp % 60)
|
||||
timestamp_str = f"[{minutes:02d}:{seconds:02d}]"
|
||||
lines.append(f"**{segment.speaker}** {timestamp_str}: {segment.text}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@router.post("/webhooks/circleback/{search_space_id}")
|
||||
async def receive_circleback_webhook(
|
||||
search_space_id: int,
|
||||
payload: CirclebackWebhookPayload,
|
||||
):
|
||||
"""
|
||||
Receive and process a Circleback webhook.
|
||||
|
||||
This endpoint receives meeting data from Circleback and saves it as a document
|
||||
in the specified search space. The meeting data is converted to Markdown format
|
||||
and processed asynchronously.
|
||||
|
||||
Args:
|
||||
search_space_id: The ID of the search space to save the document to
|
||||
payload: The Circleback webhook payload containing meeting data
|
||||
|
||||
Returns:
|
||||
Success message with document details
|
||||
|
||||
Note:
|
||||
This endpoint does not require authentication as it's designed to receive
|
||||
webhooks from Circleback. Signature verification can be added later for security.
|
||||
"""
|
||||
try:
|
||||
logger.info(
|
||||
f"Received Circleback webhook for meeting {payload.id} in search space {search_space_id}"
|
||||
)
|
||||
|
||||
# Convert to markdown
|
||||
markdown_content = format_circleback_meeting_to_markdown(payload)
|
||||
|
||||
# Trigger async document processing
|
||||
from app.tasks.celery_tasks.document_tasks import (
|
||||
process_circleback_meeting_task,
|
||||
)
|
||||
|
||||
# Prepare meeting metadata for the task
|
||||
meeting_metadata = {
|
||||
"circleback_meeting_id": payload.id,
|
||||
"meeting_name": payload.name,
|
||||
"meeting_date": payload.created_at,
|
||||
"duration_seconds": payload.duration,
|
||||
"meeting_url": payload.url,
|
||||
"tags": payload.tags,
|
||||
"attendees_count": len(payload.attendees),
|
||||
"action_items_count": len(payload.action_items),
|
||||
"has_transcript": len(payload.transcript) > 0,
|
||||
}
|
||||
|
||||
# Queue the processing task
|
||||
process_circleback_meeting_task.delay(
|
||||
meeting_id=payload.id,
|
||||
meeting_name=payload.name,
|
||||
markdown_content=markdown_content,
|
||||
metadata=meeting_metadata,
|
||||
search_space_id=search_space_id,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Queued Circleback meeting {payload.id} for processing in search space {search_space_id}"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "accepted",
|
||||
"message": f"Meeting '{payload.name}' queued for processing",
|
||||
"meeting_id": payload.id,
|
||||
"search_space_id": search_space_id,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Circleback webhook: {e!s}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to process Circleback webhook: {e!s}",
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/webhooks/circleback/{search_space_id}/info")
|
||||
async def get_circleback_webhook_info(
|
||||
search_space_id: int,
|
||||
):
|
||||
"""
|
||||
Get information about the Circleback webhook endpoint.
|
||||
|
||||
This endpoint provides information about how to configure the Circleback
|
||||
webhook integration.
|
||||
|
||||
Args:
|
||||
search_space_id: The ID of the search space
|
||||
|
||||
Returns:
|
||||
Webhook configuration information
|
||||
"""
|
||||
from app.config import config
|
||||
|
||||
# Construct the webhook URL
|
||||
base_url = getattr(config, "API_BASE_URL", "http://localhost:8000")
|
||||
webhook_url = f"{base_url}/api/v1/webhooks/circleback/{search_space_id}"
|
||||
|
||||
return {
|
||||
"webhook_url": webhook_url,
|
||||
"search_space_id": search_space_id,
|
||||
"method": "POST",
|
||||
"content_type": "application/json",
|
||||
"description": "Use this URL in your Circleback automation to send meeting data to SurfSense",
|
||||
"note": "Configure this URL in Circleback Settings → Automations → Create automation → Send webhook request",
|
||||
}
|
||||
|
|
@ -28,10 +28,8 @@ from app.config import config
|
|||
from app.connectors.google_drive import (
|
||||
GoogleDriveClient,
|
||||
get_start_page_token,
|
||||
get_valid_credentials,
|
||||
list_folder_contents,
|
||||
)
|
||||
from app.connectors.google_drive.folder_manager import list_folders
|
||||
from app.db import (
|
||||
SearchSourceConnector,
|
||||
SearchSourceConnectorType,
|
||||
|
|
@ -111,7 +109,9 @@ async def connect_drive(space_id: int, user: User = Depends(current_active_user)
|
|||
state=state_encoded,
|
||||
)
|
||||
|
||||
logger.info(f"Initiating Google Drive OAuth for user {user.id}, space {space_id}")
|
||||
logger.info(
|
||||
f"Initiating Google Drive OAuth for user {user.id}, space {space_id}"
|
||||
)
|
||||
return {"auth_url": auth_url}
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -146,7 +146,9 @@ async def drive_callback(
|
|||
user_id = UUID(data["user_id"])
|
||||
space_id = data["space_id"]
|
||||
|
||||
logger.info(f"Processing Google Drive callback for user {user_id}, space {space_id}")
|
||||
logger.info(
|
||||
f"Processing Google Drive callback for user {user_id}, space {space_id}"
|
||||
)
|
||||
|
||||
# Exchange authorization code for tokens
|
||||
flow = get_google_flow()
|
||||
|
|
@ -200,7 +202,9 @@ async def drive_callback(
|
|||
|
||||
flag_modified(db_connector, "config")
|
||||
await session.commit()
|
||||
logger.info(f"Set initial start page token for connector {db_connector.id}")
|
||||
logger.info(
|
||||
f"Set initial start page token for connector {db_connector.id}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get initial start page token: {e!s}")
|
||||
|
||||
|
|
@ -246,7 +250,7 @@ async def list_google_drive_folders(
|
|||
):
|
||||
"""
|
||||
List folders AND files in user's Google Drive with hierarchical support.
|
||||
|
||||
|
||||
This is called at index time from the manage connector page to display
|
||||
the complete file system (folders and files). Only folders are selectable.
|
||||
|
||||
|
|
@ -299,7 +303,7 @@ async def list_google_drive_folders(
|
|||
f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}"
|
||||
+ (f" in folder {parent_id}" if parent_id else " in ROOT")
|
||||
)
|
||||
|
||||
|
||||
# Log first few items for debugging
|
||||
if items:
|
||||
logger.info(f"First 3 items: {[item.get('name') for item in items[:3]]}")
|
||||
|
|
|
|||
|
|
@ -45,7 +45,6 @@ from app.tasks.connector_indexers import (
|
|||
index_github_repos,
|
||||
index_google_calendar_events,
|
||||
index_google_gmail_messages,
|
||||
index_google_drive_files,
|
||||
index_jira_issues,
|
||||
index_linear_issues,
|
||||
index_luma_events,
|
||||
|
|
@ -1572,7 +1571,9 @@ async def run_google_drive_indexing(
|
|||
errors = []
|
||||
|
||||
# Index each folder
|
||||
for folder_id, folder_name in zip(folder_id_list, folder_name_list):
|
||||
for folder_id, folder_name in zip(
|
||||
folder_id_list, folder_name_list, strict=False
|
||||
):
|
||||
try:
|
||||
indexed_count, error_message = await index_google_drive_files(
|
||||
session,
|
||||
|
|
@ -1589,7 +1590,7 @@ async def run_google_drive_indexing(
|
|||
else:
|
||||
total_indexed += indexed_count
|
||||
except Exception as e:
|
||||
errors.append(f"{folder_name}: {str(e)}")
|
||||
errors.append(f"{folder_name}: {e!s}")
|
||||
logger.error(
|
||||
f"Error indexing folder {folder_name} ({folder_id}): {e}",
|
||||
exc_info=True,
|
||||
|
|
|
|||
|
|
@ -268,3 +268,105 @@ async def _process_file_upload(
|
|||
)
|
||||
logger.error(error_message)
|
||||
raise
|
||||
|
||||
|
||||
@celery_app.task(name="process_circleback_meeting", bind=True)
|
||||
def process_circleback_meeting_task(
|
||||
self,
|
||||
meeting_id: int,
|
||||
meeting_name: str,
|
||||
markdown_content: str,
|
||||
metadata: dict,
|
||||
search_space_id: int,
|
||||
):
|
||||
"""
|
||||
Celery task to process Circleback meeting webhook data.
|
||||
|
||||
Args:
|
||||
meeting_id: Circleback meeting ID
|
||||
meeting_name: Name of the meeting
|
||||
markdown_content: Meeting content formatted as markdown
|
||||
metadata: Meeting metadata dictionary
|
||||
search_space_id: ID of the search space
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(
|
||||
_process_circleback_meeting(
|
||||
meeting_id,
|
||||
meeting_name,
|
||||
markdown_content,
|
||||
metadata,
|
||||
search_space_id,
|
||||
)
|
||||
)
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
||||
async def _process_circleback_meeting(
|
||||
meeting_id: int,
|
||||
meeting_name: str,
|
||||
markdown_content: str,
|
||||
metadata: dict,
|
||||
search_space_id: int,
|
||||
):
|
||||
"""Process Circleback meeting with new session."""
|
||||
from app.tasks.document_processors.circleback_processor import (
|
||||
add_circleback_meeting_document,
|
||||
)
|
||||
|
||||
async with get_celery_session_maker()() as session:
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_circleback_meeting",
|
||||
source="circleback_webhook",
|
||||
message=f"Starting Circleback meeting processing: {meeting_name}",
|
||||
metadata={
|
||||
"document_type": "CIRCLEBACK",
|
||||
"meeting_id": meeting_id,
|
||||
"meeting_name": meeting_name,
|
||||
**metadata,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
result = await add_circleback_meeting_document(
|
||||
session=session,
|
||||
meeting_id=meeting_id,
|
||||
meeting_name=meeting_name,
|
||||
markdown_content=markdown_content,
|
||||
metadata=metadata,
|
||||
search_space_id=search_space_id,
|
||||
)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully processed Circleback meeting: {meeting_name}",
|
||||
{
|
||||
"document_id": result.id,
|
||||
"meeting_id": meeting_id,
|
||||
"content_hash": result.content_hash,
|
||||
},
|
||||
)
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Circleback meeting document already exists (duplicate): {meeting_name}",
|
||||
{"duplicate_detected": True, "meeting_id": meeting_id},
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to process Circleback meeting: {meeting_name}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__, "meeting_id": meeting_id},
|
||||
)
|
||||
logger.error(f"Error processing Circleback meeting: {e!s}")
|
||||
raise
|
||||
|
|
|
|||
|
|
@ -34,8 +34,8 @@ from .discord_indexer import index_discord_messages
|
|||
from .elasticsearch_indexer import index_elasticsearch_documents
|
||||
from .github_indexer import index_github_repos
|
||||
from .google_calendar_indexer import index_google_calendar_events
|
||||
from .google_gmail_indexer import index_google_gmail_messages
|
||||
from .google_drive_indexer import index_google_drive_files
|
||||
from .google_gmail_indexer import index_google_gmail_messages
|
||||
from .jira_indexer import index_jira_issues
|
||||
|
||||
# Issue tracking and project management
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
"""Google Drive indexer using Surfsense file processors."""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -99,11 +98,15 @@ async def index_google_drive_files(
|
|||
target_folder_id = folder_id
|
||||
target_folder_name = folder_name or "Selected Folder"
|
||||
|
||||
logger.info(f"Indexing Google Drive folder: {target_folder_name} ({target_folder_id})")
|
||||
logger.info(
|
||||
f"Indexing Google Drive folder: {target_folder_name} ({target_folder_id})"
|
||||
)
|
||||
|
||||
folder_tokens = connector.config.get("folder_tokens", {})
|
||||
start_page_token = folder_tokens.get(target_folder_id)
|
||||
can_use_delta_sync = use_delta_sync and start_page_token and connector.last_indexed_at
|
||||
can_use_delta_sync = (
|
||||
use_delta_sync and start_page_token and connector.last_indexed_at
|
||||
)
|
||||
|
||||
if can_use_delta_sync:
|
||||
logger.info(f"Using delta sync for connector {connector_id}")
|
||||
|
|
@ -151,9 +154,7 @@ async def index_google_drive_files(
|
|||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Successfully committed Google Drive indexing changes to database"
|
||||
)
|
||||
logger.info("Successfully committed Google Drive indexing changes to database")
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -252,7 +253,9 @@ async def _index_full_scan(
|
|||
|
||||
if documents_indexed % 10 == 0 and documents_indexed > 0:
|
||||
await session.commit()
|
||||
logger.info(f"Committed batch: {documents_indexed} files indexed so far")
|
||||
logger.info(
|
||||
f"Committed batch: {documents_indexed} files indexed so far"
|
||||
)
|
||||
|
||||
page_token = next_token
|
||||
if not page_token:
|
||||
|
|
@ -391,9 +394,7 @@ async def _process_single_file(
|
|||
return 0, 1
|
||||
|
||||
|
||||
async def _remove_document(
|
||||
session: AsyncSession, file_id: str, search_space_id: int
|
||||
):
|
||||
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
|
||||
"""Remove a document that was deleted in Drive."""
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||
|
|
@ -406,5 +407,3 @@ async def _remove_document(
|
|||
if existing_document:
|
||||
await session.delete(existing_document)
|
||||
logger.info(f"Removed deleted file document: {file_id}")
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,183 @@
|
|||
"""
|
||||
Circleback meeting document processor.
|
||||
|
||||
This module processes meeting data received from Circleback webhooks
|
||||
and stores it as searchable documents in the database.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Document, DocumentType
|
||||
from app.services.llm_service import get_document_summary_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
get_current_timestamp,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def add_circleback_meeting_document(
|
||||
session: AsyncSession,
|
||||
meeting_id: int,
|
||||
meeting_name: str,
|
||||
markdown_content: str,
|
||||
metadata: dict[str, Any],
|
||||
search_space_id: int,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Process and store a Circleback meeting document.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
meeting_id: Circleback meeting ID
|
||||
meeting_name: Name of the meeting
|
||||
markdown_content: Meeting content formatted as markdown
|
||||
metadata: Meeting metadata dictionary
|
||||
search_space_id: ID of the search space
|
||||
|
||||
Returns:
|
||||
Document object if successful, None if failed or duplicate
|
||||
"""
|
||||
try:
|
||||
# Generate unique identifier hash using Circleback meeting ID
|
||||
unique_identifier = f"circleback_{meeting_id}"
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.CIRCLEBACK, unique_identifier, search_space_id
|
||||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(f"Circleback meeting {meeting_id} unchanged. Skipping.")
|
||||
return existing_document
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Circleback meeting {meeting_id}. Updating document."
|
||||
)
|
||||
|
||||
# Get LLM for generating summary
|
||||
llm = await get_document_summary_llm(session, search_space_id)
|
||||
if not llm:
|
||||
logger.warning(
|
||||
f"No LLM configured for search space {search_space_id}. Using content as summary."
|
||||
)
|
||||
# Use first 1000 chars as summary if no LLM available
|
||||
summary_content = (
|
||||
markdown_content[:1000] + "..."
|
||||
if len(markdown_content) > 1000
|
||||
else markdown_content
|
||||
)
|
||||
summary_embedding = None
|
||||
else:
|
||||
# Generate summary with metadata
|
||||
document_metadata = {
|
||||
"meeting_name": meeting_name,
|
||||
"meeting_id": meeting_id,
|
||||
"document_type": "Circleback Meeting",
|
||||
**{
|
||||
k: v
|
||||
for k, v in metadata.items()
|
||||
if isinstance(v, str | int | float | bool)
|
||||
},
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, llm, document_metadata
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Convert to BlockNote JSON for editing capability
|
||||
from app.utils.blocknote_converter import convert_markdown_to_blocknote
|
||||
|
||||
blocknote_json = await convert_markdown_to_blocknote(markdown_content)
|
||||
if not blocknote_json:
|
||||
logger.warning(
|
||||
f"Failed to convert Circleback meeting {meeting_id} to BlockNote JSON, document will not be editable"
|
||||
)
|
||||
|
||||
# Prepare document metadata
|
||||
document_metadata = {
|
||||
"CIRCLEBACK_MEETING_ID": meeting_id,
|
||||
"MEETING_NAME": meeting_name,
|
||||
"SOURCE": "CIRCLEBACK_WEBHOOK",
|
||||
**metadata,
|
||||
}
|
||||
|
||||
# Update or create document
|
||||
if existing_document:
|
||||
# Update existing document
|
||||
existing_document.title = meeting_name
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
if summary_embedding is not None:
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = document_metadata
|
||||
existing_document.chunks = chunks
|
||||
existing_document.blocknote_document = blocknote_json
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
document = existing_document
|
||||
logger.info(
|
||||
f"Updated Circleback meeting document {meeting_id} in search space {search_space_id}"
|
||||
)
|
||||
else:
|
||||
# Create new document
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=meeting_name,
|
||||
document_type=DocumentType.CIRCLEBACK,
|
||||
document_metadata=document_metadata,
|
||||
content=summary_content,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
blocknote_document=blocknote_json,
|
||||
content_needs_reindexing=False,
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
logger.info(
|
||||
f"Created new Circleback meeting document {meeting_id} in search space {search_space_id}"
|
||||
)
|
||||
|
||||
return document
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
logger.error(
|
||||
f"Database error processing Circleback meeting {meeting_id}: {db_error}"
|
||||
)
|
||||
raise db_error
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(f"Failed to process Circleback meeting {meeting_id}: {e!s}")
|
||||
raise RuntimeError(f"Failed to process Circleback meeting: {e!s}") from e
|
||||
|
|
@ -473,7 +473,8 @@ async def process_file_in_background(
|
|||
session: AsyncSession,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry: Log,
|
||||
connector: dict | None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}}
|
||||
connector: dict
|
||||
| None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}}
|
||||
):
|
||||
try:
|
||||
# Check if the file is a markdown or text file
|
||||
|
|
@ -926,7 +927,9 @@ async def process_file_in_background(
|
|||
)
|
||||
|
||||
if connector:
|
||||
await _update_document_from_connector(last_created_doc, connector, session)
|
||||
await _update_document_from_connector(
|
||||
last_created_doc, connector, session
|
||||
)
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -1053,7 +1056,9 @@ async def process_file_in_background(
|
|||
)
|
||||
|
||||
if connector:
|
||||
await _update_document_from_connector(doc_result, connector, session)
|
||||
await _update_document_from_connector(
|
||||
doc_result, connector, session
|
||||
)
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue