mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-28 21:49:40 +02:00
Merge remote-tracking branch 'upstream/dev' into dev
This commit is contained in:
commit
60330622bf
54 changed files with 1542 additions and 34 deletions
|
|
@ -3,6 +3,12 @@ DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
|
|||
#Celery Config
|
||||
CELERY_BROKER_URL=redis://localhost:6379/0
|
||||
CELERY_RESULT_BACKEND=redis://localhost:6379/0
|
||||
# Optional: isolate queues when sharing Redis with other apps
|
||||
CELERY_TASK_DEFAULT_QUEUE=surfsense
|
||||
|
||||
# Redis for app-level features (heartbeats, podcast markers)
|
||||
# Defaults to CELERY_BROKER_URL when not set
|
||||
REDIS_APP_URL=redis://localhost:6379/0
|
||||
|
||||
#Electric(for migrations only)
|
||||
ELECTRIC_DB_USER=electric
|
||||
|
|
|
|||
185
surfsense_backend/alembic/versions/86_add_document_created_by.py
Normal file
185
surfsense_backend/alembic/versions/86_add_document_created_by.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
"""Add created_by_id column to documents table for document ownership tracking
|
||||
|
||||
Revision ID: 86
|
||||
Revises: 85
|
||||
Create Date: 2026-02-02
|
||||
|
||||
Changes:
|
||||
1. Add created_by_id column (UUID, nullable, foreign key to user.id)
|
||||
2. Create index on created_by_id for performance
|
||||
3. Backfill existing documents with search space owner's user_id (with progress indicator)
|
||||
"""
|
||||
|
||||
import sys
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "86"
|
||||
down_revision: str | None = "85"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
# Batch size for backfill operation
|
||||
BATCH_SIZE = 5000
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add created_by_id column to documents and backfill with search space owner."""
|
||||
|
||||
# 1. Add created_by_id column (nullable for backward compatibility)
|
||||
print("Step 1/4: Adding created_by_id column...")
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'documents' AND column_name = 'created_by_id'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
ADD COLUMN created_by_id UUID;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
print(" Done: created_by_id column added.")
|
||||
|
||||
# 2. Create index on created_by_id for efficient queries
|
||||
print("Step 2/4: Creating index on created_by_id...")
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_documents_created_by_id
|
||||
ON documents (created_by_id);
|
||||
"""
|
||||
)
|
||||
print(" Done: Index created.")
|
||||
|
||||
# 3. Add foreign key constraint with ON DELETE SET NULL
|
||||
# First check if constraint already exists
|
||||
print("Step 3/4: Adding foreign key constraint...")
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.table_constraints
|
||||
WHERE constraint_name = 'fk_documents_created_by_id'
|
||||
AND table_name = 'documents'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
ADD CONSTRAINT fk_documents_created_by_id
|
||||
FOREIGN KEY (created_by_id) REFERENCES "user"(id)
|
||||
ON DELETE SET NULL;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
print(" Done: Foreign key constraint added.")
|
||||
|
||||
# 4. Backfill existing documents with search space owner's user_id
|
||||
# Process in batches with progress indicator
|
||||
print("Step 4/4: Backfilling created_by_id for existing documents...")
|
||||
|
||||
connection = op.get_bind()
|
||||
|
||||
# Get total count of documents that need backfilling
|
||||
result = connection.execute(
|
||||
sa.text("""
|
||||
SELECT COUNT(*) FROM documents WHERE created_by_id IS NULL
|
||||
""")
|
||||
)
|
||||
total_count = result.scalar()
|
||||
|
||||
if total_count == 0:
|
||||
print(" No documents need backfilling. Skipping.")
|
||||
return
|
||||
|
||||
print(f" Total documents to backfill: {total_count:,}")
|
||||
|
||||
processed = 0
|
||||
batch_num = 0
|
||||
|
||||
while processed < total_count:
|
||||
batch_num += 1
|
||||
|
||||
# Update a batch of documents using a subquery to limit the update
|
||||
# We use ctid (tuple identifier) for efficient batching in PostgreSQL
|
||||
result = connection.execute(
|
||||
sa.text("""
|
||||
UPDATE documents
|
||||
SET created_by_id = searchspaces.user_id
|
||||
FROM searchspaces
|
||||
WHERE documents.search_space_id = searchspaces.id
|
||||
AND documents.created_by_id IS NULL
|
||||
AND documents.id IN (
|
||||
SELECT d.id FROM documents d
|
||||
WHERE d.created_by_id IS NULL
|
||||
LIMIT :batch_size
|
||||
)
|
||||
"""),
|
||||
{"batch_size": BATCH_SIZE}
|
||||
)
|
||||
|
||||
rows_updated = result.rowcount
|
||||
if rows_updated == 0:
|
||||
# No more rows to update
|
||||
break
|
||||
|
||||
processed += rows_updated
|
||||
progress_pct = min(100.0, (processed / total_count) * 100)
|
||||
|
||||
# Print progress with carriage return for in-place update
|
||||
sys.stdout.write(f"\r Progress: {processed:,}/{total_count:,} documents ({progress_pct:.1f}%) - Batch {batch_num}")
|
||||
sys.stdout.flush()
|
||||
|
||||
# Final newline after progress
|
||||
print()
|
||||
print(f" Done: Backfilled {processed:,} documents.")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove created_by_id column from documents."""
|
||||
|
||||
# Drop foreign key constraint
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM information_schema.table_constraints
|
||||
WHERE constraint_name = 'fk_documents_created_by_id'
|
||||
AND table_name = 'documents'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
DROP CONSTRAINT fk_documents_created_by_id;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# Drop index
|
||||
op.execute(
|
||||
"""
|
||||
DROP INDEX IF EXISTS ix_documents_created_by_id;
|
||||
"""
|
||||
)
|
||||
|
||||
# Drop column
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'documents' AND column_name = 'created_by_id'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
DROP COLUMN created_by_id;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
"""Add connector_id column to documents table for linking documents to their source connector
|
||||
|
||||
Revision ID: 87
|
||||
Revises: 86
|
||||
Create Date: 2026-02-02
|
||||
|
||||
Changes:
|
||||
1. Add connector_id column (Integer, nullable, foreign key to search_source_connectors.id)
|
||||
2. Create index on connector_id for efficient bulk deletion queries
|
||||
3. SET NULL on delete - allows controlled cleanup in application code
|
||||
4. Backfill existing documents based on document_type and search_space_id matching
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "87"
|
||||
down_revision: str | None = "86"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add connector_id column to documents and backfill from existing connectors."""
|
||||
|
||||
# 1. Add connector_id column (nullable - for manually uploaded docs without connector)
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'documents' AND column_name = 'connector_id'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
ADD COLUMN connector_id INTEGER;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# 2. Create index on connector_id for efficient cleanup queries
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_documents_connector_id
|
||||
ON documents (connector_id);
|
||||
"""
|
||||
)
|
||||
|
||||
# 3. Add foreign key constraint with ON DELETE SET NULL
|
||||
# SET NULL allows us to delete documents in controlled batches before deleting connector
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.table_constraints
|
||||
WHERE constraint_name = 'fk_documents_connector_id'
|
||||
AND table_name = 'documents'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
ADD CONSTRAINT fk_documents_connector_id
|
||||
FOREIGN KEY (connector_id) REFERENCES search_source_connectors(id)
|
||||
ON DELETE SET NULL;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# 4. Backfill existing documents with connector_id based on document_type matching
|
||||
# This maps document types to their corresponding connector types
|
||||
# Only backfills for documents in search spaces that have exactly one connector of that type
|
||||
|
||||
# Map of document_type -> connector_type for backfilling
|
||||
document_connector_mappings = [
|
||||
("NOTION_CONNECTOR", "NOTION_CONNECTOR"),
|
||||
("SLACK_CONNECTOR", "SLACK_CONNECTOR"),
|
||||
("TEAMS_CONNECTOR", "TEAMS_CONNECTOR"),
|
||||
("GITHUB_CONNECTOR", "GITHUB_CONNECTOR"),
|
||||
("LINEAR_CONNECTOR", "LINEAR_CONNECTOR"),
|
||||
("DISCORD_CONNECTOR", "DISCORD_CONNECTOR"),
|
||||
("JIRA_CONNECTOR", "JIRA_CONNECTOR"),
|
||||
("CONFLUENCE_CONNECTOR", "CONFLUENCE_CONNECTOR"),
|
||||
("CLICKUP_CONNECTOR", "CLICKUP_CONNECTOR"),
|
||||
("GOOGLE_CALENDAR_CONNECTOR", "GOOGLE_CALENDAR_CONNECTOR"),
|
||||
("GOOGLE_GMAIL_CONNECTOR", "GOOGLE_GMAIL_CONNECTOR"),
|
||||
("GOOGLE_DRIVE_FILE", "GOOGLE_DRIVE_CONNECTOR"),
|
||||
("AIRTABLE_CONNECTOR", "AIRTABLE_CONNECTOR"),
|
||||
("LUMA_CONNECTOR", "LUMA_CONNECTOR"),
|
||||
("ELASTICSEARCH_CONNECTOR", "ELASTICSEARCH_CONNECTOR"),
|
||||
("BOOKSTACK_CONNECTOR", "BOOKSTACK_CONNECTOR"),
|
||||
("CIRCLEBACK", "CIRCLEBACK_CONNECTOR"),
|
||||
("OBSIDIAN_CONNECTOR", "OBSIDIAN_CONNECTOR"),
|
||||
("COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"),
|
||||
("COMPOSIO_GMAIL_CONNECTOR", "COMPOSIO_GMAIL_CONNECTOR"),
|
||||
("COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"),
|
||||
("CRAWLED_URL", "WEBCRAWLER_CONNECTOR"),
|
||||
]
|
||||
|
||||
for doc_type, connector_type in document_connector_mappings:
|
||||
# Backfill connector_id for documents where:
|
||||
# 1. Document has this document_type
|
||||
# 2. Document doesn't already have a connector_id
|
||||
# 3. There's exactly one connector of this type in the same search space
|
||||
# This safely handles most cases while avoiding ambiguity
|
||||
op.execute(
|
||||
f"""
|
||||
UPDATE documents d
|
||||
SET connector_id = (
|
||||
SELECT ssc.id
|
||||
FROM search_source_connectors ssc
|
||||
WHERE ssc.search_space_id = d.search_space_id
|
||||
AND ssc.connector_type = '{connector_type}'
|
||||
LIMIT 1
|
||||
)
|
||||
WHERE d.document_type = '{doc_type}'
|
||||
AND d.connector_id IS NULL
|
||||
AND EXISTS (
|
||||
SELECT 1 FROM search_source_connectors ssc
|
||||
WHERE ssc.search_space_id = d.search_space_id
|
||||
AND ssc.connector_type = '{connector_type}'
|
||||
);
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove connector_id column from documents."""
|
||||
|
||||
# Drop foreign key constraint
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM information_schema.table_constraints
|
||||
WHERE constraint_name = 'fk_documents_connector_id'
|
||||
AND table_name = 'documents'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
DROP CONSTRAINT fk_documents_connector_id;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# Drop index
|
||||
op.execute(
|
||||
"""
|
||||
DROP INDEX IF EXISTS ix_documents_connector_id;
|
||||
"""
|
||||
)
|
||||
|
||||
# Drop column
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'documents' AND column_name = 'connector_id'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
DROP COLUMN connector_id;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
"""Make podcast_transcript nullable
|
||||
|
||||
Revision ID: 88
|
||||
Revises: 87
|
||||
Create Date: 2026-02-02
|
||||
|
||||
The podcast workflow now creates a podcast record with PENDING status first,
|
||||
then fills in the transcript after generation completes. This requires
|
||||
podcast_transcript to be nullable.
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "88"
|
||||
down_revision: str | None = "87"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Make podcast_transcript nullable and remove the server default
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE podcasts
|
||||
ALTER COLUMN podcast_transcript DROP NOT NULL;
|
||||
"""
|
||||
)
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE podcasts
|
||||
ALTER COLUMN podcast_transcript DROP DEFAULT;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Set empty JSON for any NULL values before adding NOT NULL constraint
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE podcasts
|
||||
SET podcast_transcript = '{}'::jsonb
|
||||
WHERE podcast_transcript IS NULL;
|
||||
"""
|
||||
)
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE podcasts
|
||||
ALTER COLUMN podcast_transcript SET DEFAULT '{}';
|
||||
"""
|
||||
)
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE podcasts
|
||||
ALTER COLUMN podcast_transcript SET NOT NULL;
|
||||
"""
|
||||
)
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
"""Make podcast file_location nullable
|
||||
|
||||
Revision ID: 89
|
||||
Revises: 88
|
||||
Create Date: 2026-02-03
|
||||
|
||||
The podcast workflow creates a podcast record with PENDING status first,
|
||||
then fills in the file_location after audio generation completes. This requires
|
||||
file_location to be nullable.
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "89"
|
||||
down_revision: str | None = "88"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Make file_location nullable
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE podcasts
|
||||
ALTER COLUMN file_location DROP NOT NULL;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Set empty string for any NULL values before adding NOT NULL constraint
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE podcasts
|
||||
SET file_location = ''
|
||||
WHERE file_location IS NULL;
|
||||
"""
|
||||
)
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE podcasts
|
||||
ALTER COLUMN file_location SET NOT NULL;
|
||||
"""
|
||||
)
|
||||
|
|
@ -21,8 +21,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from app.db import Podcast, PodcastStatus
|
||||
|
||||
# Redis connection for tracking active podcast tasks
|
||||
# Uses the same Redis instance as Celery
|
||||
REDIS_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
# Defaults to the Celery broker when REDIS_APP_URL is not set
|
||||
REDIS_URL = os.getenv(
|
||||
"REDIS_APP_URL",
|
||||
os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0"),
|
||||
)
|
||||
_redis_client: redis.Redis | None = None
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ def init_worker(**kwargs):
|
|||
# Get Celery configuration from environment
|
||||
CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
|
||||
CELERY_TASK_DEFAULT_QUEUE = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
|
||||
|
||||
# Get schedule checker interval from environment
|
||||
# Format: "<number><unit>" where unit is 'm' (minutes) or 'h' (hours)
|
||||
|
|
@ -80,6 +81,7 @@ celery_app = Celery(
|
|||
"app.tasks.celery_tasks.blocknote_migration_tasks",
|
||||
"app.tasks.celery_tasks.document_reindex_tasks",
|
||||
"app.tasks.celery_tasks.stale_notification_cleanup_task",
|
||||
"app.tasks.celery_tasks.connector_deletion_task",
|
||||
],
|
||||
)
|
||||
|
||||
|
|
@ -91,6 +93,9 @@ celery_app.conf.update(
|
|||
result_serializer="json",
|
||||
timezone="UTC",
|
||||
enable_utc=True,
|
||||
task_default_queue=CELERY_TASK_DEFAULT_QUEUE,
|
||||
task_default_exchange=CELERY_TASK_DEFAULT_QUEUE,
|
||||
task_default_routing_key=CELERY_TASK_DEFAULT_QUEUE,
|
||||
# Task execution settings
|
||||
task_track_started=True,
|
||||
task_time_limit=28800, # 8 hour hard limit
|
||||
|
|
|
|||
|
|
@ -122,8 +122,52 @@ global_llm_configs:
|
|||
use_default_system_instructions: false
|
||||
citations_enabled: true
|
||||
|
||||
# Example: Groq - Fast inference
|
||||
# Example: Azure OpenAI GPT-4o
|
||||
# IMPORTANT: For Azure deployments, always include 'base_model' in litellm_params
|
||||
# to enable accurate token counting, cost tracking, and max token limits
|
||||
- id: -5
|
||||
name: "Global Azure GPT-4o"
|
||||
description: "Azure OpenAI GPT-4o deployment"
|
||||
provider: "AZURE"
|
||||
# model_name format for Azure: azure/<your-deployment-name>
|
||||
model_name: "azure/gpt-4o-deployment"
|
||||
api_key: "your-azure-api-key-here"
|
||||
api_base: "https://your-resource.openai.azure.com"
|
||||
api_version: "2024-02-15-preview" # Azure API version
|
||||
rpm: 1000
|
||||
tpm: 150000
|
||||
litellm_params:
|
||||
temperature: 0.7
|
||||
max_tokens: 4000
|
||||
# REQUIRED for Azure: Specify the underlying OpenAI model
|
||||
# This fixes "Could not identify azure model" warnings
|
||||
# Common base_model values: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo
|
||||
base_model: "gpt-4o"
|
||||
system_instructions: ""
|
||||
use_default_system_instructions: true
|
||||
citations_enabled: true
|
||||
|
||||
# Example: Azure OpenAI GPT-4 Turbo
|
||||
- id: -6
|
||||
name: "Global Azure GPT-4 Turbo"
|
||||
description: "Azure OpenAI GPT-4 Turbo deployment"
|
||||
provider: "AZURE"
|
||||
model_name: "azure/gpt-4-turbo-deployment"
|
||||
api_key: "your-azure-api-key-here"
|
||||
api_base: "https://your-resource.openai.azure.com"
|
||||
api_version: "2024-02-15-preview"
|
||||
rpm: 500
|
||||
tpm: 100000
|
||||
litellm_params:
|
||||
temperature: 0.7
|
||||
max_tokens: 4000
|
||||
base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
|
||||
system_instructions: ""
|
||||
use_default_system_instructions: true
|
||||
citations_enabled: true
|
||||
|
||||
# Example: Groq - Fast inference
|
||||
- id: -7
|
||||
name: "Global Groq Llama 3"
|
||||
description: "Ultra-fast Llama 3 70B via Groq"
|
||||
provider: "GROQ"
|
||||
|
|
@ -150,3 +194,11 @@ global_llm_configs:
|
|||
# - All standard LiteLLM providers are supported
|
||||
# - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute)
|
||||
# These help the router distribute load evenly and avoid rate limit errors
|
||||
#
|
||||
# AZURE-SPECIFIC NOTES:
|
||||
# - Always add 'base_model' in litellm_params for Azure deployments
|
||||
# - This fixes "Could not identify azure model 'X'" warnings
|
||||
# - base_model should match the underlying OpenAI model (e.g., gpt-4o, gpt-4-turbo, gpt-3.5-turbo)
|
||||
# - model_name format: "azure/<your-deployment-name>"
|
||||
# - api_version: Use a recent Azure API version (e.g., "2024-02-15-preview")
|
||||
# - See: https://docs.litellm.ai/docs/proxy/cost_tracking#spend-tracking-for-azure-openai-models
|
||||
|
|
|
|||
|
|
@ -394,6 +394,8 @@ async def _process_gmail_message_batch(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
|
|
|
|||
|
|
@ -442,6 +442,8 @@ async def index_composio_google_calendar(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
|
|
|
|||
|
|
@ -1248,7 +1248,6 @@ async def _process_single_drive_file(
|
|||
"file_name": file_name,
|
||||
"FILE_NAME": file_name, # For compatibility
|
||||
"mime_type": mime_type,
|
||||
"connector_id": connector_id,
|
||||
"toolkit_id": "googledrive",
|
||||
"source": "composio",
|
||||
},
|
||||
|
|
@ -1258,6 +1257,8 @@ async def _process_single_drive_file(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ async def download_and_process_file(
|
|||
session: AsyncSession,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry: Log,
|
||||
connector_id: int | None = None,
|
||||
) -> tuple[Any, str | None, dict[str, Any] | None]:
|
||||
"""
|
||||
Download Google Drive file and process using Surfsense file processors.
|
||||
|
|
@ -37,6 +38,7 @@ async def download_and_process_file(
|
|||
session: Database session
|
||||
task_logger: Task logging service
|
||||
log_entry: Log entry for tracking
|
||||
connector_id: ID of the connector (for de-indexing support)
|
||||
|
||||
Returns:
|
||||
Tuple of (Document object if successful, error message if failed, file metadata dict)
|
||||
|
|
@ -92,6 +94,9 @@ async def download_and_process_file(
|
|||
"source_connector": "google_drive",
|
||||
},
|
||||
}
|
||||
# Include connector_id for de-indexing support
|
||||
if connector_id is not None:
|
||||
connector_info["connector_id"] = connector_id
|
||||
|
||||
# Add additional Drive metadata if available
|
||||
if "modifiedTime" in file:
|
||||
|
|
|
|||
|
|
@ -127,7 +127,12 @@ async def get_valid_credentials(
|
|||
)
|
||||
creds_dict["_token_encrypted"] = True
|
||||
|
||||
connector.config = creds_dict
|
||||
# IMPORTANT: Merge new credentials with existing config to preserve
|
||||
# user settings like selected_folders, selected_files, indexing_options,
|
||||
# folder_tokens, etc. that would otherwise be wiped on token refresh.
|
||||
existing_config = connector.config.copy() if connector.config else {}
|
||||
existing_config.update(creds_dict)
|
||||
connector.config = existing_config
|
||||
flag_modified(connector, "config")
|
||||
await session.commit()
|
||||
|
||||
|
|
|
|||
|
|
@ -761,7 +761,27 @@ class Document(BaseModel, TimestampMixin):
|
|||
search_space_id = Column(
|
||||
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
|
||||
# Track who created/uploaded this document
|
||||
created_by_id = Column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("user.id", ondelete="SET NULL"),
|
||||
nullable=True, # Nullable for backward compatibility with existing records
|
||||
index=True,
|
||||
)
|
||||
|
||||
# Track which connector created this document (for cleanup on connector deletion)
|
||||
connector_id = Column(
|
||||
Integer,
|
||||
ForeignKey("search_source_connectors.id", ondelete="SET NULL"),
|
||||
nullable=True, # Nullable for manually uploaded docs without connector
|
||||
index=True,
|
||||
)
|
||||
|
||||
# Relationships
|
||||
search_space = relationship("SearchSpace", back_populates="documents")
|
||||
created_by = relationship("User", back_populates="documents")
|
||||
connector = relationship("SearchSourceConnector", back_populates="documents")
|
||||
chunks = relationship(
|
||||
"Chunk", back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
|
|
@ -990,6 +1010,9 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
|
|||
UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
|
||||
# Documents created by this connector (for cleanup on connector deletion)
|
||||
documents = relationship("Document", back_populates="connector")
|
||||
|
||||
|
||||
class NewLLMConfig(BaseModel, TimestampMixin):
|
||||
"""
|
||||
|
|
@ -1296,6 +1319,13 @@ if config.AUTH_TYPE == "GOOGLE":
|
|||
passive_deletes=True,
|
||||
)
|
||||
|
||||
# Documents created/uploaded by this user
|
||||
documents = relationship(
|
||||
"Document",
|
||||
back_populates="created_by",
|
||||
passive_deletes=True,
|
||||
)
|
||||
|
||||
# User memories for personalized AI responses
|
||||
memories = relationship(
|
||||
"UserMemory",
|
||||
|
|
@ -1354,6 +1384,13 @@ else:
|
|||
passive_deletes=True,
|
||||
)
|
||||
|
||||
# Documents created/uploaded by this user
|
||||
documents = relationship(
|
||||
"Document",
|
||||
back_populates="created_by",
|
||||
passive_deletes=True,
|
||||
)
|
||||
|
||||
# User memories for personalized AI responses
|
||||
memories = relationship(
|
||||
"UserMemory",
|
||||
|
|
|
|||
|
|
@ -9,8 +9,12 @@ import logging
|
|||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import SearchSourceConnector, SearchSourceConnectorType, get_async_session
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -212,6 +216,7 @@ def format_circleback_meeting_to_markdown(payload: CirclebackWebhookPayload) ->
|
|||
async def receive_circleback_webhook(
|
||||
search_space_id: int,
|
||||
payload: CirclebackWebhookPayload,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
):
|
||||
"""
|
||||
Receive and process a Circleback webhook.
|
||||
|
|
@ -223,6 +228,7 @@ async def receive_circleback_webhook(
|
|||
Args:
|
||||
search_space_id: The ID of the search space to save the document to
|
||||
payload: The Circleback webhook payload containing meeting data
|
||||
session: Database session for looking up the connector
|
||||
|
||||
Returns:
|
||||
Success message with document details
|
||||
|
|
@ -236,6 +242,26 @@ async def receive_circleback_webhook(
|
|||
f"Received Circleback webhook for meeting {payload.id} in search space {search_space_id}"
|
||||
)
|
||||
|
||||
# Look up the Circleback connector for this search space
|
||||
connector_result = await session.execute(
|
||||
select(SearchSourceConnector.id).where(
|
||||
SearchSourceConnector.search_space_id == search_space_id,
|
||||
SearchSourceConnector.connector_type
|
||||
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
|
||||
)
|
||||
)
|
||||
connector_id = connector_result.scalar_one_or_none()
|
||||
|
||||
if connector_id:
|
||||
logger.info(
|
||||
f"Found Circleback connector {connector_id} for search space {search_space_id}"
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"No Circleback connector found for search space {search_space_id}. "
|
||||
"Document will be created without connector_id."
|
||||
)
|
||||
|
||||
# Convert to markdown
|
||||
markdown_content = format_circleback_meeting_to_markdown(payload)
|
||||
|
||||
|
|
@ -264,6 +290,7 @@ async def receive_circleback_webhook(
|
|||
markdown_content=markdown_content,
|
||||
metadata=meeting_metadata,
|
||||
search_space_id=search_space_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ from pydantic import ValidationError
|
|||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
from app.config import config
|
||||
from app.db import (
|
||||
|
|
@ -330,10 +331,19 @@ async def composio_callback(
|
|||
)
|
||||
|
||||
# Update existing connector with new connected_account_id
|
||||
# IMPORTANT: Merge new credentials with existing config to preserve
|
||||
# user settings like selected_folders, selected_files, indexing_options,
|
||||
# drive_page_token, etc. that would otherwise be wiped on reconnection.
|
||||
logger.info(
|
||||
f"Updating existing Composio connector {existing_connector.id} with new connected_account_id {final_connected_account_id}"
|
||||
)
|
||||
existing_connector.config = connector_config
|
||||
existing_config = (
|
||||
existing_connector.config.copy() if existing_connector.config else {}
|
||||
)
|
||||
existing_config.update(connector_config)
|
||||
existing_connector.config = existing_config
|
||||
|
||||
flag_modified(existing_connector, "config")
|
||||
await session.commit()
|
||||
await session.refresh(existing_connector)
|
||||
|
||||
|
|
|
|||
|
|
@ -76,6 +76,7 @@ async def create_note(
|
|||
document_metadata={"NOTE": True},
|
||||
embedding=None, # Will be generated on first reindex
|
||||
updated_at=datetime.now(UTC),
|
||||
created_by_id=user.id, # Track who created this note
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
@ -93,6 +94,7 @@ async def create_note(
|
|||
search_space_id=document.search_space_id,
|
||||
created_at=document.created_at,
|
||||
updated_at=document.updated_at,
|
||||
created_by_id=document.created_by_id,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -91,7 +91,10 @@ def get_heartbeat_redis_client() -> redis.Redis:
|
|||
"""Get or create Redis client for heartbeat tracking."""
|
||||
global _heartbeat_redis_client
|
||||
if _heartbeat_redis_client is None:
|
||||
redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
redis_url = os.getenv(
|
||||
"REDIS_APP_URL",
|
||||
os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0"),
|
||||
)
|
||||
_heartbeat_redis_client = redis.from_url(redis_url, decode_responses=True)
|
||||
return _heartbeat_redis_client
|
||||
|
||||
|
|
@ -524,9 +527,17 @@ async def delete_search_source_connector(
|
|||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Delete a search source connector.
|
||||
Delete a search source connector and all its associated documents.
|
||||
|
||||
The deletion runs in background via Celery task. User is notified
|
||||
via the notification system when complete (no polling required).
|
||||
|
||||
Requires CONNECTORS_DELETE permission.
|
||||
"""
|
||||
from app.tasks.celery_tasks.connector_deletion_task import (
|
||||
delete_connector_with_documents_task,
|
||||
)
|
||||
|
||||
try:
|
||||
# Get the connector first
|
||||
result = await session.execute(
|
||||
|
|
@ -548,7 +559,12 @@ async def delete_search_source_connector(
|
|||
"You don't have permission to delete this connector",
|
||||
)
|
||||
|
||||
# Delete any periodic schedule associated with this connector
|
||||
# Store connector info before we queue the deletion task
|
||||
connector_name = db_connector.name
|
||||
connector_type = db_connector.connector_type.value
|
||||
search_space_id = db_connector.search_space_id
|
||||
|
||||
# Delete any periodic schedule associated with this connector (lightweight, sync)
|
||||
if db_connector.periodic_indexing_enabled:
|
||||
success = delete_periodic_schedule(connector_id)
|
||||
if not success:
|
||||
|
|
@ -556,7 +572,7 @@ async def delete_search_source_connector(
|
|||
f"Failed to delete periodic schedule for connector {connector_id}"
|
||||
)
|
||||
|
||||
# For Composio connectors, also delete the connected account in Composio
|
||||
# For Composio connectors, delete the connected account in Composio (lightweight API call, sync)
|
||||
composio_connector_types = [
|
||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
|
||||
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
|
||||
|
|
@ -588,16 +604,33 @@ async def delete_search_source_connector(
|
|||
f"Error deleting Composio connected account {composio_connected_account_id}: {composio_error!s}"
|
||||
)
|
||||
|
||||
await session.delete(db_connector)
|
||||
await session.commit()
|
||||
return {"message": "Search source connector deleted successfully"}
|
||||
# Queue background task to delete documents and connector
|
||||
# This handles potentially large document counts without blocking the API
|
||||
delete_connector_with_documents_task.delay(
|
||||
connector_id=connector_id,
|
||||
user_id=str(user.id),
|
||||
search_space_id=search_space_id,
|
||||
connector_name=connector_name,
|
||||
connector_type=connector_type,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Queued deletion task for connector {connector_id} ({connector_name})"
|
||||
)
|
||||
|
||||
return {
|
||||
"message": "Connector deletion started. You will be notified when complete.",
|
||||
"status": "queued",
|
||||
"connector_id": connector_id,
|
||||
"connector_name": connector_name,
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to delete search source connector: {e!s}",
|
||||
detail=f"Failed to start connector deletion: {e!s}",
|
||||
) from e
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from datetime import datetime
|
||||
from typing import TypeVar
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
|
|
@ -51,6 +52,7 @@ class DocumentRead(BaseModel):
|
|||
created_at: datetime
|
||||
updated_at: datetime | None
|
||||
search_space_id: int
|
||||
created_by_id: UUID | None = None # User who created/uploaded this document
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,269 @@
|
|||
"""Celery task for background connector deletion.
|
||||
|
||||
This task handles the deletion of all documents associated with a connector
|
||||
in the background, then deletes the connector itself. User is notified via
|
||||
the notification system when complete (no polling required).
|
||||
|
||||
Features:
|
||||
- Batch deletion to handle large document counts
|
||||
- Automatic retry on failure
|
||||
- Progress tracking via notifications
|
||||
- Handles both success and failure notifications
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from uuid import UUID
|
||||
|
||||
from sqlalchemy import delete, func, select
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.pool import NullPool
|
||||
|
||||
from app.celery_app import celery_app
|
||||
from app.config import config
|
||||
from app.db import Document, Notification, SearchSourceConnector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Batch size for document deletion
|
||||
DELETION_BATCH_SIZE = 500
|
||||
|
||||
|
||||
def _get_celery_session_maker():
|
||||
"""Create async session maker for Celery tasks."""
|
||||
engine = create_async_engine(
|
||||
config.DATABASE_URL,
|
||||
poolclass=NullPool,
|
||||
echo=False,
|
||||
)
|
||||
return async_sessionmaker(engine, expire_on_commit=False), engine
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
bind=True,
|
||||
name="delete_connector_with_documents",
|
||||
max_retries=3,
|
||||
default_retry_delay=60,
|
||||
autoretry_for=(Exception,),
|
||||
retry_backoff=True,
|
||||
)
|
||||
def delete_connector_with_documents_task(
|
||||
self,
|
||||
connector_id: int,
|
||||
user_id: str,
|
||||
search_space_id: int,
|
||||
connector_name: str,
|
||||
connector_type: str,
|
||||
):
|
||||
"""
|
||||
Background task to delete a connector and all its associated documents.
|
||||
|
||||
Creates a notification when complete (success or failure).
|
||||
No polling required - user sees notification in UI.
|
||||
|
||||
Args:
|
||||
connector_id: ID of the connector to delete
|
||||
user_id: ID of the user who initiated the deletion
|
||||
search_space_id: ID of the search space
|
||||
connector_name: Name of the connector (for notification message)
|
||||
connector_type: Type of the connector (for logging)
|
||||
"""
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
return loop.run_until_complete(
|
||||
_delete_connector_async(
|
||||
connector_id=connector_id,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
connector_name=connector_name,
|
||||
connector_type=connector_type,
|
||||
)
|
||||
)
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
||||
async def _delete_connector_async(
|
||||
connector_id: int,
|
||||
user_id: str,
|
||||
search_space_id: int,
|
||||
connector_name: str,
|
||||
connector_type: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Async implementation of connector deletion.
|
||||
|
||||
Steps:
|
||||
1. Count total documents to delete
|
||||
2. Delete documents in batches (chunks cascade automatically)
|
||||
3. Delete the connector record
|
||||
4. Create success notification
|
||||
|
||||
On failure, creates failure notification and re-raises exception.
|
||||
"""
|
||||
session_maker, engine = _get_celery_session_maker()
|
||||
total_deleted = 0
|
||||
|
||||
try:
|
||||
async with session_maker() as session:
|
||||
# Step 1: Count total documents for this connector
|
||||
count_result = await session.execute(
|
||||
select(func.count(Document.id)).where(
|
||||
Document.connector_id == connector_id
|
||||
)
|
||||
)
|
||||
total_docs = count_result.scalar() or 0
|
||||
|
||||
logger.info(
|
||||
f"Starting deletion of connector {connector_id} ({connector_name}). "
|
||||
f"Documents to delete: {total_docs}"
|
||||
)
|
||||
|
||||
# Step 2: Delete documents in batches
|
||||
while True:
|
||||
# Get batch of document IDs
|
||||
result = await session.execute(
|
||||
select(Document.id)
|
||||
.where(Document.connector_id == connector_id)
|
||||
.limit(DELETION_BATCH_SIZE)
|
||||
)
|
||||
doc_ids = [row[0] for row in result.fetchall()]
|
||||
|
||||
if not doc_ids:
|
||||
break
|
||||
|
||||
# Delete this batch (chunks are deleted via CASCADE)
|
||||
await session.execute(delete(Document).where(Document.id.in_(doc_ids)))
|
||||
await session.commit()
|
||||
|
||||
total_deleted += len(doc_ids)
|
||||
logger.info(
|
||||
f"Deleted batch of {len(doc_ids)} documents. "
|
||||
f"Progress: {total_deleted}/{total_docs}"
|
||||
)
|
||||
|
||||
# Step 3: Delete the connector record
|
||||
result = await session.execute(
|
||||
select(SearchSourceConnector).where(
|
||||
SearchSourceConnector.id == connector_id
|
||||
)
|
||||
)
|
||||
connector = result.scalar_one_or_none()
|
||||
|
||||
if connector:
|
||||
await session.delete(connector)
|
||||
logger.info(f"Deleted connector record: {connector_id}")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Connector {connector_id} not found - may have been already deleted"
|
||||
)
|
||||
|
||||
# Step 4: Create success notification
|
||||
doc_text = "document" if total_deleted == 1 else "documents"
|
||||
notification = Notification(
|
||||
user_id=UUID(user_id),
|
||||
search_space_id=search_space_id,
|
||||
type="connector_deletion",
|
||||
title=f"{connector_name} Removed",
|
||||
message=f"Connector and {total_deleted} {doc_text} have been removed from your knowledge base.",
|
||||
notification_metadata={
|
||||
"connector_id": connector_id,
|
||||
"connector_name": connector_name,
|
||||
"connector_type": connector_type,
|
||||
"documents_deleted": total_deleted,
|
||||
"status": "completed",
|
||||
},
|
||||
)
|
||||
session.add(notification)
|
||||
await session.commit()
|
||||
|
||||
logger.info(
|
||||
f"Connector {connector_id} ({connector_name}) deleted successfully. "
|
||||
f"Total documents deleted: {total_deleted}"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"connector_id": connector_id,
|
||||
"connector_name": connector_name,
|
||||
"documents_deleted": total_deleted,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to delete connector {connector_id} ({connector_name}): {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# Create failure notification
|
||||
try:
|
||||
async with session_maker() as session:
|
||||
notification = Notification(
|
||||
user_id=UUID(user_id),
|
||||
search_space_id=search_space_id,
|
||||
type="connector_deletion",
|
||||
title=f"Failed to Remove {connector_name}",
|
||||
message="Something went wrong while removing this connector. Please try again.",
|
||||
notification_metadata={
|
||||
"connector_id": connector_id,
|
||||
"connector_name": connector_name,
|
||||
"connector_type": connector_type,
|
||||
"documents_deleted": total_deleted,
|
||||
"status": "failed",
|
||||
"error": str(e),
|
||||
},
|
||||
)
|
||||
session.add(notification)
|
||||
await session.commit()
|
||||
except Exception as notify_error:
|
||||
logger.error(
|
||||
f"Failed to create failure notification: {notify_error!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# Re-raise to trigger Celery retry
|
||||
raise
|
||||
|
||||
finally:
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
async def delete_documents_by_connector_id(
|
||||
session,
|
||||
connector_id: int,
|
||||
batch_size: int = DELETION_BATCH_SIZE,
|
||||
) -> int:
|
||||
"""
|
||||
Delete all documents associated with a connector in batches.
|
||||
|
||||
This is a utility function that can be used independently of the Celery task
|
||||
for synchronous deletion scenarios (e.g., small document counts).
|
||||
|
||||
Args:
|
||||
session: AsyncSession instance
|
||||
connector_id: ID of the connector
|
||||
batch_size: Number of documents to delete per batch
|
||||
|
||||
Returns:
|
||||
Total number of documents deleted
|
||||
"""
|
||||
total_deleted = 0
|
||||
|
||||
while True:
|
||||
result = await session.execute(
|
||||
select(Document.id)
|
||||
.where(Document.connector_id == connector_id)
|
||||
.limit(batch_size)
|
||||
)
|
||||
doc_ids = [row[0] for row in result.fetchall()]
|
||||
|
||||
if not doc_ids:
|
||||
break
|
||||
|
||||
await session.execute(delete(Document).where(Document.id.in_(doc_ids)))
|
||||
await session.commit()
|
||||
total_deleted += len(doc_ids)
|
||||
|
||||
return total_deleted
|
||||
|
|
@ -323,6 +323,28 @@ def process_file_upload_task(
|
|||
user_id: ID of the user
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
import traceback
|
||||
|
||||
logger.info(
|
||||
f"[process_file_upload] Task started - file: {filename}, "
|
||||
f"search_space_id: {search_space_id}, user_id: {user_id}"
|
||||
)
|
||||
logger.info(f"[process_file_upload] File path: {file_path}")
|
||||
|
||||
# Check if file exists and is accessible
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(
|
||||
f"[process_file_upload] File does not exist: {file_path}. "
|
||||
"The temp file may have been cleaned up before the task ran."
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
file_size = os.path.getsize(file_path)
|
||||
logger.info(f"[process_file_upload] File size: {file_size} bytes")
|
||||
except Exception as e:
|
||||
logger.warning(f"[process_file_upload] Could not get file size: {e}")
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
|
@ -331,6 +353,15 @@ def process_file_upload_task(
|
|||
loop.run_until_complete(
|
||||
_process_file_upload(file_path, filename, search_space_id, user_id)
|
||||
)
|
||||
logger.info(
|
||||
f"[process_file_upload] Task completed successfully for: {filename}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"[process_file_upload] Task failed for {filename}: {e}\n"
|
||||
f"Traceback:\n{traceback.format_exc()}"
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
|
@ -343,16 +374,22 @@ async def _process_file_upload(
|
|||
|
||||
from app.tasks.document_processors.file_processors import process_file_in_background
|
||||
|
||||
logger.info(f"[_process_file_upload] Starting async processing for: {filename}")
|
||||
|
||||
async with get_celery_session_maker()() as session:
|
||||
logger.info(f"[_process_file_upload] Database session created for: {filename}")
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Get file size for notification metadata
|
||||
try:
|
||||
file_size = os.path.getsize(file_path)
|
||||
except Exception:
|
||||
logger.info(f"[_process_file_upload] File size: {file_size} bytes")
|
||||
except Exception as e:
|
||||
logger.warning(f"[_process_file_upload] Could not get file size: {e}")
|
||||
file_size = None
|
||||
|
||||
# Create notification for document processing
|
||||
logger.info(f"[_process_file_upload] Creating notification for: {filename}")
|
||||
notification = (
|
||||
await NotificationService.document_processing.notify_processing_started(
|
||||
session=session,
|
||||
|
|
@ -363,6 +400,9 @@ async def _process_file_upload(
|
|||
file_size=file_size,
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
f"[_process_file_upload] Notification created with ID: {notification.id if notification else 'None'}"
|
||||
)
|
||||
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_file_upload",
|
||||
|
|
@ -505,6 +545,7 @@ def process_circleback_meeting_task(
|
|||
markdown_content: str,
|
||||
metadata: dict,
|
||||
search_space_id: int,
|
||||
connector_id: int | None = None,
|
||||
):
|
||||
"""
|
||||
Celery task to process Circleback meeting webhook data.
|
||||
|
|
@ -515,6 +556,7 @@ def process_circleback_meeting_task(
|
|||
markdown_content: Meeting content formatted as markdown
|
||||
metadata: Meeting metadata dictionary
|
||||
search_space_id: ID of the search space
|
||||
connector_id: ID of the Circleback connector (for deletion support)
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
|
|
@ -529,6 +571,7 @@ def process_circleback_meeting_task(
|
|||
markdown_content,
|
||||
metadata,
|
||||
search_space_id,
|
||||
connector_id,
|
||||
)
|
||||
)
|
||||
finally:
|
||||
|
|
@ -541,6 +584,7 @@ async def _process_circleback_meeting(
|
|||
markdown_content: str,
|
||||
metadata: dict,
|
||||
search_space_id: int,
|
||||
connector_id: int | None = None,
|
||||
):
|
||||
"""Process Circleback meeting with new session."""
|
||||
from app.tasks.document_processors.circleback_processor import (
|
||||
|
|
@ -597,6 +641,7 @@ async def _process_circleback_meeting(
|
|||
markdown_content=markdown_content,
|
||||
metadata=metadata,
|
||||
search_space_id=search_space_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
if result:
|
||||
|
|
|
|||
|
|
@ -51,7 +51,10 @@ def _clear_generating_podcast(search_space_id: int) -> None:
|
|||
import redis
|
||||
|
||||
try:
|
||||
redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
redis_url = os.getenv(
|
||||
"REDIS_APP_URL",
|
||||
os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0"),
|
||||
)
|
||||
client = redis.from_url(redis_url, decode_responses=True)
|
||||
key = f"podcast:generating:{search_space_id}"
|
||||
client.delete(key)
|
||||
|
|
|
|||
|
|
@ -36,7 +36,10 @@ def get_redis_client() -> redis.Redis:
|
|||
"""Get or create Redis client for heartbeat checking."""
|
||||
global _redis_client
|
||||
if _redis_client is None:
|
||||
redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
redis_url = os.getenv(
|
||||
"REDIS_APP_URL",
|
||||
os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0"),
|
||||
)
|
||||
_redis_client = redis.from_url(redis_url, decode_responses=True)
|
||||
return _redis_client
|
||||
|
||||
|
|
|
|||
|
|
@ -417,6 +417,8 @@ async def index_airtable_records(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -396,6 +396,8 @@ async def index_bookstack_pages(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -395,6 +395,8 @@ async def index_clickup_tasks(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -402,6 +402,8 @@ async def index_confluence_pages(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -527,6 +527,8 @@ async def index_discord_messages(
|
|||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -292,6 +292,8 @@ async def index_elasticsearch_documents(
|
|||
document_metadata=metadata,
|
||||
search_space_id=search_space_id,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
# Create chunks and attach to document (persist via relationship)
|
||||
|
|
|
|||
|
|
@ -220,6 +220,7 @@ async def index_github_repos(
|
|||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
documents_processed += docs_created
|
||||
|
|
@ -292,6 +293,7 @@ async def _process_repository_digest(
|
|||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
connector_id: int,
|
||||
) -> int:
|
||||
"""
|
||||
Process a repository digest and create documents.
|
||||
|
|
@ -426,6 +428,8 @@ async def _process_repository_digest(
|
|||
search_space_id=search_space_id,
|
||||
chunks=chunks_data,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -499,6 +499,8 @@ async def index_google_calendar_events(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -767,6 +767,7 @@ async def _process_single_file(
|
|||
session=session,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
if error:
|
||||
|
|
|
|||
|
|
@ -413,7 +413,6 @@ async def index_google_gmail_messages(
|
|||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
|
|
@ -421,6 +420,8 @@ async def index_google_gmail_messages(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
|
|
|
|||
|
|
@ -380,6 +380,8 @@ async def index_jira_issues(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -413,6 +413,8 @@ async def index_linear_issues(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -476,6 +476,8 @@ async def index_luma_events(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -398,6 +398,7 @@ async def index_notion_pages(
|
|||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.connector_id = connector_id
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully updated Notion page: {page_title}")
|
||||
|
|
@ -470,6 +471,8 @@ async def index_notion_pages(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -500,6 +500,8 @@ async def index_obsidian_vault(
|
|||
embedding=embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(new_document)
|
||||
|
|
|
|||
|
|
@ -389,6 +389,8 @@ async def index_slack_messages(
|
|||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -430,6 +430,8 @@ async def index_teams_messages(
|
|||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -371,6 +371,8 @@ async def index_crawled_urls(
|
|||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -8,10 +8,17 @@ and stores it as searchable documents in the database.
|
|||
import logging
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Document, DocumentType
|
||||
from app.db import (
|
||||
Document,
|
||||
DocumentType,
|
||||
SearchSourceConnector,
|
||||
SearchSourceConnectorType,
|
||||
SearchSpace,
|
||||
)
|
||||
from app.services.llm_service import get_document_summary_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -35,6 +42,7 @@ async def add_circleback_meeting_document(
|
|||
markdown_content: str,
|
||||
metadata: dict[str, Any],
|
||||
search_space_id: int,
|
||||
connector_id: int | None = None,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Process and store a Circleback meeting document.
|
||||
|
|
@ -46,6 +54,7 @@ async def add_circleback_meeting_document(
|
|||
markdown_content: Meeting content formatted as markdown
|
||||
metadata: Meeting metadata dictionary
|
||||
search_space_id: ID of the search space
|
||||
connector_id: ID of the Circleback connector (for deletion support)
|
||||
|
||||
Returns:
|
||||
Document object if successful, None if failed or duplicate
|
||||
|
|
@ -125,6 +134,30 @@ async def add_circleback_meeting_document(
|
|||
**metadata,
|
||||
}
|
||||
|
||||
# Fetch the user who set up the Circleback connector (preferred)
|
||||
# or fall back to search space owner if no connector found
|
||||
created_by_user_id = None
|
||||
|
||||
# Try to find the Circleback connector for this search space
|
||||
connector_result = await session.execute(
|
||||
select(SearchSourceConnector.user_id).where(
|
||||
SearchSourceConnector.search_space_id == search_space_id,
|
||||
SearchSourceConnector.connector_type
|
||||
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
|
||||
)
|
||||
)
|
||||
connector_user = connector_result.scalar_one_or_none()
|
||||
|
||||
if connector_user:
|
||||
# Use the user who set up the Circleback connector
|
||||
created_by_user_id = connector_user
|
||||
else:
|
||||
# Fallback: use search space owner if no connector found
|
||||
search_space_result = await session.execute(
|
||||
select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
|
||||
)
|
||||
created_by_user_id = search_space_result.scalar_one_or_none()
|
||||
|
||||
# Update or create document
|
||||
if existing_document:
|
||||
# Update existing document
|
||||
|
|
@ -138,6 +171,9 @@ async def add_circleback_meeting_document(
|
|||
existing_document.blocknote_document = blocknote_json
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
# Ensure connector_id is set (backfill for documents created before this field)
|
||||
if connector_id is not None:
|
||||
existing_document.connector_id = connector_id
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
|
|
@ -160,6 +196,8 @@ async def add_circleback_meeting_document(
|
|||
blocknote_document=blocknote_json,
|
||||
content_needs_reindexing=False,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=created_by_user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -185,6 +185,7 @@ async def add_extension_received_document(
|
|||
unique_identifier_hash=unique_identifier_hash,
|
||||
blocknote_document=blocknote_json,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -526,6 +526,8 @@ async def add_received_file_document_using_unstructured(
|
|||
blocknote_document=blocknote_json,
|
||||
content_needs_reindexing=False,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
@ -665,6 +667,8 @@ async def add_received_file_document_using_llamacloud(
|
|||
blocknote_document=blocknote_json,
|
||||
content_needs_reindexing=False,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
@ -829,6 +833,8 @@ async def add_received_file_document_using_docling(
|
|||
blocknote_document=blocknote_json,
|
||||
content_needs_reindexing=False,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
@ -849,7 +855,7 @@ async def add_received_file_document_using_docling(
|
|||
async def _update_document_from_connector(
|
||||
document: Document | None, connector: dict | None, session: AsyncSession
|
||||
) -> None:
|
||||
"""Helper to update document type and metadata from connector info."""
|
||||
"""Helper to update document type, metadata, and connector_id from connector info."""
|
||||
if document and connector:
|
||||
if "type" in connector:
|
||||
document.document_type = connector["type"]
|
||||
|
|
@ -861,6 +867,9 @@ async def _update_document_from_connector(
|
|||
# Expand existing metadata with connector metadata
|
||||
merged = {**document.document_metadata, **connector["metadata"]}
|
||||
document.document_metadata = merged
|
||||
# Set connector_id if provided for de-indexing support
|
||||
if "connector_id" in connector:
|
||||
document.connector_id = connector["connector_id"]
|
||||
await session.commit()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -295,6 +295,8 @@ async def add_received_markdown_file_document(
|
|||
unique_identifier_hash=primary_hash,
|
||||
blocknote_document=blocknote_json,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -357,6 +357,7 @@ async def add_youtube_video_document(
|
|||
unique_identifier_hash=unique_identifier_hash,
|
||||
blocknote_document=blocknote_json,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ backend_pid=$!
|
|||
sleep 5
|
||||
|
||||
echo "Starting Celery Worker..."
|
||||
celery -A app.celery_app worker --loglevel=info &
|
||||
celery -A app.celery_app worker --loglevel=info --autoscale=128,4 &
|
||||
celery_worker_pid=$!
|
||||
|
||||
# Wait a bit for worker to initialize
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue