Merge pull request #765 from AnishSarkar22/fix/documents

feat: Add document ownership & deletion of documents
This commit is contained in:
Rohan Verma 2026-02-02 14:50:18 -08:00 committed by GitHub
commit d0673cecf6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
41 changed files with 832 additions and 16 deletions

View file

@ -0,0 +1,269 @@
"""Celery task for background connector deletion.
This task handles the deletion of all documents associated with a connector
in the background, then deletes the connector itself. User is notified via
the notification system when complete (no polling required).
Features:
- Batch deletion to handle large document counts
- Automatic retry on failure
- Progress tracking via notifications
- Handles both success and failure notifications
"""
import asyncio
import logging
from uuid import UUID
from sqlalchemy import delete, func, select
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.db import Document, Notification, SearchSourceConnector
logger = logging.getLogger(__name__)
# Batch size for document deletion
DELETION_BATCH_SIZE = 500
def _get_celery_session_maker():
"""Create async session maker for Celery tasks."""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool,
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False), engine
@celery_app.task(
bind=True,
name="delete_connector_with_documents",
max_retries=3,
default_retry_delay=60,
autoretry_for=(Exception,),
retry_backoff=True,
)
def delete_connector_with_documents_task(
self,
connector_id: int,
user_id: str,
search_space_id: int,
connector_name: str,
connector_type: str,
):
"""
Background task to delete a connector and all its associated documents.
Creates a notification when complete (success or failure).
No polling required - user sees notification in UI.
Args:
connector_id: ID of the connector to delete
user_id: ID of the user who initiated the deletion
search_space_id: ID of the search space
connector_name: Name of the connector (for notification message)
connector_type: Type of the connector (for logging)
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(
_delete_connector_async(
connector_id=connector_id,
user_id=user_id,
search_space_id=search_space_id,
connector_name=connector_name,
connector_type=connector_type,
)
)
finally:
loop.close()
async def _delete_connector_async(
connector_id: int,
user_id: str,
search_space_id: int,
connector_name: str,
connector_type: str,
) -> dict:
"""
Async implementation of connector deletion.
Steps:
1. Count total documents to delete
2. Delete documents in batches (chunks cascade automatically)
3. Delete the connector record
4. Create success notification
On failure, creates failure notification and re-raises exception.
"""
session_maker, engine = _get_celery_session_maker()
total_deleted = 0
try:
async with session_maker() as session:
# Step 1: Count total documents for this connector
count_result = await session.execute(
select(func.count(Document.id)).where(
Document.connector_id == connector_id
)
)
total_docs = count_result.scalar() or 0
logger.info(
f"Starting deletion of connector {connector_id} ({connector_name}). "
f"Documents to delete: {total_docs}"
)
# Step 2: Delete documents in batches
while True:
# Get batch of document IDs
result = await session.execute(
select(Document.id)
.where(Document.connector_id == connector_id)
.limit(DELETION_BATCH_SIZE)
)
doc_ids = [row[0] for row in result.fetchall()]
if not doc_ids:
break
# Delete this batch (chunks are deleted via CASCADE)
await session.execute(delete(Document).where(Document.id.in_(doc_ids)))
await session.commit()
total_deleted += len(doc_ids)
logger.info(
f"Deleted batch of {len(doc_ids)} documents. "
f"Progress: {total_deleted}/{total_docs}"
)
# Step 3: Delete the connector record
result = await session.execute(
select(SearchSourceConnector).where(
SearchSourceConnector.id == connector_id
)
)
connector = result.scalar_one_or_none()
if connector:
await session.delete(connector)
logger.info(f"Deleted connector record: {connector_id}")
else:
logger.warning(
f"Connector {connector_id} not found - may have been already deleted"
)
# Step 4: Create success notification
doc_text = "document" if total_deleted == 1 else "documents"
notification = Notification(
user_id=UUID(user_id),
search_space_id=search_space_id,
type="connector_deletion",
title=f"{connector_name} Removed",
message=f"Connector and {total_deleted} {doc_text} have been removed from your knowledge base.",
notification_metadata={
"connector_id": connector_id,
"connector_name": connector_name,
"connector_type": connector_type,
"documents_deleted": total_deleted,
"status": "completed",
},
)
session.add(notification)
await session.commit()
logger.info(
f"Connector {connector_id} ({connector_name}) deleted successfully. "
f"Total documents deleted: {total_deleted}"
)
return {
"status": "success",
"connector_id": connector_id,
"connector_name": connector_name,
"documents_deleted": total_deleted,
}
except Exception as e:
logger.error(
f"Failed to delete connector {connector_id} ({connector_name}): {e!s}",
exc_info=True,
)
# Create failure notification
try:
async with session_maker() as session:
notification = Notification(
user_id=UUID(user_id),
search_space_id=search_space_id,
type="connector_deletion",
title=f"Failed to Remove {connector_name}",
message="Something went wrong while removing this connector. Please try again.",
notification_metadata={
"connector_id": connector_id,
"connector_name": connector_name,
"connector_type": connector_type,
"documents_deleted": total_deleted,
"status": "failed",
"error": str(e),
},
)
session.add(notification)
await session.commit()
except Exception as notify_error:
logger.error(
f"Failed to create failure notification: {notify_error!s}",
exc_info=True,
)
# Re-raise to trigger Celery retry
raise
finally:
await engine.dispose()
async def delete_documents_by_connector_id(
session,
connector_id: int,
batch_size: int = DELETION_BATCH_SIZE,
) -> int:
"""
Delete all documents associated with a connector in batches.
This is a utility function that can be used independently of the Celery task
for synchronous deletion scenarios (e.g., small document counts).
Args:
session: AsyncSession instance
connector_id: ID of the connector
batch_size: Number of documents to delete per batch
Returns:
Total number of documents deleted
"""
total_deleted = 0
while True:
result = await session.execute(
select(Document.id)
.where(Document.connector_id == connector_id)
.limit(batch_size)
)
doc_ids = [row[0] for row in result.fetchall()]
if not doc_ids:
break
await session.execute(delete(Document).where(Document.id.in_(doc_ids)))
await session.commit()
total_deleted += len(doc_ids)
return total_deleted

View file

@ -545,6 +545,7 @@ def process_circleback_meeting_task(
markdown_content: str,
metadata: dict,
search_space_id: int,
connector_id: int | None = None,
):
"""
Celery task to process Circleback meeting webhook data.
@ -555,6 +556,7 @@ def process_circleback_meeting_task(
markdown_content: Meeting content formatted as markdown
metadata: Meeting metadata dictionary
search_space_id: ID of the search space
connector_id: ID of the Circleback connector (for deletion support)
"""
import asyncio
@ -569,6 +571,7 @@ def process_circleback_meeting_task(
markdown_content,
metadata,
search_space_id,
connector_id,
)
)
finally:
@ -581,6 +584,7 @@ async def _process_circleback_meeting(
markdown_content: str,
metadata: dict,
search_space_id: int,
connector_id: int | None = None,
):
"""Process Circleback meeting with new session."""
from app.tasks.document_processors.circleback_processor import (
@ -637,6 +641,7 @@ async def _process_circleback_meeting(
markdown_content=markdown_content,
metadata=metadata,
search_space_id=search_space_id,
connector_id=connector_id,
)
if result:

View file

@ -417,6 +417,8 @@ async def index_airtable_records(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -396,6 +396,8 @@ async def index_bookstack_pages(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -395,6 +395,8 @@ async def index_clickup_tasks(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -402,6 +402,8 @@ async def index_confluence_pages(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -527,6 +527,8 @@ async def index_discord_messages(
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -292,6 +292,8 @@ async def index_elasticsearch_documents(
document_metadata=metadata,
search_space_id=search_space_id,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
# Create chunks and attach to document (persist via relationship)

View file

@ -220,6 +220,7 @@ async def index_github_repos(
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
connector_id=connector_id,
)
documents_processed += docs_created
@ -292,6 +293,7 @@ async def _process_repository_digest(
user_id: str,
task_logger: TaskLoggingService,
log_entry,
connector_id: int,
) -> int:
"""
Process a repository digest and create documents.
@ -426,6 +428,8 @@ async def _process_repository_digest(
search_space_id=search_space_id,
chunks=chunks_data,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -499,6 +499,8 @@ async def index_google_calendar_events(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -767,6 +767,7 @@ async def _process_single_file(
session=session,
task_logger=task_logger,
log_entry=log_entry,
connector_id=connector_id,
)
if error:

View file

@ -413,7 +413,6 @@ async def index_google_gmail_messages(
"subject": subject,
"sender": sender,
"date": date_str,
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
@ -421,6 +420,8 @@ async def index_google_gmail_messages(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1

View file

@ -380,6 +380,8 @@ async def index_jira_issues(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -413,6 +413,8 @@ async def index_linear_issues(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -476,6 +476,8 @@ async def index_luma_events(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -398,6 +398,7 @@ async def index_notion_pages(
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
existing_document.connector_id = connector_id
documents_indexed += 1
logger.info(f"Successfully updated Notion page: {page_title}")
@ -470,6 +471,8 @@ async def index_notion_pages(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -500,6 +500,8 @@ async def index_obsidian_vault(
embedding=embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(new_document)

View file

@ -389,6 +389,8 @@ async def index_slack_messages(
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -430,6 +430,8 @@ async def index_teams_messages(
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -371,6 +371,8 @@ async def index_crawled_urls(
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -8,10 +8,17 @@ and stores it as searchable documents in the database.
import logging
from typing import Any
from sqlalchemy import select
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentType
from app.db import (
Document,
DocumentType,
SearchSourceConnector,
SearchSourceConnectorType,
SearchSpace,
)
from app.services.llm_service import get_document_summary_llm
from app.utils.document_converters import (
create_document_chunks,
@ -35,6 +42,7 @@ async def add_circleback_meeting_document(
markdown_content: str,
metadata: dict[str, Any],
search_space_id: int,
connector_id: int | None = None,
) -> Document | None:
"""
Process and store a Circleback meeting document.
@ -46,6 +54,7 @@ async def add_circleback_meeting_document(
markdown_content: Meeting content formatted as markdown
metadata: Meeting metadata dictionary
search_space_id: ID of the search space
connector_id: ID of the Circleback connector (for deletion support)
Returns:
Document object if successful, None if failed or duplicate
@ -125,6 +134,30 @@ async def add_circleback_meeting_document(
**metadata,
}
# Fetch the user who set up the Circleback connector (preferred)
# or fall back to search space owner if no connector found
created_by_user_id = None
# Try to find the Circleback connector for this search space
connector_result = await session.execute(
select(SearchSourceConnector.user_id).where(
SearchSourceConnector.search_space_id == search_space_id,
SearchSourceConnector.connector_type
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
)
)
connector_user = connector_result.scalar_one_or_none()
if connector_user:
# Use the user who set up the Circleback connector
created_by_user_id = connector_user
else:
# Fallback: use search space owner if no connector found
search_space_result = await session.execute(
select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
)
created_by_user_id = search_space_result.scalar_one_or_none()
# Update or create document
if existing_document:
# Update existing document
@ -138,6 +171,9 @@ async def add_circleback_meeting_document(
existing_document.blocknote_document = blocknote_json
existing_document.content_needs_reindexing = False
existing_document.updated_at = get_current_timestamp()
# Ensure connector_id is set (backfill for documents created before this field)
if connector_id is not None:
existing_document.connector_id = connector_id
await session.commit()
await session.refresh(existing_document)
@ -160,6 +196,8 @@ async def add_circleback_meeting_document(
blocknote_document=blocknote_json,
content_needs_reindexing=False,
updated_at=get_current_timestamp(),
created_by_id=created_by_user_id,
connector_id=connector_id,
)
session.add(document)

View file

@ -185,6 +185,7 @@ async def add_extension_received_document(
unique_identifier_hash=unique_identifier_hash,
blocknote_document=blocknote_json,
updated_at=get_current_timestamp(),
created_by_id=user_id,
)
session.add(document)

View file

@ -526,6 +526,8 @@ async def add_received_file_document_using_unstructured(
blocknote_document=blocknote_json,
content_needs_reindexing=False,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
)
session.add(document)
@ -665,6 +667,8 @@ async def add_received_file_document_using_llamacloud(
blocknote_document=blocknote_json,
content_needs_reindexing=False,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
)
session.add(document)
@ -829,6 +833,8 @@ async def add_received_file_document_using_docling(
blocknote_document=blocknote_json,
content_needs_reindexing=False,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
)
session.add(document)
@ -849,7 +855,7 @@ async def add_received_file_document_using_docling(
async def _update_document_from_connector(
document: Document | None, connector: dict | None, session: AsyncSession
) -> None:
"""Helper to update document type and metadata from connector info."""
"""Helper to update document type, metadata, and connector_id from connector info."""
if document and connector:
if "type" in connector:
document.document_type = connector["type"]
@ -861,6 +867,9 @@ async def _update_document_from_connector(
# Expand existing metadata with connector metadata
merged = {**document.document_metadata, **connector["metadata"]}
document.document_metadata = merged
# Set connector_id if provided for de-indexing support
if "connector_id" in connector:
document.connector_id = connector["connector_id"]
await session.commit()

View file

@ -295,6 +295,8 @@ async def add_received_markdown_file_document(
unique_identifier_hash=primary_hash,
blocknote_document=blocknote_json,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
)
session.add(document)

View file

@ -357,6 +357,7 @@ async def add_youtube_video_document(
unique_identifier_hash=unique_identifier_hash,
blocknote_document=blocknote_json,
updated_at=get_current_timestamp(),
created_by_id=user_id,
)
session.add(document)