refactor: unify all 3 google Composio and non-Composio connector types and pipelines keeping same credential adapters

This commit is contained in:
Anish Sarkar 2026-03-19 05:08:21 +05:30
parent 6c37b563c0
commit 83152e8e7e
24 changed files with 633 additions and 3596 deletions

View file

@ -55,7 +55,6 @@ async def _check_and_trigger_schedules():
from app.tasks.celery_tasks.connector_tasks import (
index_airtable_records_task,
index_clickup_tasks_task,
index_composio_connector_task,
index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task,
@ -88,10 +87,10 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
# Composio connector types
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_composio_connector_task,
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: index_composio_connector_task,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_composio_connector_task,
# Composio connector types (unified with native Google tasks)
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: index_google_gmail_messages_task,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_google_calendar_events_task,
}
# Trigger indexing for each due connector
@ -129,11 +128,11 @@ async def _check_and_trigger_schedules():
f"({connector.connector_type.value})"
)
# Special handling for Google Drive - uses config for folder/file selection
if (
connector.connector_type
== SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR
):
# Special handling for Google Drive (native and Composio) - uses config for folder/file selection
if connector.connector_type in [
SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
]:
connector_config = connector.config or {}
selected_folders = connector_config.get("selected_folders", [])
selected_files = connector_config.get("selected_files", [])

View file

@ -16,6 +16,15 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.google_calendar_connector import GoogleCalendarConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.utils.google_credentials import (
COMPOSIO_GOOGLE_CONNECTOR_TYPES,
build_composio_credentials,
)
ACCEPTED_CALENDAR_CONNECTOR_TYPES = {
SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
}
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -87,10 +96,12 @@ async def index_google_calendar_events(
)
try:
# Get the connector from the database
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR
)
# Accept both native and Composio Calendar connectors
connector = None
for ct in ACCEPTED_CALENDAR_CONNECTOR_TYPES:
connector = await get_connector_by_id(session, connector_id, ct)
if connector:
break
if not connector:
await task_logger.log_task_failure(
@ -101,69 +112,80 @@ async def index_google_calendar_events(
)
return 0, f"Connector with ID {connector_id} not found"
# Get the Google Calendar credentials from the connector config
config_data = connector.config
# Decrypt sensitive credentials if encrypted (for backward compatibility)
from app.config import config
from app.utils.oauth_security import TokenEncryption
token_encrypted = config_data.get("_token_encrypted", False)
if token_encrypted and config.SECRET_KEY:
try:
token_encryption = TokenEncryption(config.SECRET_KEY)
# Decrypt sensitive fields
if config_data.get("token"):
config_data["token"] = token_encryption.decrypt_token(
config_data["token"]
)
if config_data.get("refresh_token"):
config_data["refresh_token"] = token_encryption.decrypt_token(
config_data["refresh_token"]
)
if config_data.get("client_secret"):
config_data["client_secret"] = token_encryption.decrypt_token(
config_data["client_secret"]
)
logger.info(
f"Decrypted Google Calendar credentials for connector {connector_id}"
)
except Exception as e:
# Build credentials based on connector type
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
connected_account_id = connector.config.get(
"composio_connected_account_id"
)
if not connected_account_id:
await task_logger.log_task_failure(
log_entry,
f"Failed to decrypt Google Calendar credentials for connector {connector_id}: {e!s}",
"Credential decryption failed",
{"error_type": "CredentialDecryptionError"},
f"Composio connected_account_id not found for connector {connector_id}",
"Missing Composio account",
{"error_type": "MissingComposioAccount"},
)
return 0, f"Failed to decrypt Google Calendar credentials: {e!s}"
return 0, "Composio connected_account_id not found"
credentials = build_composio_credentials(connected_account_id)
else:
config_data = connector.config
exp = config_data.get("expiry", "").replace("Z", "")
credentials = Credentials(
token=config_data.get("token"),
refresh_token=config_data.get("refresh_token"),
token_uri=config_data.get("token_uri"),
client_id=config_data.get("client_id"),
client_secret=config_data.get("client_secret"),
scopes=config_data.get("scopes"),
expiry=datetime.fromisoformat(exp) if exp else None,
)
from app.config import config
from app.utils.oauth_security import TokenEncryption
if (
not credentials.client_id
or not credentials.client_secret
or not credentials.refresh_token
):
await task_logger.log_task_failure(
log_entry,
f"Google Calendar credentials not found in connector config for connector {connector_id}",
"Missing Google Calendar credentials",
{"error_type": "MissingCredentials"},
token_encrypted = config_data.get("_token_encrypted", False)
if token_encrypted and config.SECRET_KEY:
try:
token_encryption = TokenEncryption(config.SECRET_KEY)
if config_data.get("token"):
config_data["token"] = token_encryption.decrypt_token(
config_data["token"]
)
if config_data.get("refresh_token"):
config_data["refresh_token"] = token_encryption.decrypt_token(
config_data["refresh_token"]
)
if config_data.get("client_secret"):
config_data["client_secret"] = token_encryption.decrypt_token(
config_data["client_secret"]
)
logger.info(
f"Decrypted Google Calendar credentials for connector {connector_id}"
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to decrypt Google Calendar credentials for connector {connector_id}: {e!s}",
"Credential decryption failed",
{"error_type": "CredentialDecryptionError"},
)
return 0, f"Failed to decrypt Google Calendar credentials: {e!s}"
exp = config_data.get("expiry", "")
if exp:
exp = exp.replace("Z", "")
credentials = Credentials(
token=config_data.get("token"),
refresh_token=config_data.get("refresh_token"),
token_uri=config_data.get("token_uri"),
client_id=config_data.get("client_id"),
client_secret=config_data.get("client_secret"),
scopes=config_data.get("scopes", []),
expiry=datetime.fromisoformat(exp) if exp else None,
)
return 0, "Google Calendar credentials not found in connector config"
# Initialize Google Calendar client
if (
not credentials.client_id
or not credentials.client_secret
or not credentials.refresh_token
):
await task_logger.log_task_failure(
log_entry,
f"Google Calendar credentials not found in connector config for connector {connector_id}",
"Missing Google Calendar credentials",
{"error_type": "MissingCredentials"},
)
return 0, "Google Calendar credentials not found in connector config"
await task_logger.log_task_progress(
log_entry,
f"Initializing Google Calendar client for connector {connector_id}",

View file

@ -31,6 +31,15 @@ from app.tasks.connector_indexers.base import (
update_connector_last_indexed,
)
from app.utils.document_converters import generate_unique_identifier_hash
from app.utils.google_credentials import (
COMPOSIO_GOOGLE_CONNECTOR_TYPES,
build_composio_credentials,
)
ACCEPTED_DRIVE_CONNECTOR_TYPES = {
SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
}
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
@ -89,14 +98,17 @@ async def index_google_drive_files(
)
try:
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR
)
# Accept both native and Composio Drive connectors
connector = None
for ct in ACCEPTED_DRIVE_CONNECTOR_TYPES:
connector = await get_connector_by_id(session, connector_id, ct)
if connector:
break
if not connector:
error_msg = f"Google Drive connector with ID {connector_id} not found"
await task_logger.log_task_failure(
log_entry, error_msg, {"error_type": "ConnectorNotFound"}
log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
)
return 0, error_msg
@ -106,27 +118,43 @@ async def index_google_drive_files(
{"stage": "client_initialization"},
)
# Check if credentials are encrypted (only when explicitly marked)
token_encrypted = connector.config.get("_token_encrypted", False)
if token_encrypted:
# Credentials are explicitly marked as encrypted, will be decrypted during client initialization
if not config.SECRET_KEY:
await task_logger.log_task_failure(
log_entry,
f"SECRET_KEY not configured but credentials are marked as encrypted for connector {connector_id}",
"Missing SECRET_KEY for token decryption",
{"error_type": "MissingSecretKey"},
)
return (
0,
"SECRET_KEY not configured but credentials are marked as encrypted",
)
logger.info(
f"Google Drive credentials are encrypted for connector {connector_id}, will decrypt during client initialization"
# Build credentials based on connector type
pre_built_credentials = None
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
connected_account_id = connector.config.get(
"composio_connected_account_id"
)
# If _token_encrypted is False or not set, treat credentials as plaintext
if not connected_account_id:
error_msg = f"Composio connected_account_id not found for connector {connector_id}"
await task_logger.log_task_failure(
log_entry, error_msg, "Missing Composio account",
{"error_type": "MissingComposioAccount"},
)
return 0, error_msg
pre_built_credentials = build_composio_credentials(connected_account_id)
else:
token_encrypted = connector.config.get("_token_encrypted", False)
if token_encrypted:
if not config.SECRET_KEY:
await task_logger.log_task_failure(
log_entry,
f"SECRET_KEY not configured but credentials are marked as encrypted for connector {connector_id}",
"Missing SECRET_KEY for token decryption",
{"error_type": "MissingSecretKey"},
)
return (
0,
"SECRET_KEY not configured but credentials are marked as encrypted",
)
logger.info(
f"Google Drive credentials are encrypted for connector {connector_id}, will decrypt during client initialization"
)
drive_client = GoogleDriveClient(session, connector_id)
connector_enable_summary = getattr(connector, "enable_summary", True)
drive_client = GoogleDriveClient(
session, connector_id, credentials=pre_built_credentials
)
if not folder_id:
error_msg = "folder_id is required for Google Drive indexing"
@ -164,6 +192,7 @@ async def index_google_drive_files(
max_files=max_files,
include_subfolders=include_subfolders,
on_heartbeat_callback=on_heartbeat_callback,
enable_summary=connector_enable_summary,
)
else:
logger.info(f"Using full scan for connector {connector_id}")
@ -181,6 +210,7 @@ async def index_google_drive_files(
max_files=max_files,
include_subfolders=include_subfolders,
on_heartbeat_callback=on_heartbeat_callback,
enable_summary=connector_enable_summary,
)
documents_indexed, documents_skipped = result
@ -278,14 +308,17 @@ async def index_google_drive_single_file(
)
try:
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR
)
# Accept both native and Composio Drive connectors
connector = None
for ct in ACCEPTED_DRIVE_CONNECTOR_TYPES:
connector = await get_connector_by_id(session, connector_id, ct)
if connector:
break
if not connector:
error_msg = f"Google Drive connector with ID {connector_id} not found"
await task_logger.log_task_failure(
log_entry, error_msg, {"error_type": "ConnectorNotFound"}
log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
)
return 0, error_msg
@ -295,27 +328,42 @@ async def index_google_drive_single_file(
{"stage": "client_initialization"},
)
# Check if credentials are encrypted (only when explicitly marked)
token_encrypted = connector.config.get("_token_encrypted", False)
if token_encrypted:
# Credentials are explicitly marked as encrypted, will be decrypted during client initialization
if not config.SECRET_KEY:
await task_logger.log_task_failure(
log_entry,
f"SECRET_KEY not configured but credentials are marked as encrypted for connector {connector_id}",
"Missing SECRET_KEY for token decryption",
{"error_type": "MissingSecretKey"},
)
return (
0,
"SECRET_KEY not configured but credentials are marked as encrypted",
)
logger.info(
f"Google Drive credentials are encrypted for connector {connector_id}, will decrypt during client initialization"
pre_built_credentials = None
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
connected_account_id = connector.config.get(
"composio_connected_account_id"
)
# If _token_encrypted is False or not set, treat credentials as plaintext
if not connected_account_id:
error_msg = f"Composio connected_account_id not found for connector {connector_id}"
await task_logger.log_task_failure(
log_entry, error_msg, "Missing Composio account",
{"error_type": "MissingComposioAccount"},
)
return 0, error_msg
pre_built_credentials = build_composio_credentials(connected_account_id)
else:
token_encrypted = connector.config.get("_token_encrypted", False)
if token_encrypted:
if not config.SECRET_KEY:
await task_logger.log_task_failure(
log_entry,
f"SECRET_KEY not configured but credentials are marked as encrypted for connector {connector_id}",
"Missing SECRET_KEY for token decryption",
{"error_type": "MissingSecretKey"},
)
return (
0,
"SECRET_KEY not configured but credentials are marked as encrypted",
)
logger.info(
f"Google Drive credentials are encrypted for connector {connector_id}, will decrypt during client initialization"
)
drive_client = GoogleDriveClient(session, connector_id)
connector_enable_summary = getattr(connector, "enable_summary", True)
drive_client = GoogleDriveClient(
session, connector_id, credentials=pre_built_credentials
)
# Fetch the file metadata
file, error = await get_file_by_id(drive_client, file_id)
@ -362,6 +410,7 @@ async def index_google_drive_single_file(
task_logger=task_logger,
log_entry=log_entry,
pending_document=pending_doc,
enable_summary=connector_enable_summary,
)
await session.commit()
@ -433,6 +482,7 @@ async def _index_full_scan(
max_files: int,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
) -> tuple[int, int]:
"""Perform full scan indexing of a folder.
@ -562,6 +612,7 @@ async def _index_full_scan(
task_logger=task_logger,
log_entry=log_entry,
pending_document=pending_doc,
enable_summary=enable_summary,
)
documents_indexed += indexed
@ -592,6 +643,7 @@ async def _index_with_delta_sync(
max_files: int,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
) -> tuple[int, int]:
"""Perform delta sync indexing using change tracking.
@ -703,6 +755,7 @@ async def _index_with_delta_sync(
task_logger=task_logger,
log_entry=log_entry,
pending_document=pending_doc,
enable_summary=enable_summary,
)
documents_indexed += indexed
@ -957,6 +1010,7 @@ async def _process_single_file(
task_logger: TaskLoggingService,
log_entry: any,
pending_document: Document | None = None,
enable_summary: bool = True,
) -> tuple[int, int, int]:
"""
Process a single file by downloading and using Surfsense's file processor.
@ -1020,6 +1074,7 @@ async def _process_single_file(
task_logger=task_logger,
log_entry=log_entry,
connector_id=connector_id,
enable_summary=enable_summary,
)
if error:

View file

@ -21,6 +21,15 @@ from app.db import (
DocumentType,
SearchSourceConnectorType,
)
from app.utils.google_credentials import (
COMPOSIO_GOOGLE_CONNECTOR_TYPES,
build_composio_credentials,
)
ACCEPTED_GMAIL_CONNECTOR_TYPES = {
SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
}
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -94,90 +103,100 @@ async def index_google_gmail_messages(
)
try:
# Get connector by id
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR
)
# Accept both native and Composio Gmail connectors
connector = None
for ct in ACCEPTED_GMAIL_CONNECTOR_TYPES:
connector = await get_connector_by_id(session, connector_id, ct)
if connector:
break
if not connector:
error_msg = f"Gmail connector with ID {connector_id} not found"
await task_logger.log_task_failure(
log_entry, error_msg, {"error_type": "ConnectorNotFound"}
log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
)
return 0, error_msg
# Get the Google Gmail credentials from the connector config
config_data = connector.config
# Decrypt sensitive credentials if encrypted (for backward compatibility)
from app.config import config
from app.utils.oauth_security import TokenEncryption
token_encrypted = config_data.get("_token_encrypted", False)
if token_encrypted and config.SECRET_KEY:
try:
token_encryption = TokenEncryption(config.SECRET_KEY)
# Decrypt sensitive fields
if config_data.get("token"):
config_data["token"] = token_encryption.decrypt_token(
config_data["token"]
)
if config_data.get("refresh_token"):
config_data["refresh_token"] = token_encryption.decrypt_token(
config_data["refresh_token"]
)
if config_data.get("client_secret"):
config_data["client_secret"] = token_encryption.decrypt_token(
config_data["client_secret"]
)
logger.info(
f"Decrypted Google Gmail credentials for connector {connector_id}"
)
except Exception as e:
# Build credentials based on connector type
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
connected_account_id = connector.config.get(
"composio_connected_account_id"
)
if not connected_account_id:
await task_logger.log_task_failure(
log_entry,
f"Failed to decrypt Google Gmail credentials for connector {connector_id}: {e!s}",
"Credential decryption failed",
{"error_type": "CredentialDecryptionError"},
f"Composio connected_account_id not found for connector {connector_id}",
"Missing Composio account",
{"error_type": "MissingComposioAccount"},
)
return 0, f"Failed to decrypt Google Gmail credentials: {e!s}"
return 0, "Composio connected_account_id not found"
credentials = build_composio_credentials(connected_account_id)
else:
config_data = connector.config
exp = config_data.get("expiry", "")
if exp:
exp = exp.replace("Z", "")
credentials = Credentials(
token=config_data.get("token"),
refresh_token=config_data.get("refresh_token"),
token_uri=config_data.get("token_uri"),
client_id=config_data.get("client_id"),
client_secret=config_data.get("client_secret"),
scopes=config_data.get("scopes", []),
expiry=datetime.fromisoformat(exp) if exp else None,
)
from app.config import config
from app.utils.oauth_security import TokenEncryption
if (
not credentials.client_id
or not credentials.client_secret
or not credentials.refresh_token
):
await task_logger.log_task_failure(
log_entry,
f"Google gmail credentials not found in connector config for connector {connector_id}",
"Missing Google gmail credentials",
{"error_type": "MissingCredentials"},
token_encrypted = config_data.get("_token_encrypted", False)
if token_encrypted and config.SECRET_KEY:
try:
token_encryption = TokenEncryption(config.SECRET_KEY)
if config_data.get("token"):
config_data["token"] = token_encryption.decrypt_token(
config_data["token"]
)
if config_data.get("refresh_token"):
config_data["refresh_token"] = token_encryption.decrypt_token(
config_data["refresh_token"]
)
if config_data.get("client_secret"):
config_data["client_secret"] = token_encryption.decrypt_token(
config_data["client_secret"]
)
logger.info(
f"Decrypted Google Gmail credentials for connector {connector_id}"
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to decrypt Google Gmail credentials for connector {connector_id}: {e!s}",
"Credential decryption failed",
{"error_type": "CredentialDecryptionError"},
)
return 0, f"Failed to decrypt Google Gmail credentials: {e!s}"
exp = config_data.get("expiry", "")
if exp:
exp = exp.replace("Z", "")
credentials = Credentials(
token=config_data.get("token"),
refresh_token=config_data.get("refresh_token"),
token_uri=config_data.get("token_uri"),
client_id=config_data.get("client_id"),
client_secret=config_data.get("client_secret"),
scopes=config_data.get("scopes", []),
expiry=datetime.fromisoformat(exp) if exp else None,
)
return 0, "Google gmail credentials not found in connector config"
# Initialize Google gmail client
if (
not credentials.client_id
or not credentials.client_secret
or not credentials.refresh_token
):
await task_logger.log_task_failure(
log_entry,
f"Google gmail credentials not found in connector config for connector {connector_id}",
"Missing Google gmail credentials",
{"error_type": "MissingCredentials"},
)
return 0, "Google gmail credentials not found in connector config"
await task_logger.log_task_progress(
log_entry,
f"Initializing Google gmail client for connector {connector_id}",
{"stage": "client_initialization"},
)
# Initialize Google gmail connector
gmail_connector = GoogleGmailConnector(
credentials, session, user_id, connector_id
)

View file

@ -411,6 +411,7 @@ async def add_received_file_document_using_unstructured(
search_space_id: int,
user_id: str,
connector: dict | None = None,
enable_summary: bool = True,
) -> Document | None:
"""
Process and store a file document using Unstructured service.
@ -471,9 +472,13 @@ async def add_received_file_document_using_unstructured(
"etl_service": "UNSTRUCTURED",
"document_type": "File Document",
}
summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm, document_metadata
)
if enable_summary:
summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm, document_metadata
)
else:
summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
summary_embedding = embed_text(summary_content)
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
@ -493,14 +498,13 @@ async def add_received_file_document_using_unstructured(
existing_document.source_markdown = file_in_markdown
existing_document.content_needs_reindexing = False
existing_document.updated_at = get_current_timestamp()
existing_document.status = DocumentStatus.ready() # Mark as ready
existing_document.status = DocumentStatus.ready()
await session.commit()
await session.refresh(existing_document)
document = existing_document
else:
# Create new document
# Determine document type based on connector
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
@ -523,7 +527,7 @@ async def add_received_file_document_using_unstructured(
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
status=DocumentStatus.ready(), # Mark as ready
status=DocumentStatus.ready(),
)
session.add(document)
@ -546,6 +550,7 @@ async def add_received_file_document_using_llamacloud(
search_space_id: int,
user_id: str,
connector: dict | None = None,
enable_summary: bool = True,
) -> Document | None:
"""
Process and store document content parsed by LlamaCloud.
@ -605,16 +610,19 @@ async def add_received_file_document_using_llamacloud(
"etl_service": "LLAMACLOUD",
"document_type": "File Document",
}
summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm, document_metadata
)
if enable_summary:
summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm, document_metadata
)
else:
summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
summary_embedding = embed_text(summary_content)
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
# Update or create document
if existing_document:
# Update existing document
existing_document.title = file_name
existing_document.content = summary_content
existing_document.content_hash = content_hash
@ -627,14 +635,12 @@ async def add_received_file_document_using_llamacloud(
existing_document.source_markdown = file_in_markdown
existing_document.content_needs_reindexing = False
existing_document.updated_at = get_current_timestamp()
existing_document.status = DocumentStatus.ready() # Mark as ready
existing_document.status = DocumentStatus.ready()
await session.commit()
await session.refresh(existing_document)
document = existing_document
else:
# Create new document
# Determine document type based on connector
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
@ -657,7 +663,7 @@ async def add_received_file_document_using_llamacloud(
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
status=DocumentStatus.ready(), # Mark as ready
status=DocumentStatus.ready(),
)
session.add(document)
@ -682,6 +688,7 @@ async def add_received_file_document_using_docling(
search_space_id: int,
user_id: str,
connector: dict | None = None,
enable_summary: bool = True,
) -> Document | None:
"""
Process and store document content parsed by Docling.
@ -734,33 +741,32 @@ async def add_received_file_document_using_docling(
f"No long context LLM configured for user {user_id} in search_space {search_space_id}"
)
# Generate summary using chunked processing for large documents
from app.services.docling_service import create_docling_service
if enable_summary:
from app.services.docling_service import create_docling_service
docling_service = create_docling_service()
docling_service = create_docling_service()
summary_content = await docling_service.process_large_document_summary(
content=file_in_markdown, llm=user_llm, document_title=file_name
)
summary_content = await docling_service.process_large_document_summary(
content=file_in_markdown, llm=user_llm, document_title=file_name
)
# Enhance summary with metadata
document_metadata = {
"file_name": file_name,
"etl_service": "DOCLING",
"document_type": "File Document",
}
metadata_parts = []
metadata_parts.append("# DOCUMENT METADATA")
document_metadata = {
"file_name": file_name,
"etl_service": "DOCLING",
"document_type": "File Document",
}
metadata_parts = ["# DOCUMENT METADATA"]
for key, value in document_metadata.items():
if value:
formatted_key = key.replace("_", " ").title()
metadata_parts.append(f"**{formatted_key}:** {value}")
for key, value in document_metadata.items():
if value: # Only include non-empty values
formatted_key = key.replace("_", " ").title()
metadata_parts.append(f"**{formatted_key}:** {value}")
metadata_section = "\n".join(metadata_parts)
enhanced_summary_content = (
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
)
metadata_section = "\n".join(metadata_parts)
enhanced_summary_content = (
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
)
else:
enhanced_summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
summary_embedding = embed_text(enhanced_summary_content)
@ -1219,9 +1225,10 @@ async def process_file_in_background(
print("Error deleting temp file", e)
pass
# Pass the documents to the existing background task
enable_summary = connector.get("enable_summary", True) if connector else True
result = await add_received_file_document_using_unstructured(
session, filename, docs, search_space_id, user_id, connector
session, filename, docs, search_space_id, user_id, connector,
enable_summary=enable_summary,
)
if connector:
@ -1362,7 +1369,7 @@ async def process_file_in_background(
# Extract text content from the markdown documents
markdown_content = doc.text
# Process the documents using our LlamaCloud background task
enable_summary = connector.get("enable_summary", True) if connector else True
doc_result = await add_received_file_document_using_llamacloud(
session,
filename,
@ -1370,6 +1377,7 @@ async def process_file_in_background(
search_space_id=search_space_id,
user_id=user_id,
connector=connector,
enable_summary=enable_summary,
)
# Track if this document was successfully created
@ -1516,7 +1524,7 @@ async def process_file_in_background(
session, notification, stage="chunking"
)
# Process the document using our Docling background task
enable_summary = connector.get("enable_summary", True) if connector else True
doc_result = await add_received_file_document_using_docling(
session,
filename,
@ -1524,6 +1532,7 @@ async def process_file_in_background(
search_space_id=search_space_id,
user_id=user_id,
connector=connector,
enable_summary=enable_summary,
)
if doc_result: