mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-30 03:16:25 +02:00
Merge remote-tracking branch 'upstream/dev' into fix/notion-connector
This commit is contained in:
commit
1658724fb2
32 changed files with 633 additions and 78 deletions
|
|
@ -55,7 +55,9 @@ def _clear_generating_podcast(search_space_id: int) -> None:
|
|||
client = redis.from_url(redis_url, decode_responses=True)
|
||||
key = f"podcast:generating:{search_space_id}"
|
||||
client.delete(key)
|
||||
logger.info(f"Cleared generating podcast key for search_space_id={search_space_id}")
|
||||
logger.info(
|
||||
f"Cleared generating podcast key for search_space_id={search_space_id}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not clear generating podcast key: {e}")
|
||||
|
||||
|
|
@ -119,9 +121,7 @@ async def _generate_content_podcast(
|
|||
) -> dict:
|
||||
"""Generate content-based podcast and update existing record."""
|
||||
async with get_celery_session_maker()() as session:
|
||||
result = await session.execute(
|
||||
select(Podcast).filter(Podcast.id == podcast_id)
|
||||
)
|
||||
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
|
||||
podcast = result.scalars().first()
|
||||
|
||||
if not podcast:
|
||||
|
|
|
|||
|
|
@ -156,6 +156,41 @@ async def _check_and_trigger_schedules():
|
|||
)
|
||||
await session.commit()
|
||||
continue
|
||||
|
||||
# Special handling for Webcrawler - skip if no URLs configured
|
||||
elif (
|
||||
connector.connector_type
|
||||
== SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
|
||||
):
|
||||
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||
|
||||
connector_config = connector.config or {}
|
||||
urls = parse_webcrawler_urls(
|
||||
connector_config.get("INITIAL_URLS")
|
||||
)
|
||||
|
||||
if urls:
|
||||
task.delay(
|
||||
connector.id,
|
||||
connector.search_space_id,
|
||||
str(connector.user_id),
|
||||
None, # start_date
|
||||
None, # end_date
|
||||
)
|
||||
else:
|
||||
# No URLs configured - skip indexing but still update next_scheduled_at
|
||||
logger.info(
|
||||
f"Webcrawler connector {connector.id} has no URLs configured, "
|
||||
"skipping periodic indexing (will check again at next scheduled time)"
|
||||
)
|
||||
from datetime import timedelta
|
||||
|
||||
connector.next_scheduled_at = now + timedelta(
|
||||
minutes=connector.indexing_frequency_minutes
|
||||
)
|
||||
await session.commit()
|
||||
continue
|
||||
|
||||
else:
|
||||
task.delay(
|
||||
connector.id,
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ async def index_composio_connector(
|
|||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
max_items: int = 1000,
|
||||
) -> tuple[int, str]:
|
||||
) -> tuple[int, int, str | None]:
|
||||
"""
|
||||
Index content from a Composio connector.
|
||||
|
||||
|
|
@ -104,7 +104,7 @@ async def index_composio_connector(
|
|||
max_items: Maximum number of items to fetch
|
||||
|
||||
Returns:
|
||||
Tuple of (number_of_indexed_items, error_message or None)
|
||||
Tuple of (number_of_indexed_items, number_of_skipped_items, error_message or None)
|
||||
"""
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
|
|
@ -132,14 +132,14 @@ async def index_composio_connector(
|
|||
await task_logger.log_task_failure(
|
||||
log_entry, error_msg, {"error_type": "InvalidConnectorType"}
|
||||
)
|
||||
return 0, error_msg
|
||||
return 0, 0, error_msg
|
||||
|
||||
if not connector:
|
||||
error_msg = f"Composio connector with ID {connector_id} not found"
|
||||
await task_logger.log_task_failure(
|
||||
log_entry, error_msg, {"error_type": "ConnectorNotFound"}
|
||||
)
|
||||
return 0, error_msg
|
||||
return 0, 0, error_msg
|
||||
|
||||
# Get toolkit ID from config
|
||||
toolkit_id = connector.config.get("toolkit_id")
|
||||
|
|
@ -150,7 +150,7 @@ async def index_composio_connector(
|
|||
await task_logger.log_task_failure(
|
||||
log_entry, error_msg, {"error_type": "MissingToolkitId"}
|
||||
)
|
||||
return 0, error_msg
|
||||
return 0, 0, error_msg
|
||||
|
||||
# Check if toolkit is indexable
|
||||
if toolkit_id not in INDEXABLE_TOOLKITS:
|
||||
|
|
@ -158,7 +158,7 @@ async def index_composio_connector(
|
|||
await task_logger.log_task_failure(
|
||||
log_entry, error_msg, {"error_type": "ToolkitNotIndexable"}
|
||||
)
|
||||
return 0, error_msg
|
||||
return 0, 0, error_msg
|
||||
|
||||
# Get indexer function from registry
|
||||
try:
|
||||
|
|
@ -167,7 +167,7 @@ async def index_composio_connector(
|
|||
await task_logger.log_task_failure(
|
||||
log_entry, str(e), {"error_type": "NoIndexerImplemented"}
|
||||
)
|
||||
return 0, str(e)
|
||||
return 0, 0, str(e)
|
||||
|
||||
# Build kwargs for the indexer function
|
||||
kwargs = {
|
||||
|
|
@ -199,7 +199,7 @@ async def index_composio_connector(
|
|||
{"error_type": "SQLAlchemyError"},
|
||||
)
|
||||
logger.error(f"Database error: {db_error!s}", exc_info=True)
|
||||
return 0, f"Database error: {db_error!s}"
|
||||
return 0, 0, f"Database error: {db_error!s}"
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
|
|
@ -209,4 +209,4 @@ async def index_composio_connector(
|
|||
{"error_type": type(e).__name__},
|
||||
)
|
||||
logger.error(f"Failed to index Composio connector: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index Composio connector: {e!s}"
|
||||
return 0, 0, f"Failed to index Composio connector: {e!s}"
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -317,6 +318,24 @@ async def index_airtable_records(
|
|||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = (
|
||||
await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Airtable record {record_id} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate document summary
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -308,6 +309,22 @@ async def index_bookstack_pages(
|
|||
logger.info(f"Successfully updated BookStack page {page_name}")
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"BookStack page {page_name} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from app.utils.document_converters import (
|
|||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -302,6 +303,22 @@ async def index_clickup_tasks(
|
|||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"ClickUp task {task_name} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -306,6 +307,22 @@ async def index_confluence_pages(
|
|||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Confluence page {page_title} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
build_document_metadata_markdown,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -454,6 +455,24 @@ async def index_discord_messages(
|
|||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = (
|
||||
await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Discord message {msg_id} in {guild_name}#{channel_name} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from app.utils.document_converters import (
|
|||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -319,6 +320,21 @@ async def _process_repository_digest(
|
|||
# Delete existing document to replace with new one
|
||||
await session.delete(existing_document)
|
||||
await session.flush()
|
||||
else:
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Repository {repo_full_name} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
return 0
|
||||
|
||||
# Generate summary using LLM (ONE call per repository!)
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
|
|
|||
|
|
@ -24,7 +24,9 @@ from app.utils.document_converters import (
|
|||
)
|
||||
|
||||
from .base import (
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -163,10 +165,22 @@ async def index_google_gmail_messages(
|
|||
credentials, session, user_id, connector_id
|
||||
)
|
||||
|
||||
# Calculate date range using last_indexed_at if dates not provided
|
||||
# This ensures Gmail uses the same date logic as other connectors
|
||||
# (uses last_indexed_at → now, or 365 days back for first-time indexing)
|
||||
calculated_start_date, calculated_end_date = calculate_date_range(
|
||||
connector, start_date, end_date, default_days_back=365
|
||||
)
|
||||
|
||||
# Fetch recent Google gmail messages
|
||||
logger.info(f"Fetching recent emails for connector {connector_id}")
|
||||
logger.info(
|
||||
f"Fetching emails for connector {connector_id} "
|
||||
f"from {calculated_start_date} to {calculated_end_date}"
|
||||
)
|
||||
messages, error = await gmail_connector.get_recent_messages(
|
||||
max_results=max_messages, start_date=start_date, end_date=end_date
|
||||
max_results=max_messages,
|
||||
start_date=calculated_start_date,
|
||||
end_date=calculated_end_date,
|
||||
)
|
||||
|
||||
if error:
|
||||
|
|
@ -316,6 +330,22 @@ async def index_google_gmail_messages(
|
|||
logger.info(f"Successfully updated Gmail message {subject}")
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Gmail message {subject} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -284,6 +285,22 @@ async def index_jira_issues(
|
|||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Jira issue {issue_identifier} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -315,6 +316,22 @@ async def index_linear_issues(
|
|||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Linear issue {issue_identifier} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from app.utils.document_converters import (
|
|||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -363,6 +364,22 @@ async def index_luma_events(
|
|||
logger.info(f"Successfully updated Luma event {event_name}")
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Luma event {event_name} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ from .base import (
|
|||
build_document_metadata_string,
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -388,6 +389,22 @@ async def index_notion_pages(
|
|||
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Notion page {page_title} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Get user's long context LLM
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
build_document_metadata_string,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -426,6 +427,22 @@ async def index_obsidian_vault(
|
|||
indexed_count += 1
|
||||
|
||||
else:
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Obsidian note {title} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
logger.info(f"Indexing new note: {title}")
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from .base import (
|
|||
build_document_metadata_markdown,
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -325,6 +326,22 @@ async def index_slack_messages(
|
|||
logger.info(f"Successfully updated Slack message {msg_ts}")
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Slack message {msg_ts} in channel {channel_name} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(combined_document_string)
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from .base import (
|
|||
build_document_metadata_markdown,
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -354,6 +355,27 @@ async def index_teams_messages(
|
|||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = (
|
||||
await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
"Teams message %s in channel %s already indexed by another connector "
|
||||
"(existing document ID: %s, type: %s). Skipping.",
|
||||
message_id,
|
||||
channel_name,
|
||||
duplicate_by_content.id,
|
||||
duplicate_by_content.document_type,
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(
|
||||
|
|
|
|||
|
|
@ -18,9 +18,11 @@ from app.utils.document_converters import (
|
|||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
|
|
@ -96,13 +98,7 @@ async def index_crawled_urls(
|
|||
api_key = connector.config.get("FIRECRAWL_API_KEY")
|
||||
|
||||
# Get URLs from connector config
|
||||
initial_urls = connector.config.get("INITIAL_URLS", "")
|
||||
if isinstance(initial_urls, str):
|
||||
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
|
||||
elif isinstance(initial_urls, list):
|
||||
urls = [url.strip() for url in initial_urls if url.strip()]
|
||||
else:
|
||||
urls = []
|
||||
urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))
|
||||
|
||||
logger.info(
|
||||
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
|
||||
|
|
@ -281,6 +277,22 @@ async def index_crawled_urls(
|
|||
logger.info(f"Successfully updated URL {url}")
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"URL {url} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
|
|||
|
|
@ -55,7 +55,9 @@ LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
|
|||
)
|
||||
|
||||
# Timeout calculation constants
|
||||
UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024 # 100 KB/s (conservative for slow connections)
|
||||
UPLOAD_BYTES_PER_SECOND_SLOW = (
|
||||
100 * 1024
|
||||
) # 100 KB/s (conservative for slow connections)
|
||||
MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file
|
||||
MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files
|
||||
BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing
|
||||
|
|
@ -219,19 +221,19 @@ async def find_existing_document_with_migration(
|
|||
def calculate_upload_timeout(file_size_bytes: int) -> float:
|
||||
"""
|
||||
Calculate appropriate upload timeout based on file size.
|
||||
|
||||
|
||||
Assumes a conservative slow connection speed to handle worst-case scenarios.
|
||||
|
||||
|
||||
Args:
|
||||
file_size_bytes: Size of the file in bytes
|
||||
|
||||
|
||||
Returns:
|
||||
Timeout in seconds
|
||||
"""
|
||||
# Calculate time needed at slow connection speed
|
||||
# Add 50% buffer for network variability and SSL overhead
|
||||
estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
|
||||
|
||||
|
||||
# Clamp to reasonable bounds
|
||||
return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
|
||||
|
||||
|
|
@ -239,21 +241,21 @@ def calculate_upload_timeout(file_size_bytes: int) -> float:
|
|||
def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
|
||||
"""
|
||||
Calculate job processing timeout based on page count and file size.
|
||||
|
||||
|
||||
Args:
|
||||
estimated_pages: Estimated number of pages
|
||||
file_size_bytes: Size of the file in bytes
|
||||
|
||||
|
||||
Returns:
|
||||
Timeout in seconds
|
||||
"""
|
||||
# Base timeout + time per page
|
||||
page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
|
||||
|
||||
|
||||
# Also consider file size (large images take longer to process)
|
||||
# ~1 minute per 10MB of file size
|
||||
size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
|
||||
|
||||
|
||||
# Use the larger of the two estimates
|
||||
return max(page_based_timeout, size_based_timeout)
|
||||
|
||||
|
|
@ -284,18 +286,18 @@ async def parse_with_llamacloud_retry(
|
|||
"""
|
||||
import os
|
||||
import random
|
||||
|
||||
|
||||
from llama_cloud_services import LlamaParse
|
||||
from llama_cloud_services.parse.utils import ResultType
|
||||
|
||||
# Get file size for timeout calculations
|
||||
file_size_bytes = os.path.getsize(file_path)
|
||||
file_size_mb = file_size_bytes / (1024 * 1024)
|
||||
|
||||
|
||||
# Calculate dynamic timeouts based on file size and page count
|
||||
upload_timeout = calculate_upload_timeout(file_size_bytes)
|
||||
job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
|
||||
|
||||
|
||||
# HTTP client timeouts - scaled based on file size
|
||||
# Write timeout is critical for large file uploads
|
||||
custom_timeout = httpx.Timeout(
|
||||
|
|
@ -304,7 +306,7 @@ async def parse_with_llamacloud_retry(
|
|||
write=upload_timeout, # Dynamic based on file size (upload time)
|
||||
pool=120.0, # 2 minutes to acquire connection from pool
|
||||
)
|
||||
|
||||
|
||||
logging.info(
|
||||
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
|
||||
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
|
||||
|
|
@ -335,14 +337,14 @@ async def parse_with_llamacloud_retry(
|
|||
|
||||
# Parse the file asynchronously
|
||||
result = await parser.aparse(file_path)
|
||||
|
||||
|
||||
# Success - log if we had previous failures
|
||||
if attempt > 1:
|
||||
logging.info(
|
||||
f"LlamaCloud upload succeeded on attempt {attempt} after "
|
||||
f"{len(attempt_errors)} failures"
|
||||
)
|
||||
|
||||
|
||||
return result
|
||||
|
||||
except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
|
||||
|
|
@ -355,8 +357,7 @@ async def parse_with_llamacloud_retry(
|
|||
# Calculate exponential backoff with jitter
|
||||
# Base delay doubles each attempt, capped at max delay
|
||||
base_delay = min(
|
||||
LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
|
||||
LLAMACLOUD_MAX_DELAY
|
||||
LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY
|
||||
)
|
||||
# Add random jitter (±25%) to prevent thundering herd
|
||||
jitter = base_delay * 0.25 * (2 * random.random() - 1)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue