Merge remote-tracking branch 'upstream/main' into feature/blocknote-editor

This commit is contained in:
Anish Sarkar 2025-11-30 04:10:49 +05:30
commit b98c312fb1
81 changed files with 8976 additions and 2387 deletions

View file

@ -600,3 +600,46 @@ async def _index_elasticsearch_documents(
await run_elasticsearch_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_crawled_urls", bind=True)
def index_crawled_urls_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Web page Urls."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_crawled_urls(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_crawled_urls(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Web page Urls with new session."""
from app.routes.search_source_connectors_routes import (
run_web_page_indexing,
)
async with get_celery_session_maker()() as session:
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)

View file

@ -9,7 +9,6 @@ from app.celery_app import celery_app
from app.config import config
from app.services.task_logging_service import TaskLoggingService
from app.tasks.document_processors import (
add_crawled_url_document,
add_extension_received_document,
add_youtube_video_document,
)
@ -120,71 +119,6 @@ async def _process_extension_document(
raise
@celery_app.task(name="process_crawled_url", bind=True)
def process_crawled_url_task(self, url: str, search_space_id: int, user_id: str):
"""
Celery task to process crawled URL.
Args:
url: URL to crawl and process
search_space_id: ID of the search space
user_id: ID of the user
"""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(_process_crawled_url(url, search_space_id, user_id))
finally:
loop.close()
async def _process_crawled_url(url: str, search_space_id: int, user_id: str):
"""Process crawled URL with new session."""
async with get_celery_session_maker()() as session:
task_logger = TaskLoggingService(session, search_space_id)
log_entry = await task_logger.log_task_start(
task_name="process_crawled_url",
source="document_processor",
message=f"Starting URL crawling and processing for: {url}",
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
)
try:
result = await add_crawled_url_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{
"document_id": result.id,
"title": result.title,
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"URL document already exists (duplicate): {url}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Error processing crawled URL: {e!s}")
raise
@celery_app.task(name="process_youtube_video", bind=True)
def process_youtube_video_task(self, url: str, search_space_id: int, user_id: str):
"""

View file

@ -67,6 +67,7 @@ async def _check_and_trigger_schedules():
index_airtable_records_task,
index_clickup_tasks_task,
index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task,
index_elasticsearch_documents_task,
index_github_repos_task,
@ -94,6 +95,7 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
}
# Trigger indexing for each due connector