Webcrawler connector draft

This commit is contained in:
samkul-swe 2025-11-21 20:45:59 -08:00
parent 419f94e8ee
commit 896e410e2a
26 changed files with 1225 additions and 9 deletions

View file

@ -600,3 +600,46 @@ async def _index_elasticsearch_documents(
await run_elasticsearch_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_webcrawler_urls", bind=True)
def index_webcrawler_urls_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Webcrawler Urls."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_webcrawler_urls(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_webcrawler_urls(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Webcrawler Urls with new session."""
from app.routes.search_source_connectors_routes import (
run_webcrawler_indexing,
)
async with get_celery_session_maker()() as session:
await run_webcrawler_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)

View file

@ -77,6 +77,7 @@ async def _check_and_trigger_schedules():
index_luma_events_task,
index_notion_pages_task,
index_slack_messages_task,
index_webcrawler_urls_task
)
# Map connector types to their tasks
@ -94,6 +95,7 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
}
# Trigger indexing for each due connector