Webcrawler connector draft

This commit is contained in:
samkul-swe 2025-11-21 20:45:59 -08:00
parent 419f94e8ee
commit 896e410e2a
26 changed files with 1225 additions and 9 deletions

View file

@ -49,6 +49,7 @@ from app.tasks.connector_indexers import (
index_luma_events,
index_notion_pages,
index_slack_messages,
index_webcrawler_urls,
)
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
@ -1523,3 +1524,63 @@ async def run_elasticsearch_indexing(
f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
exc_info=True,
)
# Add new helper functions for webcrawler indexing
async def run_webcrawler_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Create a new session and run the Webcrawler indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_webcrawler_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_webcrawler_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Webcrawler indexing.
Args:
session: Database session
connector_id: ID of the webcrawler connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
try:
documents_processed, error_or_warning = await index_webcrawler_urls(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
update_last_indexed=False, # Don't update timestamp in the indexing function
)
# Only update last_indexed_at if indexing was successful (either new docs or updated docs)
if documents_processed > 0:
await update_connector_last_indexed(session, connector_id)
logger.info(
f"Webcrawler indexing completed successfully: {documents_processed} documents processed"
)
else:
logger.error(
f"Webcrawler indexing failed or no documents processed: {error_or_warning}"
)
except Exception as e:
logger.error(f"Error in background Webcrawler indexing task: {e!s}")