Webcrawler connector draft

2026-05-02 04:12:47 +02:00 · 2025-11-21 20:45:59 -08:00 · 2025-11-21 20:45:59 -08:00 · 896e410e2a
commit 896e410e2a
parent 419f94e8ee
26 changed files with 1225 additions and 9 deletions
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -49,6 +49,7 @@ from app.tasks.connector_indexers import (
    index_luma_events,
    index_notion_pages,
    index_slack_messages,
+    index_webcrawler_urls,
 )
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
@ -1523,3 +1524,63 @@ async def run_elasticsearch_indexing(
            f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
            exc_info=True,
        )
+
+# Add new helper functions for webcrawler indexing
+async def run_webcrawler_indexing_with_new_session(
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    start_date: str,
+    end_date: str,
+):
+    """
+    Create a new session and run the Webcrawler indexing task.
+    This prevents session leaks by creating a dedicated session for the background task.
+    """
+    async with async_session_maker() as session:
+        await run_webcrawler_indexing(
+            session, connector_id, search_space_id, user_id, start_date, end_date
+        )
+
+
+async def run_webcrawler_indexing(
+    session: AsyncSession,
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    start_date: str,
+    end_date: str,
+):
+    """
+    Background task to run Webcrawler indexing.
+    Args:
+        session: Database session
+        connector_id: ID of the webcrawler connector
+        search_space_id: ID of the search space
+        user_id: ID of the user
+        start_date: Start date for indexing
+        end_date: End date for indexing
+    """
+    try:
+        documents_processed, error_or_warning = await index_webcrawler_urls(
+            session=session,
+            connector_id=connector_id,
+            search_space_id=search_space_id,
+            user_id=user_id,
+            start_date=start_date,
+            end_date=end_date,
+            update_last_indexed=False,  # Don't update timestamp in the indexing function
+        )
+
+        # Only update last_indexed_at if indexing was successful (either new docs or updated docs)
+        if documents_processed > 0:
+            await update_connector_last_indexed(session, connector_id)
+            logger.info(
+                f"Webcrawler indexing completed successfully: {documents_processed} documents processed"
+            )
+        else:
+            logger.error(
+                f"Webcrawler indexing failed or no documents processed: {error_or_warning}"
+            )
+    except Exception as e:
+        logger.error(f"Error in background Webcrawler indexing task: {e!s}")