Merge remote-tracking branch 'upstream/main' into feature/blocknote-editor

2026-04-28 02:23:53 +02:00 · 2025-11-30 04:10:49 +05:30 · 2025-11-30 04:10:49 +05:30 · b98c312fb1
commit b98c312fb1
parent f8e4926969 0e9efd6f79
81 changed files with 8976 additions and 2387 deletions
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@ -600,3 +600,46 @@ async def _index_elasticsearch_documents(
        await run_elasticsearch_indexing(
            session, connector_id, search_space_id, user_id, start_date, end_date
        )
+
+
+@celery_app.task(name="index_crawled_urls", bind=True)
+def index_crawled_urls_task(
+    self,
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    start_date: str,
+    end_date: str,
+):
+    """Celery task to index Web page Urls."""
+    import asyncio
+
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    try:
+        loop.run_until_complete(
+            _index_crawled_urls(
+                connector_id, search_space_id, user_id, start_date, end_date
+            )
+        )
+    finally:
+        loop.close()
+
+
+async def _index_crawled_urls(
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    start_date: str,
+    end_date: str,
+):
+    """Index Web page Urls with new session."""
+    from app.routes.search_source_connectors_routes import (
+        run_web_page_indexing,
+    )
+
+    async with get_celery_session_maker()() as session:
+        await run_web_page_indexing(
+            session, connector_id, search_space_id, user_id, start_date, end_date
+        )
--- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
@ -9,7 +9,6 @@ from app.celery_app import celery_app
 from app.config import config
 from app.services.task_logging_service import TaskLoggingService
 from app.tasks.document_processors import (
-    add_crawled_url_document,
    add_extension_received_document,
    add_youtube_video_document,
 )
@ -120,71 +119,6 @@ async def _process_extension_document(
            raise


-@celery_app.task(name="process_crawled_url", bind=True)
-def process_crawled_url_task(self, url: str, search_space_id: int, user_id: str):
-    """
-    Celery task to process crawled URL.
-
-    Args:
-        url: URL to crawl and process
-        search_space_id: ID of the search space
-        user_id: ID of the user
-    """
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(_process_crawled_url(url, search_space_id, user_id))
-    finally:
-        loop.close()
-
-
-async def _process_crawled_url(url: str, search_space_id: int, user_id: str):
-    """Process crawled URL with new session."""
-    async with get_celery_session_maker()() as session:
-        task_logger = TaskLoggingService(session, search_space_id)
-
-        log_entry = await task_logger.log_task_start(
-            task_name="process_crawled_url",
-            source="document_processor",
-            message=f"Starting URL crawling and processing for: {url}",
-            metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
-        )
-
-        try:
-            result = await add_crawled_url_document(
-                session, url, search_space_id, user_id
-            )
-
-            if result:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Successfully crawled and processed URL: {url}",
-                    {
-                        "document_id": result.id,
-                        "title": result.title,
-                        "content_hash": result.content_hash,
-                    },
-                )
-            else:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"URL document already exists (duplicate): {url}",
-                    {"duplicate_detected": True},
-                )
-        except Exception as e:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Failed to crawl URL: {url}",
-                str(e),
-                {"error_type": type(e).__name__},
-            )
-            logger.error(f"Error processing crawled URL: {e!s}")
-            raise
-
-
@celery_app.task(name="process_youtube_video", bind=True)
 def process_youtube_video_task(self, url: str, search_space_id: int, user_id: str):
    """
--- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
@ -67,6 +67,7 @@ async def _check_and_trigger_schedules():
                index_airtable_records_task,
                index_clickup_tasks_task,
                index_confluence_pages_task,
+                index_crawled_urls_task,
                index_discord_messages_task,
                index_elasticsearch_documents_task,
                index_github_repos_task,
@ -94,6 +95,7 @@ async def _check_and_trigger_schedules():
                SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
                SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
                SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
+                SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
            }

            # Trigger indexing for each due connector