Removed the CRAWLED_URL document processors

2026-04-25 00:36:31 +02:00 · 2025-11-21 23:25:23 -08:00 · 2025-11-21 23:25:23 -08:00 · 8333697598
commit 8333697598
parent 896e410e2a
4 changed files with 4 additions and 404 deletions
--- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
@ -9,7 +9,6 @@ from app.celery_app import celery_app
 from app.config import config
 from app.services.task_logging_service import TaskLoggingService
 from app.tasks.document_processors import (
-    add_crawled_url_document,
    add_extension_received_document,
    add_youtube_video_document,
 )
@ -120,71 +119,6 @@ async def _process_extension_document(
            raise


-@celery_app.task(name="process_crawled_url", bind=True)
-def process_crawled_url_task(self, url: str, search_space_id: int, user_id: str):
-    """
-    Celery task to process crawled URL.
-
-    Args:
-        url: URL to crawl and process
-        search_space_id: ID of the search space
-        user_id: ID of the user
-    """
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(_process_crawled_url(url, search_space_id, user_id))
-    finally:
-        loop.close()
-
-
-async def _process_crawled_url(url: str, search_space_id: int, user_id: str):
-    """Process crawled URL with new session."""
-    async with get_celery_session_maker()() as session:
-        task_logger = TaskLoggingService(session, search_space_id)
-
-        log_entry = await task_logger.log_task_start(
-            task_name="process_crawled_url",
-            source="document_processor",
-            message=f"Starting URL crawling and processing for: {url}",
-            metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
-        )
-
-        try:
-            result = await add_crawled_url_document(
-                session, url, search_space_id, user_id
-            )
-
-            if result:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Successfully crawled and processed URL: {url}",
-                    {
-                        "document_id": result.id,
-                        "title": result.title,
-                        "content_hash": result.content_hash,
-                    },
-                )
-            else:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"URL document already exists (duplicate): {url}",
-                    {"duplicate_detected": True},
-                )
-        except Exception as e:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Failed to crawl URL: {url}",
-                str(e),
-                {"error_type": type(e).__name__},
-            )
-            logger.error(f"Error processing crawled URL: {e!s}")
-            raise
-
-
@celery_app.task(name="process_youtube_video", bind=True)
 def process_youtube_video_task(self, url: str, search_space_id: int, user_id: str):
    """