feat: add residential proxy configuration for web crawling and YouTube transcript fetching

2026-07-22 23:31:12 +02:00 · 2026-02-05 20:44:13 -08:00 · 2026-02-05 20:44:13 -08:00 · 1511c26ef5
commit 1511c26ef5
parent eaa0060def
12 changed files with 251 additions and 16 deletions
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@ -15,12 +15,12 @@ logger = logging.getLogger(__name__)
 def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
    """
    Handle greenlet_spawn errors with detailed logging for debugging.
-    
+
    The 'greenlet_spawn has not been called' error occurs when:
    1. SQLAlchemy lazy-loads a relationship outside of an async context
    2. A sync operation is called from an async context (or vice versa)
    3. Session objects are accessed after the session is closed
-    
+
    This helper logs detailed context to help identify the root cause.
    """
    error_str = str(e)
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime:
    # Try ISO format as fallback
    try:
        return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
-    except ValueError:
-        raise ValueError(f"Unable to parse date: {date_str}")
+    except ValueError as err:
+        raise ValueError(f"Unable to parse date: {date_str}") from err


 async def check_duplicate_document_by_hash(
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@ -217,7 +217,7 @@ async def index_notion_pages(
                )
                await task_logger.log_task_failure(
                    log_entry,
-                    f"Failed to get Notion pages: Notion API limitation",
+                    "Failed to get Notion pages: Notion API limitation",
                    f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
                    {"error_type": "UnsupportedBlockType", "is_known_limitation": True},
                )
--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -138,7 +138,7 @@ async def index_crawled_urls(
                f"No URLs provided for indexing. Connector ID: {connector_id}, "
                f"Connector name: {connector.name}, "
                f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
-                f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
+                f"INITIAL_URLS raw value: {raw_initial_urls!r}"
            )
            await task_logger.log_task_failure(
                log_entry,
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -6,6 +6,8 @@ import logging
 from urllib.parse import parse_qs, urlparse

 import aiohttp
+from fake_useragent import UserAgent
+from requests import Session
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from youtube_transcript_api import YouTubeTranscriptApi
@ -19,6 +21,7 @@ from app.utils.document_converters import (
    generate_document_summary,
    generate_unique_identifier_hash,
 )
+from app.utils.proxy_config import get_requests_proxies

 from .base import (
    check_document_by_unique_identifier,
@ -114,9 +117,16 @@ async def add_youtube_video_document(
        }
        oembed_url = "https://www.youtube.com/oembed"

+        # Build residential proxy URL (if configured)
+        residential_proxies = get_requests_proxies()
+
        async with (
            aiohttp.ClientSession() as http_session,
-            http_session.get(oembed_url, params=params) as response,
+            http_session.get(
+                oembed_url,
+                params=params,
+                proxy=residential_proxies["http"] if residential_proxies else None,
+            ) as response,
        ):
            video_data = await response.json()

@ -138,7 +148,12 @@ async def add_youtube_video_document(
        )

        try:
-            ytt_api = YouTubeTranscriptApi()
+            ua = UserAgent()
+            http_client = Session()
+            http_client.headers.update({"User-Agent": ua.random})
+            if residential_proxies:
+                http_client.proxies.update(residential_proxies)
+            ytt_api = YouTubeTranscriptApi(http_client=http_client)
            captions = ytt_api.fetch(video_id)
            # Include complete caption information with timestamps
            transcript_segments = []