Merge pull request #789 from MODSetter/dev

feat: rotating proxy support
2026-07-22 23:31:12 +02:00 · 2026-02-05 20:47:54 -08:00 · 2026-02-05 20:47:54 -08:00 · ac35f9d674
commit ac35f9d674
parent eaa0060def 9dab665c7c
14 changed files with 261 additions and 26 deletions
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@ -143,6 +143,15 @@ STT_SERVICE=local/base
 PAGES_LIMIT=500
 # Residential Proxy Configuration (anonymous-proxies.net)
 # Used for web crawling, link previews, and YouTube transcript fetching to avoid IP bans.
 # Leave commented out to disable proxying.
 # RESIDENTIAL_PROXY_USERNAME=your_proxy_username
 # RESIDENTIAL_PROXY_PASSWORD=your_proxy_password
 # RESIDENTIAL_PROXY_HOSTNAME=rotating.dnsproxifier.com:31230
 # RESIDENTIAL_PROXY_LOCATION=
 # RESIDENTIAL_PROXY_TYPE=1
 FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
 # File Parser Service
--- a/surfsense_backend/alembic/versions/93_add_image_generations_table.py
+++ b/surfsense_backend/alembic/versions/93_add_image_generations_table.py
@ -13,8 +13,7 @@ Changes:
 from collections.abc import Sequence
 import sqlalchemy as sa
-from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM
+from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM, JSONB, UUID
 from sqlalchemy.dialects.postgresql import JSONB, UUID
 from alembic import op
--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@ -17,6 +17,8 @@ from fake_useragent import UserAgent
 from langchain_core.tools import tool
 from playwright.async_api import async_playwright
 from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
 logger = logging.getLogger(__name__)
@ -186,9 +188,15 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
        ua = UserAgent()
        user_agent = ua.random
        # Use residential proxy if configured
        playwright_proxy = get_playwright_proxy()
        # Use Playwright to fetch the page
        async with async_playwright() as p:
-            browser = await p.chromium.launch(headless=True)
+            launch_kwargs: dict = {"headless": True}
            if playwright_proxy:
                launch_kwargs["proxy"] = playwright_proxy
            browser = await p.chromium.launch(**launch_kwargs)
            context = await browser.new_context(user_agent=user_agent)
            page = await context.new_page()
@ -283,12 +291,16 @@ def create_link_preview_tool():
            ua = UserAgent()
            user_agent = ua.random
            # Use residential proxy if configured
            proxy_url = get_residential_proxy_url()
            # Use a browser-like User-Agent to fetch Open Graph metadata.
            # We're only fetching publicly available metadata (title, description, thumbnail)
            # that websites intentionally expose via OG tags for link preview purposes.
            async with httpx.AsyncClient(
                timeout=10.0,
                follow_redirects=True,
                proxy=proxy_url,
                headers={
                    "User-Agent": user_agent,
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
--- a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py
+++ b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py
@ -2,17 +2,26 @@
 Web scraping tool for the SurfSense agent.
 This module provides a tool for scraping and extracting content from webpages
-using the existing WebCrawlerConnector. The scraped content can be used by
+using the existing WebCrawlerConnector. For YouTube URLs, it fetches the
-the agent to answer questions about web pages.
+transcript directly via the YouTubeTranscriptApi instead of crawling the page.
 """
 import hashlib
 import logging
 from typing import Any
 from urllib.parse import urlparse
 import aiohttp
 from fake_useragent import UserAgent
 from langchain_core.tools import tool
 from requests import Session
 from youtube_transcript_api import YouTubeTranscriptApi
 from app.connectors.webcrawler_connector import WebCrawlerConnector
 from app.tasks.document_processors.youtube_processor import get_youtube_video_id
 from app.utils.proxy_config import get_requests_proxies
 logger = logging.getLogger(__name__)
 def extract_domain(url: str) -> str:
@ -57,6 +66,89 @@ def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
    return truncated + "\n\n[Content truncated...]", True
 async def _scrape_youtube_video(
    url: str, video_id: str, max_length: int
 ) -> dict[str, Any]:
    """
    Fetch YouTube video metadata and transcript via the YouTubeTranscriptApi.
    Returns a result dict in the same shape as the regular scrape_webpage output.
    """
    scrape_id = generate_scrape_id(url)
    domain = "youtube.com"
    # --- Video metadata via oEmbed ---
    residential_proxies = get_requests_proxies()
    params = {
        "format": "json",
        "url": f"https://www.youtube.com/watch?v={video_id}",
    }
    oembed_url = "https://www.youtube.com/oembed"
    try:
        async with (
            aiohttp.ClientSession() as http_session,
            http_session.get(
                oembed_url,
                params=params,
                proxy=residential_proxies["http"] if residential_proxies else None,
            ) as response,
        ):
            video_data = await response.json()
    except Exception:
        video_data = {}
    title = video_data.get("title", "YouTube Video")
    author = video_data.get("author_name", "Unknown")
    # --- Transcript via YouTubeTranscriptApi ---
    try:
        ua = UserAgent()
        http_client = Session()
        http_client.headers.update({"User-Agent": ua.random})
        if residential_proxies:
            http_client.proxies.update(residential_proxies)
        ytt_api = YouTubeTranscriptApi(http_client=http_client)
        captions = ytt_api.fetch(video_id)
        transcript_segments = []
        for line in captions:
            start_time = line.start
            duration = line.duration
            text = line.text
            timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]"
            transcript_segments.append(f"{timestamp} {text}")
        transcript_text = "\n".join(transcript_segments)
    except Exception as e:
        logger.warning(f"[scrape_webpage] No transcript for video {video_id}: {e}")
        transcript_text = f"No captions available for this video. Error: {e!s}"
    # Build combined content
    content = f"# {title}\n\n**Author:** {author}\n**Video ID:** {video_id}\n\n## Transcript\n\n{transcript_text}"
    # Truncate if needed
    content, was_truncated = truncate_content(content, max_length)
    word_count = len(content.split())
    description = f"YouTube video by {author}"
    return {
        "id": scrape_id,
        "assetId": url,
        "kind": "article",
        "href": url,
        "title": title,
        "description": description,
        "content": content,
        "domain": domain,
        "word_count": word_count,
        "was_truncated": was_truncated,
        "crawler_type": "youtube_transcript",
        "author": author,
    }
 def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
    """
    Factory function to create the scrape_webpage tool.
@ -79,7 +171,8 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
        Use this tool when the user wants you to read, summarize, or answer
        questions about a specific webpage's content. This tool actually
-        fetches and reads the full page content.
+        fetches and reads the full page content. For YouTube video URLs it
        fetches the transcript directly instead of crawling the page.
        Common triggers:
        - "Read this article and summarize it"
@ -114,6 +207,11 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
            url = f"https://{url}"
        try:
            # Check if this is a YouTube URL and use transcript API instead
            video_id = get_youtube_video_id(url)
            if video_id:
                return await _scrape_youtube_video(url, video_id, max_length)
            # Create webcrawler connector
            connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key)
@ -184,7 +282,7 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
        except Exception as e:
            error_message = str(e)
-            print(f"[scrape_webpage] Error scraping {url}: {error_message}")
+            logger.error(f"[scrape_webpage] Error scraping {url}: {error_message}")
            return {
                "id": scrape_id,
                "assetId": url,
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -360,6 +360,14 @@ class Config:
        # LlamaCloud API Key
        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
    # Residential Proxy Configuration (anonymous-proxies.net)
    # Used for web crawling and YouTube transcript fetching to avoid IP bans.
    RESIDENTIAL_PROXY_USERNAME = os.getenv("RESIDENTIAL_PROXY_USERNAME")
    RESIDENTIAL_PROXY_PASSWORD = os.getenv("RESIDENTIAL_PROXY_PASSWORD")
    RESIDENTIAL_PROXY_HOSTNAME = os.getenv("RESIDENTIAL_PROXY_HOSTNAME")
    RESIDENTIAL_PROXY_LOCATION = os.getenv("RESIDENTIAL_PROXY_LOCATION", "")
    RESIDENTIAL_PROXY_TYPE = int(os.getenv("RESIDENTIAL_PROXY_TYPE", "1"))
    # Litellm TTS Configuration
    TTS_SERVICE = os.getenv("TTS_SERVICE")
    TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
--- a/surfsense_backend/app/connectors/webcrawler_connector.py
+++ b/surfsense_backend/app/connectors/webcrawler_connector.py
@ -14,6 +14,8 @@ from fake_useragent import UserAgent
 from firecrawl import AsyncFirecrawlApp
 from playwright.async_api import async_playwright
 from app.utils.proxy_config import get_playwright_proxy
 logger = logging.getLogger(__name__)
@ -165,9 +167,15 @@ class WebCrawlerConnector:
        ua = UserAgent()
        user_agent = ua.random
        # Use residential proxy if configured
        playwright_proxy = get_playwright_proxy()
        # Use Playwright to fetch the page
        async with async_playwright() as p:
-            browser = await p.chromium.launch(headless=True)
+            launch_kwargs: dict = {"headless": True}
            if playwright_proxy:
                launch_kwargs["proxy"] = playwright_proxy
            browser = await p.chromium.launch(**launch_kwargs)
            context = await browser.new_context(user_agent=user_agent)
            page = await context.new_page()
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@ -15,12 +15,12 @@ logger = logging.getLogger(__name__)
 def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
    """
    Handle greenlet_spawn errors with detailed logging for debugging.
-    
+
    The 'greenlet_spawn has not been called' error occurs when:
    1. SQLAlchemy lazy-loads a relationship outside of an async context
    2. A sync operation is called from an async context (or vice versa)
    3. Session objects are accessed after the session is closed
-    
+
    This helper logs detailed context to help identify the root cause.
    """
    error_str = str(e)
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime:
    # Try ISO format as fallback
    try:
        return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
-    except ValueError:
+    except ValueError as err:
-        raise ValueError(f"Unable to parse date: {date_str}")
+        raise ValueError(f"Unable to parse date: {date_str}") from err
 async def check_duplicate_document_by_hash(
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@ -217,7 +217,7 @@ async def index_notion_pages(
                )
                await task_logger.log_task_failure(
                    log_entry,
-                    f"Failed to get Notion pages: Notion API limitation",
+                    "Failed to get Notion pages: Notion API limitation",
                    f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
                    {"error_type": "UnsupportedBlockType", "is_known_limitation": True},
                )
--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -138,7 +138,7 @@ async def index_crawled_urls(
                f"No URLs provided for indexing. Connector ID: {connector_id}, "
                f"Connector name: {connector.name}, "
                f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
-                f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
+                f"INITIAL_URLS raw value: {raw_initial_urls!r}"
            )
            await task_logger.log_task_failure(
                log_entry,
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -6,6 +6,8 @@ import logging
 from urllib.parse import parse_qs, urlparse
 import aiohttp
 from fake_useragent import UserAgent
 from requests import Session
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from youtube_transcript_api import YouTubeTranscriptApi
@ -19,6 +21,7 @@ from app.utils.document_converters import (
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from app.utils.proxy_config import get_requests_proxies
 from .base import (
    check_document_by_unique_identifier,
@ -114,9 +117,16 @@ async def add_youtube_video_document(
        }
        oembed_url = "https://www.youtube.com/oembed"
        # Build residential proxy URL (if configured)
        residential_proxies = get_requests_proxies()
        async with (
            aiohttp.ClientSession() as http_session,
-            http_session.get(oembed_url, params=params) as response,
+            http_session.get(
                oembed_url,
                params=params,
                proxy=residential_proxies["http"] if residential_proxies else None,
            ) as response,
        ):
            video_data = await response.json()
@ -138,7 +148,12 @@ async def add_youtube_video_document(
        )
        try:
-            ytt_api = YouTubeTranscriptApi()
+            ua = UserAgent()
            http_client = Session()
            http_client.headers.update({"User-Agent": ua.random})
            if residential_proxies:
                http_client.proxies.update(residential_proxies)
            ytt_api = YouTubeTranscriptApi(http_client=http_client)
            captions = ytt_api.fetch(video_id)
            # Include complete caption information with timestamps
            transcript_segments = []
--- a/surfsense_backend/app/utils/proxy_config.py
+++ b/surfsense_backend/app/utils/proxy_config.py
@ -0,0 +1,86 @@
 """
 Residential proxy configuration utility.
 Reads proxy credentials from the application Config and provides helper
 functions that return proxy configs in the format expected by different
 HTTP libraries (requests, httpx, aiohttp, Playwright).
 """
 import base64
 import json
 import logging
 from app.config import Config
 logger = logging.getLogger(__name__)
 def _build_password_b64() -> str | None:
    """
    Build the base64-encoded password dict required by anonymous-proxies.net.
    Returns ``None`` when the required config values are not set.
    """
    password = Config.RESIDENTIAL_PROXY_PASSWORD
    if not password:
        return None
    password_dict = {
        "p": password,
        "l": Config.RESIDENTIAL_PROXY_LOCATION,
        "t": Config.RESIDENTIAL_PROXY_TYPE,
    }
    return base64.b64encode(json.dumps(password_dict).encode("utf-8")).decode("utf-8")
 def get_residential_proxy_url() -> str | None:
    """
    Return the fully-formed residential proxy URL, or ``None`` when not
    configured.
    The URL format is::
        http://<username>:<base64_password>@<hostname>/
    """
    username = Config.RESIDENTIAL_PROXY_USERNAME
    hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
    password_b64 = _build_password_b64()
    if not all([username, hostname, password_b64]):
        return None
    return f"http://{username}:{password_b64}@{hostname}/"
 def get_requests_proxies() -> dict[str, str] | None:
    """
    Return a ``{"http": …, "https": …}`` dict suitable for
    ``requests.Session.proxies`` and ``aiohttp`` ``proxy=`` kwarg,
    or ``None`` when not configured.
    """
    proxy_url = get_residential_proxy_url()
    if proxy_url is None:
        return None
    return {"http": proxy_url, "https": proxy_url}
 def get_playwright_proxy() -> dict[str, str] | None:
    """
    Return a Playwright-compatible proxy dict::
        {"server": "http://host:port", "username": "…", "password": "…"}
    or ``None`` when not configured.
    """
    username = Config.RESIDENTIAL_PROXY_USERNAME
    hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
    password_b64 = _build_password_b64()
    if not all([username, hostname, password_b64]):
        return None
    return {
        "server": f"http://{hostname}",
        "username": username,
        "password": password_b64,
    }
--- a/surfsense_web/components/assistant-ui/attachment.tsx
+++ b/surfsense_web/components/assistant-ui/attachment.tsx
@ -351,14 +351,14 @@ export const ComposerAddAttachment: FC = () => {
 						<PlusIcon className="aui-attachment-add-icon size-5 stroke-[1.5px]" />
 					</TooltipIconButton>
 				</DropdownMenuTrigger>
-				<DropdownMenuContent align="start" className="w-48 bg-background border-border">
+				<DropdownMenuContent align="start" className="w-72 bg-background border-border">
 					<DropdownMenuItem onSelect={handleChatAttachment} className="cursor-pointer">
 						<Paperclip className="size-4" />
-						<span>Add attachment</span>
+						<span>Add attachment to this chat</span>
 					</DropdownMenuItem>
 					<DropdownMenuItem onClick={handleFileUpload} className="cursor-pointer">
 						<Upload className="size-4" />
-						<span>Upload Documents</span>
+						<span>Upload documents to Search Space</span>
 					</DropdownMenuItem>
 				</DropdownMenuContent>
 			</DropdownMenu>
--- a/surfsense_web/components/pricing/pricing-section.tsx
+++ b/surfsense_web/components/pricing/pricing-section.tsx
@ -12,11 +12,11 @@ const demoPlans = [
 		features: [
 			"Open source on GitHub",
 			"Upload and chat with 300+ pages of content",
-			"Connects with 8 popular sources, like Drive and Notion.",
+			"Connects with 8 popular sources, like Drive and Notion",
 			"Includes limited access to ChatGPT, Claude, and DeepSeek models",
-			"Supports 100+ more LLMs, including Gemini, Llama and many more.",
+			"Supports 100+ more LLMs, including Gemini, Llama and many more",
-			"50+ File extensions supported.",
+			"50+ File extensions supported",
-			"Generate podcasts in seconds.",
+			"Generate podcasts in seconds",
 			"Cross-Browser Extension for dynamic webpages including authenticated content",
 			"Community support on Discord",
 		],
@ -33,8 +33,8 @@ const demoPlans = [
 		billingText: "billed annually",
 		features: [
 			"Everything in Free",
-			"Upload and chat with 5,000+ pages of content",
+			"Upload and chat with 5,000+ pages of content per user",
-			"Connects with 15+ external sources, like Slack and Airtable.",
+			"Connects with 15+ external sources, like Slack and Airtable",
 			"Includes extended access to ChatGPT, Claude, and DeepSeek models",
 			"Collaboration and commenting features",
 			"Shared BYOK (Bring Your Own Key)",
@ -42,7 +42,7 @@ const demoPlans = [
 			"Planned: Centralized billing",
 			"Priority support",
 		],
-		description: "The AIknowledge base for individuals and teams",
+		description: "The AI knowledge base for individuals and teams",
 		buttonText: "Upgrade",
 		href: "/contact",
 		isPopular: true,