diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 628329917..c4facc84d 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -143,6 +143,15 @@ STT_SERVICE=local/base PAGES_LIMIT=500 +# Residential Proxy Configuration (anonymous-proxies.net) +# Used for web crawling, link previews, and YouTube transcript fetching to avoid IP bans. +# Leave commented out to disable proxying. +# RESIDENTIAL_PROXY_USERNAME=your_proxy_username +# RESIDENTIAL_PROXY_PASSWORD=your_proxy_password +# RESIDENTIAL_PROXY_HOSTNAME=rotating.dnsproxifier.com:31230 +# RESIDENTIAL_PROXY_LOCATION= +# RESIDENTIAL_PROXY_TYPE=1 + FIRECRAWL_API_KEY=fcr-01J0000000000000000000000 # File Parser Service diff --git a/surfsense_backend/alembic/versions/93_add_image_generations_table.py b/surfsense_backend/alembic/versions/93_add_image_generations_table.py index eba9d7c86..d2cee5af4 100644 --- a/surfsense_backend/alembic/versions/93_add_image_generations_table.py +++ b/surfsense_backend/alembic/versions/93_add_image_generations_table.py @@ -13,8 +13,7 @@ Changes: from collections.abc import Sequence import sqlalchemy as sa -from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM -from sqlalchemy.dialects.postgresql import JSONB, UUID +from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM, JSONB, UUID from alembic import op diff --git a/surfsense_backend/app/agents/new_chat/tools/link_preview.py b/surfsense_backend/app/agents/new_chat/tools/link_preview.py index 3e2070a14..bf7b4af38 100644 --- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py +++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py @@ -17,6 +17,8 @@ from fake_useragent import UserAgent from langchain_core.tools import tool from playwright.async_api import async_playwright +from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url + logger = logging.getLogger(__name__) @@ -186,9 +188,15 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None: ua = UserAgent() user_agent = ua.random + # Use residential proxy if configured + playwright_proxy = get_playwright_proxy() + # Use Playwright to fetch the page async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) + launch_kwargs: dict = {"headless": True} + if playwright_proxy: + launch_kwargs["proxy"] = playwright_proxy + browser = await p.chromium.launch(**launch_kwargs) context = await browser.new_context(user_agent=user_agent) page = await context.new_page() @@ -283,12 +291,16 @@ def create_link_preview_tool(): ua = UserAgent() user_agent = ua.random + # Use residential proxy if configured + proxy_url = get_residential_proxy_url() + # Use a browser-like User-Agent to fetch Open Graph metadata. # We're only fetching publicly available metadata (title, description, thumbnail) # that websites intentionally expose via OG tags for link preview purposes. async with httpx.AsyncClient( timeout=10.0, follow_redirects=True, + proxy=proxy_url, headers={ "User-Agent": user_agent, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", diff --git a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py index 24f15edba..e3c58c857 100644 --- a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py +++ b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py @@ -2,17 +2,26 @@ Web scraping tool for the SurfSense agent. This module provides a tool for scraping and extracting content from webpages -using the existing WebCrawlerConnector. The scraped content can be used by -the agent to answer questions about web pages. +using the existing WebCrawlerConnector. For YouTube URLs, it fetches the +transcript directly via the YouTubeTranscriptApi instead of crawling the page. """ import hashlib +import logging from typing import Any from urllib.parse import urlparse +import aiohttp +from fake_useragent import UserAgent from langchain_core.tools import tool +from requests import Session +from youtube_transcript_api import YouTubeTranscriptApi from app.connectors.webcrawler_connector import WebCrawlerConnector +from app.tasks.document_processors.youtube_processor import get_youtube_video_id +from app.utils.proxy_config import get_requests_proxies + +logger = logging.getLogger(__name__) def extract_domain(url: str) -> str: @@ -57,6 +66,89 @@ def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]: return truncated + "\n\n[Content truncated...]", True +async def _scrape_youtube_video( + url: str, video_id: str, max_length: int +) -> dict[str, Any]: + """ + Fetch YouTube video metadata and transcript via the YouTubeTranscriptApi. + + Returns a result dict in the same shape as the regular scrape_webpage output. + """ + scrape_id = generate_scrape_id(url) + domain = "youtube.com" + + # --- Video metadata via oEmbed --- + residential_proxies = get_requests_proxies() + + params = { + "format": "json", + "url": f"https://www.youtube.com/watch?v={video_id}", + } + oembed_url = "https://www.youtube.com/oembed" + + try: + async with ( + aiohttp.ClientSession() as http_session, + http_session.get( + oembed_url, + params=params, + proxy=residential_proxies["http"] if residential_proxies else None, + ) as response, + ): + video_data = await response.json() + except Exception: + video_data = {} + + title = video_data.get("title", "YouTube Video") + author = video_data.get("author_name", "Unknown") + + # --- Transcript via YouTubeTranscriptApi --- + try: + ua = UserAgent() + http_client = Session() + http_client.headers.update({"User-Agent": ua.random}) + if residential_proxies: + http_client.proxies.update(residential_proxies) + ytt_api = YouTubeTranscriptApi(http_client=http_client) + captions = ytt_api.fetch(video_id) + + transcript_segments = [] + for line in captions: + start_time = line.start + duration = line.duration + text = line.text + timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]" + transcript_segments.append(f"{timestamp} {text}") + transcript_text = "\n".join(transcript_segments) + except Exception as e: + logger.warning(f"[scrape_webpage] No transcript for video {video_id}: {e}") + transcript_text = f"No captions available for this video. Error: {e!s}" + + # Build combined content + content = f"# {title}\n\n**Author:** {author}\n**Video ID:** {video_id}\n\n## Transcript\n\n{transcript_text}" + + # Truncate if needed + content, was_truncated = truncate_content(content, max_length) + word_count = len(content.split()) + + description = f"YouTube video by {author}" + + return { + "id": scrape_id, + "assetId": url, + "kind": "article", + "href": url, + "title": title, + "description": description, + "content": content, + "domain": domain, + "word_count": word_count, + "was_truncated": was_truncated, + "crawler_type": "youtube_transcript", + "author": author, + } + + def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): """ Factory function to create the scrape_webpage tool. @@ -79,7 +171,8 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): Use this tool when the user wants you to read, summarize, or answer questions about a specific webpage's content. This tool actually - fetches and reads the full page content. + fetches and reads the full page content. For YouTube video URLs it + fetches the transcript directly instead of crawling the page. Common triggers: - "Read this article and summarize it" @@ -114,6 +207,11 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): url = f"https://{url}" try: + # Check if this is a YouTube URL and use transcript API instead + video_id = get_youtube_video_id(url) + if video_id: + return await _scrape_youtube_video(url, video_id, max_length) + # Create webcrawler connector connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key) @@ -184,7 +282,7 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): except Exception as e: error_message = str(e) - print(f"[scrape_webpage] Error scraping {url}: {error_message}") + logger.error(f"[scrape_webpage] Error scraping {url}: {error_message}") return { "id": scrape_id, "assetId": url, diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index bb299e583..e102c414d 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -360,6 +360,14 @@ class Config: # LlamaCloud API Key LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") + # Residential Proxy Configuration (anonymous-proxies.net) + # Used for web crawling and YouTube transcript fetching to avoid IP bans. + RESIDENTIAL_PROXY_USERNAME = os.getenv("RESIDENTIAL_PROXY_USERNAME") + RESIDENTIAL_PROXY_PASSWORD = os.getenv("RESIDENTIAL_PROXY_PASSWORD") + RESIDENTIAL_PROXY_HOSTNAME = os.getenv("RESIDENTIAL_PROXY_HOSTNAME") + RESIDENTIAL_PROXY_LOCATION = os.getenv("RESIDENTIAL_PROXY_LOCATION", "") + RESIDENTIAL_PROXY_TYPE = int(os.getenv("RESIDENTIAL_PROXY_TYPE", "1")) + # Litellm TTS Configuration TTS_SERVICE = os.getenv("TTS_SERVICE") TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE") diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index 5d6ea98c8..a5fb33e1a 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -14,6 +14,8 @@ from fake_useragent import UserAgent from firecrawl import AsyncFirecrawlApp from playwright.async_api import async_playwright +from app.utils.proxy_config import get_playwright_proxy + logger = logging.getLogger(__name__) @@ -165,9 +167,15 @@ class WebCrawlerConnector: ua = UserAgent() user_agent = ua.random + # Use residential proxy if configured + playwright_proxy = get_playwright_proxy() + # Use Playwright to fetch the page async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) + launch_kwargs: dict = {"headless": True} + if playwright_proxy: + launch_kwargs["proxy"] = playwright_proxy + browser = await p.chromium.launch(**launch_kwargs) context = await browser.new_context(user_agent=user_agent) page = await context.new_page() diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 760651589..a35528a93 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -15,12 +15,12 @@ logger = logging.getLogger(__name__) def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None: """ Handle greenlet_spawn errors with detailed logging for debugging. - + The 'greenlet_spawn has not been called' error occurs when: 1. SQLAlchemy lazy-loads a relationship outside of an async context 2. A sync operation is called from an async context (or vice versa) 3. Session objects are accessed after the session is closed - + This helper logs detailed context to help identify the root cause. """ error_str = str(e) diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index b801b67d6..5dc438b9b 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime: # Try ISO format as fallback try: return datetime.fromisoformat(date_str.replace("Z", "+00:00")) - except ValueError: - raise ValueError(f"Unable to parse date: {date_str}") + except ValueError as err: + raise ValueError(f"Unable to parse date: {date_str}") from err async def check_duplicate_document_by_hash( diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py index ed300898c..ba494bb9f 100644 --- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -217,7 +217,7 @@ async def index_notion_pages( ) await task_logger.log_task_failure( log_entry, - f"Failed to get Notion pages: Notion API limitation", + "Failed to get Notion pages: Notion API limitation", f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.", {"error_type": "UnsupportedBlockType", "is_known_limitation": True}, ) diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index a2f0898ba..82ef8870d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -138,7 +138,7 @@ async def index_crawled_urls( f"No URLs provided for indexing. Connector ID: {connector_id}, " f"Connector name: {connector.name}, " f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, " - f"INITIAL_URLS raw value: {repr(raw_initial_urls)}" + f"INITIAL_URLS raw value: {raw_initial_urls!r}" ) await task_logger.log_task_failure( log_entry, diff --git a/surfsense_backend/app/tasks/document_processors/youtube_processor.py b/surfsense_backend/app/tasks/document_processors/youtube_processor.py index 7251fb22f..e5599e78b 100644 --- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py +++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py @@ -6,6 +6,8 @@ import logging from urllib.parse import parse_qs, urlparse import aiohttp +from fake_useragent import UserAgent +from requests import Session from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from youtube_transcript_api import YouTubeTranscriptApi @@ -19,6 +21,7 @@ from app.utils.document_converters import ( generate_document_summary, generate_unique_identifier_hash, ) +from app.utils.proxy_config import get_requests_proxies from .base import ( check_document_by_unique_identifier, @@ -114,9 +117,16 @@ async def add_youtube_video_document( } oembed_url = "https://www.youtube.com/oembed" + # Build residential proxy URL (if configured) + residential_proxies = get_requests_proxies() + async with ( aiohttp.ClientSession() as http_session, - http_session.get(oembed_url, params=params) as response, + http_session.get( + oembed_url, + params=params, + proxy=residential_proxies["http"] if residential_proxies else None, + ) as response, ): video_data = await response.json() @@ -138,7 +148,12 @@ async def add_youtube_video_document( ) try: - ytt_api = YouTubeTranscriptApi() + ua = UserAgent() + http_client = Session() + http_client.headers.update({"User-Agent": ua.random}) + if residential_proxies: + http_client.proxies.update(residential_proxies) + ytt_api = YouTubeTranscriptApi(http_client=http_client) captions = ytt_api.fetch(video_id) # Include complete caption information with timestamps transcript_segments = [] diff --git a/surfsense_backend/app/utils/proxy_config.py b/surfsense_backend/app/utils/proxy_config.py new file mode 100644 index 000000000..de377e366 --- /dev/null +++ b/surfsense_backend/app/utils/proxy_config.py @@ -0,0 +1,86 @@ +""" +Residential proxy configuration utility. + +Reads proxy credentials from the application Config and provides helper +functions that return proxy configs in the format expected by different +HTTP libraries (requests, httpx, aiohttp, Playwright). +""" + +import base64 +import json +import logging + +from app.config import Config + +logger = logging.getLogger(__name__) + + +def _build_password_b64() -> str | None: + """ + Build the base64-encoded password dict required by anonymous-proxies.net. + + Returns ``None`` when the required config values are not set. + """ + password = Config.RESIDENTIAL_PROXY_PASSWORD + if not password: + return None + + password_dict = { + "p": password, + "l": Config.RESIDENTIAL_PROXY_LOCATION, + "t": Config.RESIDENTIAL_PROXY_TYPE, + } + return base64.b64encode(json.dumps(password_dict).encode("utf-8")).decode("utf-8") + + +def get_residential_proxy_url() -> str | None: + """ + Return the fully-formed residential proxy URL, or ``None`` when not + configured. + + The URL format is:: + + http://:@/ + """ + username = Config.RESIDENTIAL_PROXY_USERNAME + hostname = Config.RESIDENTIAL_PROXY_HOSTNAME + password_b64 = _build_password_b64() + + if not all([username, hostname, password_b64]): + return None + + return f"http://{username}:{password_b64}@{hostname}/" + + +def get_requests_proxies() -> dict[str, str] | None: + """ + Return a ``{"http": …, "https": …}`` dict suitable for + ``requests.Session.proxies`` and ``aiohttp`` ``proxy=`` kwarg, + or ``None`` when not configured. + """ + proxy_url = get_residential_proxy_url() + if proxy_url is None: + return None + return {"http": proxy_url, "https": proxy_url} + + +def get_playwright_proxy() -> dict[str, str] | None: + """ + Return a Playwright-compatible proxy dict:: + + {"server": "http://host:port", "username": "…", "password": "…"} + + or ``None`` when not configured. + """ + username = Config.RESIDENTIAL_PROXY_USERNAME + hostname = Config.RESIDENTIAL_PROXY_HOSTNAME + password_b64 = _build_password_b64() + + if not all([username, hostname, password_b64]): + return None + + return { + "server": f"http://{hostname}", + "username": username, + "password": password_b64, + }