From 3cb3723d92058b3e0f5e581545d4b572db953cb0 Mon Sep 17 00:00:00 2001 From: Eric Lammertsma Date: Thu, 5 Feb 2026 17:27:12 -0500 Subject: [PATCH 1/4] Updated attachement UI dropdown item labels for clarity. --- surfsense_web/components/assistant-ui/attachment.tsx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/surfsense_web/components/assistant-ui/attachment.tsx b/surfsense_web/components/assistant-ui/attachment.tsx index c1a147046..3a64d3a6c 100644 --- a/surfsense_web/components/assistant-ui/attachment.tsx +++ b/surfsense_web/components/assistant-ui/attachment.tsx @@ -351,14 +351,14 @@ export const ComposerAddAttachment: FC = () => { - + - Add attachment + Add attachment to this chat - Upload Documents + Upload documents to Search Space From 16692385b491035fd14cc77578b40b255a6dbace Mon Sep 17 00:00:00 2001 From: Eric Lammertsma Date: Thu, 5 Feb 2026 23:09:05 -0500 Subject: [PATCH 2/4] [FIX] typo in pricing section --- surfsense_web/components/pricing/pricing-section.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfsense_web/components/pricing/pricing-section.tsx b/surfsense_web/components/pricing/pricing-section.tsx index 117be15ec..cf27b8842 100644 --- a/surfsense_web/components/pricing/pricing-section.tsx +++ b/surfsense_web/components/pricing/pricing-section.tsx @@ -42,7 +42,7 @@ const demoPlans = [ "Planned: Centralized billing", "Priority support", ], - description: "The AIknowledge base for individuals and teams", + description: "The AI knowledge base for individuals and teams", buttonText: "Upgrade", href: "/contact", isPopular: true, From 585cf972776fd184b9cd7f3ed0b7f2024cb1cea8 Mon Sep 17 00:00:00 2001 From: Eric Lammertsma Date: Thu, 5 Feb 2026 23:10:50 -0500 Subject: [PATCH 3/4] fix: remove trailing periods from feature descriptions in pricing section --- surfsense_web/components/pricing/pricing-section.tsx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/surfsense_web/components/pricing/pricing-section.tsx b/surfsense_web/components/pricing/pricing-section.tsx index cf27b8842..7528aeb0b 100644 --- a/surfsense_web/components/pricing/pricing-section.tsx +++ b/surfsense_web/components/pricing/pricing-section.tsx @@ -12,11 +12,11 @@ const demoPlans = [ features: [ "Open source on GitHub", "Upload and chat with 300+ pages of content", - "Connects with 8 popular sources, like Drive and Notion.", + "Connects with 8 popular sources, like Drive and Notion", "Includes limited access to ChatGPT, Claude, and DeepSeek models", - "Supports 100+ more LLMs, including Gemini, Llama and many more.", - "50+ File extensions supported.", - "Generate podcasts in seconds.", + "Supports 100+ more LLMs, including Gemini, Llama and many more", + "50+ File extensions supported", + "Generate podcasts in seconds", "Cross-Browser Extension for dynamic webpages including authenticated content", "Community support on Discord", ], @@ -33,8 +33,8 @@ const demoPlans = [ billingText: "billed annually", features: [ "Everything in Free", - "Upload and chat with 5,000+ pages of content", - "Connects with 15+ external sources, like Slack and Airtable.", + "Upload and chat with 5,000+ pages of content per user", + "Connects with 15+ external sources, like Slack and Airtable", "Includes extended access to ChatGPT, Claude, and DeepSeek models", "Collaboration and commenting features", "Shared BYOK (Bring Your Own Key)", From 1511c26ef59e10d12a49884aaa5a6dfcf6cfd5f4 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 5 Feb 2026 20:44:13 -0800 Subject: [PATCH 4/4] feat: add residential proxy configuration for web crawling and YouTube transcript fetching --- surfsense_backend/.env.example | 9 ++ .../93_add_image_generations_table.py | 3 +- .../app/agents/new_chat/tools/link_preview.py | 14 ++- .../agents/new_chat/tools/scrape_webpage.py | 106 +++++++++++++++++- surfsense_backend/app/config/__init__.py | 8 ++ .../app/connectors/webcrawler_connector.py | 10 +- .../app/tasks/celery_tasks/connector_tasks.py | 4 +- .../app/tasks/connector_indexers/base.py | 4 +- .../connector_indexers/notion_indexer.py | 2 +- .../connector_indexers/webcrawler_indexer.py | 2 +- .../document_processors/youtube_processor.py | 19 +++- surfsense_backend/app/utils/proxy_config.py | 86 ++++++++++++++ 12 files changed, 251 insertions(+), 16 deletions(-) create mode 100644 surfsense_backend/app/utils/proxy_config.py diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 628329917..c4facc84d 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -143,6 +143,15 @@ STT_SERVICE=local/base PAGES_LIMIT=500 +# Residential Proxy Configuration (anonymous-proxies.net) +# Used for web crawling, link previews, and YouTube transcript fetching to avoid IP bans. +# Leave commented out to disable proxying. +# RESIDENTIAL_PROXY_USERNAME=your_proxy_username +# RESIDENTIAL_PROXY_PASSWORD=your_proxy_password +# RESIDENTIAL_PROXY_HOSTNAME=rotating.dnsproxifier.com:31230 +# RESIDENTIAL_PROXY_LOCATION= +# RESIDENTIAL_PROXY_TYPE=1 + FIRECRAWL_API_KEY=fcr-01J0000000000000000000000 # File Parser Service diff --git a/surfsense_backend/alembic/versions/93_add_image_generations_table.py b/surfsense_backend/alembic/versions/93_add_image_generations_table.py index eba9d7c86..d2cee5af4 100644 --- a/surfsense_backend/alembic/versions/93_add_image_generations_table.py +++ b/surfsense_backend/alembic/versions/93_add_image_generations_table.py @@ -13,8 +13,7 @@ Changes: from collections.abc import Sequence import sqlalchemy as sa -from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM -from sqlalchemy.dialects.postgresql import JSONB, UUID +from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM, JSONB, UUID from alembic import op diff --git a/surfsense_backend/app/agents/new_chat/tools/link_preview.py b/surfsense_backend/app/agents/new_chat/tools/link_preview.py index 3e2070a14..bf7b4af38 100644 --- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py +++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py @@ -17,6 +17,8 @@ from fake_useragent import UserAgent from langchain_core.tools import tool from playwright.async_api import async_playwright +from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url + logger = logging.getLogger(__name__) @@ -186,9 +188,15 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None: ua = UserAgent() user_agent = ua.random + # Use residential proxy if configured + playwright_proxy = get_playwright_proxy() + # Use Playwright to fetch the page async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) + launch_kwargs: dict = {"headless": True} + if playwright_proxy: + launch_kwargs["proxy"] = playwright_proxy + browser = await p.chromium.launch(**launch_kwargs) context = await browser.new_context(user_agent=user_agent) page = await context.new_page() @@ -283,12 +291,16 @@ def create_link_preview_tool(): ua = UserAgent() user_agent = ua.random + # Use residential proxy if configured + proxy_url = get_residential_proxy_url() + # Use a browser-like User-Agent to fetch Open Graph metadata. # We're only fetching publicly available metadata (title, description, thumbnail) # that websites intentionally expose via OG tags for link preview purposes. async with httpx.AsyncClient( timeout=10.0, follow_redirects=True, + proxy=proxy_url, headers={ "User-Agent": user_agent, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", diff --git a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py index 24f15edba..e3c58c857 100644 --- a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py +++ b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py @@ -2,17 +2,26 @@ Web scraping tool for the SurfSense agent. This module provides a tool for scraping and extracting content from webpages -using the existing WebCrawlerConnector. The scraped content can be used by -the agent to answer questions about web pages. +using the existing WebCrawlerConnector. For YouTube URLs, it fetches the +transcript directly via the YouTubeTranscriptApi instead of crawling the page. """ import hashlib +import logging from typing import Any from urllib.parse import urlparse +import aiohttp +from fake_useragent import UserAgent from langchain_core.tools import tool +from requests import Session +from youtube_transcript_api import YouTubeTranscriptApi from app.connectors.webcrawler_connector import WebCrawlerConnector +from app.tasks.document_processors.youtube_processor import get_youtube_video_id +from app.utils.proxy_config import get_requests_proxies + +logger = logging.getLogger(__name__) def extract_domain(url: str) -> str: @@ -57,6 +66,89 @@ def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]: return truncated + "\n\n[Content truncated...]", True +async def _scrape_youtube_video( + url: str, video_id: str, max_length: int +) -> dict[str, Any]: + """ + Fetch YouTube video metadata and transcript via the YouTubeTranscriptApi. + + Returns a result dict in the same shape as the regular scrape_webpage output. + """ + scrape_id = generate_scrape_id(url) + domain = "youtube.com" + + # --- Video metadata via oEmbed --- + residential_proxies = get_requests_proxies() + + params = { + "format": "json", + "url": f"https://www.youtube.com/watch?v={video_id}", + } + oembed_url = "https://www.youtube.com/oembed" + + try: + async with ( + aiohttp.ClientSession() as http_session, + http_session.get( + oembed_url, + params=params, + proxy=residential_proxies["http"] if residential_proxies else None, + ) as response, + ): + video_data = await response.json() + except Exception: + video_data = {} + + title = video_data.get("title", "YouTube Video") + author = video_data.get("author_name", "Unknown") + + # --- Transcript via YouTubeTranscriptApi --- + try: + ua = UserAgent() + http_client = Session() + http_client.headers.update({"User-Agent": ua.random}) + if residential_proxies: + http_client.proxies.update(residential_proxies) + ytt_api = YouTubeTranscriptApi(http_client=http_client) + captions = ytt_api.fetch(video_id) + + transcript_segments = [] + for line in captions: + start_time = line.start + duration = line.duration + text = line.text + timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]" + transcript_segments.append(f"{timestamp} {text}") + transcript_text = "\n".join(transcript_segments) + except Exception as e: + logger.warning(f"[scrape_webpage] No transcript for video {video_id}: {e}") + transcript_text = f"No captions available for this video. Error: {e!s}" + + # Build combined content + content = f"# {title}\n\n**Author:** {author}\n**Video ID:** {video_id}\n\n## Transcript\n\n{transcript_text}" + + # Truncate if needed + content, was_truncated = truncate_content(content, max_length) + word_count = len(content.split()) + + description = f"YouTube video by {author}" + + return { + "id": scrape_id, + "assetId": url, + "kind": "article", + "href": url, + "title": title, + "description": description, + "content": content, + "domain": domain, + "word_count": word_count, + "was_truncated": was_truncated, + "crawler_type": "youtube_transcript", + "author": author, + } + + def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): """ Factory function to create the scrape_webpage tool. @@ -79,7 +171,8 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): Use this tool when the user wants you to read, summarize, or answer questions about a specific webpage's content. This tool actually - fetches and reads the full page content. + fetches and reads the full page content. For YouTube video URLs it + fetches the transcript directly instead of crawling the page. Common triggers: - "Read this article and summarize it" @@ -114,6 +207,11 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): url = f"https://{url}" try: + # Check if this is a YouTube URL and use transcript API instead + video_id = get_youtube_video_id(url) + if video_id: + return await _scrape_youtube_video(url, video_id, max_length) + # Create webcrawler connector connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key) @@ -184,7 +282,7 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): except Exception as e: error_message = str(e) - print(f"[scrape_webpage] Error scraping {url}: {error_message}") + logger.error(f"[scrape_webpage] Error scraping {url}: {error_message}") return { "id": scrape_id, "assetId": url, diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index bb299e583..e102c414d 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -360,6 +360,14 @@ class Config: # LlamaCloud API Key LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") + # Residential Proxy Configuration (anonymous-proxies.net) + # Used for web crawling and YouTube transcript fetching to avoid IP bans. + RESIDENTIAL_PROXY_USERNAME = os.getenv("RESIDENTIAL_PROXY_USERNAME") + RESIDENTIAL_PROXY_PASSWORD = os.getenv("RESIDENTIAL_PROXY_PASSWORD") + RESIDENTIAL_PROXY_HOSTNAME = os.getenv("RESIDENTIAL_PROXY_HOSTNAME") + RESIDENTIAL_PROXY_LOCATION = os.getenv("RESIDENTIAL_PROXY_LOCATION", "") + RESIDENTIAL_PROXY_TYPE = int(os.getenv("RESIDENTIAL_PROXY_TYPE", "1")) + # Litellm TTS Configuration TTS_SERVICE = os.getenv("TTS_SERVICE") TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE") diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index 5d6ea98c8..a5fb33e1a 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -14,6 +14,8 @@ from fake_useragent import UserAgent from firecrawl import AsyncFirecrawlApp from playwright.async_api import async_playwright +from app.utils.proxy_config import get_playwright_proxy + logger = logging.getLogger(__name__) @@ -165,9 +167,15 @@ class WebCrawlerConnector: ua = UserAgent() user_agent = ua.random + # Use residential proxy if configured + playwright_proxy = get_playwright_proxy() + # Use Playwright to fetch the page async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) + launch_kwargs: dict = {"headless": True} + if playwright_proxy: + launch_kwargs["proxy"] = playwright_proxy + browser = await p.chromium.launch(**launch_kwargs) context = await browser.new_context(user_agent=user_agent) page = await context.new_page() diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 760651589..a35528a93 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -15,12 +15,12 @@ logger = logging.getLogger(__name__) def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None: """ Handle greenlet_spawn errors with detailed logging for debugging. - + The 'greenlet_spawn has not been called' error occurs when: 1. SQLAlchemy lazy-loads a relationship outside of an async context 2. A sync operation is called from an async context (or vice versa) 3. Session objects are accessed after the session is closed - + This helper logs detailed context to help identify the root cause. """ error_str = str(e) diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index b801b67d6..5dc438b9b 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime: # Try ISO format as fallback try: return datetime.fromisoformat(date_str.replace("Z", "+00:00")) - except ValueError: - raise ValueError(f"Unable to parse date: {date_str}") + except ValueError as err: + raise ValueError(f"Unable to parse date: {date_str}") from err async def check_duplicate_document_by_hash( diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py index ed300898c..ba494bb9f 100644 --- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -217,7 +217,7 @@ async def index_notion_pages( ) await task_logger.log_task_failure( log_entry, - f"Failed to get Notion pages: Notion API limitation", + "Failed to get Notion pages: Notion API limitation", f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.", {"error_type": "UnsupportedBlockType", "is_known_limitation": True}, ) diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index a2f0898ba..82ef8870d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -138,7 +138,7 @@ async def index_crawled_urls( f"No URLs provided for indexing. Connector ID: {connector_id}, " f"Connector name: {connector.name}, " f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, " - f"INITIAL_URLS raw value: {repr(raw_initial_urls)}" + f"INITIAL_URLS raw value: {raw_initial_urls!r}" ) await task_logger.log_task_failure( log_entry, diff --git a/surfsense_backend/app/tasks/document_processors/youtube_processor.py b/surfsense_backend/app/tasks/document_processors/youtube_processor.py index 7251fb22f..e5599e78b 100644 --- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py +++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py @@ -6,6 +6,8 @@ import logging from urllib.parse import parse_qs, urlparse import aiohttp +from fake_useragent import UserAgent +from requests import Session from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from youtube_transcript_api import YouTubeTranscriptApi @@ -19,6 +21,7 @@ from app.utils.document_converters import ( generate_document_summary, generate_unique_identifier_hash, ) +from app.utils.proxy_config import get_requests_proxies from .base import ( check_document_by_unique_identifier, @@ -114,9 +117,16 @@ async def add_youtube_video_document( } oembed_url = "https://www.youtube.com/oembed" + # Build residential proxy URL (if configured) + residential_proxies = get_requests_proxies() + async with ( aiohttp.ClientSession() as http_session, - http_session.get(oembed_url, params=params) as response, + http_session.get( + oembed_url, + params=params, + proxy=residential_proxies["http"] if residential_proxies else None, + ) as response, ): video_data = await response.json() @@ -138,7 +148,12 @@ async def add_youtube_video_document( ) try: - ytt_api = YouTubeTranscriptApi() + ua = UserAgent() + http_client = Session() + http_client.headers.update({"User-Agent": ua.random}) + if residential_proxies: + http_client.proxies.update(residential_proxies) + ytt_api = YouTubeTranscriptApi(http_client=http_client) captions = ytt_api.fetch(video_id) # Include complete caption information with timestamps transcript_segments = [] diff --git a/surfsense_backend/app/utils/proxy_config.py b/surfsense_backend/app/utils/proxy_config.py new file mode 100644 index 000000000..de377e366 --- /dev/null +++ b/surfsense_backend/app/utils/proxy_config.py @@ -0,0 +1,86 @@ +""" +Residential proxy configuration utility. + +Reads proxy credentials from the application Config and provides helper +functions that return proxy configs in the format expected by different +HTTP libraries (requests, httpx, aiohttp, Playwright). +""" + +import base64 +import json +import logging + +from app.config import Config + +logger = logging.getLogger(__name__) + + +def _build_password_b64() -> str | None: + """ + Build the base64-encoded password dict required by anonymous-proxies.net. + + Returns ``None`` when the required config values are not set. + """ + password = Config.RESIDENTIAL_PROXY_PASSWORD + if not password: + return None + + password_dict = { + "p": password, + "l": Config.RESIDENTIAL_PROXY_LOCATION, + "t": Config.RESIDENTIAL_PROXY_TYPE, + } + return base64.b64encode(json.dumps(password_dict).encode("utf-8")).decode("utf-8") + + +def get_residential_proxy_url() -> str | None: + """ + Return the fully-formed residential proxy URL, or ``None`` when not + configured. + + The URL format is:: + + http://:@/ + """ + username = Config.RESIDENTIAL_PROXY_USERNAME + hostname = Config.RESIDENTIAL_PROXY_HOSTNAME + password_b64 = _build_password_b64() + + if not all([username, hostname, password_b64]): + return None + + return f"http://{username}:{password_b64}@{hostname}/" + + +def get_requests_proxies() -> dict[str, str] | None: + """ + Return a ``{"http": …, "https": …}`` dict suitable for + ``requests.Session.proxies`` and ``aiohttp`` ``proxy=`` kwarg, + or ``None`` when not configured. + """ + proxy_url = get_residential_proxy_url() + if proxy_url is None: + return None + return {"http": proxy_url, "https": proxy_url} + + +def get_playwright_proxy() -> dict[str, str] | None: + """ + Return a Playwright-compatible proxy dict:: + + {"server": "http://host:port", "username": "…", "password": "…"} + + or ``None`` when not configured. + """ + username = Config.RESIDENTIAL_PROXY_USERNAME + hostname = Config.RESIDENTIAL_PROXY_HOSTNAME + password_b64 = _build_password_b64() + + if not all([username, hostname, password_b64]): + return None + + return { + "server": f"http://{hostname}", + "username": username, + "password": password_b64, + }