Merge pull request #789 from MODSetter/dev

feat: rotating proxy support
This commit is contained in:
Rohan Verma 2026-02-05 20:47:54 -08:00 committed by GitHub
commit ac35f9d674
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 261 additions and 26 deletions

View file

@ -143,6 +143,15 @@ STT_SERVICE=local/base
PAGES_LIMIT=500 PAGES_LIMIT=500
# Residential Proxy Configuration (anonymous-proxies.net)
# Used for web crawling, link previews, and YouTube transcript fetching to avoid IP bans.
# Leave commented out to disable proxying.
# RESIDENTIAL_PROXY_USERNAME=your_proxy_username
# RESIDENTIAL_PROXY_PASSWORD=your_proxy_password
# RESIDENTIAL_PROXY_HOSTNAME=rotating.dnsproxifier.com:31230
# RESIDENTIAL_PROXY_LOCATION=
# RESIDENTIAL_PROXY_TYPE=1
FIRECRAWL_API_KEY=fcr-01J0000000000000000000000 FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
# File Parser Service # File Parser Service

View file

@ -13,8 +13,7 @@ Changes:
from collections.abc import Sequence from collections.abc import Sequence
import sqlalchemy as sa import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM, JSONB, UUID
from sqlalchemy.dialects.postgresql import JSONB, UUID
from alembic import op from alembic import op

View file

@ -17,6 +17,8 @@ from fake_useragent import UserAgent
from langchain_core.tools import tool from langchain_core.tools import tool
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -186,9 +188,15 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
ua = UserAgent() ua = UserAgent()
user_agent = ua.random user_agent = ua.random
# Use residential proxy if configured
playwright_proxy = get_playwright_proxy()
# Use Playwright to fetch the page # Use Playwright to fetch the page
async with async_playwright() as p: async with async_playwright() as p:
browser = await p.chromium.launch(headless=True) launch_kwargs: dict = {"headless": True}
if playwright_proxy:
launch_kwargs["proxy"] = playwright_proxy
browser = await p.chromium.launch(**launch_kwargs)
context = await browser.new_context(user_agent=user_agent) context = await browser.new_context(user_agent=user_agent)
page = await context.new_page() page = await context.new_page()
@ -283,12 +291,16 @@ def create_link_preview_tool():
ua = UserAgent() ua = UserAgent()
user_agent = ua.random user_agent = ua.random
# Use residential proxy if configured
proxy_url = get_residential_proxy_url()
# Use a browser-like User-Agent to fetch Open Graph metadata. # Use a browser-like User-Agent to fetch Open Graph metadata.
# We're only fetching publicly available metadata (title, description, thumbnail) # We're only fetching publicly available metadata (title, description, thumbnail)
# that websites intentionally expose via OG tags for link preview purposes. # that websites intentionally expose via OG tags for link preview purposes.
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=10.0, timeout=10.0,
follow_redirects=True, follow_redirects=True,
proxy=proxy_url,
headers={ headers={
"User-Agent": user_agent, "User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",

View file

@ -2,17 +2,26 @@
Web scraping tool for the SurfSense agent. Web scraping tool for the SurfSense agent.
This module provides a tool for scraping and extracting content from webpages This module provides a tool for scraping and extracting content from webpages
using the existing WebCrawlerConnector. The scraped content can be used by using the existing WebCrawlerConnector. For YouTube URLs, it fetches the
the agent to answer questions about web pages. transcript directly via the YouTubeTranscriptApi instead of crawling the page.
""" """
import hashlib import hashlib
import logging
from typing import Any from typing import Any
from urllib.parse import urlparse from urllib.parse import urlparse
import aiohttp
from fake_useragent import UserAgent
from langchain_core.tools import tool from langchain_core.tools import tool
from requests import Session
from youtube_transcript_api import YouTubeTranscriptApi
from app.connectors.webcrawler_connector import WebCrawlerConnector from app.connectors.webcrawler_connector import WebCrawlerConnector
from app.tasks.document_processors.youtube_processor import get_youtube_video_id
from app.utils.proxy_config import get_requests_proxies
logger = logging.getLogger(__name__)
def extract_domain(url: str) -> str: def extract_domain(url: str) -> str:
@ -57,6 +66,89 @@ def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
return truncated + "\n\n[Content truncated...]", True return truncated + "\n\n[Content truncated...]", True
async def _scrape_youtube_video(
url: str, video_id: str, max_length: int
) -> dict[str, Any]:
"""
Fetch YouTube video metadata and transcript via the YouTubeTranscriptApi.
Returns a result dict in the same shape as the regular scrape_webpage output.
"""
scrape_id = generate_scrape_id(url)
domain = "youtube.com"
# --- Video metadata via oEmbed ---
residential_proxies = get_requests_proxies()
params = {
"format": "json",
"url": f"https://www.youtube.com/watch?v={video_id}",
}
oembed_url = "https://www.youtube.com/oembed"
try:
async with (
aiohttp.ClientSession() as http_session,
http_session.get(
oembed_url,
params=params,
proxy=residential_proxies["http"] if residential_proxies else None,
) as response,
):
video_data = await response.json()
except Exception:
video_data = {}
title = video_data.get("title", "YouTube Video")
author = video_data.get("author_name", "Unknown")
# --- Transcript via YouTubeTranscriptApi ---
try:
ua = UserAgent()
http_client = Session()
http_client.headers.update({"User-Agent": ua.random})
if residential_proxies:
http_client.proxies.update(residential_proxies)
ytt_api = YouTubeTranscriptApi(http_client=http_client)
captions = ytt_api.fetch(video_id)
transcript_segments = []
for line in captions:
start_time = line.start
duration = line.duration
text = line.text
timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]"
transcript_segments.append(f"{timestamp} {text}")
transcript_text = "\n".join(transcript_segments)
except Exception as e:
logger.warning(f"[scrape_webpage] No transcript for video {video_id}: {e}")
transcript_text = f"No captions available for this video. Error: {e!s}"
# Build combined content
content = f"# {title}\n\n**Author:** {author}\n**Video ID:** {video_id}\n\n## Transcript\n\n{transcript_text}"
# Truncate if needed
content, was_truncated = truncate_content(content, max_length)
word_count = len(content.split())
description = f"YouTube video by {author}"
return {
"id": scrape_id,
"assetId": url,
"kind": "article",
"href": url,
"title": title,
"description": description,
"content": content,
"domain": domain,
"word_count": word_count,
"was_truncated": was_truncated,
"crawler_type": "youtube_transcript",
"author": author,
}
def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
""" """
Factory function to create the scrape_webpage tool. Factory function to create the scrape_webpage tool.
@ -79,7 +171,8 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
Use this tool when the user wants you to read, summarize, or answer Use this tool when the user wants you to read, summarize, or answer
questions about a specific webpage's content. This tool actually questions about a specific webpage's content. This tool actually
fetches and reads the full page content. fetches and reads the full page content. For YouTube video URLs it
fetches the transcript directly instead of crawling the page.
Common triggers: Common triggers:
- "Read this article and summarize it" - "Read this article and summarize it"
@ -114,6 +207,11 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
url = f"https://{url}" url = f"https://{url}"
try: try:
# Check if this is a YouTube URL and use transcript API instead
video_id = get_youtube_video_id(url)
if video_id:
return await _scrape_youtube_video(url, video_id, max_length)
# Create webcrawler connector # Create webcrawler connector
connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key) connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key)
@ -184,7 +282,7 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
except Exception as e: except Exception as e:
error_message = str(e) error_message = str(e)
print(f"[scrape_webpage] Error scraping {url}: {error_message}") logger.error(f"[scrape_webpage] Error scraping {url}: {error_message}")
return { return {
"id": scrape_id, "id": scrape_id,
"assetId": url, "assetId": url,

View file

@ -360,6 +360,14 @@ class Config:
# LlamaCloud API Key # LlamaCloud API Key
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
# Residential Proxy Configuration (anonymous-proxies.net)
# Used for web crawling and YouTube transcript fetching to avoid IP bans.
RESIDENTIAL_PROXY_USERNAME = os.getenv("RESIDENTIAL_PROXY_USERNAME")
RESIDENTIAL_PROXY_PASSWORD = os.getenv("RESIDENTIAL_PROXY_PASSWORD")
RESIDENTIAL_PROXY_HOSTNAME = os.getenv("RESIDENTIAL_PROXY_HOSTNAME")
RESIDENTIAL_PROXY_LOCATION = os.getenv("RESIDENTIAL_PROXY_LOCATION", "")
RESIDENTIAL_PROXY_TYPE = int(os.getenv("RESIDENTIAL_PROXY_TYPE", "1"))
# Litellm TTS Configuration # Litellm TTS Configuration
TTS_SERVICE = os.getenv("TTS_SERVICE") TTS_SERVICE = os.getenv("TTS_SERVICE")
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE") TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")

View file

@ -14,6 +14,8 @@ from fake_useragent import UserAgent
from firecrawl import AsyncFirecrawlApp from firecrawl import AsyncFirecrawlApp
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from app.utils.proxy_config import get_playwright_proxy
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -165,9 +167,15 @@ class WebCrawlerConnector:
ua = UserAgent() ua = UserAgent()
user_agent = ua.random user_agent = ua.random
# Use residential proxy if configured
playwright_proxy = get_playwright_proxy()
# Use Playwright to fetch the page # Use Playwright to fetch the page
async with async_playwright() as p: async with async_playwright() as p:
browser = await p.chromium.launch(headless=True) launch_kwargs: dict = {"headless": True}
if playwright_proxy:
launch_kwargs["proxy"] = playwright_proxy
browser = await p.chromium.launch(**launch_kwargs)
context = await browser.new_context(user_agent=user_agent) context = await browser.new_context(user_agent=user_agent)
page = await context.new_page() page = await context.new_page()

View file

@ -15,12 +15,12 @@ logger = logging.getLogger(__name__)
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None: def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
""" """
Handle greenlet_spawn errors with detailed logging for debugging. Handle greenlet_spawn errors with detailed logging for debugging.
The 'greenlet_spawn has not been called' error occurs when: The 'greenlet_spawn has not been called' error occurs when:
1. SQLAlchemy lazy-loads a relationship outside of an async context 1. SQLAlchemy lazy-loads a relationship outside of an async context
2. A sync operation is called from an async context (or vice versa) 2. A sync operation is called from an async context (or vice versa)
3. Session objects are accessed after the session is closed 3. Session objects are accessed after the session is closed
This helper logs detailed context to help identify the root cause. This helper logs detailed context to help identify the root cause.
""" """
error_str = str(e) error_str = str(e)

View file

@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime:
# Try ISO format as fallback # Try ISO format as fallback
try: try:
return datetime.fromisoformat(date_str.replace("Z", "+00:00")) return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError: except ValueError as err:
raise ValueError(f"Unable to parse date: {date_str}") raise ValueError(f"Unable to parse date: {date_str}") from err
async def check_duplicate_document_by_hash( async def check_duplicate_document_by_hash(

View file

@ -217,7 +217,7 @@ async def index_notion_pages(
) )
await task_logger.log_task_failure( await task_logger.log_task_failure(
log_entry, log_entry,
f"Failed to get Notion pages: Notion API limitation", "Failed to get Notion pages: Notion API limitation",
f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.", f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
{"error_type": "UnsupportedBlockType", "is_known_limitation": True}, {"error_type": "UnsupportedBlockType", "is_known_limitation": True},
) )

View file

@ -138,7 +138,7 @@ async def index_crawled_urls(
f"No URLs provided for indexing. Connector ID: {connector_id}, " f"No URLs provided for indexing. Connector ID: {connector_id}, "
f"Connector name: {connector.name}, " f"Connector name: {connector.name}, "
f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, " f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
f"INITIAL_URLS raw value: {repr(raw_initial_urls)}" f"INITIAL_URLS raw value: {raw_initial_urls!r}"
) )
await task_logger.log_task_failure( await task_logger.log_task_failure(
log_entry, log_entry,

View file

@ -6,6 +6,8 @@ import logging
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
import aiohttp import aiohttp
from fake_useragent import UserAgent
from requests import Session
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api import YouTubeTranscriptApi
@ -19,6 +21,7 @@ from app.utils.document_converters import (
generate_document_summary, generate_document_summary,
generate_unique_identifier_hash, generate_unique_identifier_hash,
) )
from app.utils.proxy_config import get_requests_proxies
from .base import ( from .base import (
check_document_by_unique_identifier, check_document_by_unique_identifier,
@ -114,9 +117,16 @@ async def add_youtube_video_document(
} }
oembed_url = "https://www.youtube.com/oembed" oembed_url = "https://www.youtube.com/oembed"
# Build residential proxy URL (if configured)
residential_proxies = get_requests_proxies()
async with ( async with (
aiohttp.ClientSession() as http_session, aiohttp.ClientSession() as http_session,
http_session.get(oembed_url, params=params) as response, http_session.get(
oembed_url,
params=params,
proxy=residential_proxies["http"] if residential_proxies else None,
) as response,
): ):
video_data = await response.json() video_data = await response.json()
@ -138,7 +148,12 @@ async def add_youtube_video_document(
) )
try: try:
ytt_api = YouTubeTranscriptApi() ua = UserAgent()
http_client = Session()
http_client.headers.update({"User-Agent": ua.random})
if residential_proxies:
http_client.proxies.update(residential_proxies)
ytt_api = YouTubeTranscriptApi(http_client=http_client)
captions = ytt_api.fetch(video_id) captions = ytt_api.fetch(video_id)
# Include complete caption information with timestamps # Include complete caption information with timestamps
transcript_segments = [] transcript_segments = []

View file

@ -0,0 +1,86 @@
"""
Residential proxy configuration utility.
Reads proxy credentials from the application Config and provides helper
functions that return proxy configs in the format expected by different
HTTP libraries (requests, httpx, aiohttp, Playwright).
"""
import base64
import json
import logging
from app.config import Config
logger = logging.getLogger(__name__)
def _build_password_b64() -> str | None:
"""
Build the base64-encoded password dict required by anonymous-proxies.net.
Returns ``None`` when the required config values are not set.
"""
password = Config.RESIDENTIAL_PROXY_PASSWORD
if not password:
return None
password_dict = {
"p": password,
"l": Config.RESIDENTIAL_PROXY_LOCATION,
"t": Config.RESIDENTIAL_PROXY_TYPE,
}
return base64.b64encode(json.dumps(password_dict).encode("utf-8")).decode("utf-8")
def get_residential_proxy_url() -> str | None:
"""
Return the fully-formed residential proxy URL, or ``None`` when not
configured.
The URL format is::
http://<username>:<base64_password>@<hostname>/
"""
username = Config.RESIDENTIAL_PROXY_USERNAME
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
password_b64 = _build_password_b64()
if not all([username, hostname, password_b64]):
return None
return f"http://{username}:{password_b64}@{hostname}/"
def get_requests_proxies() -> dict[str, str] | None:
"""
Return a ``{"http": , "https": }`` dict suitable for
``requests.Session.proxies`` and ``aiohttp`` ``proxy=`` kwarg,
or ``None`` when not configured.
"""
proxy_url = get_residential_proxy_url()
if proxy_url is None:
return None
return {"http": proxy_url, "https": proxy_url}
def get_playwright_proxy() -> dict[str, str] | None:
"""
Return a Playwright-compatible proxy dict::
{"server": "http://host:port", "username": "", "password": ""}
or ``None`` when not configured.
"""
username = Config.RESIDENTIAL_PROXY_USERNAME
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
password_b64 = _build_password_b64()
if not all([username, hostname, password_b64]):
return None
return {
"server": f"http://{hostname}",
"username": username,
"password": password_b64,
}

View file

@ -351,14 +351,14 @@ export const ComposerAddAttachment: FC = () => {
<PlusIcon className="aui-attachment-add-icon size-5 stroke-[1.5px]" /> <PlusIcon className="aui-attachment-add-icon size-5 stroke-[1.5px]" />
</TooltipIconButton> </TooltipIconButton>
</DropdownMenuTrigger> </DropdownMenuTrigger>
<DropdownMenuContent align="start" className="w-48 bg-background border-border"> <DropdownMenuContent align="start" className="w-72 bg-background border-border">
<DropdownMenuItem onSelect={handleChatAttachment} className="cursor-pointer"> <DropdownMenuItem onSelect={handleChatAttachment} className="cursor-pointer">
<Paperclip className="size-4" /> <Paperclip className="size-4" />
<span>Add attachment</span> <span>Add attachment to this chat</span>
</DropdownMenuItem> </DropdownMenuItem>
<DropdownMenuItem onClick={handleFileUpload} className="cursor-pointer"> <DropdownMenuItem onClick={handleFileUpload} className="cursor-pointer">
<Upload className="size-4" /> <Upload className="size-4" />
<span>Upload Documents</span> <span>Upload documents to Search Space</span>
</DropdownMenuItem> </DropdownMenuItem>
</DropdownMenuContent> </DropdownMenuContent>
</DropdownMenu> </DropdownMenu>

View file

@ -12,11 +12,11 @@ const demoPlans = [
features: [ features: [
"Open source on GitHub", "Open source on GitHub",
"Upload and chat with 300+ pages of content", "Upload and chat with 300+ pages of content",
"Connects with 8 popular sources, like Drive and Notion.", "Connects with 8 popular sources, like Drive and Notion",
"Includes limited access to ChatGPT, Claude, and DeepSeek models", "Includes limited access to ChatGPT, Claude, and DeepSeek models",
"Supports 100+ more LLMs, including Gemini, Llama and many more.", "Supports 100+ more LLMs, including Gemini, Llama and many more",
"50+ File extensions supported.", "50+ File extensions supported",
"Generate podcasts in seconds.", "Generate podcasts in seconds",
"Cross-Browser Extension for dynamic webpages including authenticated content", "Cross-Browser Extension for dynamic webpages including authenticated content",
"Community support on Discord", "Community support on Discord",
], ],
@ -33,8 +33,8 @@ const demoPlans = [
billingText: "billed annually", billingText: "billed annually",
features: [ features: [
"Everything in Free", "Everything in Free",
"Upload and chat with 5,000+ pages of content", "Upload and chat with 5,000+ pages of content per user",
"Connects with 15+ external sources, like Slack and Airtable.", "Connects with 15+ external sources, like Slack and Airtable",
"Includes extended access to ChatGPT, Claude, and DeepSeek models", "Includes extended access to ChatGPT, Claude, and DeepSeek models",
"Collaboration and commenting features", "Collaboration and commenting features",
"Shared BYOK (Bring Your Own Key)", "Shared BYOK (Bring Your Own Key)",
@ -42,7 +42,7 @@ const demoPlans = [
"Planned: Centralized billing", "Planned: Centralized billing",
"Priority support", "Priority support",
], ],
description: "The AIknowledge base for individuals and teams", description: "The AI knowledge base for individuals and teams",
buttonText: "Upgrade", buttonText: "Upgrade",
href: "/contact", href: "/contact",
isPopular: true, isPopular: true,