feat: add residential proxy configuration for web crawling and YouTube transcript fetching

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-05 20:44:13 -08:00
parent eaa0060def
commit 1511c26ef5
12 changed files with 251 additions and 16 deletions

View file

@ -143,6 +143,15 @@ STT_SERVICE=local/base
PAGES_LIMIT=500
# Residential Proxy Configuration (anonymous-proxies.net)
# Used for web crawling, link previews, and YouTube transcript fetching to avoid IP bans.
# Leave commented out to disable proxying.
# RESIDENTIAL_PROXY_USERNAME=your_proxy_username
# RESIDENTIAL_PROXY_PASSWORD=your_proxy_password
# RESIDENTIAL_PROXY_HOSTNAME=rotating.dnsproxifier.com:31230
# RESIDENTIAL_PROXY_LOCATION=
# RESIDENTIAL_PROXY_TYPE=1
FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
# File Parser Service

View file

@ -13,8 +13,7 @@ Changes:
from collections.abc import Sequence
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM
from sqlalchemy.dialects.postgresql import JSONB, UUID
from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM, JSONB, UUID
from alembic import op

View file

@ -17,6 +17,8 @@ from fake_useragent import UserAgent
from langchain_core.tools import tool
from playwright.async_api import async_playwright
from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
logger = logging.getLogger(__name__)
@ -186,9 +188,15 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
ua = UserAgent()
user_agent = ua.random
# Use residential proxy if configured
playwright_proxy = get_playwright_proxy()
# Use Playwright to fetch the page
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
launch_kwargs: dict = {"headless": True}
if playwright_proxy:
launch_kwargs["proxy"] = playwright_proxy
browser = await p.chromium.launch(**launch_kwargs)
context = await browser.new_context(user_agent=user_agent)
page = await context.new_page()
@ -283,12 +291,16 @@ def create_link_preview_tool():
ua = UserAgent()
user_agent = ua.random
# Use residential proxy if configured
proxy_url = get_residential_proxy_url()
# Use a browser-like User-Agent to fetch Open Graph metadata.
# We're only fetching publicly available metadata (title, description, thumbnail)
# that websites intentionally expose via OG tags for link preview purposes.
async with httpx.AsyncClient(
timeout=10.0,
follow_redirects=True,
proxy=proxy_url,
headers={
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",

View file

@ -2,17 +2,26 @@
Web scraping tool for the SurfSense agent.
This module provides a tool for scraping and extracting content from webpages
using the existing WebCrawlerConnector. The scraped content can be used by
the agent to answer questions about web pages.
using the existing WebCrawlerConnector. For YouTube URLs, it fetches the
transcript directly via the YouTubeTranscriptApi instead of crawling the page.
"""
import hashlib
import logging
from typing import Any
from urllib.parse import urlparse
import aiohttp
from fake_useragent import UserAgent
from langchain_core.tools import tool
from requests import Session
from youtube_transcript_api import YouTubeTranscriptApi
from app.connectors.webcrawler_connector import WebCrawlerConnector
from app.tasks.document_processors.youtube_processor import get_youtube_video_id
from app.utils.proxy_config import get_requests_proxies
logger = logging.getLogger(__name__)
def extract_domain(url: str) -> str:
@ -57,6 +66,89 @@ def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
return truncated + "\n\n[Content truncated...]", True
async def _scrape_youtube_video(
url: str, video_id: str, max_length: int
) -> dict[str, Any]:
"""
Fetch YouTube video metadata and transcript via the YouTubeTranscriptApi.
Returns a result dict in the same shape as the regular scrape_webpage output.
"""
scrape_id = generate_scrape_id(url)
domain = "youtube.com"
# --- Video metadata via oEmbed ---
residential_proxies = get_requests_proxies()
params = {
"format": "json",
"url": f"https://www.youtube.com/watch?v={video_id}",
}
oembed_url = "https://www.youtube.com/oembed"
try:
async with (
aiohttp.ClientSession() as http_session,
http_session.get(
oembed_url,
params=params,
proxy=residential_proxies["http"] if residential_proxies else None,
) as response,
):
video_data = await response.json()
except Exception:
video_data = {}
title = video_data.get("title", "YouTube Video")
author = video_data.get("author_name", "Unknown")
# --- Transcript via YouTubeTranscriptApi ---
try:
ua = UserAgent()
http_client = Session()
http_client.headers.update({"User-Agent": ua.random})
if residential_proxies:
http_client.proxies.update(residential_proxies)
ytt_api = YouTubeTranscriptApi(http_client=http_client)
captions = ytt_api.fetch(video_id)
transcript_segments = []
for line in captions:
start_time = line.start
duration = line.duration
text = line.text
timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]"
transcript_segments.append(f"{timestamp} {text}")
transcript_text = "\n".join(transcript_segments)
except Exception as e:
logger.warning(f"[scrape_webpage] No transcript for video {video_id}: {e}")
transcript_text = f"No captions available for this video. Error: {e!s}"
# Build combined content
content = f"# {title}\n\n**Author:** {author}\n**Video ID:** {video_id}\n\n## Transcript\n\n{transcript_text}"
# Truncate if needed
content, was_truncated = truncate_content(content, max_length)
word_count = len(content.split())
description = f"YouTube video by {author}"
return {
"id": scrape_id,
"assetId": url,
"kind": "article",
"href": url,
"title": title,
"description": description,
"content": content,
"domain": domain,
"word_count": word_count,
"was_truncated": was_truncated,
"crawler_type": "youtube_transcript",
"author": author,
}
def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
"""
Factory function to create the scrape_webpage tool.
@ -79,7 +171,8 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
Use this tool when the user wants you to read, summarize, or answer
questions about a specific webpage's content. This tool actually
fetches and reads the full page content.
fetches and reads the full page content. For YouTube video URLs it
fetches the transcript directly instead of crawling the page.
Common triggers:
- "Read this article and summarize it"
@ -114,6 +207,11 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
url = f"https://{url}"
try:
# Check if this is a YouTube URL and use transcript API instead
video_id = get_youtube_video_id(url)
if video_id:
return await _scrape_youtube_video(url, video_id, max_length)
# Create webcrawler connector
connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key)
@ -184,7 +282,7 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
except Exception as e:
error_message = str(e)
print(f"[scrape_webpage] Error scraping {url}: {error_message}")
logger.error(f"[scrape_webpage] Error scraping {url}: {error_message}")
return {
"id": scrape_id,
"assetId": url,

View file

@ -360,6 +360,14 @@ class Config:
# LlamaCloud API Key
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
# Residential Proxy Configuration (anonymous-proxies.net)
# Used for web crawling and YouTube transcript fetching to avoid IP bans.
RESIDENTIAL_PROXY_USERNAME = os.getenv("RESIDENTIAL_PROXY_USERNAME")
RESIDENTIAL_PROXY_PASSWORD = os.getenv("RESIDENTIAL_PROXY_PASSWORD")
RESIDENTIAL_PROXY_HOSTNAME = os.getenv("RESIDENTIAL_PROXY_HOSTNAME")
RESIDENTIAL_PROXY_LOCATION = os.getenv("RESIDENTIAL_PROXY_LOCATION", "")
RESIDENTIAL_PROXY_TYPE = int(os.getenv("RESIDENTIAL_PROXY_TYPE", "1"))
# Litellm TTS Configuration
TTS_SERVICE = os.getenv("TTS_SERVICE")
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")

View file

@ -14,6 +14,8 @@ from fake_useragent import UserAgent
from firecrawl import AsyncFirecrawlApp
from playwright.async_api import async_playwright
from app.utils.proxy_config import get_playwright_proxy
logger = logging.getLogger(__name__)
@ -165,9 +167,15 @@ class WebCrawlerConnector:
ua = UserAgent()
user_agent = ua.random
# Use residential proxy if configured
playwright_proxy = get_playwright_proxy()
# Use Playwright to fetch the page
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
launch_kwargs: dict = {"headless": True}
if playwright_proxy:
launch_kwargs["proxy"] = playwright_proxy
browser = await p.chromium.launch(**launch_kwargs)
context = await browser.new_context(user_agent=user_agent)
page = await context.new_page()

View file

@ -15,12 +15,12 @@ logger = logging.getLogger(__name__)
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
"""
Handle greenlet_spawn errors with detailed logging for debugging.
The 'greenlet_spawn has not been called' error occurs when:
1. SQLAlchemy lazy-loads a relationship outside of an async context
2. A sync operation is called from an async context (or vice versa)
3. Session objects are accessed after the session is closed
This helper logs detailed context to help identify the root cause.
"""
error_str = str(e)

View file

@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime:
# Try ISO format as fallback
try:
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
raise ValueError(f"Unable to parse date: {date_str}")
except ValueError as err:
raise ValueError(f"Unable to parse date: {date_str}") from err
async def check_duplicate_document_by_hash(

View file

@ -217,7 +217,7 @@ async def index_notion_pages(
)
await task_logger.log_task_failure(
log_entry,
f"Failed to get Notion pages: Notion API limitation",
"Failed to get Notion pages: Notion API limitation",
f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
{"error_type": "UnsupportedBlockType", "is_known_limitation": True},
)

View file

@ -138,7 +138,7 @@ async def index_crawled_urls(
f"No URLs provided for indexing. Connector ID: {connector_id}, "
f"Connector name: {connector.name}, "
f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
f"INITIAL_URLS raw value: {raw_initial_urls!r}"
)
await task_logger.log_task_failure(
log_entry,

View file

@ -6,6 +6,8 @@ import logging
from urllib.parse import parse_qs, urlparse
import aiohttp
from fake_useragent import UserAgent
from requests import Session
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from youtube_transcript_api import YouTubeTranscriptApi
@ -19,6 +21,7 @@ from app.utils.document_converters import (
generate_document_summary,
generate_unique_identifier_hash,
)
from app.utils.proxy_config import get_requests_proxies
from .base import (
check_document_by_unique_identifier,
@ -114,9 +117,16 @@ async def add_youtube_video_document(
}
oembed_url = "https://www.youtube.com/oembed"
# Build residential proxy URL (if configured)
residential_proxies = get_requests_proxies()
async with (
aiohttp.ClientSession() as http_session,
http_session.get(oembed_url, params=params) as response,
http_session.get(
oembed_url,
params=params,
proxy=residential_proxies["http"] if residential_proxies else None,
) as response,
):
video_data = await response.json()
@ -138,7 +148,12 @@ async def add_youtube_video_document(
)
try:
ytt_api = YouTubeTranscriptApi()
ua = UserAgent()
http_client = Session()
http_client.headers.update({"User-Agent": ua.random})
if residential_proxies:
http_client.proxies.update(residential_proxies)
ytt_api = YouTubeTranscriptApi(http_client=http_client)
captions = ytt_api.fetch(video_id)
# Include complete caption information with timestamps
transcript_segments = []

View file

@ -0,0 +1,86 @@
"""
Residential proxy configuration utility.
Reads proxy credentials from the application Config and provides helper
functions that return proxy configs in the format expected by different
HTTP libraries (requests, httpx, aiohttp, Playwright).
"""
import base64
import json
import logging
from app.config import Config
logger = logging.getLogger(__name__)
def _build_password_b64() -> str | None:
"""
Build the base64-encoded password dict required by anonymous-proxies.net.
Returns ``None`` when the required config values are not set.
"""
password = Config.RESIDENTIAL_PROXY_PASSWORD
if not password:
return None
password_dict = {
"p": password,
"l": Config.RESIDENTIAL_PROXY_LOCATION,
"t": Config.RESIDENTIAL_PROXY_TYPE,
}
return base64.b64encode(json.dumps(password_dict).encode("utf-8")).decode("utf-8")
def get_residential_proxy_url() -> str | None:
"""
Return the fully-formed residential proxy URL, or ``None`` when not
configured.
The URL format is::
http://<username>:<base64_password>@<hostname>/
"""
username = Config.RESIDENTIAL_PROXY_USERNAME
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
password_b64 = _build_password_b64()
if not all([username, hostname, password_b64]):
return None
return f"http://{username}:{password_b64}@{hostname}/"
def get_requests_proxies() -> dict[str, str] | None:
"""
Return a ``{"http": , "https": }`` dict suitable for
``requests.Session.proxies`` and ``aiohttp`` ``proxy=`` kwarg,
or ``None`` when not configured.
"""
proxy_url = get_residential_proxy_url()
if proxy_url is None:
return None
return {"http": proxy_url, "https": proxy_url}
def get_playwright_proxy() -> dict[str, str] | None:
"""
Return a Playwright-compatible proxy dict::
{"server": "http://host:port", "username": "", "password": ""}
or ``None`` when not configured.
"""
username = Config.RESIDENTIAL_PROXY_USERNAME
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
password_b64 = _build_password_b64()
if not all([username, hostname, password_b64]):
return None
return {
"server": f"http://{hostname}",
"username": username,
"password": password_b64,
}