mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
feat: add residential proxy configuration for web crawling and YouTube transcript fetching
This commit is contained in:
parent
eaa0060def
commit
1511c26ef5
12 changed files with 251 additions and 16 deletions
|
|
@ -143,6 +143,15 @@ STT_SERVICE=local/base
|
|||
PAGES_LIMIT=500
|
||||
|
||||
|
||||
# Residential Proxy Configuration (anonymous-proxies.net)
|
||||
# Used for web crawling, link previews, and YouTube transcript fetching to avoid IP bans.
|
||||
# Leave commented out to disable proxying.
|
||||
# RESIDENTIAL_PROXY_USERNAME=your_proxy_username
|
||||
# RESIDENTIAL_PROXY_PASSWORD=your_proxy_password
|
||||
# RESIDENTIAL_PROXY_HOSTNAME=rotating.dnsproxifier.com:31230
|
||||
# RESIDENTIAL_PROXY_LOCATION=
|
||||
# RESIDENTIAL_PROXY_TYPE=1
|
||||
|
||||
FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
|
||||
|
||||
# File Parser Service
|
||||
|
|
|
|||
|
|
@ -13,8 +13,7 @@ Changes:
|
|||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM
|
||||
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
||||
from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM, JSONB, UUID
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,8 @@ from fake_useragent import UserAgent
|
|||
from langchain_core.tools import tool
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -186,9 +188,15 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
|
|||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Use residential proxy if configured
|
||||
playwright_proxy = get_playwright_proxy()
|
||||
|
||||
# Use Playwright to fetch the page
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
launch_kwargs: dict = {"headless": True}
|
||||
if playwright_proxy:
|
||||
launch_kwargs["proxy"] = playwright_proxy
|
||||
browser = await p.chromium.launch(**launch_kwargs)
|
||||
context = await browser.new_context(user_agent=user_agent)
|
||||
page = await context.new_page()
|
||||
|
||||
|
|
@ -283,12 +291,16 @@ def create_link_preview_tool():
|
|||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Use residential proxy if configured
|
||||
proxy_url = get_residential_proxy_url()
|
||||
|
||||
# Use a browser-like User-Agent to fetch Open Graph metadata.
|
||||
# We're only fetching publicly available metadata (title, description, thumbnail)
|
||||
# that websites intentionally expose via OG tags for link preview purposes.
|
||||
async with httpx.AsyncClient(
|
||||
timeout=10.0,
|
||||
follow_redirects=True,
|
||||
proxy=proxy_url,
|
||||
headers={
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||
|
|
|
|||
|
|
@ -2,17 +2,26 @@
|
|||
Web scraping tool for the SurfSense agent.
|
||||
|
||||
This module provides a tool for scraping and extracting content from webpages
|
||||
using the existing WebCrawlerConnector. The scraped content can be used by
|
||||
the agent to answer questions about web pages.
|
||||
using the existing WebCrawlerConnector. For YouTube URLs, it fetches the
|
||||
transcript directly via the YouTubeTranscriptApi instead of crawling the page.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
from fake_useragent import UserAgent
|
||||
from langchain_core.tools import tool
|
||||
from requests import Session
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
from app.connectors.webcrawler_connector import WebCrawlerConnector
|
||||
from app.tasks.document_processors.youtube_processor import get_youtube_video_id
|
||||
from app.utils.proxy_config import get_requests_proxies
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_domain(url: str) -> str:
|
||||
|
|
@ -57,6 +66,89 @@ def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
|
|||
return truncated + "\n\n[Content truncated...]", True
|
||||
|
||||
|
||||
async def _scrape_youtube_video(
|
||||
url: str, video_id: str, max_length: int
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Fetch YouTube video metadata and transcript via the YouTubeTranscriptApi.
|
||||
|
||||
Returns a result dict in the same shape as the regular scrape_webpage output.
|
||||
"""
|
||||
scrape_id = generate_scrape_id(url)
|
||||
domain = "youtube.com"
|
||||
|
||||
# --- Video metadata via oEmbed ---
|
||||
residential_proxies = get_requests_proxies()
|
||||
|
||||
params = {
|
||||
"format": "json",
|
||||
"url": f"https://www.youtube.com/watch?v={video_id}",
|
||||
}
|
||||
oembed_url = "https://www.youtube.com/oembed"
|
||||
|
||||
try:
|
||||
async with (
|
||||
aiohttp.ClientSession() as http_session,
|
||||
http_session.get(
|
||||
oembed_url,
|
||||
params=params,
|
||||
proxy=residential_proxies["http"] if residential_proxies else None,
|
||||
) as response,
|
||||
):
|
||||
video_data = await response.json()
|
||||
except Exception:
|
||||
video_data = {}
|
||||
|
||||
title = video_data.get("title", "YouTube Video")
|
||||
author = video_data.get("author_name", "Unknown")
|
||||
|
||||
# --- Transcript via YouTubeTranscriptApi ---
|
||||
try:
|
||||
ua = UserAgent()
|
||||
http_client = Session()
|
||||
http_client.headers.update({"User-Agent": ua.random})
|
||||
if residential_proxies:
|
||||
http_client.proxies.update(residential_proxies)
|
||||
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
||||
captions = ytt_api.fetch(video_id)
|
||||
|
||||
transcript_segments = []
|
||||
for line in captions:
|
||||
start_time = line.start
|
||||
duration = line.duration
|
||||
text = line.text
|
||||
timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]"
|
||||
transcript_segments.append(f"{timestamp} {text}")
|
||||
transcript_text = "\n".join(transcript_segments)
|
||||
except Exception as e:
|
||||
logger.warning(f"[scrape_webpage] No transcript for video {video_id}: {e}")
|
||||
transcript_text = f"No captions available for this video. Error: {e!s}"
|
||||
|
||||
# Build combined content
|
||||
content = f"# {title}\n\n**Author:** {author}\n**Video ID:** {video_id}\n\n## Transcript\n\n{transcript_text}"
|
||||
|
||||
# Truncate if needed
|
||||
content, was_truncated = truncate_content(content, max_length)
|
||||
word_count = len(content.split())
|
||||
|
||||
description = f"YouTube video by {author}"
|
||||
|
||||
return {
|
||||
"id": scrape_id,
|
||||
"assetId": url,
|
||||
"kind": "article",
|
||||
"href": url,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"content": content,
|
||||
"domain": domain,
|
||||
"word_count": word_count,
|
||||
"was_truncated": was_truncated,
|
||||
"crawler_type": "youtube_transcript",
|
||||
"author": author,
|
||||
}
|
||||
|
||||
|
||||
def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
|
||||
"""
|
||||
Factory function to create the scrape_webpage tool.
|
||||
|
|
@ -79,7 +171,8 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
|
|||
|
||||
Use this tool when the user wants you to read, summarize, or answer
|
||||
questions about a specific webpage's content. This tool actually
|
||||
fetches and reads the full page content.
|
||||
fetches and reads the full page content. For YouTube video URLs it
|
||||
fetches the transcript directly instead of crawling the page.
|
||||
|
||||
Common triggers:
|
||||
- "Read this article and summarize it"
|
||||
|
|
@ -114,6 +207,11 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
|
|||
url = f"https://{url}"
|
||||
|
||||
try:
|
||||
# Check if this is a YouTube URL and use transcript API instead
|
||||
video_id = get_youtube_video_id(url)
|
||||
if video_id:
|
||||
return await _scrape_youtube_video(url, video_id, max_length)
|
||||
|
||||
# Create webcrawler connector
|
||||
connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key)
|
||||
|
||||
|
|
@ -184,7 +282,7 @@ def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
|
|||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
print(f"[scrape_webpage] Error scraping {url}: {error_message}")
|
||||
logger.error(f"[scrape_webpage] Error scraping {url}: {error_message}")
|
||||
return {
|
||||
"id": scrape_id,
|
||||
"assetId": url,
|
||||
|
|
|
|||
|
|
@ -360,6 +360,14 @@ class Config:
|
|||
# LlamaCloud API Key
|
||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
|
||||
# Residential Proxy Configuration (anonymous-proxies.net)
|
||||
# Used for web crawling and YouTube transcript fetching to avoid IP bans.
|
||||
RESIDENTIAL_PROXY_USERNAME = os.getenv("RESIDENTIAL_PROXY_USERNAME")
|
||||
RESIDENTIAL_PROXY_PASSWORD = os.getenv("RESIDENTIAL_PROXY_PASSWORD")
|
||||
RESIDENTIAL_PROXY_HOSTNAME = os.getenv("RESIDENTIAL_PROXY_HOSTNAME")
|
||||
RESIDENTIAL_PROXY_LOCATION = os.getenv("RESIDENTIAL_PROXY_LOCATION", "")
|
||||
RESIDENTIAL_PROXY_TYPE = int(os.getenv("RESIDENTIAL_PROXY_TYPE", "1"))
|
||||
|
||||
# Litellm TTS Configuration
|
||||
TTS_SERVICE = os.getenv("TTS_SERVICE")
|
||||
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
|
||||
|
|
|
|||
|
|
@ -14,6 +14,8 @@ from fake_useragent import UserAgent
|
|||
from firecrawl import AsyncFirecrawlApp
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from app.utils.proxy_config import get_playwright_proxy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -165,9 +167,15 @@ class WebCrawlerConnector:
|
|||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Use residential proxy if configured
|
||||
playwright_proxy = get_playwright_proxy()
|
||||
|
||||
# Use Playwright to fetch the page
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
launch_kwargs: dict = {"headless": True}
|
||||
if playwright_proxy:
|
||||
launch_kwargs["proxy"] = playwright_proxy
|
||||
browser = await p.chromium.launch(**launch_kwargs)
|
||||
context = await browser.new_context(user_agent=user_agent)
|
||||
page = await context.new_page()
|
||||
|
||||
|
|
|
|||
|
|
@ -15,12 +15,12 @@ logger = logging.getLogger(__name__)
|
|||
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
|
||||
"""
|
||||
Handle greenlet_spawn errors with detailed logging for debugging.
|
||||
|
||||
|
||||
The 'greenlet_spawn has not been called' error occurs when:
|
||||
1. SQLAlchemy lazy-loads a relationship outside of an async context
|
||||
2. A sync operation is called from an async context (or vice versa)
|
||||
3. Session objects are accessed after the session is closed
|
||||
|
||||
|
||||
This helper logs detailed context to help identify the root cause.
|
||||
"""
|
||||
error_str = str(e)
|
||||
|
|
|
|||
|
|
@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime:
|
|||
# Try ISO format as fallback
|
||||
try:
|
||||
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
raise ValueError(f"Unable to parse date: {date_str}")
|
||||
except ValueError as err:
|
||||
raise ValueError(f"Unable to parse date: {date_str}") from err
|
||||
|
||||
|
||||
async def check_duplicate_document_by_hash(
|
||||
|
|
|
|||
|
|
@ -217,7 +217,7 @@ async def index_notion_pages(
|
|||
)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Notion pages: Notion API limitation",
|
||||
"Failed to get Notion pages: Notion API limitation",
|
||||
f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
|
||||
{"error_type": "UnsupportedBlockType", "is_known_limitation": True},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ async def index_crawled_urls(
|
|||
f"No URLs provided for indexing. Connector ID: {connector_id}, "
|
||||
f"Connector name: {connector.name}, "
|
||||
f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
|
||||
f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
|
||||
f"INITIAL_URLS raw value: {raw_initial_urls!r}"
|
||||
)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ import logging
|
|||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
import aiohttp
|
||||
from fake_useragent import UserAgent
|
||||
from requests import Session
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
|
@ -19,6 +21,7 @@ from app.utils.document_converters import (
|
|||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
from app.utils.proxy_config import get_requests_proxies
|
||||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
|
|
@ -114,9 +117,16 @@ async def add_youtube_video_document(
|
|||
}
|
||||
oembed_url = "https://www.youtube.com/oembed"
|
||||
|
||||
# Build residential proxy URL (if configured)
|
||||
residential_proxies = get_requests_proxies()
|
||||
|
||||
async with (
|
||||
aiohttp.ClientSession() as http_session,
|
||||
http_session.get(oembed_url, params=params) as response,
|
||||
http_session.get(
|
||||
oembed_url,
|
||||
params=params,
|
||||
proxy=residential_proxies["http"] if residential_proxies else None,
|
||||
) as response,
|
||||
):
|
||||
video_data = await response.json()
|
||||
|
||||
|
|
@ -138,7 +148,12 @@ async def add_youtube_video_document(
|
|||
)
|
||||
|
||||
try:
|
||||
ytt_api = YouTubeTranscriptApi()
|
||||
ua = UserAgent()
|
||||
http_client = Session()
|
||||
http_client.headers.update({"User-Agent": ua.random})
|
||||
if residential_proxies:
|
||||
http_client.proxies.update(residential_proxies)
|
||||
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
||||
captions = ytt_api.fetch(video_id)
|
||||
# Include complete caption information with timestamps
|
||||
transcript_segments = []
|
||||
|
|
|
|||
86
surfsense_backend/app/utils/proxy_config.py
Normal file
86
surfsense_backend/app/utils/proxy_config.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
"""
|
||||
Residential proxy configuration utility.
|
||||
|
||||
Reads proxy credentials from the application Config and provides helper
|
||||
functions that return proxy configs in the format expected by different
|
||||
HTTP libraries (requests, httpx, aiohttp, Playwright).
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
|
||||
from app.config import Config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _build_password_b64() -> str | None:
|
||||
"""
|
||||
Build the base64-encoded password dict required by anonymous-proxies.net.
|
||||
|
||||
Returns ``None`` when the required config values are not set.
|
||||
"""
|
||||
password = Config.RESIDENTIAL_PROXY_PASSWORD
|
||||
if not password:
|
||||
return None
|
||||
|
||||
password_dict = {
|
||||
"p": password,
|
||||
"l": Config.RESIDENTIAL_PROXY_LOCATION,
|
||||
"t": Config.RESIDENTIAL_PROXY_TYPE,
|
||||
}
|
||||
return base64.b64encode(json.dumps(password_dict).encode("utf-8")).decode("utf-8")
|
||||
|
||||
|
||||
def get_residential_proxy_url() -> str | None:
|
||||
"""
|
||||
Return the fully-formed residential proxy URL, or ``None`` when not
|
||||
configured.
|
||||
|
||||
The URL format is::
|
||||
|
||||
http://<username>:<base64_password>@<hostname>/
|
||||
"""
|
||||
username = Config.RESIDENTIAL_PROXY_USERNAME
|
||||
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
|
||||
password_b64 = _build_password_b64()
|
||||
|
||||
if not all([username, hostname, password_b64]):
|
||||
return None
|
||||
|
||||
return f"http://{username}:{password_b64}@{hostname}/"
|
||||
|
||||
|
||||
def get_requests_proxies() -> dict[str, str] | None:
|
||||
"""
|
||||
Return a ``{"http": …, "https": …}`` dict suitable for
|
||||
``requests.Session.proxies`` and ``aiohttp`` ``proxy=`` kwarg,
|
||||
or ``None`` when not configured.
|
||||
"""
|
||||
proxy_url = get_residential_proxy_url()
|
||||
if proxy_url is None:
|
||||
return None
|
||||
return {"http": proxy_url, "https": proxy_url}
|
||||
|
||||
|
||||
def get_playwright_proxy() -> dict[str, str] | None:
|
||||
"""
|
||||
Return a Playwright-compatible proxy dict::
|
||||
|
||||
{"server": "http://host:port", "username": "…", "password": "…"}
|
||||
|
||||
or ``None`` when not configured.
|
||||
"""
|
||||
username = Config.RESIDENTIAL_PROXY_USERNAME
|
||||
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
|
||||
password_b64 = _build_password_b64()
|
||||
|
||||
if not all([username, hostname, password_b64]):
|
||||
return None
|
||||
|
||||
return {
|
||||
"server": f"http://{hostname}",
|
||||
"username": username,
|
||||
"password": password_b64,
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue