feat: add residential proxy configuration for web crawling and YouTube transcript fetching

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-05 20:44:13 -08:00
parent eaa0060def
commit 1511c26ef5
12 changed files with 251 additions and 16 deletions

View file

@ -15,12 +15,12 @@ logger = logging.getLogger(__name__)
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
"""
Handle greenlet_spawn errors with detailed logging for debugging.
The 'greenlet_spawn has not been called' error occurs when:
1. SQLAlchemy lazy-loads a relationship outside of an async context
2. A sync operation is called from an async context (or vice versa)
3. Session objects are accessed after the session is closed
This helper logs detailed context to help identify the root cause.
"""
error_str = str(e)

View file

@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime:
# Try ISO format as fallback
try:
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
raise ValueError(f"Unable to parse date: {date_str}")
except ValueError as err:
raise ValueError(f"Unable to parse date: {date_str}") from err
async def check_duplicate_document_by_hash(

View file

@ -217,7 +217,7 @@ async def index_notion_pages(
)
await task_logger.log_task_failure(
log_entry,
f"Failed to get Notion pages: Notion API limitation",
"Failed to get Notion pages: Notion API limitation",
f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
{"error_type": "UnsupportedBlockType", "is_known_limitation": True},
)

View file

@ -138,7 +138,7 @@ async def index_crawled_urls(
f"No URLs provided for indexing. Connector ID: {connector_id}, "
f"Connector name: {connector.name}, "
f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
f"INITIAL_URLS raw value: {raw_initial_urls!r}"
)
await task_logger.log_task_failure(
log_entry,

View file

@ -6,6 +6,8 @@ import logging
from urllib.parse import parse_qs, urlparse
import aiohttp
from fake_useragent import UserAgent
from requests import Session
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from youtube_transcript_api import YouTubeTranscriptApi
@ -19,6 +21,7 @@ from app.utils.document_converters import (
generate_document_summary,
generate_unique_identifier_hash,
)
from app.utils.proxy_config import get_requests_proxies
from .base import (
check_document_by_unique_identifier,
@ -114,9 +117,16 @@ async def add_youtube_video_document(
}
oembed_url = "https://www.youtube.com/oembed"
# Build residential proxy URL (if configured)
residential_proxies = get_requests_proxies()
async with (
aiohttp.ClientSession() as http_session,
http_session.get(oembed_url, params=params) as response,
http_session.get(
oembed_url,
params=params,
proxy=residential_proxies["http"] if residential_proxies else None,
) as response,
):
video_data = await response.json()
@ -138,7 +148,12 @@ async def add_youtube_video_document(
)
try:
ytt_api = YouTubeTranscriptApi()
ua = UserAgent()
http_client = Session()
http_client.headers.update({"User-Agent": ua.random})
if residential_proxies:
http_client.proxies.update(residential_proxies)
ytt_api = YouTubeTranscriptApi(http_client=http_client)
captions = ytt_api.fetch(video_id)
# Include complete caption information with timestamps
transcript_segments = []