feat: add residential proxy configuration for web crawling and YouTube transcript fetching

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-05 20:44:13 -08:00
parent eaa0060def
commit 1511c26ef5
12 changed files with 251 additions and 16 deletions

View file

@ -6,6 +6,8 @@ import logging
from urllib.parse import parse_qs, urlparse
import aiohttp
from fake_useragent import UserAgent
from requests import Session
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from youtube_transcript_api import YouTubeTranscriptApi
@ -19,6 +21,7 @@ from app.utils.document_converters import (
generate_document_summary,
generate_unique_identifier_hash,
)
from app.utils.proxy_config import get_requests_proxies
from .base import (
check_document_by_unique_identifier,
@ -114,9 +117,16 @@ async def add_youtube_video_document(
}
oembed_url = "https://www.youtube.com/oembed"
# Build residential proxy URL (if configured)
residential_proxies = get_requests_proxies()
async with (
aiohttp.ClientSession() as http_session,
http_session.get(oembed_url, params=params) as response,
http_session.get(
oembed_url,
params=params,
proxy=residential_proxies["http"] if residential_proxies else None,
) as response,
):
video_data = await response.json()
@ -138,7 +148,12 @@ async def add_youtube_video_document(
)
try:
ytt_api = YouTubeTranscriptApi()
ua = UserAgent()
http_client = Session()
http_client.headers.update({"User-Agent": ua.random})
if residential_proxies:
http_client.proxies.update(residential_proxies)
ytt_api = YouTubeTranscriptApi(http_client=http_client)
captions = ytt_api.fetch(video_id)
# Include complete caption information with timestamps
transcript_segments = []