mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 16:56:22 +02:00
feat: add residential proxy configuration for web crawling and YouTube transcript fetching
This commit is contained in:
parent
eaa0060def
commit
1511c26ef5
12 changed files with 251 additions and 16 deletions
|
|
@ -6,6 +6,8 @@ import logging
|
|||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
import aiohttp
|
||||
from fake_useragent import UserAgent
|
||||
from requests import Session
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
|
@ -19,6 +21,7 @@ from app.utils.document_converters import (
|
|||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
from app.utils.proxy_config import get_requests_proxies
|
||||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
|
|
@ -114,9 +117,16 @@ async def add_youtube_video_document(
|
|||
}
|
||||
oembed_url = "https://www.youtube.com/oembed"
|
||||
|
||||
# Build residential proxy URL (if configured)
|
||||
residential_proxies = get_requests_proxies()
|
||||
|
||||
async with (
|
||||
aiohttp.ClientSession() as http_session,
|
||||
http_session.get(oembed_url, params=params) as response,
|
||||
http_session.get(
|
||||
oembed_url,
|
||||
params=params,
|
||||
proxy=residential_proxies["http"] if residential_proxies else None,
|
||||
) as response,
|
||||
):
|
||||
video_data = await response.json()
|
||||
|
||||
|
|
@ -138,7 +148,12 @@ async def add_youtube_video_document(
|
|||
)
|
||||
|
||||
try:
|
||||
ytt_api = YouTubeTranscriptApi()
|
||||
ua = UserAgent()
|
||||
http_client = Session()
|
||||
http_client.headers.update({"User-Agent": ua.random})
|
||||
if residential_proxies:
|
||||
http_client.proxies.update(residential_proxies)
|
||||
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
||||
captions = ytt_api.fetch(video_id)
|
||||
# Include complete caption information with timestamps
|
||||
transcript_segments = []
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue