mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
feat: add residential proxy configuration for web crawling and YouTube transcript fetching
This commit is contained in:
parent
eaa0060def
commit
1511c26ef5
12 changed files with 251 additions and 16 deletions
|
|
@ -15,12 +15,12 @@ logger = logging.getLogger(__name__)
|
|||
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
|
||||
"""
|
||||
Handle greenlet_spawn errors with detailed logging for debugging.
|
||||
|
||||
|
||||
The 'greenlet_spawn has not been called' error occurs when:
|
||||
1. SQLAlchemy lazy-loads a relationship outside of an async context
|
||||
2. A sync operation is called from an async context (or vice versa)
|
||||
3. Session objects are accessed after the session is closed
|
||||
|
||||
|
||||
This helper logs detailed context to help identify the root cause.
|
||||
"""
|
||||
error_str = str(e)
|
||||
|
|
|
|||
|
|
@ -52,8 +52,8 @@ def parse_date_flexible(date_str: str) -> datetime:
|
|||
# Try ISO format as fallback
|
||||
try:
|
||||
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
raise ValueError(f"Unable to parse date: {date_str}")
|
||||
except ValueError as err:
|
||||
raise ValueError(f"Unable to parse date: {date_str}") from err
|
||||
|
||||
|
||||
async def check_duplicate_document_by_hash(
|
||||
|
|
|
|||
|
|
@ -217,7 +217,7 @@ async def index_notion_pages(
|
|||
)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Notion pages: Notion API limitation",
|
||||
"Failed to get Notion pages: Notion API limitation",
|
||||
f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
|
||||
{"error_type": "UnsupportedBlockType", "is_known_limitation": True},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ async def index_crawled_urls(
|
|||
f"No URLs provided for indexing. Connector ID: {connector_id}, "
|
||||
f"Connector name: {connector.name}, "
|
||||
f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
|
||||
f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
|
||||
f"INITIAL_URLS raw value: {raw_initial_urls!r}"
|
||||
)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ import logging
|
|||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
import aiohttp
|
||||
from fake_useragent import UserAgent
|
||||
from requests import Session
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
|
@ -19,6 +21,7 @@ from app.utils.document_converters import (
|
|||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
from app.utils.proxy_config import get_requests_proxies
|
||||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
|
|
@ -114,9 +117,16 @@ async def add_youtube_video_document(
|
|||
}
|
||||
oembed_url = "https://www.youtube.com/oembed"
|
||||
|
||||
# Build residential proxy URL (if configured)
|
||||
residential_proxies = get_requests_proxies()
|
||||
|
||||
async with (
|
||||
aiohttp.ClientSession() as http_session,
|
||||
http_session.get(oembed_url, params=params) as response,
|
||||
http_session.get(
|
||||
oembed_url,
|
||||
params=params,
|
||||
proxy=residential_proxies["http"] if residential_proxies else None,
|
||||
) as response,
|
||||
):
|
||||
video_data = await response.json()
|
||||
|
||||
|
|
@ -138,7 +148,12 @@ async def add_youtube_video_document(
|
|||
)
|
||||
|
||||
try:
|
||||
ytt_api = YouTubeTranscriptApi()
|
||||
ua = UserAgent()
|
||||
http_client = Session()
|
||||
http_client.headers.update({"User-Agent": ua.random})
|
||||
if residential_proxies:
|
||||
http_client.proxies.update(residential_proxies)
|
||||
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
||||
captions = ytt_api.fetch(video_id)
|
||||
# Include complete caption information with timestamps
|
||||
transcript_segments = []
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue