diff --git a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py index e3c58c857..014126927 100644 --- a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py +++ b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py @@ -110,7 +110,17 @@ async def _scrape_youtube_video( if residential_proxies: http_client.proxies.update(residential_proxies) ytt_api = YouTubeTranscriptApi(http_client=http_client) - captions = ytt_api.fetch(video_id) + + # List all available transcripts and pick the first one + # (the video's primary language) instead of defaulting to English + transcript_list = ytt_api.list(video_id) + transcript = next(iter(transcript_list)) + captions = transcript.fetch() + + logger.info( + f"[scrape_webpage] Fetched transcript for {video_id} " + f"in {transcript.language} ({transcript.language_code})" + ) transcript_segments = [] for line in captions: diff --git a/surfsense_backend/app/tasks/document_processors/youtube_processor.py b/surfsense_backend/app/tasks/document_processors/youtube_processor.py index 9dac6d554..6d087b6d0 100644 --- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py +++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py @@ -244,7 +244,13 @@ async def add_youtube_video_document( if residential_proxies: http_client.proxies.update(residential_proxies) ytt_api = YouTubeTranscriptApi(http_client=http_client) - captions = ytt_api.fetch(video_id) + + # List all available transcripts and pick the first one + # (the video's primary language) instead of defaulting to English + transcript_list = ytt_api.list(video_id) + transcript = next(iter(transcript_list)) + captions = transcript.fetch() + # Include complete caption information with timestamps transcript_segments = [] for line in captions: @@ -257,11 +263,14 @@ async def add_youtube_video_document( await task_logger.log_task_progress( log_entry, - f"Transcript fetched successfully: {len(captions)} segments", + f"Transcript fetched successfully: {len(captions)} segments ({transcript.language})", { "stage": "transcript_fetched", "segments_count": len(captions), "transcript_length": len(transcript_text), + "language": transcript.language, + "language_code": transcript.language_code, + "is_generated": transcript.is_generated, }, ) except Exception as e: