feat: update YouTube transcript fetching to select primary language transcripts

2026-06-30 21:59:46 +02:00 · 2026-02-06 14:02:46 -08:00 · 2026-02-06 14:02:46 -08:00 · cdc217dbe2
commit cdc217dbe2
parent d97068882a
2 changed files with 22 additions and 3 deletions
--- a/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py
+++ b/surfsense_backend/app/agents/new_chat/tools/scrape_webpage.py
@ -110,7 +110,17 @@ async def _scrape_youtube_video(
        if residential_proxies:
            http_client.proxies.update(residential_proxies)
        ytt_api = YouTubeTranscriptApi(http_client=http_client)
-        captions = ytt_api.fetch(video_id)
+
        # List all available transcripts and pick the first one
        # (the video's primary language) instead of defaulting to English
        transcript_list = ytt_api.list(video_id)
        transcript = next(iter(transcript_list))
        captions = transcript.fetch()
        logger.info(
            f"[scrape_webpage] Fetched transcript for {video_id} "
            f"in {transcript.language} ({transcript.language_code})"
        )
        transcript_segments = []
        for line in captions:
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -244,7 +244,13 @@ async def add_youtube_video_document(
            if residential_proxies:
                http_client.proxies.update(residential_proxies)
            ytt_api = YouTubeTranscriptApi(http_client=http_client)
-            captions = ytt_api.fetch(video_id)
+
            # List all available transcripts and pick the first one
            # (the video's primary language) instead of defaulting to English
            transcript_list = ytt_api.list(video_id)
            transcript = next(iter(transcript_list))
            captions = transcript.fetch()
            # Include complete caption information with timestamps
            transcript_segments = []
            for line in captions:
@ -257,11 +263,14 @@ async def add_youtube_video_document(
            await task_logger.log_task_progress(
                log_entry,
-                f"Transcript fetched successfully: {len(captions)} segments",
+                f"Transcript fetched successfully: {len(captions)} segments ({transcript.language})",
                {
                    "stage": "transcript_fetched",
                    "segments_count": len(captions),
                    "transcript_length": len(transcript_text),
                    "language": transcript.language,
                    "language_code": transcript.language_code,
                    "is_generated": transcript.is_generated,
                },
            )
        except Exception as e: