feat: update YouTube transcript fetching to select primary language transcripts

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-06 14:02:46 -08:00
parent d97068882a
commit cdc217dbe2
2 changed files with 22 additions and 3 deletions

View file

@ -244,7 +244,13 @@ async def add_youtube_video_document(
if residential_proxies:
http_client.proxies.update(residential_proxies)
ytt_api = YouTubeTranscriptApi(http_client=http_client)
captions = ytt_api.fetch(video_id)
# List all available transcripts and pick the first one
# (the video's primary language) instead of defaulting to English
transcript_list = ytt_api.list(video_id)
transcript = next(iter(transcript_list))
captions = transcript.fetch()
# Include complete caption information with timestamps
transcript_segments = []
for line in captions:
@ -257,11 +263,14 @@ async def add_youtube_video_document(
await task_logger.log_task_progress(
log_entry,
f"Transcript fetched successfully: {len(captions)} segments",
f"Transcript fetched successfully: {len(captions)} segments ({transcript.language})",
{
"stage": "transcript_fetched",
"segments_count": len(captions),
"transcript_length": len(transcript_text),
"language": transcript.language,
"language_code": transcript.language_code,
"is_generated": transcript.is_generated,
},
)
except Exception as e: