mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 16:56:22 +02:00
feat: update YouTube transcript fetching to select primary language transcripts
This commit is contained in:
parent
d97068882a
commit
cdc217dbe2
2 changed files with 22 additions and 3 deletions
|
|
@ -244,7 +244,13 @@ async def add_youtube_video_document(
|
|||
if residential_proxies:
|
||||
http_client.proxies.update(residential_proxies)
|
||||
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
||||
captions = ytt_api.fetch(video_id)
|
||||
|
||||
# List all available transcripts and pick the first one
|
||||
# (the video's primary language) instead of defaulting to English
|
||||
transcript_list = ytt_api.list(video_id)
|
||||
transcript = next(iter(transcript_list))
|
||||
captions = transcript.fetch()
|
||||
|
||||
# Include complete caption information with timestamps
|
||||
transcript_segments = []
|
||||
for line in captions:
|
||||
|
|
@ -257,11 +263,14 @@ async def add_youtube_video_document(
|
|||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Transcript fetched successfully: {len(captions)} segments",
|
||||
f"Transcript fetched successfully: {len(captions)} segments ({transcript.language})",
|
||||
{
|
||||
"stage": "transcript_fetched",
|
||||
"segments_count": len(captions),
|
||||
"transcript_length": len(transcript_text),
|
||||
"language": transcript.language,
|
||||
"language_code": transcript.language_code,
|
||||
"is_generated": transcript.is_generated,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue