mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-01 11:56:25 +02:00
feat: update YouTube transcript fetching to select primary language transcripts
This commit is contained in:
parent
d97068882a
commit
cdc217dbe2
2 changed files with 22 additions and 3 deletions
|
|
@ -110,7 +110,17 @@ async def _scrape_youtube_video(
|
|||
if residential_proxies:
|
||||
http_client.proxies.update(residential_proxies)
|
||||
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
||||
captions = ytt_api.fetch(video_id)
|
||||
|
||||
# List all available transcripts and pick the first one
|
||||
# (the video's primary language) instead of defaulting to English
|
||||
transcript_list = ytt_api.list(video_id)
|
||||
transcript = next(iter(transcript_list))
|
||||
captions = transcript.fetch()
|
||||
|
||||
logger.info(
|
||||
f"[scrape_webpage] Fetched transcript for {video_id} "
|
||||
f"in {transcript.language} ({transcript.language_code})"
|
||||
)
|
||||
|
||||
transcript_segments = []
|
||||
for line in captions:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue