mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-30 21:59:46 +02:00
feat: update YouTube transcript fetching to select primary language transcripts
This commit is contained in:
parent
d97068882a
commit
cdc217dbe2
2 changed files with 22 additions and 3 deletions
|
|
@ -110,7 +110,17 @@ async def _scrape_youtube_video(
|
||||||
if residential_proxies:
|
if residential_proxies:
|
||||||
http_client.proxies.update(residential_proxies)
|
http_client.proxies.update(residential_proxies)
|
||||||
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
||||||
captions = ytt_api.fetch(video_id)
|
|
||||||
|
# List all available transcripts and pick the first one
|
||||||
|
# (the video's primary language) instead of defaulting to English
|
||||||
|
transcript_list = ytt_api.list(video_id)
|
||||||
|
transcript = next(iter(transcript_list))
|
||||||
|
captions = transcript.fetch()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"[scrape_webpage] Fetched transcript for {video_id} "
|
||||||
|
f"in {transcript.language} ({transcript.language_code})"
|
||||||
|
)
|
||||||
|
|
||||||
transcript_segments = []
|
transcript_segments = []
|
||||||
for line in captions:
|
for line in captions:
|
||||||
|
|
|
||||||
|
|
@ -244,7 +244,13 @@ async def add_youtube_video_document(
|
||||||
if residential_proxies:
|
if residential_proxies:
|
||||||
http_client.proxies.update(residential_proxies)
|
http_client.proxies.update(residential_proxies)
|
||||||
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
ytt_api = YouTubeTranscriptApi(http_client=http_client)
|
||||||
captions = ytt_api.fetch(video_id)
|
|
||||||
|
# List all available transcripts and pick the first one
|
||||||
|
# (the video's primary language) instead of defaulting to English
|
||||||
|
transcript_list = ytt_api.list(video_id)
|
||||||
|
transcript = next(iter(transcript_list))
|
||||||
|
captions = transcript.fetch()
|
||||||
|
|
||||||
# Include complete caption information with timestamps
|
# Include complete caption information with timestamps
|
||||||
transcript_segments = []
|
transcript_segments = []
|
||||||
for line in captions:
|
for line in captions:
|
||||||
|
|
@ -257,11 +263,14 @@ async def add_youtube_video_document(
|
||||||
|
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Transcript fetched successfully: {len(captions)} segments",
|
f"Transcript fetched successfully: {len(captions)} segments ({transcript.language})",
|
||||||
{
|
{
|
||||||
"stage": "transcript_fetched",
|
"stage": "transcript_fetched",
|
||||||
"segments_count": len(captions),
|
"segments_count": len(captions),
|
||||||
"transcript_length": len(transcript_text),
|
"transcript_length": len(transcript_text),
|
||||||
|
"language": transcript.language,
|
||||||
|
"language_code": transcript.language_code,
|
||||||
|
"is_generated": transcript.is_generated,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue