webclaw/crates/webclaw-core/src/youtube.rs
Valerio afe4d3077d feat: v0.1.2 — TLS fallback, Safari default, Reddit fix, YouTube transcript infra
- Switch default profile to Safari26/Mac (best CF pass rate)
- Auto-fallback to plain client on connection error or 403
- Fixes: ycombinator.com, producthunt.com, and similar CF-strict sites
- Reddit .json endpoint uses plain client (TLS fingerprint was blocked)
- YouTube caption track extraction + timed text parser (core, not yet wired)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-25 18:50:07 +01:00

296 lines
9.1 KiB
Rust

use once_cell::sync::Lazy;
/// YouTube video metadata extraction from `ytInitialPlayerResponse` embedded JSON.
///
/// YouTube embeds the full player config (title, author, view count, description,
/// duration, upload date) in a `<script>` tag as a JS variable assignment. This
/// module parses that blob and formats it as structured markdown, giving LLMs a
/// clean representation without needing the YouTube API.
use regex::Regex;
use tracing::debug;
/// Regex to find the ytInitialPlayerResponse assignment in a <script> block.
/// YouTube uses: `var ytInitialPlayerResponse = {...};`
static YT_PLAYER_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap());
/// Check if a URL is a YouTube video page.
pub fn is_youtube_url(url: &str) -> bool {
let lower = url.to_lowercase();
lower.contains("youtube.com/watch") || lower.contains("youtu.be/")
}
/// Extracted YouTube video metadata.
#[derive(Debug)]
struct VideoMeta {
title: String,
author: String,
view_count: String,
upload_date: String,
description: String,
duration: String,
}
/// Try to extract YouTube video metadata from the page HTML.
/// Returns structured markdown if successful, None if the page doesn't contain
/// ytInitialPlayerResponse or parsing fails.
pub fn try_extract(html: &str) -> Option<String> {
let json_str = YT_PLAYER_RE.captures(html)?.get(1)?.as_str();
let value: serde_json::Value = serde_json::from_str(json_str).ok()?;
let video_details = value.get("videoDetails")?;
let microformat = value
.get("microformat")
.and_then(|m| m.get("playerMicroformatRenderer"));
let title = video_details
.get("title")
.and_then(|v| v.as_str())
.unwrap_or("Untitled")
.to_string();
let author = video_details
.get("author")
.and_then(|v| v.as_str())
.unwrap_or("Unknown")
.to_string();
let view_count = video_details
.get("viewCount")
.and_then(|v| v.as_str())
.map(format_view_count)
.unwrap_or_else(|| "N/A".to_string());
let upload_date = microformat
.and_then(|m| m.get("uploadDate"))
.or_else(|| microformat.and_then(|m| m.get("publishDate")))
.and_then(|v| v.as_str())
.unwrap_or("Unknown")
.to_string();
let description = video_details
.get("shortDescription")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let duration_secs = video_details
.get("lengthSeconds")
.and_then(|v| v.as_str())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(0);
let duration = format_duration(duration_secs);
let meta = VideoMeta {
title,
author,
view_count,
upload_date,
description,
duration,
};
debug!(
title = %meta.title,
author = %meta.author,
"extracted YouTube video metadata"
);
Some(format_markdown(&meta))
}
/// Format seconds into human-readable duration (e.g., "1:23:45" or "12:34").
fn format_duration(total_secs: u64) -> String {
let hours = total_secs / 3600;
let minutes = (total_secs % 3600) / 60;
let seconds = total_secs % 60;
if hours > 0 {
format!("{hours}:{minutes:02}:{seconds:02}")
} else {
format!("{minutes}:{seconds:02}")
}
}
/// Format a raw view count string with commas (e.g., "1234567" -> "1,234,567").
fn format_view_count(raw: &str) -> String {
let Ok(n) = raw.parse::<u64>() else {
return raw.to_string();
};
if n >= 1_000_000 {
format!("{:.1}M", n as f64 / 1_000_000.0)
} else if n >= 1_000 {
format!("{:.1}K", n as f64 / 1_000.0)
} else {
n.to_string()
}
}
/// A caption track URL extracted from ytInitialPlayerResponse.
#[derive(Debug, Clone)]
pub struct CaptionTrack {
pub url: String,
pub lang: String,
pub name: String,
}
/// Extract caption track URLs from ytInitialPlayerResponse JSON.
/// Returns empty vec if no captions are available.
pub fn extract_caption_tracks(html: &str) -> Vec<CaptionTrack> {
let Some(json_str) = YT_PLAYER_RE.captures(html).and_then(|c| c.get(1)) else {
return vec![];
};
let Ok(value) = serde_json::from_str::<serde_json::Value>(json_str.as_str()) else {
return vec![];
};
let Some(tracks) = value
.get("captions")
.and_then(|c| c.get("playerCaptionsTracklistRenderer"))
.and_then(|r| r.get("captionTracks"))
.and_then(|t| t.as_array())
else {
return vec![];
};
tracks
.iter()
.filter_map(|t| {
let url = t.get("baseUrl")?.as_str()?.to_string();
let lang = t
.get("languageCode")
.and_then(|v| v.as_str())
.unwrap_or("en")
.to_string();
let name = t
.get("name")
.and_then(|v| v.get("simpleText"))
.and_then(|v| v.as_str())
.unwrap_or(&lang)
.to_string();
Some(CaptionTrack { url, lang, name })
})
.collect()
}
/// Parse YouTube timed text XML into plain transcript text.
/// The XML format is: `<transcript><text start="0" dur="1.5">Hello</text>...</transcript>`
pub fn parse_timed_text(xml: &str) -> String {
// Simple regex-based parsing to avoid adding an XML crate dependency.
// Extract text content between <text ...>...</text> tags.
static TEXT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<text[^>]*>([^<]*)</text>").unwrap());
let mut lines: Vec<String> = Vec::new();
for cap in TEXT_RE.captures_iter(xml) {
let text = cap[1].trim();
if text.is_empty() {
continue;
}
// Decode XML entities
let decoded = text
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&apos;", "'")
.replace("\n", " ");
lines.push(decoded);
}
lines.join(" ")
}
/// Format extracted metadata into structured markdown.
fn format_markdown(meta: &VideoMeta) -> String {
let mut md = format!("# {}\n\n", meta.title);
md.push_str(&format!(
"**Channel:** {} | **Views:** {} | **Published:** {} | **Duration:** {}\n\n",
meta.author, meta.view_count, meta.upload_date, meta.duration
));
if !meta.description.is_empty() {
md.push_str("## Description\n\n");
md.push_str(&meta.description);
md.push('\n');
}
md
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_youtube_urls() {
assert!(is_youtube_url(
"https://www.youtube.com/watch?v=dQw4w9WgXcQ"
));
assert!(is_youtube_url("https://youtube.com/watch?v=abc123"));
assert!(is_youtube_url("https://youtu.be/dQw4w9WgXcQ"));
assert!(!is_youtube_url("https://example.com"));
assert!(!is_youtube_url("https://vimeo.com/123456"));
}
#[test]
fn format_duration_short() {
assert_eq!(format_duration(0), "0:00");
assert_eq!(format_duration(65), "1:05");
assert_eq!(format_duration(3661), "1:01:01");
assert_eq!(format_duration(754), "12:34");
}
#[test]
fn format_view_count_values() {
assert_eq!(format_view_count("500"), "500");
assert_eq!(format_view_count("1500"), "1.5K");
assert_eq!(format_view_count("1234567"), "1.2M");
}
#[test]
fn extracts_from_mock_html() {
let html = r#"
<html><head><title>Test Video</title></head>
<body>
<script>
var ytInitialPlayerResponse = {"videoDetails":{"title":"Rust in 100 Seconds","author":"Fireship","viewCount":"5432100","shortDescription":"Learn Rust in 100 seconds.","lengthSeconds":"120"},"microformat":{"playerMicroformatRenderer":{"uploadDate":"2023-01-15"}}};
</script>
</body></html>
"#;
let result = try_extract(html).unwrap();
assert!(result.contains("# Rust in 100 Seconds"));
assert!(result.contains("**Channel:** Fireship"));
assert!(result.contains("5.4M"));
assert!(result.contains("2023-01-15"));
assert!(result.contains("2:00"));
assert!(result.contains("Learn Rust in 100 seconds."));
}
#[test]
fn returns_none_for_non_youtube_html() {
let html = "<html><body><p>Hello world</p></body></html>";
assert!(try_extract(html).is_none());
}
#[test]
fn handles_missing_optional_fields() {
let html = r#"
<html><body>
<script>
var ytInitialPlayerResponse = {"videoDetails":{"title":"Minimal Video","author":"Someone","viewCount":"100","shortDescription":"","lengthSeconds":"60"}};
</script>
</body></html>
"#;
let result = try_extract(html).unwrap();
assert!(result.contains("# Minimal Video"));
assert!(result.contains("**Channel:** Someone"));
// Upload date should be "Unknown" when microformat is missing
assert!(result.contains("Unknown"));
}
}