mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-21 02:28:27 +02:00
- Switch default profile to Safari26/Mac (best CF pass rate) - Auto-fallback to plain client on connection error or 403 - Fixes: ycombinator.com, producthunt.com, and similar CF-strict sites - Reddit .json endpoint uses plain client (TLS fingerprint was blocked) - YouTube caption track extraction + timed text parser (core, not yet wired) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
296 lines
9.1 KiB
Rust
296 lines
9.1 KiB
Rust
use once_cell::sync::Lazy;
|
|
/// YouTube video metadata extraction from `ytInitialPlayerResponse` embedded JSON.
|
|
///
|
|
/// YouTube embeds the full player config (title, author, view count, description,
|
|
/// duration, upload date) in a `<script>` tag as a JS variable assignment. This
|
|
/// module parses that blob and formats it as structured markdown, giving LLMs a
|
|
/// clean representation without needing the YouTube API.
|
|
use regex::Regex;
|
|
use tracing::debug;
|
|
|
|
/// Regex to find the ytInitialPlayerResponse assignment in a <script> block.
|
|
/// YouTube uses: `var ytInitialPlayerResponse = {...};`
|
|
static YT_PLAYER_RE: Lazy<Regex> =
|
|
Lazy::new(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap());
|
|
|
|
/// Check if a URL is a YouTube video page.
|
|
pub fn is_youtube_url(url: &str) -> bool {
|
|
let lower = url.to_lowercase();
|
|
lower.contains("youtube.com/watch") || lower.contains("youtu.be/")
|
|
}
|
|
|
|
/// Extracted YouTube video metadata.
|
|
#[derive(Debug)]
|
|
struct VideoMeta {
|
|
title: String,
|
|
author: String,
|
|
view_count: String,
|
|
upload_date: String,
|
|
description: String,
|
|
duration: String,
|
|
}
|
|
|
|
/// Try to extract YouTube video metadata from the page HTML.
|
|
/// Returns structured markdown if successful, None if the page doesn't contain
|
|
/// ytInitialPlayerResponse or parsing fails.
|
|
pub fn try_extract(html: &str) -> Option<String> {
|
|
let json_str = YT_PLAYER_RE.captures(html)?.get(1)?.as_str();
|
|
|
|
let value: serde_json::Value = serde_json::from_str(json_str).ok()?;
|
|
|
|
let video_details = value.get("videoDetails")?;
|
|
let microformat = value
|
|
.get("microformat")
|
|
.and_then(|m| m.get("playerMicroformatRenderer"));
|
|
|
|
let title = video_details
|
|
.get("title")
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or("Untitled")
|
|
.to_string();
|
|
|
|
let author = video_details
|
|
.get("author")
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or("Unknown")
|
|
.to_string();
|
|
|
|
let view_count = video_details
|
|
.get("viewCount")
|
|
.and_then(|v| v.as_str())
|
|
.map(format_view_count)
|
|
.unwrap_or_else(|| "N/A".to_string());
|
|
|
|
let upload_date = microformat
|
|
.and_then(|m| m.get("uploadDate"))
|
|
.or_else(|| microformat.and_then(|m| m.get("publishDate")))
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or("Unknown")
|
|
.to_string();
|
|
|
|
let description = video_details
|
|
.get("shortDescription")
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or("")
|
|
.to_string();
|
|
|
|
let duration_secs = video_details
|
|
.get("lengthSeconds")
|
|
.and_then(|v| v.as_str())
|
|
.and_then(|s| s.parse::<u64>().ok())
|
|
.unwrap_or(0);
|
|
let duration = format_duration(duration_secs);
|
|
|
|
let meta = VideoMeta {
|
|
title,
|
|
author,
|
|
view_count,
|
|
upload_date,
|
|
description,
|
|
duration,
|
|
};
|
|
|
|
debug!(
|
|
title = %meta.title,
|
|
author = %meta.author,
|
|
"extracted YouTube video metadata"
|
|
);
|
|
|
|
Some(format_markdown(&meta))
|
|
}
|
|
|
|
/// Format seconds into human-readable duration (e.g., "1:23:45" or "12:34").
|
|
fn format_duration(total_secs: u64) -> String {
|
|
let hours = total_secs / 3600;
|
|
let minutes = (total_secs % 3600) / 60;
|
|
let seconds = total_secs % 60;
|
|
|
|
if hours > 0 {
|
|
format!("{hours}:{minutes:02}:{seconds:02}")
|
|
} else {
|
|
format!("{minutes}:{seconds:02}")
|
|
}
|
|
}
|
|
|
|
/// Format a raw view count string with commas (e.g., "1234567" -> "1,234,567").
|
|
fn format_view_count(raw: &str) -> String {
|
|
let Ok(n) = raw.parse::<u64>() else {
|
|
return raw.to_string();
|
|
};
|
|
|
|
if n >= 1_000_000 {
|
|
format!("{:.1}M", n as f64 / 1_000_000.0)
|
|
} else if n >= 1_000 {
|
|
format!("{:.1}K", n as f64 / 1_000.0)
|
|
} else {
|
|
n.to_string()
|
|
}
|
|
}
|
|
|
|
/// A caption track URL extracted from ytInitialPlayerResponse.
|
|
#[derive(Debug, Clone)]
|
|
pub struct CaptionTrack {
|
|
pub url: String,
|
|
pub lang: String,
|
|
pub name: String,
|
|
}
|
|
|
|
/// Extract caption track URLs from ytInitialPlayerResponse JSON.
|
|
/// Returns empty vec if no captions are available.
|
|
pub fn extract_caption_tracks(html: &str) -> Vec<CaptionTrack> {
|
|
let Some(json_str) = YT_PLAYER_RE.captures(html).and_then(|c| c.get(1)) else {
|
|
return vec![];
|
|
};
|
|
|
|
let Ok(value) = serde_json::from_str::<serde_json::Value>(json_str.as_str()) else {
|
|
return vec![];
|
|
};
|
|
|
|
let Some(tracks) = value
|
|
.get("captions")
|
|
.and_then(|c| c.get("playerCaptionsTracklistRenderer"))
|
|
.and_then(|r| r.get("captionTracks"))
|
|
.and_then(|t| t.as_array())
|
|
else {
|
|
return vec![];
|
|
};
|
|
|
|
tracks
|
|
.iter()
|
|
.filter_map(|t| {
|
|
let url = t.get("baseUrl")?.as_str()?.to_string();
|
|
let lang = t
|
|
.get("languageCode")
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or("en")
|
|
.to_string();
|
|
let name = t
|
|
.get("name")
|
|
.and_then(|v| v.get("simpleText"))
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or(&lang)
|
|
.to_string();
|
|
Some(CaptionTrack { url, lang, name })
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Parse YouTube timed text XML into plain transcript text.
|
|
/// The XML format is: `<transcript><text start="0" dur="1.5">Hello</text>...</transcript>`
|
|
pub fn parse_timed_text(xml: &str) -> String {
|
|
// Simple regex-based parsing to avoid adding an XML crate dependency.
|
|
// Extract text content between <text ...>...</text> tags.
|
|
static TEXT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<text[^>]*>([^<]*)</text>").unwrap());
|
|
|
|
let mut lines: Vec<String> = Vec::new();
|
|
for cap in TEXT_RE.captures_iter(xml) {
|
|
let text = cap[1].trim();
|
|
if text.is_empty() {
|
|
continue;
|
|
}
|
|
// Decode XML entities
|
|
let decoded = text
|
|
.replace("&", "&")
|
|
.replace("<", "<")
|
|
.replace(">", ">")
|
|
.replace(""", "\"")
|
|
.replace("'", "'")
|
|
.replace("'", "'")
|
|
.replace("\n", " ");
|
|
lines.push(decoded);
|
|
}
|
|
|
|
lines.join(" ")
|
|
}
|
|
|
|
/// Format extracted metadata into structured markdown.
|
|
fn format_markdown(meta: &VideoMeta) -> String {
|
|
let mut md = format!("# {}\n\n", meta.title);
|
|
|
|
md.push_str(&format!(
|
|
"**Channel:** {} | **Views:** {} | **Published:** {} | **Duration:** {}\n\n",
|
|
meta.author, meta.view_count, meta.upload_date, meta.duration
|
|
));
|
|
|
|
if !meta.description.is_empty() {
|
|
md.push_str("## Description\n\n");
|
|
md.push_str(&meta.description);
|
|
md.push('\n');
|
|
}
|
|
|
|
md
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn detects_youtube_urls() {
|
|
assert!(is_youtube_url(
|
|
"https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
|
));
|
|
assert!(is_youtube_url("https://youtube.com/watch?v=abc123"));
|
|
assert!(is_youtube_url("https://youtu.be/dQw4w9WgXcQ"));
|
|
assert!(!is_youtube_url("https://example.com"));
|
|
assert!(!is_youtube_url("https://vimeo.com/123456"));
|
|
}
|
|
|
|
#[test]
|
|
fn format_duration_short() {
|
|
assert_eq!(format_duration(0), "0:00");
|
|
assert_eq!(format_duration(65), "1:05");
|
|
assert_eq!(format_duration(3661), "1:01:01");
|
|
assert_eq!(format_duration(754), "12:34");
|
|
}
|
|
|
|
#[test]
|
|
fn format_view_count_values() {
|
|
assert_eq!(format_view_count("500"), "500");
|
|
assert_eq!(format_view_count("1500"), "1.5K");
|
|
assert_eq!(format_view_count("1234567"), "1.2M");
|
|
}
|
|
|
|
#[test]
|
|
fn extracts_from_mock_html() {
|
|
let html = r#"
|
|
<html><head><title>Test Video</title></head>
|
|
<body>
|
|
<script>
|
|
var ytInitialPlayerResponse = {"videoDetails":{"title":"Rust in 100 Seconds","author":"Fireship","viewCount":"5432100","shortDescription":"Learn Rust in 100 seconds.","lengthSeconds":"120"},"microformat":{"playerMicroformatRenderer":{"uploadDate":"2023-01-15"}}};
|
|
</script>
|
|
</body></html>
|
|
"#;
|
|
|
|
let result = try_extract(html).unwrap();
|
|
assert!(result.contains("# Rust in 100 Seconds"));
|
|
assert!(result.contains("**Channel:** Fireship"));
|
|
assert!(result.contains("5.4M"));
|
|
assert!(result.contains("2023-01-15"));
|
|
assert!(result.contains("2:00"));
|
|
assert!(result.contains("Learn Rust in 100 seconds."));
|
|
}
|
|
|
|
#[test]
|
|
fn returns_none_for_non_youtube_html() {
|
|
let html = "<html><body><p>Hello world</p></body></html>";
|
|
assert!(try_extract(html).is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn handles_missing_optional_fields() {
|
|
let html = r#"
|
|
<html><body>
|
|
<script>
|
|
var ytInitialPlayerResponse = {"videoDetails":{"title":"Minimal Video","author":"Someone","viewCount":"100","shortDescription":"","lengthSeconds":"60"}};
|
|
</script>
|
|
</body></html>
|
|
"#;
|
|
|
|
let result = try_extract(html).unwrap();
|
|
assert!(result.contains("# Minimal Video"));
|
|
assert!(result.contains("**Channel:** Someone"));
|
|
// Upload date should be "Unknown" when microformat is missing
|
|
assert!(result.contains("Unknown"));
|
|
}
|
|
}
|