diff --git a/CHANGELOG.md b/CHANGELOG.md index 207e41d..951d8d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,18 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). -## [0.1.2] — 2026-03-24 +## [0.1.2] — 2026-03-25 + +### Changed +- Default TLS profile switched from Chrome145/Win to Safari26/Mac (highest pass rate across CF-protected sites) +- Plain client fallback: when impersonated TLS gets connection error or 403, automatically retries without impersonation (fixes ycombinator.com, producthunt.com, and similar sites) ### Fixed - Reddit scraping: use plain HTTP client for `.json` endpoint (TLS fingerprinting was getting blocked) +### Added +- YouTube transcript extraction infrastructure in webclaw-core (caption track parsing, timed text XML parser) — wired up when cloud API launches + --- ## [0.1.1] — 2026-03-24 diff --git a/crates/webclaw-core/src/youtube.rs b/crates/webclaw-core/src/youtube.rs index 61f4914..896a7db 100644 --- a/crates/webclaw-core/src/youtube.rs +++ b/crates/webclaw-core/src/youtube.rs @@ -127,6 +127,82 @@ fn format_view_count(raw: &str) -> String { } } +/// A caption track URL extracted from ytInitialPlayerResponse. +#[derive(Debug, Clone)] +pub struct CaptionTrack { + pub url: String, + pub lang: String, + pub name: String, +} + +/// Extract caption track URLs from ytInitialPlayerResponse JSON. +/// Returns empty vec if no captions are available. +pub fn extract_caption_tracks(html: &str) -> Vec { + let Some(json_str) = YT_PLAYER_RE.captures(html).and_then(|c| c.get(1)) else { + return vec![]; + }; + + let Ok(value) = serde_json::from_str::(json_str.as_str()) else { + return vec![]; + }; + + let Some(tracks) = value + .get("captions") + .and_then(|c| c.get("playerCaptionsTracklistRenderer")) + .and_then(|r| r.get("captionTracks")) + .and_then(|t| t.as_array()) + else { + return vec![]; + }; + + tracks + .iter() + .filter_map(|t| { + let url = t.get("baseUrl")?.as_str()?.to_string(); + let lang = t + .get("languageCode") + .and_then(|v| v.as_str()) + .unwrap_or("en") + .to_string(); + let name = t + .get("name") + .and_then(|v| v.get("simpleText")) + .and_then(|v| v.as_str()) + .unwrap_or(&lang) + .to_string(); + Some(CaptionTrack { url, lang, name }) + }) + .collect() +} + +/// Parse YouTube timed text XML into plain transcript text. +/// The XML format is: `Hello...` +pub fn parse_timed_text(xml: &str) -> String { + // Simple regex-based parsing to avoid adding an XML crate dependency. + // Extract text content between ... tags. + static TEXT_RE: Lazy = Lazy::new(|| Regex::new(r"]*>([^<]*)").unwrap()); + + let mut lines: Vec = Vec::new(); + for cap in TEXT_RE.captures_iter(xml) { + let text = cap[1].trim(); + if text.is_empty() { + continue; + } + // Decode XML entities + let decoded = text + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") + .replace("'", "'") + .replace("\n", " "); + lines.push(decoded); + } + + lines.join(" ") +} + /// Format extracted metadata into structured markdown. fn format_markdown(meta: &VideoMeta) -> String { let mut md = format!("# {}\n\n", meta.title); diff --git a/crates/webclaw-fetch/src/browser.rs b/crates/webclaw-fetch/src/browser.rs index 1a71a31..c35fab5 100644 --- a/crates/webclaw-fetch/src/browser.rs +++ b/crates/webclaw-fetch/src/browser.rs @@ -83,8 +83,8 @@ pub fn extra_profiles() -> Vec { pub fn latest_chrome() -> ImpersonateProfile { ImpersonateProfile { - browser: Impersonate::ChromeV145, - os: ImpersonateOS::Windows, + browser: Impersonate::SafariV26, + os: ImpersonateOS::MacOS, } } diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 0cbacb2..ef6c249 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -206,7 +206,11 @@ impl FetchClient { Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into()))) } - /// Single fetch attempt (no retry). + /// Single fetch attempt with automatic plain-client fallback. + /// + /// If the TLS-impersonated client fails with a connection error or gets a 403, + /// retries with a plain client (no impersonation). Some sites (e.g. ycombinator.com) + /// reject forged TLS fingerprints but accept default rustls connections. async fn fetch_once(&self, url: &str) -> Result { let start = Instant::now(); @@ -222,8 +226,47 @@ impl FetchClient { ClientPool::Rotating { clients } => pick_random(clients), }; - let response = client.get(url).send().await?; + // Try impersonated client first + let needs_plain_fallback = match client.get(url).send().await { + Ok(response) => { + let status = response.status().as_u16(); + if status == 403 { + debug!(url, "impersonated client got 403, trying plain fallback"); + true + } else { + return Self::response_to_result(response, start).await; + } + } + Err(_e) => { + debug!( + url, + "impersonated client connection failed, trying plain fallback" + ); + true + } + }; + // Plain client fallback (no TLS impersonation) + if needs_plain_fallback { + let plain = primp::Client::builder() + .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36") + .cookie_store(true) + .timeout(Duration::from_secs(30)) + .build() + .map_err(|e| FetchError::Build(format!("plain client: {e}")))?; + + let response = plain.get(url).send().await?; + return Self::response_to_result(response, start).await; + } + + unreachable!() + } + + /// Convert a primp Response into a FetchResult. + async fn response_to_result( + response: primp::Response, + start: Instant, + ) -> Result { let status = response.status().as_u16(); let final_url = response.url().to_string(); @@ -301,7 +344,31 @@ impl FetchClient { let start = Instant::now(); let client = self.pick_client(url); - let response = client.get(url).send().await?; + + // Try impersonated client, fall back to plain on connection error or 403 + let response = match client.get(url).send().await { + Ok(resp) if resp.status().as_u16() == 403 => { + debug!(url, "impersonated client got 403, trying plain fallback"); + let plain = primp::Client::builder() + .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36") + .cookie_store(true) + .timeout(Duration::from_secs(30)) + .build() + .map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?; + plain.get(url).send().await? + } + Ok(resp) => resp, + Err(_e) => { + debug!(url, "impersonated client failed, trying plain fallback"); + let plain = primp::Client::builder() + .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36") + .cookie_store(true) + .timeout(Duration::from_secs(30)) + .build() + .map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?; + plain.get(url).send().await? + } + }; let status = response.status().as_u16(); let final_url = response.url().to_string(); @@ -351,6 +418,14 @@ impl FetchClient { } let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?; + + // YouTube transcript: caption URLs are IP-signed and expire immediately, + // so the timedtext endpoint returns empty responses. The innertube + // get_transcript API requires cookies/consent. Transcript extraction + // will be enabled via the cloud API (JS rendering + cookie jar). + // The extraction functions exist in webclaw_core::youtube but are not + // wired up here until we have a reliable fetch path. + Ok(extraction) } }