feat: v0.1.2 — TLS fallback, Safari default, Reddit fix, YouTube transcript infra

- Switch default profile to Safari26/Mac (best CF pass rate) - Auto-fallback to plain client on connection error or 403 - Fixes: ycombinator.com, producthunt.com, and similar CF-strict sites - Reddit .json endpoint uses plain client (TLS fingerprint was blocked) - YouTube caption track extraction + timed text parser (core, not yet wired) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-06-08 22:25:12 +02:00 · 2026-03-25 18:50:07 +01:00 · 2026-03-25 18:50:07 +01:00 · afe4d3077d
commit afe4d3077d
parent c90c0b6066
4 changed files with 164 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,11 +3,18 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).

-## [0.1.2] — 2026-03-24
+## [0.1.2] — 2026-03-25
+
+### Changed
+- Default TLS profile switched from Chrome145/Win to Safari26/Mac (highest pass rate across CF-protected sites)
+- Plain client fallback: when impersonated TLS gets connection error or 403, automatically retries without impersonation (fixes ycombinator.com, producthunt.com, and similar sites)

 ### Fixed
 - Reddit scraping: use plain HTTP client for `.json` endpoint (TLS fingerprinting was getting blocked)

+### Added
+- YouTube transcript extraction infrastructure in webclaw-core (caption track parsing, timed text XML parser) — wired up when cloud API launches
+
 ---

 ## [0.1.1] — 2026-03-24
--- a/crates/webclaw-core/src/youtube.rs
+++ b/crates/webclaw-core/src/youtube.rs
@ -127,6 +127,82 @@ fn format_view_count(raw: &str) -> String {
    }
 }

+/// A caption track URL extracted from ytInitialPlayerResponse.
+#[derive(Debug, Clone)]
+pub struct CaptionTrack {
+    pub url: String,
+    pub lang: String,
+    pub name: String,
+}
+
+/// Extract caption track URLs from ytInitialPlayerResponse JSON.
+/// Returns empty vec if no captions are available.
+pub fn extract_caption_tracks(html: &str) -> Vec<CaptionTrack> {
+    let Some(json_str) = YT_PLAYER_RE.captures(html).and_then(|c| c.get(1)) else {
+        return vec![];
+    };
+
+    let Ok(value) = serde_json::from_str::<serde_json::Value>(json_str.as_str()) else {
+        return vec![];
+    };
+
+    let Some(tracks) = value
+        .get("captions")
+        .and_then(|c| c.get("playerCaptionsTracklistRenderer"))
+        .and_then(|r| r.get("captionTracks"))
+        .and_then(|t| t.as_array())
+    else {
+        return vec![];
+    };
+
+    tracks
+        .iter()
+        .filter_map(|t| {
+            let url = t.get("baseUrl")?.as_str()?.to_string();
+            let lang = t
+                .get("languageCode")
+                .and_then(|v| v.as_str())
+                .unwrap_or("en")
+                .to_string();
+            let name = t
+                .get("name")
+                .and_then(|v| v.get("simpleText"))
+                .and_then(|v| v.as_str())
+                .unwrap_or(&lang)
+                .to_string();
+            Some(CaptionTrack { url, lang, name })
+        })
+        .collect()
+}
+
+/// Parse YouTube timed text XML into plain transcript text.
+/// The XML format is: `<transcript><text start="0" dur="1.5">Hello</text>...</transcript>`
+pub fn parse_timed_text(xml: &str) -> String {
+    // Simple regex-based parsing to avoid adding an XML crate dependency.
+    // Extract text content between <text ...>...</text> tags.
+    static TEXT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<text[^>]*>([^<]*)</text>").unwrap());
+
+    let mut lines: Vec<String> = Vec::new();
+    for cap in TEXT_RE.captures_iter(xml) {
+        let text = cap[1].trim();
+        if text.is_empty() {
+            continue;
+        }
+        // Decode XML entities
+        let decoded = text
+            .replace("&amp;", "&")
+            .replace("&lt;", "<")
+            .replace("&gt;", ">")
+            .replace("&quot;", "\"")
+            .replace("&#39;", "'")
+            .replace("&apos;", "'")
+            .replace("\n", " ");
+        lines.push(decoded);
+    }
+
+    lines.join(" ")
+}
+
 /// Format extracted metadata into structured markdown.
 fn format_markdown(meta: &VideoMeta) -> String {
    let mut md = format!("# {}\n\n", meta.title);
--- a/crates/webclaw-fetch/src/browser.rs
+++ b/crates/webclaw-fetch/src/browser.rs
@ -83,8 +83,8 @@ pub fn extra_profiles() -> Vec<ImpersonateProfile> {

 pub fn latest_chrome() -> ImpersonateProfile {
    ImpersonateProfile {
-        browser: Impersonate::ChromeV145,
-        os: ImpersonateOS::Windows,
+        browser: Impersonate::SafariV26,
+        os: ImpersonateOS::MacOS,
    }
 }

--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -206,7 +206,11 @@ impl FetchClient {
        Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
    }

-    /// Single fetch attempt (no retry).
+    /// Single fetch attempt with automatic plain-client fallback.
+    ///
+    /// If the TLS-impersonated client fails with a connection error or gets a 403,
+    /// retries with a plain client (no impersonation). Some sites (e.g. ycombinator.com)
+    /// reject forged TLS fingerprints but accept default rustls connections.
    async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
        let start = Instant::now();

@ -222,8 +226,47 @@ impl FetchClient {
            ClientPool::Rotating { clients } => pick_random(clients),
        };

-        let response = client.get(url).send().await?;
+        // Try impersonated client first
+        let needs_plain_fallback = match client.get(url).send().await {
+            Ok(response) => {
+                let status = response.status().as_u16();
+                if status == 403 {
+                    debug!(url, "impersonated client got 403, trying plain fallback");
+                    true
+                } else {
+                    return Self::response_to_result(response, start).await;
+                }
+            }
+            Err(_e) => {
+                debug!(
+                    url,
+                    "impersonated client connection failed, trying plain fallback"
+                );
+                true
+            }
+        };

+        // Plain client fallback (no TLS impersonation)
+        if needs_plain_fallback {
+            let plain = primp::Client::builder()
+                .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
+                .cookie_store(true)
+                .timeout(Duration::from_secs(30))
+                .build()
+                .map_err(|e| FetchError::Build(format!("plain client: {e}")))?;
+
+            let response = plain.get(url).send().await?;
+            return Self::response_to_result(response, start).await;
+        }
+
+        unreachable!()
+    }
+
+    /// Convert a primp Response into a FetchResult.
+    async fn response_to_result(
+        response: primp::Response,
+        start: Instant,
+    ) -> Result<FetchResult, FetchError> {
        let status = response.status().as_u16();
        let final_url = response.url().to_string();

@ -301,7 +344,31 @@ impl FetchClient {

        let start = Instant::now();
        let client = self.pick_client(url);
-        let response = client.get(url).send().await?;
+
+        // Try impersonated client, fall back to plain on connection error or 403
+        let response = match client.get(url).send().await {
+            Ok(resp) if resp.status().as_u16() == 403 => {
+                debug!(url, "impersonated client got 403, trying plain fallback");
+                let plain = primp::Client::builder()
+                    .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
+                    .cookie_store(true)
+                    .timeout(Duration::from_secs(30))
+                    .build()
+                    .map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?;
+                plain.get(url).send().await?
+            }
+            Ok(resp) => resp,
+            Err(_e) => {
+                debug!(url, "impersonated client failed, trying plain fallback");
+                let plain = primp::Client::builder()
+                    .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
+                    .cookie_store(true)
+                    .timeout(Duration::from_secs(30))
+                    .build()
+                    .map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?;
+                plain.get(url).send().await?
+            }
+        };

        let status = response.status().as_u16();
        let final_url = response.url().to_string();
@ -351,6 +418,14 @@ impl FetchClient {
            }

            let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
+
+            // YouTube transcript: caption URLs are IP-signed and expire immediately,
+            // so the timedtext endpoint returns empty responses. The innertube
+            // get_transcript API requires cookies/consent. Transcript extraction
+            // will be enabled via the cloud API (JS rendering + cookie jar).
+            // The extraction functions exist in webclaw_core::youtube but are not
+            // wired up here until we have a reliable fetch path.
+
            Ok(extraction)
        }
    }