mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-08 22:25:12 +02:00
feat: v0.1.2 — TLS fallback, Safari default, Reddit fix, YouTube transcript infra
- Switch default profile to Safari26/Mac (best CF pass rate) - Auto-fallback to plain client on connection error or 403 - Fixes: ycombinator.com, producthunt.com, and similar CF-strict sites - Reddit .json endpoint uses plain client (TLS fingerprint was blocked) - YouTube caption track extraction + timed text parser (core, not yet wired) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c90c0b6066
commit
afe4d3077d
4 changed files with 164 additions and 6 deletions
|
|
@ -3,11 +3,18 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
## [0.1.2] — 2026-03-24
|
## [0.1.2] — 2026-03-25
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Default TLS profile switched from Chrome145/Win to Safari26/Mac (highest pass rate across CF-protected sites)
|
||||||
|
- Plain client fallback: when impersonated TLS gets connection error or 403, automatically retries without impersonation (fixes ycombinator.com, producthunt.com, and similar sites)
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Reddit scraping: use plain HTTP client for `.json` endpoint (TLS fingerprinting was getting blocked)
|
- Reddit scraping: use plain HTTP client for `.json` endpoint (TLS fingerprinting was getting blocked)
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- YouTube transcript extraction infrastructure in webclaw-core (caption track parsing, timed text XML parser) — wired up when cloud API launches
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## [0.1.1] — 2026-03-24
|
## [0.1.1] — 2026-03-24
|
||||||
|
|
|
||||||
|
|
@ -127,6 +127,82 @@ fn format_view_count(raw: &str) -> String {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A caption track URL extracted from ytInitialPlayerResponse.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct CaptionTrack {
|
||||||
|
pub url: String,
|
||||||
|
pub lang: String,
|
||||||
|
pub name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract caption track URLs from ytInitialPlayerResponse JSON.
|
||||||
|
/// Returns empty vec if no captions are available.
|
||||||
|
pub fn extract_caption_tracks(html: &str) -> Vec<CaptionTrack> {
|
||||||
|
let Some(json_str) = YT_PLAYER_RE.captures(html).and_then(|c| c.get(1)) else {
|
||||||
|
return vec![];
|
||||||
|
};
|
||||||
|
|
||||||
|
let Ok(value) = serde_json::from_str::<serde_json::Value>(json_str.as_str()) else {
|
||||||
|
return vec![];
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(tracks) = value
|
||||||
|
.get("captions")
|
||||||
|
.and_then(|c| c.get("playerCaptionsTracklistRenderer"))
|
||||||
|
.and_then(|r| r.get("captionTracks"))
|
||||||
|
.and_then(|t| t.as_array())
|
||||||
|
else {
|
||||||
|
return vec![];
|
||||||
|
};
|
||||||
|
|
||||||
|
tracks
|
||||||
|
.iter()
|
||||||
|
.filter_map(|t| {
|
||||||
|
let url = t.get("baseUrl")?.as_str()?.to_string();
|
||||||
|
let lang = t
|
||||||
|
.get("languageCode")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("en")
|
||||||
|
.to_string();
|
||||||
|
let name = t
|
||||||
|
.get("name")
|
||||||
|
.and_then(|v| v.get("simpleText"))
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or(&lang)
|
||||||
|
.to_string();
|
||||||
|
Some(CaptionTrack { url, lang, name })
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse YouTube timed text XML into plain transcript text.
|
||||||
|
/// The XML format is: `<transcript><text start="0" dur="1.5">Hello</text>...</transcript>`
|
||||||
|
pub fn parse_timed_text(xml: &str) -> String {
|
||||||
|
// Simple regex-based parsing to avoid adding an XML crate dependency.
|
||||||
|
// Extract text content between <text ...>...</text> tags.
|
||||||
|
static TEXT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<text[^>]*>([^<]*)</text>").unwrap());
|
||||||
|
|
||||||
|
let mut lines: Vec<String> = Vec::new();
|
||||||
|
for cap in TEXT_RE.captures_iter(xml) {
|
||||||
|
let text = cap[1].trim();
|
||||||
|
if text.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Decode XML entities
|
||||||
|
let decoded = text
|
||||||
|
.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
.replace(""", "\"")
|
||||||
|
.replace("'", "'")
|
||||||
|
.replace("'", "'")
|
||||||
|
.replace("\n", " ");
|
||||||
|
lines.push(decoded);
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.join(" ")
|
||||||
|
}
|
||||||
|
|
||||||
/// Format extracted metadata into structured markdown.
|
/// Format extracted metadata into structured markdown.
|
||||||
fn format_markdown(meta: &VideoMeta) -> String {
|
fn format_markdown(meta: &VideoMeta) -> String {
|
||||||
let mut md = format!("# {}\n\n", meta.title);
|
let mut md = format!("# {}\n\n", meta.title);
|
||||||
|
|
|
||||||
|
|
@ -83,8 +83,8 @@ pub fn extra_profiles() -> Vec<ImpersonateProfile> {
|
||||||
|
|
||||||
pub fn latest_chrome() -> ImpersonateProfile {
|
pub fn latest_chrome() -> ImpersonateProfile {
|
||||||
ImpersonateProfile {
|
ImpersonateProfile {
|
||||||
browser: Impersonate::ChromeV145,
|
browser: Impersonate::SafariV26,
|
||||||
os: ImpersonateOS::Windows,
|
os: ImpersonateOS::MacOS,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -206,7 +206,11 @@ impl FetchClient {
|
||||||
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
|
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Single fetch attempt (no retry).
|
/// Single fetch attempt with automatic plain-client fallback.
|
||||||
|
///
|
||||||
|
/// If the TLS-impersonated client fails with a connection error or gets a 403,
|
||||||
|
/// retries with a plain client (no impersonation). Some sites (e.g. ycombinator.com)
|
||||||
|
/// reject forged TLS fingerprints but accept default rustls connections.
|
||||||
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
|
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
|
|
@ -222,8 +226,47 @@ impl FetchClient {
|
||||||
ClientPool::Rotating { clients } => pick_random(clients),
|
ClientPool::Rotating { clients } => pick_random(clients),
|
||||||
};
|
};
|
||||||
|
|
||||||
let response = client.get(url).send().await?;
|
// Try impersonated client first
|
||||||
|
let needs_plain_fallback = match client.get(url).send().await {
|
||||||
|
Ok(response) => {
|
||||||
|
let status = response.status().as_u16();
|
||||||
|
if status == 403 {
|
||||||
|
debug!(url, "impersonated client got 403, trying plain fallback");
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
return Self::response_to_result(response, start).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_e) => {
|
||||||
|
debug!(
|
||||||
|
url,
|
||||||
|
"impersonated client connection failed, trying plain fallback"
|
||||||
|
);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Plain client fallback (no TLS impersonation)
|
||||||
|
if needs_plain_fallback {
|
||||||
|
let plain = primp::Client::builder()
|
||||||
|
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
|
||||||
|
.cookie_store(true)
|
||||||
|
.timeout(Duration::from_secs(30))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| FetchError::Build(format!("plain client: {e}")))?;
|
||||||
|
|
||||||
|
let response = plain.get(url).send().await?;
|
||||||
|
return Self::response_to_result(response, start).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert a primp Response into a FetchResult.
|
||||||
|
async fn response_to_result(
|
||||||
|
response: primp::Response,
|
||||||
|
start: Instant,
|
||||||
|
) -> Result<FetchResult, FetchError> {
|
||||||
let status = response.status().as_u16();
|
let status = response.status().as_u16();
|
||||||
let final_url = response.url().to_string();
|
let final_url = response.url().to_string();
|
||||||
|
|
||||||
|
|
@ -301,7 +344,31 @@ impl FetchClient {
|
||||||
|
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let client = self.pick_client(url);
|
let client = self.pick_client(url);
|
||||||
let response = client.get(url).send().await?;
|
|
||||||
|
// Try impersonated client, fall back to plain on connection error or 403
|
||||||
|
let response = match client.get(url).send().await {
|
||||||
|
Ok(resp) if resp.status().as_u16() == 403 => {
|
||||||
|
debug!(url, "impersonated client got 403, trying plain fallback");
|
||||||
|
let plain = primp::Client::builder()
|
||||||
|
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
|
||||||
|
.cookie_store(true)
|
||||||
|
.timeout(Duration::from_secs(30))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?;
|
||||||
|
plain.get(url).send().await?
|
||||||
|
}
|
||||||
|
Ok(resp) => resp,
|
||||||
|
Err(_e) => {
|
||||||
|
debug!(url, "impersonated client failed, trying plain fallback");
|
||||||
|
let plain = primp::Client::builder()
|
||||||
|
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
|
||||||
|
.cookie_store(true)
|
||||||
|
.timeout(Duration::from_secs(30))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?;
|
||||||
|
plain.get(url).send().await?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let status = response.status().as_u16();
|
let status = response.status().as_u16();
|
||||||
let final_url = response.url().to_string();
|
let final_url = response.url().to_string();
|
||||||
|
|
@ -351,6 +418,14 @@ impl FetchClient {
|
||||||
}
|
}
|
||||||
|
|
||||||
let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
|
let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||||
|
|
||||||
|
// YouTube transcript: caption URLs are IP-signed and expire immediately,
|
||||||
|
// so the timedtext endpoint returns empty responses. The innertube
|
||||||
|
// get_transcript API requires cookies/consent. Transcript extraction
|
||||||
|
// will be enabled via the cloud API (JS rendering + cookie jar).
|
||||||
|
// The extraction functions exist in webclaw_core::youtube but are not
|
||||||
|
// wired up here until we have a reliable fetch path.
|
||||||
|
|
||||||
Ok(extraction)
|
Ok(extraction)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue