From 907966a983f26bbea52b1cbe278fe22d24d6e542 Mon Sep 17 00:00:00 2001 From: Valerio Date: Tue, 24 Mar 2026 18:43:47 +0100 Subject: [PATCH] fix: use plain client for Reddit JSON endpoint Reddit blocks TLS-fingerprinted clients on their .json API but accepts standard requests with a browser User-Agent. Switch to a non-impersonated primp client for the Reddit fallback path. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/webclaw-fetch/src/client.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index db8a2c7..0cbacb2 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -274,13 +274,19 @@ impl FetchClient { url: &str, options: &webclaw_core::ExtractionOptions, ) -> Result { - // Reddit fallback: use their JSON API to get post + full comment tree + // Reddit fallback: use their JSON API to get post + full comment tree. + // Uses a plain reqwest client — Reddit's JSON endpoint blocks TLS-fingerprinted clients + // but accepts standard requests with a browser User-Agent. if crate::reddit::is_reddit_url(url) { let json_url = crate::reddit::json_url(url); debug!("reddit detected, fetching {json_url}"); - let client = self.pick_client(&json_url); - let response = client.get(&json_url).send().await?; + let plain = primp::Client::builder() + .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36") + .timeout(std::time::Duration::from_secs(15)) + .build() + .map_err(|e| FetchError::Build(format!("reddit client: {e}")))?; + let response = plain.get(&json_url).send().await?; if response.status().is_success() { let bytes = response .bytes()