mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix: use plain client for Reddit JSON endpoint
Reddit blocks TLS-fingerprinted clients on their .json API but accepts standard requests with a browser User-Agent. Switch to a non-impersonated primp client for the Reddit fallback path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
dff458d2f5
commit
907966a983
1 changed files with 9 additions and 3 deletions
|
|
@ -274,13 +274,19 @@ impl FetchClient {
|
|||
url: &str,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
// Uses a plain reqwest client — Reddit's JSON endpoint blocks TLS-fingerprinted clients
|
||||
// but accepts standard requests with a browser User-Agent.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let client = self.pick_client(&json_url);
|
||||
let response = client.get(&json_url).send().await?;
|
||||
let plain = primp::Client::builder()
|
||||
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
|
||||
.timeout(std::time::Duration::from_secs(15))
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(format!("reddit client: {e}")))?;
|
||||
let response = plain.get(&json_url).send().await?;
|
||||
if response.status().is_success() {
|
||||
let bytes = response
|
||||
.bytes()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue