mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
Compare commits
6 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a5c3433372 | ||
|
|
966981bc42 | ||
|
|
866fa88aa0 | ||
|
|
b413d702b2 | ||
|
|
98a177dec4 | ||
|
|
e1af2da509 |
7 changed files with 99 additions and 19 deletions
18
CHANGELOG.md
18
CHANGELOG.md
|
|
@ -3,6 +3,24 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.5.6] — 2026-04-23
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API with an identifiable bot `User-Agent`, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up.
|
||||||
|
- Panic in the markdown converter (`markdown.rs:925`) on single-pipe `|` lines. A `[1..len-1]` slice on a 1-char input triggered `begin <= end`. Guarded.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.5.5] — 2026-04-23
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- `webclaw --browser safari-ios` on the CLI. Pairs with `--proxy` for DataDome-fronted sites that reject desktop profiles.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.5.4] — 2026-04-23
|
## [0.5.4] — 2026-04-23
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,7 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
|
||||||
- **webclaw-fetch uses wreq 6.x** (BoringSSL). No `[patch.crates-io]` forks needed; wreq handles TLS internally.
|
- **webclaw-fetch uses wreq 6.x** (BoringSSL). No `[patch.crates-io]` forks needed; wreq handles TLS internally.
|
||||||
- **No special RUSTFLAGS** — `.cargo/config.toml` is currently empty of build flags. Don't add any.
|
- **No special RUSTFLAGS** — `.cargo/config.toml` is currently empty of build flags. Don't add any.
|
||||||
- **webclaw-llm uses plain reqwest**. LLM APIs don't need TLS fingerprinting, so no wreq dep.
|
- **webclaw-llm uses plain reqwest**. LLM APIs don't need TLS fingerprinting, so no wreq dep.
|
||||||
- **Vertical extractors take `&dyn Fetcher`**, not `&FetchClient`. This lets the production server plug in a `TlsSidecarFetcher` that routes through the Go tls-sidecar instead of in-process wreq.
|
- **Vertical extractors take `&dyn Fetcher`**, not `&FetchClient`. This lets the production server plug in a `ProductionFetcher` that adds domain_hints routing and antibot escalation on top of the same wreq client.
|
||||||
- **qwen3 thinking tags** (`<think>`) are stripped at both provider and consumer levels.
|
- **qwen3 thinking tags** (`<think>`) are stripped at both provider and consumer levels.
|
||||||
|
|
||||||
## Build & Test
|
## Build & Test
|
||||||
|
|
|
||||||
14
Cargo.lock
generated
14
Cargo.lock
generated
|
|
@ -3219,7 +3219,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.5.3"
|
version = "0.5.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3240,7 +3240,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.5.3"
|
version = "0.5.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3258,7 +3258,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.5.3"
|
version = "0.5.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
|
@ -3284,7 +3284,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.5.3"
|
version = "0.5.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3297,7 +3297,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.5.3"
|
version = "0.5.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3317,7 +3317,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.5.3"
|
version = "0.5.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
@ -3326,7 +3326,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-server"
|
name = "webclaw-server"
|
||||||
version = "0.5.3"
|
version = "0.5.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"axum",
|
"axum",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.5.4"
|
version = "0.5.6"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -351,6 +351,9 @@ enum OutputFormat {
|
||||||
enum Browser {
|
enum Browser {
|
||||||
Chrome,
|
Chrome,
|
||||||
Firefox,
|
Firefox,
|
||||||
|
/// Safari iOS 26. Pair with a country-matched residential proxy for sites
|
||||||
|
/// that reject non-mobile profiles.
|
||||||
|
SafariIos,
|
||||||
Random,
|
Random,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -377,6 +380,7 @@ impl From<Browser> for BrowserProfile {
|
||||||
match b {
|
match b {
|
||||||
Browser::Chrome => BrowserProfile::Chrome,
|
Browser::Chrome => BrowserProfile::Chrome,
|
||||||
Browser::Firefox => BrowserProfile::Firefox,
|
Browser::Firefox => BrowserProfile::Firefox,
|
||||||
|
Browser::SafariIos => BrowserProfile::SafariIos,
|
||||||
Browser::Random => BrowserProfile::Random,
|
Browser::Random => BrowserProfile::Random,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -920,8 +920,10 @@ fn strip_markdown(md: &str) -> String {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs
|
// Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs.
|
||||||
if trimmed.starts_with('|') && trimmed.ends_with('|') {
|
// Require at least 2 chars so the slice `[1..len-1]` stays non-empty on single-pipe rows
|
||||||
|
// (which aren't real tables anyway); a lone `|` previously panicked at `begin <= end`.
|
||||||
|
if trimmed.len() >= 2 && trimmed.starts_with('|') && trimmed.ends_with('|') {
|
||||||
let inner = &trimmed[1..trimmed.len() - 1];
|
let inner = &trimmed[1..trimmed.len() - 1];
|
||||||
let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect();
|
let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect();
|
||||||
lines.push(cells.join("\t"));
|
lines.push(cells.join("\t"));
|
||||||
|
|
|
||||||
|
|
@ -261,10 +261,65 @@ impl FetchClient {
|
||||||
self.cloud.as_deref()
|
self.cloud.as_deref()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Fetch a URL with per-site rescue paths: Reddit URLs redirect to the
|
||||||
|
/// `.json` API, and Akamai-style challenge responses trigger a homepage
|
||||||
|
/// cookie warmup and a retry. Returns the same `FetchResult` shape as
|
||||||
|
/// [`Self::fetch`] so every caller (CLI, MCP, OSS server, production
|
||||||
|
/// server) benefits without shape churn.
|
||||||
|
///
|
||||||
|
/// This is the method most callers want. Use plain [`Self::fetch`] only
|
||||||
|
/// when you need literal no-rescue behavior (e.g. inside the rescue
|
||||||
|
/// logic itself to avoid recursion).
|
||||||
|
pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||||
|
// Reddit: the HTML page shows a verification interstitial for most
|
||||||
|
// client IPs, but appending `.json` returns the post + comment tree
|
||||||
|
// publicly. `parse_reddit_json` in downstream code knows how to read
|
||||||
|
// the result; here we just do the URL swap at the fetch layer.
|
||||||
|
if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
|
||||||
|
let json_url = crate::reddit::json_url(url);
|
||||||
|
// Reddit's public .json API serves JSON to identifiable bot
|
||||||
|
// User-Agents and blocks browser UAs with a verification wall.
|
||||||
|
// Override our Chrome-profile UA for this specific call.
|
||||||
|
let ua = concat!(
|
||||||
|
"Webclaw/",
|
||||||
|
env!("CARGO_PKG_VERSION"),
|
||||||
|
" (+https://webclaw.io)"
|
||||||
|
);
|
||||||
|
if let Ok(resp) = self
|
||||||
|
.fetch_with_headers(&json_url, &[("user-agent", ua)])
|
||||||
|
.await
|
||||||
|
&& resp.status == 200
|
||||||
|
{
|
||||||
|
let first = resp.html.trim_start().as_bytes().first().copied();
|
||||||
|
if matches!(first, Some(b'{') | Some(b'[')) {
|
||||||
|
return Ok(resp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If the .json fetch failed or returned HTML, fall through.
|
||||||
|
}
|
||||||
|
|
||||||
|
let resp = self.fetch(url).await?;
|
||||||
|
|
||||||
|
// Akamai / bazadebezolkohpepadr challenge: visit the homepage to
|
||||||
|
// collect warmup cookies (_abck, bm_sz, etc.), then retry.
|
||||||
|
if is_challenge_html(&resp.html)
|
||||||
|
&& let Some(homepage) = extract_homepage(url)
|
||||||
|
{
|
||||||
|
debug!("challenge detected, warming cookies via {homepage}");
|
||||||
|
let _ = self.fetch(&homepage).await;
|
||||||
|
if let Ok(retry) = self.fetch(url).await {
|
||||||
|
return Ok(retry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(resp)
|
||||||
|
}
|
||||||
|
|
||||||
/// Fetch a URL and return the raw HTML + response metadata.
|
/// Fetch a URL and return the raw HTML + response metadata.
|
||||||
///
|
///
|
||||||
/// Automatically retries on transient failures (network errors, 5xx, 429)
|
/// Automatically retries on transient failures (network errors, 5xx, 429)
|
||||||
/// with exponential backoff: 0s, 1s (2 attempts total).
|
/// with exponential backoff: 0s, 1s (2 attempts total). No per-site
|
||||||
|
/// rescue logic; use [`Self::fetch_smart`] for that.
|
||||||
#[instrument(skip(self), fields(url = %url))]
|
#[instrument(skip(self), fields(url = %url))]
|
||||||
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
|
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||||
let delays = [Duration::ZERO, Duration::from_secs(1)];
|
let delays = [Duration::ZERO, Duration::from_secs(1)];
|
||||||
|
|
@ -713,22 +768,23 @@ fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
|
||||||
|
|
||||||
/// Detect if a response looks like a bot protection challenge page.
|
/// Detect if a response looks like a bot protection challenge page.
|
||||||
fn is_challenge_response(response: &Response) -> bool {
|
fn is_challenge_response(response: &Response) -> bool {
|
||||||
let len = response.body().len();
|
is_challenge_html(response.text().as_ref())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as `is_challenge_response`, operating on a body string directly
|
||||||
|
/// so callers holding a `FetchResult` can reuse the heuristic.
|
||||||
|
fn is_challenge_html(html: &str) -> bool {
|
||||||
|
let len = html.len();
|
||||||
if len > 15_000 || len == 0 {
|
if len > 15_000 || len == 0 {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
let lower = html.to_lowercase();
|
||||||
let text = response.text();
|
|
||||||
let lower = text.to_lowercase();
|
|
||||||
|
|
||||||
if lower.contains("<title>challenge page</title>") {
|
if lower.contains("<title>challenge page</title>") {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
|
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue