diff --git a/CHANGELOG.md b/CHANGELOG.md index 938a0b4..7cfd1e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,20 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.5.1] — 2026-04-22 + +### Added +- **`webclaw_fetch::Fetcher` trait.** Vertical extractors now consume `&dyn Fetcher` instead of `&FetchClient` directly. The trait exposes three methods (`fetch`, `fetch_with_headers`, `cloud`) covering everything extractors need. Callers that already held a `FetchClient` keep working unchanged: `FetchClient` implements `Fetcher`, blanket impls cover `&T` and `Arc`, so `&client` coerces to `&dyn Fetcher` automatically. + + The motivation is the split between OSS (wreq-backed, in-process TLS fingerprinting) and the production API server at api.webclaw.io (which cannot use in-process fingerprinting per the architecture rule, and must delegate HTTP through the Go tls-sidecar). Before this trait, adding vertical routes to the production server would have required importing wreq into its dependency graph, violating the separation. Now the production server can provide its own `TlsSidecarFetcher` implementation and pass it to the same extractor dispatcher the OSS server uses. + + Backwards compatible. No behavior change for CLI, MCP, or OSS self-host. + +### Changed +- All 28 extractor `extract()` signatures migrated from `client: &FetchClient` to `client: &dyn Fetcher`. The dispatcher functions (`extractors::dispatch_by_url`, `extractors::dispatch_by_name`) and the cloud escalation helpers (`cloud::smart_fetch`, `cloud::smart_fetch_html`) follow the same change. Tests and call sites are unchanged because `&FetchClient` auto-coerces. + +--- + ## [0.5.0] — 2026-04-22 ### Added diff --git a/Cargo.lock b/Cargo.lock index 3603981..bad52e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3199,7 +3199,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.0" +version = "0.5.1" dependencies = [ "clap", "dotenvy", @@ -3220,7 +3220,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.0" +version = "0.5.1" dependencies = [ "ego-tree", "once_cell", @@ -3238,8 +3238,9 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.0" +version = "0.5.1" dependencies = [ + "async-trait", "bytes", "calamine", "http", @@ -3262,7 +3263,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.0" +version = "0.5.1" dependencies = [ "async-trait", "reqwest", @@ -3275,7 +3276,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.0" +version = "0.5.1" dependencies = [ "dirs", "dotenvy", @@ -3295,7 +3296,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.0" +version = "0.5.1" dependencies = [ "pdf-extract", "thiserror", @@ -3304,7 +3305,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.0" +version = "0.5.1" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index e8b2677..92152f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.0" +version = "0.5.1" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml index 2ec9b9d..a47ba7e 100644 --- a/crates/webclaw-fetch/Cargo.toml +++ b/crates/webclaw-fetch/Cargo.toml @@ -12,6 +12,7 @@ serde = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } tokio = { workspace = true } +async-trait = "0.1" wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] } http = "1" bytes = "1" diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 7ce16d7..8fd5ff5 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -599,6 +599,36 @@ impl FetchClient { } } +// --------------------------------------------------------------------------- +// Fetcher trait implementation +// +// Vertical extractors consume the [`crate::fetcher::Fetcher`] trait +// rather than `FetchClient` directly, which is what lets the production +// API server swap in a tls-sidecar-backed implementation without +// pulling wreq into its dependency graph. For everyone else (CLI, MCP, +// self-hosted OSS server) this impl means "pass the FetchClient you +// already have; nothing changes". +// --------------------------------------------------------------------------- + +#[async_trait::async_trait] +impl crate::fetcher::Fetcher for FetchClient { + async fn fetch(&self, url: &str) -> Result { + FetchClient::fetch(self, url).await + } + + async fn fetch_with_headers( + &self, + url: &str, + headers: &[(&str, &str)], + ) -> Result { + FetchClient::fetch_with_headers(self, url, headers).await + } + + fn cloud(&self) -> Option<&crate::cloud::CloudClient> { + FetchClient::cloud(self) + } +} + /// Collect the browser variants to use based on the browser profile. fn collect_variants(profile: &BrowserProfile) -> Vec { match profile { diff --git a/crates/webclaw-fetch/src/cloud.rs b/crates/webclaw-fetch/src/cloud.rs index c70a75e..3bad383 100644 --- a/crates/webclaw-fetch/src/cloud.rs +++ b/crates/webclaw-fetch/src/cloud.rs @@ -66,7 +66,9 @@ use serde_json::{Value, json}; use thiserror::Error; use tracing::{debug, info, warn}; -use crate::client::FetchClient; +// Client type isn't needed here anymore now that smart_fetch* takes +// `&dyn Fetcher`. Kept as a comment for historical context: this +// module used to import FetchClient directly before v0.5.1. // --------------------------------------------------------------------------- // URLs + defaults — keep in one place so "change the signup link" is a @@ -506,7 +508,7 @@ pub enum SmartFetchResult { /// Prefer [`smart_fetch_html`] for new callers — it surfaces the typed /// [`CloudError`] so you can render precise UX. pub async fn smart_fetch( - client: &FetchClient, + client: &dyn crate::fetcher::Fetcher, cloud: Option<&CloudClient>, url: &str, include_selectors: &[String], @@ -613,7 +615,7 @@ pub struct FetchedHtml { /// Designed for the vertical-extractor pattern where the caller has /// its own parser and just needs bytes. pub async fn smart_fetch_html( - client: &FetchClient, + client: &dyn crate::fetcher::Fetcher, cloud: Option<&CloudClient>, url: &str, ) -> Result { diff --git a/crates/webclaw-fetch/src/extractors/amazon_product.rs b/crates/webclaw-fetch/src/extractors/amazon_product.rs index 7f022fb..fed6b9f 100644 --- a/crates/webclaw-fetch/src/extractors/amazon_product.rs +++ b/crates/webclaw-fetch/src/extractors/amazon_product.rs @@ -32,9 +32,9 @@ use regex::Regex; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::cloud::{self, CloudError}; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "amazon_product", @@ -59,7 +59,7 @@ pub fn matches(url: &str) -> bool { parse_asin(url).is_some() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let asin = parse_asin(url) .ok_or_else(|| FetchError::Build(format!("amazon_product: no ASIN in '{url}'")))?; diff --git a/crates/webclaw-fetch/src/extractors/arxiv.rs b/crates/webclaw-fetch/src/extractors/arxiv.rs index cbcb3d1..c2b85c0 100644 --- a/crates/webclaw-fetch/src/extractors/arxiv.rs +++ b/crates/webclaw-fetch/src/extractors/arxiv.rs @@ -10,8 +10,8 @@ use quick_xml::events::Event; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "arxiv", @@ -32,7 +32,7 @@ pub fn matches(url: &str) -> bool { url.contains("/abs/") || url.contains("/pdf/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let id = parse_id(url) .ok_or_else(|| FetchError::Build(format!("arxiv: cannot parse id from '{url}'")))?; diff --git a/crates/webclaw-fetch/src/extractors/crates_io.rs b/crates/webclaw-fetch/src/extractors/crates_io.rs index 915b1c3..719579f 100644 --- a/crates/webclaw-fetch/src/extractors/crates_io.rs +++ b/crates/webclaw-fetch/src/extractors/crates_io.rs @@ -9,8 +9,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "crates_io", @@ -30,7 +30,7 @@ pub fn matches(url: &str) -> bool { url.contains("/crates/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let name = parse_name(url) .ok_or_else(|| FetchError::Build(format!("crates.io: cannot parse name from '{url}'")))?; diff --git a/crates/webclaw-fetch/src/extractors/dev_to.rs b/crates/webclaw-fetch/src/extractors/dev_to.rs index 49372ce..86199d8 100644 --- a/crates/webclaw-fetch/src/extractors/dev_to.rs +++ b/crates/webclaw-fetch/src/extractors/dev_to.rs @@ -8,8 +8,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "dev_to", @@ -61,7 +61,7 @@ const RESERVED_FIRST_SEGS: &[&str] = &[ "t", ]; -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (username, slug) = parse_username_slug(url).ok_or_else(|| { FetchError::Build(format!("dev_to: cannot parse username/slug from '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/extractors/docker_hub.rs b/crates/webclaw-fetch/src/extractors/docker_hub.rs index 15c928c..bce9315 100644 --- a/crates/webclaw-fetch/src/extractors/docker_hub.rs +++ b/crates/webclaw-fetch/src/extractors/docker_hub.rs @@ -8,8 +8,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "docker_hub", @@ -29,7 +29,7 @@ pub fn matches(url: &str) -> bool { url.contains("/_/") || url.contains("/r/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (namespace, name) = parse_repo(url) .ok_or_else(|| FetchError::Build(format!("docker_hub: cannot parse repo from '{url}'")))?; diff --git a/crates/webclaw-fetch/src/extractors/ebay_listing.rs b/crates/webclaw-fetch/src/extractors/ebay_listing.rs index 14c36ef..dbc85ab 100644 --- a/crates/webclaw-fetch/src/extractors/ebay_listing.rs +++ b/crates/webclaw-fetch/src/extractors/ebay_listing.rs @@ -14,9 +14,9 @@ use regex::Regex; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::cloud::{self, CloudError}; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "ebay_listing", @@ -39,7 +39,7 @@ pub fn matches(url: &str) -> bool { parse_item_id(url).is_some() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let item_id = parse_item_id(url) .ok_or_else(|| FetchError::Build(format!("ebay_listing: no item id in '{url}'")))?; diff --git a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs index 099a8fb..019fb68 100644 --- a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs +++ b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs @@ -42,8 +42,8 @@ use regex::Regex; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "ecommerce_product", @@ -69,7 +69,7 @@ pub fn matches(url: &str) -> bool { !host_of(url).is_empty() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let resp = client.fetch(url).await?; if !(200..300).contains(&resp.status) { return Err(FetchError::Build(format!( diff --git a/crates/webclaw-fetch/src/extractors/etsy_listing.rs b/crates/webclaw-fetch/src/extractors/etsy_listing.rs index 060c3b6..ea9ed0b 100644 --- a/crates/webclaw-fetch/src/extractors/etsy_listing.rs +++ b/crates/webclaw-fetch/src/extractors/etsy_listing.rs @@ -26,9 +26,9 @@ use regex::Regex; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::cloud::{self, CloudError}; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "etsy_listing", @@ -49,7 +49,7 @@ pub fn matches(url: &str) -> bool { parse_listing_id(url).is_some() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let listing_id = parse_listing_id(url) .ok_or_else(|| FetchError::Build(format!("etsy_listing: no listing id in '{url}'")))?; diff --git a/crates/webclaw-fetch/src/extractors/github_issue.rs b/crates/webclaw-fetch/src/extractors/github_issue.rs index 436faa9..9a64f21 100644 --- a/crates/webclaw-fetch/src/extractors/github_issue.rs +++ b/crates/webclaw-fetch/src/extractors/github_issue.rs @@ -10,8 +10,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "github_issue", @@ -34,7 +34,7 @@ pub fn matches(url: &str) -> bool { parse_issue(url).is_some() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (owner, repo, number) = parse_issue(url).ok_or_else(|| { FetchError::Build(format!("github_issue: cannot parse issue URL '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/extractors/github_pr.rs b/crates/webclaw-fetch/src/extractors/github_pr.rs index 9d4b95a..266d3cd 100644 --- a/crates/webclaw-fetch/src/extractors/github_pr.rs +++ b/crates/webclaw-fetch/src/extractors/github_pr.rs @@ -9,8 +9,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "github_pr", @@ -33,7 +33,7 @@ pub fn matches(url: &str) -> bool { parse_pr(url).is_some() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (owner, repo, number) = parse_pr(url).ok_or_else(|| { FetchError::Build(format!("github_pr: cannot parse pull-request URL '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/extractors/github_release.rs b/crates/webclaw-fetch/src/extractors/github_release.rs index b019550..7699d09 100644 --- a/crates/webclaw-fetch/src/extractors/github_release.rs +++ b/crates/webclaw-fetch/src/extractors/github_release.rs @@ -8,8 +8,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "github_release", @@ -32,7 +32,7 @@ pub fn matches(url: &str) -> bool { parse_release(url).is_some() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (owner, repo, tag) = parse_release(url).ok_or_else(|| { FetchError::Build(format!("github_release: cannot parse release URL '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/extractors/github_repo.rs b/crates/webclaw-fetch/src/extractors/github_repo.rs index d89d06a..2a62aa3 100644 --- a/crates/webclaw-fetch/src/extractors/github_repo.rs +++ b/crates/webclaw-fetch/src/extractors/github_repo.rs @@ -10,8 +10,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "github_repo", @@ -70,7 +70,7 @@ const RESERVED_OWNERS: &[&str] = &[ "about", ]; -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (owner, repo) = parse_owner_repo(url).ok_or_else(|| { FetchError::Build(format!("github_repo: cannot parse owner/repo from '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/extractors/hackernews.rs b/crates/webclaw-fetch/src/extractors/hackernews.rs index 7adaa1c..91d4520 100644 --- a/crates/webclaw-fetch/src/extractors/hackernews.rs +++ b/crates/webclaw-fetch/src/extractors/hackernews.rs @@ -10,8 +10,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "hackernews", @@ -40,7 +40,7 @@ pub fn matches(url: &str) -> bool { false } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let id = parse_item_id(url).ok_or_else(|| { FetchError::Build(format!("hackernews: cannot parse item id from '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs b/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs index cb1f524..e1f84f7 100644 --- a/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs +++ b/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs @@ -7,8 +7,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "huggingface_dataset", @@ -38,7 +38,7 @@ pub fn matches(url: &str) -> bool { segs.first().copied() == Some("datasets") && (segs.len() == 2 || segs.len() == 3) } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let dataset_path = parse_dataset_path(url).ok_or_else(|| { FetchError::Build(format!( "hf_dataset: cannot parse dataset path from '{url}'" diff --git a/crates/webclaw-fetch/src/extractors/huggingface_model.rs b/crates/webclaw-fetch/src/extractors/huggingface_model.rs index decc68a..4c549e0 100644 --- a/crates/webclaw-fetch/src/extractors/huggingface_model.rs +++ b/crates/webclaw-fetch/src/extractors/huggingface_model.rs @@ -9,8 +9,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "huggingface_model", @@ -61,7 +61,7 @@ const RESERVED_NAMESPACES: &[&str] = &[ "search", ]; -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (owner, name) = parse_owner_name(url).ok_or_else(|| { FetchError::Build(format!("hf model: cannot parse owner/name from '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/extractors/instagram_post.rs b/crates/webclaw-fetch/src/extractors/instagram_post.rs index 05c9b8a..8847e36 100644 --- a/crates/webclaw-fetch/src/extractors/instagram_post.rs +++ b/crates/webclaw-fetch/src/extractors/instagram_post.rs @@ -11,8 +11,8 @@ use serde_json::{Value, json}; use std::sync::OnceLock; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "instagram_post", @@ -33,7 +33,7 @@ pub fn matches(url: &str) -> bool { parse_shortcode(url).is_some() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (kind, shortcode) = parse_shortcode(url).ok_or_else(|| { FetchError::Build(format!( "instagram_post: cannot parse shortcode from '{url}'" diff --git a/crates/webclaw-fetch/src/extractors/instagram_profile.rs b/crates/webclaw-fetch/src/extractors/instagram_profile.rs index 4524090..9a92b4c 100644 --- a/crates/webclaw-fetch/src/extractors/instagram_profile.rs +++ b/crates/webclaw-fetch/src/extractors/instagram_profile.rs @@ -23,8 +23,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "instagram_profile", @@ -80,7 +80,7 @@ const RESERVED: &[&str] = &[ "signup", ]; -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let username = parse_username(url).ok_or_else(|| { FetchError::Build(format!( "instagram_profile: cannot parse username from '{url}'" @@ -198,7 +198,7 @@ fn classify(n: &MediaNode) -> &'static str { /// pull whatever OG tags we can. Returns less data and explicitly /// flags `data_completeness: "og_only"` so callers know. async fn og_fallback( - client: &FetchClient, + client: &dyn Fetcher, username: &str, original_url: &str, api_status: u16, diff --git a/crates/webclaw-fetch/src/extractors/linkedin_post.rs b/crates/webclaw-fetch/src/extractors/linkedin_post.rs index 2d6a399..ed7e07b 100644 --- a/crates/webclaw-fetch/src/extractors/linkedin_post.rs +++ b/crates/webclaw-fetch/src/extractors/linkedin_post.rs @@ -14,8 +14,8 @@ use serde_json::{Value, json}; use std::sync::OnceLock; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "linkedin_post", @@ -36,7 +36,7 @@ pub fn matches(url: &str) -> bool { url.contains("/feed/update/urn:li:") || url.contains("/posts/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let urn = extract_urn(url).ok_or_else(|| { FetchError::Build(format!( "linkedin_post: cannot extract URN from '{url}' (expected /feed/update/urn:li:... or /posts/{{slug}}-{{id}})" diff --git a/crates/webclaw-fetch/src/extractors/mod.rs b/crates/webclaw-fetch/src/extractors/mod.rs index 5d06158..91ef8d0 100644 --- a/crates/webclaw-fetch/src/extractors/mod.rs +++ b/crates/webclaw-fetch/src/extractors/mod.rs @@ -46,8 +46,8 @@ pub mod youtube_video; use serde::Serialize; use serde_json::Value; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; /// Public catalog entry for `/v1/extractors`. Stable shape — clients /// rely on `name` to pick the right `/v1/scrape/{name}` route. @@ -102,7 +102,7 @@ pub fn list() -> Vec { /// one that claims the URL. Used by `/v1/scrape` when the caller doesn't /// pick a vertical explicitly. pub async fn dispatch_by_url( - client: &FetchClient, + client: &dyn Fetcher, url: &str, ) -> Option> { if reddit::matches(url) { @@ -281,7 +281,7 @@ pub async fn dispatch_by_url( /// users get a clear "wrong route" error instead of a confusing parse /// failure deep in the extractor. pub async fn dispatch_by_name( - client: &FetchClient, + client: &dyn Fetcher, name: &str, url: &str, ) -> Result { diff --git a/crates/webclaw-fetch/src/extractors/npm.rs b/crates/webclaw-fetch/src/extractors/npm.rs index 4343890..f84da0e 100644 --- a/crates/webclaw-fetch/src/extractors/npm.rs +++ b/crates/webclaw-fetch/src/extractors/npm.rs @@ -13,8 +13,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "npm", @@ -31,7 +31,7 @@ pub fn matches(url: &str) -> bool { url.contains("/package/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let name = parse_name(url) .ok_or_else(|| FetchError::Build(format!("npm: cannot parse name from '{url}'")))?; @@ -94,7 +94,7 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result Result { +async fn fetch_weekly_downloads(client: &dyn Fetcher, name: &str) -> Result { let url = format!( "https://api.npmjs.org/downloads/point/last-week/{}", urlencode_segment(name) diff --git a/crates/webclaw-fetch/src/extractors/pypi.rs b/crates/webclaw-fetch/src/extractors/pypi.rs index f6b7c64..33a4d1c 100644 --- a/crates/webclaw-fetch/src/extractors/pypi.rs +++ b/crates/webclaw-fetch/src/extractors/pypi.rs @@ -9,8 +9,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "pypi", @@ -30,7 +30,7 @@ pub fn matches(url: &str) -> bool { url.contains("/project/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (name, version) = parse_project(url).ok_or_else(|| { FetchError::Build(format!("pypi: cannot parse package name from '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/extractors/reddit.rs b/crates/webclaw-fetch/src/extractors/reddit.rs index 2d084dc..13cdc16 100644 --- a/crates/webclaw-fetch/src/extractors/reddit.rs +++ b/crates/webclaw-fetch/src/extractors/reddit.rs @@ -9,8 +9,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "reddit", @@ -32,7 +32,7 @@ pub fn matches(url: &str) -> bool { is_reddit_host && url.contains("/comments/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let json_url = build_json_url(url); let resp = client.fetch(&json_url).await?; if resp.status != 200 { diff --git a/crates/webclaw-fetch/src/extractors/shopify_collection.rs b/crates/webclaw-fetch/src/extractors/shopify_collection.rs index 095f7dd..23d57c6 100644 --- a/crates/webclaw-fetch/src/extractors/shopify_collection.rs +++ b/crates/webclaw-fetch/src/extractors/shopify_collection.rs @@ -15,8 +15,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "shopify_collection", @@ -49,7 +49,7 @@ const NON_SHOPIFY_HOSTS: &[&str] = &[ "github.com", ]; -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let (coll_meta_url, coll_products_url) = build_json_urls(url); // Step 1: collection metadata. Shopify returns 200 on missing diff --git a/crates/webclaw-fetch/src/extractors/shopify_product.rs b/crates/webclaw-fetch/src/extractors/shopify_product.rs index 19f0438..b52ef36 100644 --- a/crates/webclaw-fetch/src/extractors/shopify_product.rs +++ b/crates/webclaw-fetch/src/extractors/shopify_product.rs @@ -21,8 +21,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "shopify_product", @@ -65,7 +65,7 @@ const NON_SHOPIFY_HOSTS: &[&str] = &[ "github.com", // /products is a marketing page ]; -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let json_url = build_json_url(url); let resp = client.fetch(&json_url).await?; if resp.status == 404 { diff --git a/crates/webclaw-fetch/src/extractors/stackoverflow.rs b/crates/webclaw-fetch/src/extractors/stackoverflow.rs index d74b511..03597a3 100644 --- a/crates/webclaw-fetch/src/extractors/stackoverflow.rs +++ b/crates/webclaw-fetch/src/extractors/stackoverflow.rs @@ -13,8 +13,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "stackoverflow", @@ -31,7 +31,7 @@ pub fn matches(url: &str) -> bool { parse_question_id(url).is_some() } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let id = parse_question_id(url).ok_or_else(|| { FetchError::Build(format!( "stackoverflow: cannot parse question id from '{url}'" diff --git a/crates/webclaw-fetch/src/extractors/substack_post.rs b/crates/webclaw-fetch/src/extractors/substack_post.rs index 0571f3d..c5b5019 100644 --- a/crates/webclaw-fetch/src/extractors/substack_post.rs +++ b/crates/webclaw-fetch/src/extractors/substack_post.rs @@ -28,9 +28,9 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::cloud::{self, CloudError}; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "substack_post", @@ -49,7 +49,7 @@ pub fn matches(url: &str) -> bool { url.contains("/p/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let slug = parse_slug(url).ok_or_else(|| { FetchError::Build(format!("substack_post: cannot parse slug from '{url}'")) })?; @@ -149,7 +149,7 @@ fn build_api_payload(url: &str, api_url: &str, slug: &str, p: Post) -> Value { // --------------------------------------------------------------------------- async fn html_fallback( - client: &FetchClient, + client: &dyn Fetcher, url: &str, api_url: &str, slug: &str, diff --git a/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs b/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs index ae97c67..8b77a29 100644 --- a/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs +++ b/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs @@ -32,9 +32,9 @@ use regex::Regex; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::cloud::{self, CloudError}; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "trustpilot_reviews", @@ -51,7 +51,7 @@ pub fn matches(url: &str) -> bool { url.contains("/review/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let fetched = cloud::smart_fetch_html(client, client.cloud(), url) .await .map_err(cloud_to_fetch_err)?; diff --git a/crates/webclaw-fetch/src/extractors/woocommerce_product.rs b/crates/webclaw-fetch/src/extractors/woocommerce_product.rs index 73f1109..db6dd78 100644 --- a/crates/webclaw-fetch/src/extractors/woocommerce_product.rs +++ b/crates/webclaw-fetch/src/extractors/woocommerce_product.rs @@ -15,8 +15,8 @@ use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "woocommerce_product", @@ -42,7 +42,7 @@ pub fn matches(url: &str) -> bool { || url.contains("/produit/") // common fr locale } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let slug = parse_slug(url).ok_or_else(|| { FetchError::Build(format!( "woocommerce_product: cannot parse slug from '{url}'" diff --git a/crates/webclaw-fetch/src/extractors/youtube_video.rs b/crates/webclaw-fetch/src/extractors/youtube_video.rs index 81079f4..2551ff8 100644 --- a/crates/webclaw-fetch/src/extractors/youtube_video.rs +++ b/crates/webclaw-fetch/src/extractors/youtube_video.rs @@ -25,8 +25,8 @@ use regex::Regex; use serde_json::{Value, json}; use super::ExtractorInfo; -use crate::client::FetchClient; use crate::error::FetchError; +use crate::fetcher::Fetcher; pub const INFO: ExtractorInfo = ExtractorInfo { name: "youtube_video", @@ -45,7 +45,7 @@ pub fn matches(url: &str) -> bool { || url.contains("youtube-nocookie.com/embed/") } -pub async fn extract(client: &FetchClient, url: &str) -> Result { +pub async fn extract(client: &dyn Fetcher, url: &str) -> Result { let video_id = parse_video_id(url).ok_or_else(|| { FetchError::Build(format!("youtube_video: cannot parse video id from '{url}'")) })?; diff --git a/crates/webclaw-fetch/src/fetcher.rs b/crates/webclaw-fetch/src/fetcher.rs new file mode 100644 index 0000000..fabcf44 --- /dev/null +++ b/crates/webclaw-fetch/src/fetcher.rs @@ -0,0 +1,118 @@ +//! Pluggable fetcher abstraction for vertical extractors. +//! +//! Extractors call the network through this trait instead of hard- +//! coding [`FetchClient`]. The OSS CLI / MCP / self-hosted server all +//! pass `&FetchClient` (wreq-backed BoringSSL). The production API +//! server, which must not use in-process TLS fingerprinting, provides +//! its own implementation that routes through the Go tls-sidecar. +//! +//! Both paths expose the same [`FetchResult`] shape and the same +//! optional cloud-escalation client, so extractor logic stays +//! identical across environments. +//! +//! ## Choosing an implementation +//! +//! - CLI, MCP, self-hosted `webclaw-server`: build a [`FetchClient`] +//! with [`FetchClient::with_cloud`] to attach cloud fallback, pass +//! it to extractors as `&client`. +//! - `api.webclaw.io` production server: build a `TlsSidecarFetcher` +//! (in `server/src/engine/`) that delegates to `engine::tls_client` +//! and wraps it in `Arc` for handler injection. +//! +//! ## Why a trait and not a free function +//! +//! Extractors need state beyond a single fetch: the cloud client for +//! antibot escalation, and in the future per-user proxy pools, tenant +//! headers, circuit breakers. A trait keeps that state encapsulated +//! behind the fetch interface instead of threading it through every +//! extractor signature. + +use async_trait::async_trait; + +use crate::client::FetchResult; +use crate::cloud::CloudClient; +use crate::error::FetchError; + +/// HTTP fetch surface used by vertical extractors. +/// +/// Implementations must be `Send + Sync` because extractor dispatchers +/// run them inside tokio tasks, potentially across many requests. +#[async_trait] +pub trait Fetcher: Send + Sync { + /// Fetch a URL and return the raw response body + metadata. The + /// body is in `FetchResult::html` regardless of the actual content + /// type — JSON API endpoints put JSON there, HTML pages put HTML. + /// Extractors branch on response status and body shape. + async fn fetch(&self, url: &str) -> Result; + + /// Fetch with additional request headers. Needed for endpoints + /// that authenticate via a specific header (Instagram's + /// `x-ig-app-id`, for example). Default implementation routes to + /// [`Self::fetch`] so implementers without header support stay + /// functional, though the `Option` field they'd set won't + /// be populated on the request. + async fn fetch_with_headers( + &self, + url: &str, + _headers: &[(&str, &str)], + ) -> Result { + self.fetch(url).await + } + + /// Optional cloud-escalation client for antibot bypass. Returning + /// `Some` tells extractors they can call into the hosted API when + /// local fetch hits a challenge page. Returning `None` makes + /// cloud-gated extractors emit [`CloudError::NotConfigured`] with + /// an actionable signup link. + /// + /// The default implementation returns `None` because not every + /// deployment wants cloud fallback (self-hosts that don't have a + /// webclaw.io subscription, for instance). + /// + /// [`CloudError::NotConfigured`]: crate::cloud::CloudError::NotConfigured + fn cloud(&self) -> Option<&CloudClient> { + None + } +} + +// --------------------------------------------------------------------------- +// Blanket impls: make `&T` and `Arc` behave like the wrapped `T`. +// --------------------------------------------------------------------------- + +#[async_trait] +impl Fetcher for &T { + async fn fetch(&self, url: &str) -> Result { + (**self).fetch(url).await + } + + async fn fetch_with_headers( + &self, + url: &str, + headers: &[(&str, &str)], + ) -> Result { + (**self).fetch_with_headers(url, headers).await + } + + fn cloud(&self) -> Option<&CloudClient> { + (**self).cloud() + } +} + +#[async_trait] +impl Fetcher for std::sync::Arc { + async fn fetch(&self, url: &str) -> Result { + (**self).fetch(url).await + } + + async fn fetch_with_headers( + &self, + url: &str, + headers: &[(&str, &str)], + ) -> Result { + (**self).fetch_with_headers(url, headers).await + } + + fn cloud(&self) -> Option<&CloudClient> { + (**self).cloud() + } +} diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index 3a4781e..83664a1 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -8,6 +8,7 @@ pub mod crawler; pub mod document; pub mod error; pub mod extractors; +pub mod fetcher; pub mod linkedin; pub mod proxy; pub mod reddit; @@ -18,6 +19,7 @@ pub use browser::BrowserProfile; pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult}; pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult}; pub use error::FetchError; +pub use fetcher::Fetcher; pub use http::HeaderMap; pub use proxy::{parse_proxy_file, parse_proxy_line}; pub use sitemap::SitemapEntry;