mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-21 02:28:27 +02:00
Adds `webclaw_fetch::Fetcher` trait. All 28 vertical extractors now
take `client: &dyn Fetcher` instead of `client: &FetchClient` directly.
Backwards-compatible: FetchClient implements Fetcher, blanket impls
cover `&T` and `Arc<T>`, so existing CLI / MCP / self-hosted-server
callers keep working unchanged.
Motivation: the production API server (api.webclaw.io) must not do
in-process TLS fingerprinting; it delegates all HTTP to the Go
tls-sidecar. Before this trait, exposing /v1/scrape/{vertical} on
production would have required importing wreq into the server's
dep graph, violating the CLAUDE.md rule. Now production can provide
its own TlsSidecarFetcher implementation and pass it to the same
dispatcher the OSS server uses.
Changes:
- New `crates/webclaw-fetch/src/fetcher.rs` defining the trait plus
blanket impls for `&T` and `Arc<T>`.
- `FetchClient` gains a tiny impl block in client.rs that forwards to
its existing public methods.
- All 28 extractor signatures migrated from `&FetchClient` to
`&dyn Fetcher` (sed-driven bulk rewrite, no semantic change).
- `cloud::smart_fetch` and `cloud::smart_fetch_html` take `&dyn Fetcher`.
- `extractors::dispatch_by_url` and `extractors::dispatch_by_name`
take `&dyn Fetcher`.
- `async-trait 0.1` added to webclaw-fetch deps (Rust 1.75+ has
native async-fn-in-trait but dyn dispatch still needs async_trait).
- Version bumped to 0.5.1, CHANGELOG updated.
Tests: 215 passing in webclaw-fetch (no new tests needed — the existing
extractor tests exercise the trait methods transparently).
Clippy: clean workspace-wide.
565 lines
19 KiB
Rust
565 lines
19 KiB
Rust
//! Substack post extractor.
|
|
//!
|
|
//! Every Substack publication exposes `/api/v1/posts/{slug}` that
|
|
//! returns the full post as JSON: body HTML, cover image, author,
|
|
//! publication info, reactions, paywall state. No auth on public
|
|
//! posts.
|
|
//!
|
|
//! Works on both `*.substack.com` subdomains and custom domains
|
|
//! (e.g. `simonwillison.net` uses Substack too). Detection is
|
|
//! "URL has `/p/{slug}`" because that's the canonical Substack post
|
|
//! path. Explicit-call only because the `/p/{slug}` URL shape is
|
|
//! used by non-Substack sites too.
|
|
//!
|
|
//! ## Fallback
|
|
//!
|
|
//! The API endpoint is rate-limited aggressively on popular publications
|
|
//! and occasionally returns 403 on custom domains with Cloudflare in
|
|
//! front. When that happens we escalate to an HTML fetch (via
|
|
//! `smart_fetch_html`, so antibot-protected custom domains still work)
|
|
//! and extract OG tags + Article JSON-LD for a degraded-but-useful
|
|
//! payload. The response shape stays stable across both paths; a
|
|
//! `data_source` field tells the caller which branch ran.
|
|
|
|
use std::sync::OnceLock;
|
|
|
|
use regex::Regex;
|
|
use serde::Deserialize;
|
|
use serde_json::{Value, json};
|
|
|
|
use super::ExtractorInfo;
|
|
use crate::cloud::{self, CloudError};
|
|
use crate::error::FetchError;
|
|
use crate::fetcher::Fetcher;
|
|
|
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
|
name: "substack_post",
|
|
label: "Substack post",
|
|
description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API. Falls back to OG + JSON-LD HTML parsing when the API is rate-limited.",
|
|
url_patterns: &[
|
|
"https://{pub}.substack.com/p/{slug}",
|
|
"https://{custom-domain}/p/{slug}",
|
|
],
|
|
};
|
|
|
|
pub fn matches(url: &str) -> bool {
|
|
if !(url.starts_with("http://") || url.starts_with("https://")) {
|
|
return false;
|
|
}
|
|
url.contains("/p/")
|
|
}
|
|
|
|
pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
|
|
let slug = parse_slug(url).ok_or_else(|| {
|
|
FetchError::Build(format!("substack_post: cannot parse slug from '{url}'"))
|
|
})?;
|
|
let host = host_of(url);
|
|
if host.is_empty() {
|
|
return Err(FetchError::Build(format!(
|
|
"substack_post: empty host in '{url}'"
|
|
)));
|
|
}
|
|
let scheme = if url.starts_with("http://") {
|
|
"http"
|
|
} else {
|
|
"https"
|
|
};
|
|
let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
|
|
|
|
// 1. Try the public API. 200 = full payload; 404 = real miss; any
|
|
// other status hands off to the HTML fallback so a transient rate
|
|
// limit or a hardened custom domain doesn't fail the whole call.
|
|
let resp = client.fetch(&api_url).await?;
|
|
match resp.status {
|
|
200 => match serde_json::from_str::<Post>(&resp.html) {
|
|
Ok(p) => Ok(build_api_payload(url, &api_url, &slug, p)),
|
|
Err(e) => {
|
|
// API returned 200 but the body isn't the Post shape we
|
|
// expect. Could be a custom-domain site that exposes
|
|
// something else at /api/v1/posts/. Fall back to HTML
|
|
// rather than hard-failing.
|
|
html_fallback(
|
|
client,
|
|
url,
|
|
&api_url,
|
|
&slug,
|
|
Some(format!(
|
|
"api returned 200 but body was not Substack JSON ({e})"
|
|
)),
|
|
)
|
|
.await
|
|
}
|
|
},
|
|
404 => Err(FetchError::Build(format!(
|
|
"substack_post: '{slug}' not found on {host} (got 404). \
|
|
If the publication isn't actually on Substack, use /v1/scrape instead."
|
|
))),
|
|
_ => {
|
|
// Rate limit, 403, 5xx, whatever: try HTML.
|
|
let reason = format!("api returned status {} for {api_url}", resp.status);
|
|
html_fallback(client, url, &api_url, &slug, Some(reason)).await
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// API-path payload builder
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn build_api_payload(url: &str, api_url: &str, slug: &str, p: Post) -> Value {
|
|
json!({
|
|
"url": url,
|
|
"api_url": api_url,
|
|
"data_source": "api",
|
|
"id": p.id,
|
|
"type": p.r#type,
|
|
"slug": p.slug.or_else(|| Some(slug.to_string())),
|
|
"title": p.title,
|
|
"subtitle": p.subtitle,
|
|
"description": p.description,
|
|
"canonical_url": p.canonical_url,
|
|
"post_date": p.post_date,
|
|
"updated_at": p.updated_at,
|
|
"audience": p.audience,
|
|
"has_paywall": matches!(p.audience.as_deref(), Some("only_paid") | Some("founding")),
|
|
"is_free_preview": p.is_free_preview,
|
|
"cover_image": p.cover_image,
|
|
"word_count": p.wordcount,
|
|
"reactions": p.reactions,
|
|
"comment_count": p.comment_count,
|
|
"body_html": p.body_html,
|
|
"body_text": p.truncated_body_text.or(p.body_text),
|
|
"publication": json!({
|
|
"id": p.publication.as_ref().and_then(|pub_| pub_.id),
|
|
"name": p.publication.as_ref().and_then(|pub_| pub_.name.clone()),
|
|
"subdomain": p.publication.as_ref().and_then(|pub_| pub_.subdomain.clone()),
|
|
"custom_domain":p.publication.as_ref().and_then(|pub_| pub_.custom_domain.clone()),
|
|
}),
|
|
"authors": p.published_bylines.iter().map(|a| json!({
|
|
"id": a.id,
|
|
"name": a.name,
|
|
"handle": a.handle,
|
|
"photo": a.photo_url,
|
|
})).collect::<Vec<_>>(),
|
|
})
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// HTML fallback: OG + Article JSON-LD
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async fn html_fallback(
|
|
client: &dyn Fetcher,
|
|
url: &str,
|
|
api_url: &str,
|
|
slug: &str,
|
|
fallback_reason: Option<String>,
|
|
) -> Result<Value, FetchError> {
|
|
let fetched = cloud::smart_fetch_html(client, client.cloud(), url)
|
|
.await
|
|
.map_err(cloud_to_fetch_err)?;
|
|
|
|
let mut data = parse_html(&fetched.html, url, api_url, slug);
|
|
if let Some(obj) = data.as_object_mut() {
|
|
obj.insert(
|
|
"fetch_source".into(),
|
|
match fetched.source {
|
|
cloud::FetchSource::Local => json!("local"),
|
|
cloud::FetchSource::Cloud => json!("cloud"),
|
|
},
|
|
);
|
|
if let Some(reason) = fallback_reason {
|
|
obj.insert("fallback_reason".into(), json!(reason));
|
|
}
|
|
}
|
|
Ok(data)
|
|
}
|
|
|
|
/// Pure HTML parser. Pulls title, subtitle, description, cover image,
|
|
/// publish date, and authors from OG tags and Article JSON-LD. Kept
|
|
/// public so tests can exercise it with fixtures.
|
|
pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
|
|
let article = find_article_jsonld(html);
|
|
|
|
let title = article
|
|
.as_ref()
|
|
.and_then(|v| get_text(v, "headline"))
|
|
.or_else(|| og(html, "title"));
|
|
let description = article
|
|
.as_ref()
|
|
.and_then(|v| get_text(v, "description"))
|
|
.or_else(|| og(html, "description"));
|
|
let cover_image = article
|
|
.as_ref()
|
|
.and_then(get_first_image)
|
|
.or_else(|| og(html, "image"));
|
|
let post_date = article
|
|
.as_ref()
|
|
.and_then(|v| get_text(v, "datePublished"))
|
|
.or_else(|| meta_property(html, "article:published_time"));
|
|
let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
|
|
let publication_name = og(html, "site_name");
|
|
let authors = article.as_ref().map(extract_authors).unwrap_or_default();
|
|
|
|
json!({
|
|
"url": url,
|
|
"api_url": api_url,
|
|
"data_source": "html_fallback",
|
|
"slug": slug,
|
|
"title": title,
|
|
"subtitle": None::<String>,
|
|
"description": description,
|
|
"canonical_url": canonical_url(html).or_else(|| Some(url.to_string())),
|
|
"post_date": post_date,
|
|
"updated_at": updated_at,
|
|
"cover_image": cover_image,
|
|
"body_html": None::<String>,
|
|
"body_text": None::<String>,
|
|
"word_count": None::<i64>,
|
|
"comment_count": None::<i64>,
|
|
"reactions": Value::Null,
|
|
"has_paywall": None::<bool>,
|
|
"is_free_preview": None::<bool>,
|
|
"publication": json!({
|
|
"name": publication_name,
|
|
}),
|
|
"authors": authors,
|
|
})
|
|
}
|
|
|
|
fn extract_authors(v: &Value) -> Vec<Value> {
|
|
let Some(a) = v.get("author") else {
|
|
return Vec::new();
|
|
};
|
|
let one = |val: &Value| -> Option<Value> {
|
|
match val {
|
|
Value::String(s) => Some(json!({"name": s})),
|
|
Value::Object(_) => {
|
|
let name = val.get("name").and_then(|n| n.as_str())?;
|
|
let handle = val
|
|
.get("url")
|
|
.and_then(|u| u.as_str())
|
|
.and_then(handle_from_author_url);
|
|
Some(json!({
|
|
"name": name,
|
|
"handle": handle,
|
|
}))
|
|
}
|
|
_ => None,
|
|
}
|
|
};
|
|
match a {
|
|
Value::Array(arr) => arr.iter().filter_map(one).collect(),
|
|
_ => one(a).into_iter().collect(),
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// URL helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn host_of(url: &str) -> &str {
|
|
url.split("://")
|
|
.nth(1)
|
|
.unwrap_or(url)
|
|
.split('/')
|
|
.next()
|
|
.unwrap_or("")
|
|
}
|
|
|
|
fn parse_slug(url: &str) -> Option<String> {
|
|
let after = url.split("/p/").nth(1)?;
|
|
let stripped = after
|
|
.split(['?', '#'])
|
|
.next()?
|
|
.trim_end_matches('/')
|
|
.split('/')
|
|
.next()
|
|
.unwrap_or("");
|
|
if stripped.is_empty() {
|
|
None
|
|
} else {
|
|
Some(stripped.to_string())
|
|
}
|
|
}
|
|
|
|
/// Extract the Substack handle from an author URL like
|
|
/// `https://substack.com/@handle` or `https://pub.substack.com/@handle`.
|
|
///
|
|
/// Returns `None` when the URL has no `@` segment (e.g. a non-Substack
|
|
/// author page) so we don't synthesise a fake handle.
|
|
fn handle_from_author_url(u: &str) -> Option<String> {
|
|
let after = u.rsplit_once('@').map(|(_, tail)| tail)?;
|
|
let clean = after.split(['/', '?', '#']).next()?;
|
|
if clean.is_empty() {
|
|
None
|
|
} else {
|
|
Some(clean.to_string())
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// HTML tag helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn og(html: &str, prop: &str) -> Option<String> {
|
|
static RE: OnceLock<Regex> = OnceLock::new();
|
|
let re = RE.get_or_init(|| {
|
|
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
|
});
|
|
for c in re.captures_iter(html) {
|
|
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
|
return c.get(2).map(|m| m.as_str().to_string());
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Pull `<meta property="article:published_time" content="...">` and
|
|
/// similar structured meta tags.
|
|
fn meta_property(html: &str, prop: &str) -> Option<String> {
|
|
static RE: OnceLock<Regex> = OnceLock::new();
|
|
let re = RE.get_or_init(|| {
|
|
Regex::new(r#"(?i)<meta[^>]+property="([^"]+)"[^>]+content="([^"]+)""#).unwrap()
|
|
});
|
|
for c in re.captures_iter(html) {
|
|
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
|
return c.get(2).map(|m| m.as_str().to_string());
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
fn canonical_url(html: &str) -> Option<String> {
|
|
static RE: OnceLock<Regex> = OnceLock::new();
|
|
let re = RE
|
|
.get_or_init(|| Regex::new(r#"(?i)<link[^>]+rel="canonical"[^>]+href="([^"]+)""#).unwrap());
|
|
re.captures(html)
|
|
.and_then(|c| c.get(1))
|
|
.map(|m| m.as_str().to_string())
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// JSON-LD walkers (Article / NewsArticle)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn find_article_jsonld(html: &str) -> Option<Value> {
|
|
let blocks = webclaw_core::structured_data::extract_json_ld(html);
|
|
for b in blocks {
|
|
if let Some(found) = find_article_in(&b) {
|
|
return Some(found);
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
fn find_article_in(v: &Value) -> Option<Value> {
|
|
if is_article_type(v) {
|
|
return Some(v.clone());
|
|
}
|
|
if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) {
|
|
for item in graph {
|
|
if let Some(found) = find_article_in(item) {
|
|
return Some(found);
|
|
}
|
|
}
|
|
}
|
|
if let Some(arr) = v.as_array() {
|
|
for item in arr {
|
|
if let Some(found) = find_article_in(item) {
|
|
return Some(found);
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
fn is_article_type(v: &Value) -> bool {
|
|
let Some(t) = v.get("@type") else {
|
|
return false;
|
|
};
|
|
let is_art = |s: &str| {
|
|
matches!(
|
|
s,
|
|
"Article" | "NewsArticle" | "BlogPosting" | "SocialMediaPosting"
|
|
)
|
|
};
|
|
match t {
|
|
Value::String(s) => is_art(s),
|
|
Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_art)),
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
fn get_text(v: &Value, key: &str) -> Option<String> {
|
|
v.get(key).and_then(|x| match x {
|
|
Value::String(s) => Some(s.clone()),
|
|
Value::Number(n) => Some(n.to_string()),
|
|
_ => None,
|
|
})
|
|
}
|
|
|
|
fn get_first_image(v: &Value) -> Option<String> {
|
|
match v.get("image")? {
|
|
Value::String(s) => Some(s.clone()),
|
|
Value::Array(arr) => arr.iter().find_map(|x| match x {
|
|
Value::String(s) => Some(s.clone()),
|
|
Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from),
|
|
_ => None,
|
|
}),
|
|
Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
|
|
FetchError::Build(e.to_string())
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Substack API types (subset)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[derive(Deserialize)]
|
|
struct Post {
|
|
id: Option<i64>,
|
|
r#type: Option<String>,
|
|
slug: Option<String>,
|
|
title: Option<String>,
|
|
subtitle: Option<String>,
|
|
description: Option<String>,
|
|
canonical_url: Option<String>,
|
|
post_date: Option<String>,
|
|
updated_at: Option<String>,
|
|
audience: Option<String>,
|
|
is_free_preview: Option<bool>,
|
|
cover_image: Option<String>,
|
|
wordcount: Option<i64>,
|
|
reactions: Option<serde_json::Value>,
|
|
comment_count: Option<i64>,
|
|
body_html: Option<String>,
|
|
body_text: Option<String>,
|
|
truncated_body_text: Option<String>,
|
|
publication: Option<Publication>,
|
|
#[serde(default, rename = "publishedBylines")]
|
|
published_bylines: Vec<Byline>,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct Publication {
|
|
id: Option<i64>,
|
|
name: Option<String>,
|
|
subdomain: Option<String>,
|
|
custom_domain: Option<String>,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct Byline {
|
|
id: Option<i64>,
|
|
name: Option<String>,
|
|
handle: Option<String>,
|
|
photo_url: Option<String>,
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn matches_post_urls() {
|
|
assert!(matches(
|
|
"https://stratechery.substack.com/p/the-tech-letter"
|
|
));
|
|
assert!(matches("https://simonwillison.net/p/2024-08-01-something"));
|
|
assert!(!matches("https://example.com/"));
|
|
assert!(!matches("ftp://example.com/p/foo"));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_slug_strips_query_and_trailing_slash() {
|
|
assert_eq!(
|
|
parse_slug("https://example.substack.com/p/my-post"),
|
|
Some("my-post".into())
|
|
);
|
|
assert_eq!(
|
|
parse_slug("https://example.substack.com/p/my-post/"),
|
|
Some("my-post".into())
|
|
);
|
|
assert_eq!(
|
|
parse_slug("https://example.substack.com/p/my-post?ref=123"),
|
|
Some("my-post".into())
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn parse_html_extracts_from_og_tags() {
|
|
let html = r##"
|
|
<html><head>
|
|
<meta property="og:title" content="My Great Post">
|
|
<meta property="og:description" content="A short summary.">
|
|
<meta property="og:image" content="https://cdn.substack.com/cover.jpg">
|
|
<meta property="og:site_name" content="My Publication">
|
|
<meta property="article:published_time" content="2025-09-01T10:00:00Z">
|
|
<link rel="canonical" href="https://mypub.substack.com/p/my-post">
|
|
</head></html>"##;
|
|
let v = parse_html(
|
|
html,
|
|
"https://mypub.substack.com/p/my-post",
|
|
"https://mypub.substack.com/api/v1/posts/my-post",
|
|
"my-post",
|
|
);
|
|
assert_eq!(v["data_source"], "html_fallback");
|
|
assert_eq!(v["title"], "My Great Post");
|
|
assert_eq!(v["description"], "A short summary.");
|
|
assert_eq!(v["cover_image"], "https://cdn.substack.com/cover.jpg");
|
|
assert_eq!(v["post_date"], "2025-09-01T10:00:00Z");
|
|
assert_eq!(v["publication"]["name"], "My Publication");
|
|
assert_eq!(v["canonical_url"], "https://mypub.substack.com/p/my-post");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_html_prefers_jsonld_when_present() {
|
|
let html = r##"
|
|
<html><head>
|
|
<meta property="og:title" content="OG Title">
|
|
<script type="application/ld+json">
|
|
{"@context":"https://schema.org","@type":"NewsArticle",
|
|
"headline":"JSON-LD Title",
|
|
"description":"JSON-LD desc.",
|
|
"image":"https://cdn.substack.com/hero.jpg",
|
|
"datePublished":"2025-10-12T08:30:00Z",
|
|
"dateModified":"2025-10-12T09:00:00Z",
|
|
"author":[{"@type":"Person","name":"Alice Author","url":"https://substack.com/@alice"}]}
|
|
</script>
|
|
</head></html>"##;
|
|
let v = parse_html(
|
|
html,
|
|
"https://example.com/p/a",
|
|
"https://example.com/api/v1/posts/a",
|
|
"a",
|
|
);
|
|
assert_eq!(v["title"], "JSON-LD Title");
|
|
assert_eq!(v["description"], "JSON-LD desc.");
|
|
assert_eq!(v["cover_image"], "https://cdn.substack.com/hero.jpg");
|
|
assert_eq!(v["post_date"], "2025-10-12T08:30:00Z");
|
|
assert_eq!(v["updated_at"], "2025-10-12T09:00:00Z");
|
|
assert_eq!(v["authors"][0]["name"], "Alice Author");
|
|
assert_eq!(v["authors"][0]["handle"], "alice");
|
|
}
|
|
|
|
#[test]
|
|
fn handle_from_author_url_pulls_handle() {
|
|
assert_eq!(
|
|
handle_from_author_url("https://substack.com/@alice"),
|
|
Some("alice".into())
|
|
);
|
|
assert_eq!(
|
|
handle_from_author_url("https://mypub.substack.com/@bob/"),
|
|
Some("bob".into())
|
|
);
|
|
assert_eq!(
|
|
handle_from_author_url("https://not-substack.com/author/carol"),
|
|
None
|
|
);
|
|
}
|
|
}
|