feat(extractors): wave 6a, 5 easy verticals (27 total)

Adds 5 structured extractors that hit public APIs with stable shapes: - github_issue: /repos/{o}/{r}/issues/{n} (rejects PRs, points to github_pr) - shopify_collection: /collections/{handle}.json + products.json - woocommerce_product: /wp-json/wc/store/v1/products?slug={slug} - substack_post: /api/v1/posts/{slug} (works on custom domains too) - youtube_video: ytInitialPlayerResponse blob from /watch HTML Auto-dispatched: github_issue, youtube_video (unique hosts and stable URL shapes). Explicit-call: shopify_collection, woocommerce_product, substack_post (URL shapes overlap with non-target sites). Tests: 82 total passing in webclaw-fetch (12 new), clippy clean.
2026-06-27 03:19:38 +02:00 · 2026-04-22 16:33:35 +02:00 · 2026-04-22 16:33:35 +02:00 · 8cc727c2f2
commit 8cc727c2f2
parent d8c9274a9c
6 changed files with 1175 additions and 1 deletions
--- a/crates/webclaw-fetch/src/extractors/substack_post.rs
+++ b/crates/webclaw-fetch/src/extractors/substack_post.rs
@ -0,0 +1,213 @@
+//! Substack post extractor.
+//!
+//! Every Substack publication exposes `/api/v1/posts/{slug}` that
+//! returns the full post as JSON: body HTML, cover image, author,
+//! publication info, reactions, paywall state. No auth on public
+//! posts.
+//!
+//! Works on both `*.substack.com` subdomains and custom domains
+//! (e.g. `simonwillison.net` uses Substack too). Detection is
+//! "URL has `/p/{slug}`" because that's the canonical Substack post
+//! path. Explicit-call only because the `/p/{slug}` URL shape is
+//! used by non-Substack sites too.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "substack_post",
+    label: "Substack post",
+    description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.",
+    url_patterns: &[
+        "https://{pub}.substack.com/p/{slug}",
+        "https://{custom-domain}/p/{slug}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    if !(url.starts_with("http://") || url.starts_with("https://")) {
+        return false;
+    }
+    url.contains("/p/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let slug = parse_slug(url).ok_or_else(|| {
+        FetchError::Build(format!("substack_post: cannot parse slug from '{url}'"))
+    })?;
+    let host = host_of(url);
+    if host.is_empty() {
+        return Err(FetchError::Build(format!(
+            "substack_post: empty host in '{url}'"
+        )));
+    }
+    let scheme = if url.starts_with("http://") {
+        "http"
+    } else {
+        "https"
+    };
+    let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "substack_post: '{slug}' not found on {host} (got 404). \
+             If the publication isn't actually on Substack, use /v1/scrape instead."
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "substack returned status {} for {api_url}",
+            resp.status
+        )));
+    }
+
+    let p: Post = serde_json::from_str(&resp.html).map_err(|e| {
+        FetchError::BodyDecode(format!(
+            "substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})"
+        ))
+    })?;
+
+    Ok(json!({
+        "url":                  url,
+        "api_url":              api_url,
+        "id":                   p.id,
+        "type":                 p.r#type,
+        "slug":                 p.slug,
+        "title":                p.title,
+        "subtitle":             p.subtitle,
+        "description":          p.description,
+        "canonical_url":        p.canonical_url,
+        "post_date":            p.post_date,
+        "updated_at":           p.updated_at,
+        "audience":             p.audience,
+        "has_paywall":          matches!(p.audience.as_deref(), Some("only_paid") | Some("founding")),
+        "is_free_preview":      p.is_free_preview,
+        "cover_image":          p.cover_image,
+        "word_count":           p.wordcount,
+        "reactions":            p.reactions,
+        "comment_count":        p.comment_count,
+        "body_html":            p.body_html,
+        "body_text":            p.truncated_body_text.or(p.body_text),
+        "publication": json!({
+            "id":           p.publication.as_ref().and_then(|pub_| pub_.id),
+            "name":         p.publication.as_ref().and_then(|pub_| pub_.name.clone()),
+            "subdomain":    p.publication.as_ref().and_then(|pub_| pub_.subdomain.clone()),
+            "custom_domain":p.publication.as_ref().and_then(|pub_| pub_.custom_domain.clone()),
+        }),
+        "authors": p.published_bylines.iter().map(|a| json!({
+            "id":     a.id,
+            "name":   a.name,
+            "handle": a.handle,
+            "photo":  a.photo_url,
+        })).collect::<Vec<_>>(),
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// URL helpers
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+fn parse_slug(url: &str) -> Option<String> {
+    let after = url.split("/p/").nth(1)?;
+    let stripped = after
+        .split(['?', '#'])
+        .next()?
+        .trim_end_matches('/')
+        .split('/')
+        .next()
+        .unwrap_or("");
+    if stripped.is_empty() {
+        None
+    } else {
+        Some(stripped.to_string())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Substack API types (subset)
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct Post {
+    id: Option<i64>,
+    r#type: Option<String>,
+    slug: Option<String>,
+    title: Option<String>,
+    subtitle: Option<String>,
+    description: Option<String>,
+    canonical_url: Option<String>,
+    post_date: Option<String>,
+    updated_at: Option<String>,
+    audience: Option<String>,
+    is_free_preview: Option<bool>,
+    cover_image: Option<String>,
+    wordcount: Option<i64>,
+    reactions: Option<serde_json::Value>,
+    comment_count: Option<i64>,
+    body_html: Option<String>,
+    body_text: Option<String>,
+    truncated_body_text: Option<String>,
+    publication: Option<Publication>,
+    #[serde(default, rename = "publishedBylines")]
+    published_bylines: Vec<Byline>,
+}
+
+#[derive(Deserialize)]
+struct Publication {
+    id: Option<i64>,
+    name: Option<String>,
+    subdomain: Option<String>,
+    custom_domain: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct Byline {
+    id: Option<i64>,
+    name: Option<String>,
+    handle: Option<String>,
+    photo_url: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_post_urls() {
+        assert!(matches(
+            "https://stratechery.substack.com/p/the-tech-letter"
+        ));
+        assert!(matches("https://simonwillison.net/p/2024-08-01-something"));
+        assert!(!matches("https://example.com/"));
+        assert!(!matches("ftp://example.com/p/foo"));
+    }
+
+    #[test]
+    fn parse_slug_strips_query_and_trailing_slash() {
+        assert_eq!(
+            parse_slug("https://example.substack.com/p/my-post"),
+            Some("my-post".into())
+        );
+        assert_eq!(
+            parse_slug("https://example.substack.com/p/my-post/"),
+            Some("my-post".into())
+        );
+        assert_eq!(
+            parse_slug("https://example.substack.com/p/my-post?ref=123"),
+            Some("my-post".into())
+        );
+    }
+}