From 8ba7538c37686d0c0cc0907feddfbd63740c5c00 Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 22 Apr 2026 14:11:43 +0200 Subject: [PATCH] feat(extractors): add vertical extractors module + first 6 verticals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New extractors module returns site-specific typed JSON instead of generic markdown. Each extractor: - declares a URL pattern via matches() - fetches from the site's official JSON API where one exists - returns a typed serde_json::Value with documented field names - exposes an INFO struct that powers the /v1/extractors catalog First 6 verticals shipped, all hitting public JSON APIs (no HTML scraping, zero antibot risk): - reddit → www.reddit.com/*/.json - hackernews → hn.algolia.com/api/v1/items/{id} (full thread in one call) - github_repo → api.github.com/repos/{owner}/{repo} - pypi → pypi.org/pypi/{name}/json - npm → registry.npmjs.org/{name} + downloads/point/last-week - huggingface_model → huggingface.co/api/models/{owner}/{name} Server-side routes added: - POST /v1/scrape/{vertical} explicit per-vertical extraction - GET /v1/extractors catalog (name, label, description, url_patterns) The dispatcher validates that URL matches the requested vertical before running, so users get "URL doesn't match the X extractor" instead of opaque parse failures inside the extractor. 17 unit tests cover URL matching + path parsing for each vertical. Live tests against canonical URLs (rust-lang/rust, requests pypi, react npm, whisper-large-v3 hf, item 8863 hn, an r/micro_saas post) all return correct typed JSON in 100-300ms. Sample sizes: github 863B, npm 700B, pypi 1.7KB, hf 3.2KB, hn 38KB (full comment tree). Marketing positioning: Firecrawl charges 5 credits per /extract call and you write the schema. Webclaw returns the same JSON in 1 credit per /scrape/{vertical} call with hand-written deterministic extractors per site. --- .../src/extractors/github_repo.rs | 212 ++++++++++++++++ .../src/extractors/hackernews.rs | 186 ++++++++++++++ .../src/extractors/huggingface_model.rs | 223 +++++++++++++++++ crates/webclaw-fetch/src/extractors/mod.rs | 199 +++++++++++++++ crates/webclaw-fetch/src/extractors/npm.rs | 235 ++++++++++++++++++ crates/webclaw-fetch/src/extractors/pypi.rs | 184 ++++++++++++++ crates/webclaw-fetch/src/extractors/reddit.rs | 234 +++++++++++++++++ crates/webclaw-fetch/src/lib.rs | 1 + crates/webclaw-server/src/main.rs | 5 + crates/webclaw-server/src/routes/mod.rs | 1 + .../webclaw-server/src/routes/structured.rs | 55 ++++ 11 files changed, 1535 insertions(+) create mode 100644 crates/webclaw-fetch/src/extractors/github_repo.rs create mode 100644 crates/webclaw-fetch/src/extractors/hackernews.rs create mode 100644 crates/webclaw-fetch/src/extractors/huggingface_model.rs create mode 100644 crates/webclaw-fetch/src/extractors/mod.rs create mode 100644 crates/webclaw-fetch/src/extractors/npm.rs create mode 100644 crates/webclaw-fetch/src/extractors/pypi.rs create mode 100644 crates/webclaw-fetch/src/extractors/reddit.rs create mode 100644 crates/webclaw-server/src/routes/structured.rs diff --git a/crates/webclaw-fetch/src/extractors/github_repo.rs b/crates/webclaw-fetch/src/extractors/github_repo.rs new file mode 100644 index 0000000..d89d06a --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/github_repo.rs @@ -0,0 +1,212 @@ +//! GitHub repository structured extractor. +//! +//! Uses GitHub's public REST API at `api.github.com/repos/{owner}/{repo}`. +//! Unauthenticated requests get 60/hour per IP, which is fine for users +//! self-hosting and for low-volume cloud usage. Production cloud should +//! set a `GITHUB_TOKEN` to lift to 5,000/hour, but the extractor doesn't +//! depend on it being set — it works open out of the box. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "github_repo", + label: "GitHub repository", + description: "Returns repo metadata: stars, forks, topics, license, default branch, recent activity.", + url_patterns: &["https://github.com/{owner}/{repo}"], +}; + +pub fn matches(url: &str) -> bool { + let host = url + .split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or(""); + if host != "github.com" && host != "www.github.com" { + return false; + } + // Path must be exactly /{owner}/{repo} (or with trailing slash). Reject + // sub-pages (issues, pulls, blob, etc.) so we don't claim URLs the + // future github_issue / github_pr extractors will handle. + let path = url + .split("://") + .nth(1) + .and_then(|s| s.split_once('/')) + .map(|(_, p)| p) + .unwrap_or(""); + let stripped = path + .split(['?', '#']) + .next() + .unwrap_or("") + .trim_end_matches('/'); + let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect(); + segs.len() == 2 && !RESERVED_OWNERS.contains(&segs[0]) +} + +/// GitHub uses some top-level paths for non-repo pages. +const RESERVED_OWNERS: &[&str] = &[ + "settings", + "marketplace", + "explore", + "topics", + "trending", + "collections", + "events", + "sponsors", + "issues", + "pulls", + "notifications", + "new", + "organizations", + "login", + "join", + "search", + "about", +]; + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (owner, repo) = parse_owner_repo(url).ok_or_else(|| { + FetchError::Build(format!("github_repo: cannot parse owner/repo from '{url}'")) + })?; + + let api_url = format!("https://api.github.com/repos/{owner}/{repo}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "github_repo: repo '{owner}/{repo}' not found" + ))); + } + if resp.status == 403 { + return Err(FetchError::Build( + "github_repo: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(), + )); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "github api returned status {}", + resp.status + ))); + } + + let r: Repo = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("github api parse: {e}")))?; + + Ok(json!({ + "url": url, + "owner": r.owner.as_ref().map(|o| &o.login), + "name": r.name, + "full_name": r.full_name, + "description": r.description, + "homepage": r.homepage, + "language": r.language, + "topics": r.topics, + "license": r.license.as_ref().and_then(|l| l.spdx_id.clone()), + "license_name": r.license.as_ref().map(|l| l.name.clone()), + "default_branch": r.default_branch, + "stars": r.stargazers_count, + "forks": r.forks_count, + "watchers": r.subscribers_count, + "open_issues": r.open_issues_count, + "size_kb": r.size, + "archived": r.archived, + "fork": r.fork, + "is_template": r.is_template, + "has_issues": r.has_issues, + "has_wiki": r.has_wiki, + "has_pages": r.has_pages, + "has_discussions": r.has_discussions, + "created_at": r.created_at, + "updated_at": r.updated_at, + "pushed_at": r.pushed_at, + "html_url": r.html_url, + })) +} + +fn parse_owner_repo(url: &str) -> Option<(String, String)> { + let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?; + let stripped = path.split(['?', '#']).next()?.trim_end_matches('/'); + let mut segs = stripped.split('/').filter(|s| !s.is_empty()); + let owner = segs.next()?.to_string(); + let repo = segs.next()?.to_string(); + Some((owner, repo)) +} + +// --------------------------------------------------------------------------- +// GitHub API types — only the fields we surface +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct Repo { + name: Option, + full_name: Option, + description: Option, + homepage: Option, + language: Option, + #[serde(default)] + topics: Vec, + license: Option, + default_branch: Option, + stargazers_count: Option, + forks_count: Option, + subscribers_count: Option, + open_issues_count: Option, + size: Option, + archived: Option, + fork: Option, + is_template: Option, + has_issues: Option, + has_wiki: Option, + has_pages: Option, + has_discussions: Option, + created_at: Option, + updated_at: Option, + pushed_at: Option, + html_url: Option, + owner: Option, +} + +#[derive(Deserialize)] +struct Owner { + login: String, +} + +#[derive(Deserialize)] +struct License { + name: String, + spdx_id: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_repo_root_only() { + assert!(matches("https://github.com/rust-lang/rust")); + assert!(matches("https://github.com/rust-lang/rust/")); + assert!(!matches("https://github.com/rust-lang/rust/issues")); + assert!(!matches("https://github.com/rust-lang/rust/pulls/123")); + assert!(!matches("https://github.com/rust-lang")); + assert!(!matches("https://github.com/marketplace")); + assert!(!matches("https://github.com/topics/rust")); + assert!(!matches("https://example.com/foo/bar")); + } + + #[test] + fn parse_owner_repo_handles_trailing_slash_and_query() { + assert_eq!( + parse_owner_repo("https://github.com/rust-lang/rust"), + Some(("rust-lang".into(), "rust".into())) + ); + assert_eq!( + parse_owner_repo("https://github.com/rust-lang/rust/?tab=foo"), + Some(("rust-lang".into(), "rust".into())) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/hackernews.rs b/crates/webclaw-fetch/src/extractors/hackernews.rs new file mode 100644 index 0000000..7adaa1c --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/hackernews.rs @@ -0,0 +1,186 @@ +//! Hacker News structured extractor. +//! +//! Uses Algolia's HN API (`hn.algolia.com/api/v1/items/{id}`) which +//! returns the full post + recursive comment tree in a single request. +//! The official Firebase API at `hacker-news.firebaseio.com` requires +//! N+1 fetches per comment, so we'd hit either timeout or rate-limit +//! on any non-trivial thread. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "hackernews", + label: "Hacker News story", + description: "Returns post + nested comment tree for a Hacker News item.", + url_patterns: &[ + "https://news.ycombinator.com/item?id=N", + "https://hn.algolia.com/items/N", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = url + .split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or(""); + if host == "news.ycombinator.com" { + return url.contains("item?id=") || url.contains("item%3Fid="); + } + if host == "hn.algolia.com" { + return url.contains("/items/"); + } + false +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let id = parse_item_id(url).ok_or_else(|| { + FetchError::Build(format!("hackernews: cannot parse item id from '{url}'")) + })?; + + let api_url = format!("https://hn.algolia.com/api/v1/items/{id}"); + let resp = client.fetch(&api_url).await?; + if resp.status != 200 { + return Err(FetchError::Build(format!( + "hn algolia returned status {}", + resp.status + ))); + } + + let item: AlgoliaItem = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("hn algolia parse: {e}")))?; + + let post = post_json(&item); + let comments: Vec = item.children.iter().filter_map(comment_json).collect(); + + Ok(json!({ + "url": url, + "post": post, + "comments": comments, + })) +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Pull the numeric id out of a HN URL. Handles `item?id=N` and the +/// Algolia mirror's `/items/N` form. +fn parse_item_id(url: &str) -> Option { + if let Some(after) = url.split("id=").nth(1) { + let n = after.split('&').next().unwrap_or(after); + if let Ok(id) = n.parse::() { + return Some(id); + } + } + if let Some(after) = url.split("/items/").nth(1) { + let n = after.split(['/', '?', '#']).next().unwrap_or(after); + if let Ok(id) = n.parse::() { + return Some(id); + } + } + None +} + +fn post_json(item: &AlgoliaItem) -> Value { + json!({ + "id": item.id, + "type": item.r#type, + "title": item.title, + "url": item.url, + "author": item.author, + "points": item.points, + "text": item.text, // populated for ask/show/tell + "created_at": item.created_at, + "created_at_unix": item.created_at_i, + "comment_count": count_descendants(item), + "permalink": item.id.map(|i| format!("https://news.ycombinator.com/item?id={i}")), + }) +} + +fn comment_json(item: &AlgoliaItem) -> Option { + if !matches!(item.r#type.as_deref(), Some("comment")) { + return None; + } + // Dead/deleted comments still appear in the tree; surface them honestly. + let replies: Vec = item.children.iter().filter_map(comment_json).collect(); + Some(json!({ + "id": item.id, + "author": item.author, + "text": item.text, + "created_at": item.created_at, + "created_at_unix": item.created_at_i, + "parent_id": item.parent_id, + "story_id": item.story_id, + "replies": replies, + })) +} + +fn count_descendants(item: &AlgoliaItem) -> usize { + item.children + .iter() + .filter(|c| matches!(c.r#type.as_deref(), Some("comment"))) + .map(|c| 1 + count_descendants(c)) + .sum() +} + +// --------------------------------------------------------------------------- +// Algolia API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct AlgoliaItem { + id: Option, + r#type: Option, + title: Option, + url: Option, + author: Option, + points: Option, + text: Option, + created_at: Option, + created_at_i: Option, + parent_id: Option, + story_id: Option, + #[serde(default)] + children: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_hn_item_urls() { + assert!(matches("https://news.ycombinator.com/item?id=1")); + assert!(matches("https://news.ycombinator.com/item?id=12345")); + assert!(matches("https://hn.algolia.com/items/1")); + } + + #[test] + fn rejects_non_item_urls() { + assert!(!matches("https://news.ycombinator.com/")); + assert!(!matches("https://news.ycombinator.com/news")); + assert!(!matches("https://example.com/item?id=1")); + } + + #[test] + fn parse_item_id_handles_both_forms() { + assert_eq!( + parse_item_id("https://news.ycombinator.com/item?id=1"), + Some(1) + ); + assert_eq!( + parse_item_id("https://news.ycombinator.com/item?id=12345&p=2"), + Some(12345) + ); + assert_eq!(parse_item_id("https://hn.algolia.com/items/999"), Some(999)); + assert_eq!(parse_item_id("https://example.com/foo"), None); + } +} diff --git a/crates/webclaw-fetch/src/extractors/huggingface_model.rs b/crates/webclaw-fetch/src/extractors/huggingface_model.rs new file mode 100644 index 0000000..decc68a --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/huggingface_model.rs @@ -0,0 +1,223 @@ +//! HuggingFace model card structured extractor. +//! +//! Uses the public model API at `huggingface.co/api/models/{owner}/{name}`. +//! Returns metadata + the parsed model card front matter, but does not +//! pull the full README body — those are sometimes 100KB+ and the user +//! can hit /v1/scrape if they want it as markdown. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "huggingface_model", + label: "HuggingFace model", + description: "Returns model metadata: downloads, likes, license, pipeline tag, library name, file list.", + url_patterns: &["https://huggingface.co/{owner}/{name}"], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "huggingface.co" && host != "www.huggingface.co" { + return false; + } + let path = url + .split("://") + .nth(1) + .and_then(|s| s.split_once('/')) + .map(|(_, p)| p) + .unwrap_or(""); + let stripped = path + .split(['?', '#']) + .next() + .unwrap_or("") + .trim_end_matches('/'); + let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect(); + // /{owner}/{name} but reject HF-internal sections + sub-pages. + if segs.len() != 2 { + return false; + } + !RESERVED_NAMESPACES.contains(&segs[0]) +} + +const RESERVED_NAMESPACES: &[&str] = &[ + "datasets", + "spaces", + "blog", + "docs", + "api", + "models", + "papers", + "pricing", + "tasks", + "join", + "login", + "settings", + "organizations", + "new", + "search", +]; + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (owner, name) = parse_owner_name(url).ok_or_else(|| { + FetchError::Build(format!("hf model: cannot parse owner/name from '{url}'")) + })?; + + let api_url = format!("https://huggingface.co/api/models/{owner}/{name}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "hf model: '{owner}/{name}' not found" + ))); + } + if resp.status == 401 { + return Err(FetchError::Build(format!( + "hf model: '{owner}/{name}' requires authentication (gated repo)" + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "hf api returned status {}", + resp.status + ))); + } + + let m: ModelInfo = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("hf api parse: {e}")))?; + + // Surface a flat file list — full siblings can be hundreds of entries + // for big repos. We keep it as-is because callers want to know about + // every shard; if it bloats responses too much we'll add pagination. + let files: Vec = m + .siblings + .iter() + .map(|s| json!({"rfilename": s.rfilename, "size": s.size})) + .collect(); + + Ok(json!({ + "url": url, + "id": m.id, + "model_id": m.model_id, + "private": m.private, + "gated": m.gated, + "downloads": m.downloads, + "downloads_30d": m.downloads_all_time, + "likes": m.likes, + "library_name": m.library_name, + "pipeline_tag": m.pipeline_tag, + "tags": m.tags, + "license": m.card_data.as_ref().and_then(|c| c.license.clone()), + "language": m.card_data.as_ref().and_then(|c| c.language.clone()), + "datasets": m.card_data.as_ref().and_then(|c| c.datasets.clone()), + "base_model": m.card_data.as_ref().and_then(|c| c.base_model.clone()), + "model_type": m.card_data.as_ref().and_then(|c| c.model_type.clone()), + "created_at": m.created_at, + "last_modified": m.last_modified, + "sha": m.sha, + "file_count": m.siblings.len(), + "files": files, + })) +} + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +fn parse_owner_name(url: &str) -> Option<(String, String)> { + let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?; + let stripped = path.split(['?', '#']).next()?.trim_end_matches('/'); + let mut segs = stripped.split('/').filter(|s| !s.is_empty()); + let owner = segs.next()?.to_string(); + let name = segs.next()?.to_string(); + Some((owner, name)) +} + +// --------------------------------------------------------------------------- +// HF API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct ModelInfo { + id: Option, + #[serde(rename = "modelId")] + model_id: Option, + private: Option, + gated: Option, // bool or string ("auto" / "manual" / false) + downloads: Option, + #[serde(rename = "downloadsAllTime")] + downloads_all_time: Option, + likes: Option, + #[serde(rename = "library_name")] + library_name: Option, + #[serde(rename = "pipeline_tag")] + pipeline_tag: Option, + #[serde(default)] + tags: Vec, + #[serde(rename = "createdAt")] + created_at: Option, + #[serde(rename = "lastModified")] + last_modified: Option, + sha: Option, + #[serde(rename = "cardData")] + card_data: Option, + #[serde(default)] + siblings: Vec, +} + +#[derive(Deserialize)] +struct CardData { + license: Option, // string or array + language: Option, + datasets: Option, + #[serde(rename = "base_model")] + base_model: Option, + #[serde(rename = "model_type")] + model_type: Option, +} + +#[derive(Deserialize)] +struct Sibling { + rfilename: String, + size: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_model_pages() { + assert!(matches("https://huggingface.co/meta-llama/Meta-Llama-3-8B")); + assert!(matches("https://huggingface.co/openai/whisper-large-v3")); + assert!(matches("https://huggingface.co/bert-base-uncased/main")); // owner=bert-base-uncased name=main: false positive but acceptable for v1 + } + + #[test] + fn rejects_hf_section_pages() { + assert!(!matches("https://huggingface.co/datasets/squad")); + assert!(!matches("https://huggingface.co/spaces/foo/bar")); + assert!(!matches("https://huggingface.co/blog/intro")); + assert!(!matches("https://huggingface.co/")); + assert!(!matches("https://huggingface.co/meta-llama")); + } + + #[test] + fn parse_owner_name_pulls_both() { + assert_eq!( + parse_owner_name("https://huggingface.co/meta-llama/Meta-Llama-3-8B"), + Some(("meta-llama".into(), "Meta-Llama-3-8B".into())) + ); + assert_eq!( + parse_owner_name("https://huggingface.co/openai/whisper-large-v3?library=transformers"), + Some(("openai".into(), "whisper-large-v3".into())) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/mod.rs b/crates/webclaw-fetch/src/extractors/mod.rs new file mode 100644 index 0000000..b9a539b --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/mod.rs @@ -0,0 +1,199 @@ +//! Vertical extractors: site-specific parsers that return typed JSON +//! instead of generic markdown. +//! +//! Each extractor handles a single site or platform and exposes: +//! - `matches(url)` to claim ownership of a URL pattern +//! - `extract(client, url)` to fetch + parse into a typed JSON `Value` +//! - `INFO` static for the catalog (`/v1/extractors`) +//! +//! The dispatch in this module is a simple `match`-style chain rather than +//! a trait registry. With ~30 extractors that's still fast and avoids the +//! ceremony of dynamic dispatch. If we hit 50+ we'll revisit. +//! +//! Extractors prefer official JSON APIs over HTML scraping where one +//! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have +//! one). HTML extraction is the fallback for sites that don't. + +pub mod github_repo; +pub mod hackernews; +pub mod huggingface_model; +pub mod npm; +pub mod pypi; +pub mod reddit; + +use serde::Serialize; +use serde_json::Value; + +use crate::client::FetchClient; +use crate::error::FetchError; + +/// Public catalog entry for `/v1/extractors`. Stable shape — clients +/// rely on `name` to pick the right `/v1/scrape/{name}` route. +#[derive(Debug, Clone, Serialize)] +pub struct ExtractorInfo { + /// URL-safe identifier (`reddit`, `hackernews`, `github_repo`, ...). + pub name: &'static str, + /// Human-friendly display name. + pub label: &'static str, + /// One-line description of what the extractor returns. + pub description: &'static str, + /// Glob-ish URL pattern(s) the extractor claims. For documentation; + /// the actual matching is done by the extractor's `matches` fn. + pub url_patterns: &'static [&'static str], +} + +/// Full catalog. Order is stable; new entries append. +pub fn list() -> Vec { + vec![ + reddit::INFO, + hackernews::INFO, + github_repo::INFO, + pypi::INFO, + npm::INFO, + huggingface_model::INFO, + ] +} + +/// Auto-detect mode: try every extractor's `matches`, return the first +/// one that claims the URL. Used by `/v1/scrape` when the caller doesn't +/// pick a vertical explicitly. +pub async fn dispatch_by_url( + client: &FetchClient, + url: &str, +) -> Option> { + if reddit::matches(url) { + return Some( + reddit::extract(client, url) + .await + .map(|v| (reddit::INFO.name, v)), + ); + } + if hackernews::matches(url) { + return Some( + hackernews::extract(client, url) + .await + .map(|v| (hackernews::INFO.name, v)), + ); + } + if github_repo::matches(url) { + return Some( + github_repo::extract(client, url) + .await + .map(|v| (github_repo::INFO.name, v)), + ); + } + if pypi::matches(url) { + return Some( + pypi::extract(client, url) + .await + .map(|v| (pypi::INFO.name, v)), + ); + } + if npm::matches(url) { + return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v))); + } + if huggingface_model::matches(url) { + return Some( + huggingface_model::extract(client, url) + .await + .map(|v| (huggingface_model::INFO.name, v)), + ); + } + None +} + +/// Explicit mode: caller picked the vertical (`POST /v1/scrape/reddit`). +/// We still validate that the URL plausibly belongs to that vertical so +/// users get a clear "wrong route" error instead of a confusing parse +/// failure deep in the extractor. +pub async fn dispatch_by_name( + client: &FetchClient, + name: &str, + url: &str, +) -> Result { + match name { + n if n == reddit::INFO.name => { + run_or_mismatch(reddit::matches(url), n, url, || { + reddit::extract(client, url) + }) + .await + } + n if n == hackernews::INFO.name => { + run_or_mismatch(hackernews::matches(url), n, url, || { + hackernews::extract(client, url) + }) + .await + } + n if n == github_repo::INFO.name => { + run_or_mismatch(github_repo::matches(url), n, url, || { + github_repo::extract(client, url) + }) + .await + } + n if n == pypi::INFO.name => { + run_or_mismatch(pypi::matches(url), n, url, || pypi::extract(client, url)).await + } + n if n == npm::INFO.name => { + run_or_mismatch(npm::matches(url), n, url, || npm::extract(client, url)).await + } + n if n == huggingface_model::INFO.name => { + run_or_mismatch(huggingface_model::matches(url), n, url, || { + huggingface_model::extract(client, url) + }) + .await + } + _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())), + } +} + +/// Errors that the dispatcher itself raises (vs. errors from inside an +/// extractor, which come back wrapped in `Fetch`). +#[derive(Debug, thiserror::Error)] +pub enum ExtractorDispatchError { + #[error("unknown vertical: '{0}'")] + UnknownVertical(String), + + #[error("URL '{url}' does not match the '{vertical}' extractor")] + UrlMismatch { vertical: String, url: String }, + + #[error(transparent)] + Fetch(#[from] FetchError), +} + +/// Helper: when the caller explicitly picked a vertical but their URL +/// doesn't match it, return `UrlMismatch` instead of running the +/// extractor (which would just fail with a less-clear error). +async fn run_or_mismatch( + matches: bool, + vertical: &str, + url: &str, + f: F, +) -> Result +where + F: FnOnce() -> Fut, + Fut: std::future::Future>, +{ + if !matches { + return Err(ExtractorDispatchError::UrlMismatch { + vertical: vertical.to_string(), + url: url.to_string(), + }); + } + f().await.map_err(ExtractorDispatchError::Fetch) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn list_is_non_empty_and_unique() { + let entries = list(); + assert!(!entries.is_empty()); + let mut names: Vec<_> = entries.iter().map(|e| e.name).collect(); + names.sort(); + let before = names.len(); + names.dedup(); + assert_eq!(before, names.len(), "extractor names must be unique"); + } +} diff --git a/crates/webclaw-fetch/src/extractors/npm.rs b/crates/webclaw-fetch/src/extractors/npm.rs new file mode 100644 index 0000000..4343890 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/npm.rs @@ -0,0 +1,235 @@ +//! npm package structured extractor. +//! +//! Uses two npm-run APIs: +//! - `registry.npmjs.org/{name}` for full package metadata +//! - `api.npmjs.org/downloads/point/last-week/{name}` for usage signal +//! +//! The registry API returns the *full* document including every version +//! ever published, which can be tens of MB for popular packages +//! (`@types/node` etc). We strip down to the latest version's manifest +//! and a count of releases — full history would explode the response. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "npm", + label: "npm package", + description: "Returns package metadata: latest version manifest, dependencies, weekly downloads, license.", + url_patterns: &["https://www.npmjs.com/package/{name}"], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "www.npmjs.com" && host != "npmjs.com" { + return false; + } + url.contains("/package/") +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let name = parse_name(url) + .ok_or_else(|| FetchError::Build(format!("npm: cannot parse name from '{url}'")))?; + + let registry_url = format!("https://registry.npmjs.org/{}", urlencode_segment(&name)); + let resp = client.fetch(®istry_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "npm: package '{name}' not found" + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "npm registry returned status {}", + resp.status + ))); + } + + let pkg: PackageDoc = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("npm registry parse: {e}")))?; + + // Resolve "latest" to a concrete version. + let latest_version = pkg + .dist_tags + .as_ref() + .and_then(|t| t.get("latest")) + .cloned() + .or_else(|| pkg.versions.as_ref().and_then(|v| v.keys().last().cloned())); + + let latest_manifest = latest_version + .as_deref() + .and_then(|v| pkg.versions.as_ref().and_then(|m| m.get(v))); + + let release_count = pkg.versions.as_ref().map(|v| v.len()).unwrap_or(0); + let latest_release_date = latest_version + .as_deref() + .and_then(|v| pkg.time.as_ref().and_then(|t| t.get(v).cloned())); + + // Best-effort weekly downloads. If the api.npmjs.org call fails we + // surface `null` rather than failing the whole extractor — npm + // sometimes 503s the downloads endpoint while the registry is up. + let weekly_downloads = fetch_weekly_downloads(client, &name).await.ok(); + + Ok(json!({ + "url": url, + "name": pkg.name.clone().unwrap_or(name.clone()), + "description": pkg.description, + "latest_version": latest_version, + "license": latest_manifest.and_then(|m| m.license.clone()), + "homepage": pkg.homepage, + "repository": pkg.repository.as_ref().and_then(|r| r.url.clone()), + "dependencies": latest_manifest.and_then(|m| m.dependencies.clone()), + "dev_dependencies": latest_manifest.and_then(|m| m.dev_dependencies.clone()), + "peer_dependencies": latest_manifest.and_then(|m| m.peer_dependencies.clone()), + "keywords": pkg.keywords, + "maintainers": pkg.maintainers, + "deprecated": latest_manifest.and_then(|m| m.deprecated.clone()), + "release_count": release_count, + "latest_release_date": latest_release_date, + "weekly_downloads": weekly_downloads, + })) +} + +async fn fetch_weekly_downloads(client: &FetchClient, name: &str) -> Result { + let url = format!( + "https://api.npmjs.org/downloads/point/last-week/{}", + urlencode_segment(name) + ); + let resp = client.fetch(&url).await?; + if resp.status != 200 { + return Err(FetchError::Build(format!( + "npm downloads api status {}", + resp.status + ))); + } + let dl: Downloads = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("npm downloads parse: {e}")))?; + Ok(dl.downloads) +} + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +/// Extract the package name from an npmjs.com URL. Handles scoped packages +/// (`/package/@scope/name`) and trailing path segments (`/v/x.y.z`). +fn parse_name(url: &str) -> Option { + let after = url.split("/package/").nth(1)?; + let stripped = after.split(['?', '#']).next()?.trim_end_matches('/'); + let mut segs = stripped.split('/').filter(|s| !s.is_empty()); + let first = segs.next()?; + if first.starts_with('@') { + let second = segs.next()?; + Some(format!("{first}/{second}")) + } else { + Some(first.to_string()) + } +} + +/// `@scope/name` must encode the `/` for the registry path. Plain names +/// pass through untouched. +fn urlencode_segment(name: &str) -> String { + name.replace('/', "%2F") +} + +// --------------------------------------------------------------------------- +// Registry types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct PackageDoc { + name: Option, + description: Option, + homepage: Option, // sometimes string, sometimes object + repository: Option, + keywords: Option>, + maintainers: Option>, + #[serde(rename = "dist-tags")] + dist_tags: Option>, + versions: Option>, + time: Option>, +} + +#[derive(Deserialize, Default, Clone)] +struct VersionManifest { + license: Option, // string or object + dependencies: Option>, + #[serde(rename = "devDependencies")] + dev_dependencies: Option>, + #[serde(rename = "peerDependencies")] + peer_dependencies: Option>, + // `deprecated` is sometimes a bool and sometimes a string in the + // registry. serde_json::Value covers both without failing the parse. + deprecated: Option, +} + +#[derive(Deserialize)] +struct Repository { + url: Option, +} + +#[derive(Deserialize, Clone)] +struct Maintainer { + name: Option, + email: Option, +} + +impl serde::Serialize for Maintainer { + fn serialize(&self, s: S) -> Result { + use serde::ser::SerializeMap; + let mut m = s.serialize_map(Some(2))?; + m.serialize_entry("name", &self.name)?; + m.serialize_entry("email", &self.email)?; + m.end() + } +} + +#[derive(Deserialize)] +struct Downloads { + downloads: i64, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_npm_package_urls() { + assert!(matches("https://www.npmjs.com/package/react")); + assert!(matches("https://www.npmjs.com/package/@types/node")); + assert!(matches("https://npmjs.com/package/lodash")); + assert!(!matches("https://www.npmjs.com/")); + assert!(!matches("https://example.com/package/foo")); + } + + #[test] + fn parse_name_handles_scoped_and_unscoped() { + assert_eq!( + parse_name("https://www.npmjs.com/package/react"), + Some("react".into()) + ); + assert_eq!( + parse_name("https://www.npmjs.com/package/@types/node"), + Some("@types/node".into()) + ); + assert_eq!( + parse_name("https://www.npmjs.com/package/lodash/v/4.17.21"), + Some("lodash".into()) + ); + } + + #[test] + fn urlencode_only_touches_scope_separator() { + assert_eq!(urlencode_segment("react"), "react"); + assert_eq!(urlencode_segment("@types/node"), "@types%2Fnode"); + } +} diff --git a/crates/webclaw-fetch/src/extractors/pypi.rs b/crates/webclaw-fetch/src/extractors/pypi.rs new file mode 100644 index 0000000..f6b7c64 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/pypi.rs @@ -0,0 +1,184 @@ +//! PyPI package structured extractor. +//! +//! PyPI exposes a stable JSON API at `pypi.org/pypi/{name}/json` and +//! a versioned form at `pypi.org/pypi/{name}/{version}/json`. Both +//! return the full release info plus history. No auth, no rate limits +//! that we hit at normal usage. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "pypi", + label: "PyPI package", + description: "Returns package metadata: latest version, dependencies, license, release history.", + url_patterns: &[ + "https://pypi.org/project/{name}/", + "https://pypi.org/project/{name}/{version}/", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "pypi.org" && host != "www.pypi.org" { + return false; + } + url.contains("/project/") +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (name, version) = parse_project(url).ok_or_else(|| { + FetchError::Build(format!("pypi: cannot parse package name from '{url}'")) + })?; + + let api_url = match &version { + Some(v) => format!("https://pypi.org/pypi/{name}/{v}/json"), + None => format!("https://pypi.org/pypi/{name}/json"), + }; + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "pypi: package '{name}' not found" + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "pypi api returned status {}", + resp.status + ))); + } + + let pkg: PypiResponse = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("pypi parse: {e}")))?; + + let info = pkg.info; + let release_count = pkg.releases.as_ref().map(|r| r.len()).unwrap_or(0); + + // Latest release date = max upload time across files in the latest version. + let latest_release_date = pkg + .releases + .as_ref() + .and_then(|map| info.version.as_deref().and_then(|v| map.get(v))) + .and_then(|files| files.iter().filter_map(|f| f.upload_time.clone()).max()); + + // Drop the long description from the JSON shape — it's frequently a 50KB + // README and bloats responses. Callers who need it can hit /v1/scrape. + Ok(json!({ + "url": url, + "name": info.name, + "version": info.version, + "summary": info.summary, + "homepage": info.home_page, + "license": info.license, + "license_classifier": pick_license_classifier(&info.classifiers), + "author": info.author, + "author_email": info.author_email, + "maintainer": info.maintainer, + "requires_python": info.requires_python, + "requires_dist": info.requires_dist, + "keywords": info.keywords, + "classifiers": info.classifiers, + "yanked": info.yanked, + "yanked_reason": info.yanked_reason, + "project_urls": info.project_urls, + "release_count": release_count, + "latest_release_date": latest_release_date, + })) +} + +/// PyPI puts the SPDX-ish license under classifiers like +/// `License :: OSI Approved :: Apache Software License`. Surface the most +/// specific one when the `license` field itself is empty/junk. +fn pick_license_classifier(classifiers: &Option>) -> Option { + classifiers + .as_ref()? + .iter() + .filter(|c| c.starts_with("License ::")) + .max_by_key(|c| c.len()) + .cloned() +} + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +fn parse_project(url: &str) -> Option<(String, Option)> { + let after = url.split("/project/").nth(1)?; + let stripped = after.split(['?', '#']).next()?.trim_end_matches('/'); + let mut segs = stripped.split('/').filter(|s| !s.is_empty()); + let name = segs.next()?.to_string(); + let version = segs.next().map(|v| v.to_string()); + Some((name, version)) +} + +// --------------------------------------------------------------------------- +// PyPI API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct PypiResponse { + info: Info, + releases: Option>>, +} + +#[derive(Deserialize)] +struct Info { + name: Option, + version: Option, + summary: Option, + home_page: Option, + license: Option, + author: Option, + author_email: Option, + maintainer: Option, + requires_python: Option, + requires_dist: Option>, + keywords: Option, + classifiers: Option>, + yanked: Option, + yanked_reason: Option, + project_urls: Option>, +} + +#[derive(Deserialize)] +struct File { + upload_time: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_project_urls() { + assert!(matches("https://pypi.org/project/requests/")); + assert!(matches("https://pypi.org/project/numpy/1.26.0/")); + assert!(!matches("https://pypi.org/")); + assert!(!matches("https://example.com/project/foo")); + } + + #[test] + fn parse_project_pulls_name_and_version() { + assert_eq!( + parse_project("https://pypi.org/project/requests/"), + Some(("requests".into(), None)) + ); + assert_eq!( + parse_project("https://pypi.org/project/numpy/1.26.0/"), + Some(("numpy".into(), Some("1.26.0".into()))) + ); + assert_eq!( + parse_project("https://pypi.org/project/scikit-learn/?foo=bar"), + Some(("scikit-learn".into(), None)) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/reddit.rs b/crates/webclaw-fetch/src/extractors/reddit.rs new file mode 100644 index 0000000..2d084dc --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/reddit.rs @@ -0,0 +1,234 @@ +//! Reddit structured extractor — returns the full post + comment tree +//! as typed JSON via Reddit's `.json` API. +//! +//! The same trick the markdown extractor in `crate::reddit` uses: +//! appending `.json` to any post URL returns the data the new SPA +//! frontend would load client-side. Zero antibot, zero JS rendering. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "reddit", + label: "Reddit thread", + description: "Returns post + nested comment tree with scores, authors, and timestamps.", + url_patterns: &[ + "https://www.reddit.com/r/*/comments/*", + "https://reddit.com/r/*/comments/*", + "https://old.reddit.com/r/*/comments/*", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + let is_reddit_host = matches!( + host, + "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com" + ); + is_reddit_host && url.contains("/comments/") +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let json_url = build_json_url(url); + let resp = client.fetch(&json_url).await?; + if resp.status != 200 { + return Err(FetchError::Build(format!( + "reddit api returned status {}", + resp.status + ))); + } + + let listings: Vec = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?; + + if listings.is_empty() { + return Err(FetchError::BodyDecode("reddit response empty".into())); + } + + // First listing = the post (single t3 child). + let post = listings + .first() + .and_then(|l| l.data.children.first()) + .filter(|t| t.kind == "t3") + .map(|t| post_json(&t.data)) + .unwrap_or(Value::Null); + + // Second listing = the comment tree. + let comments: Vec = listings + .get(1) + .map(|l| l.data.children.iter().filter_map(comment_json).collect()) + .unwrap_or_default(); + + Ok(json!({ + "url": url, + "post": post, + "comments": comments, + })) +} + +// --------------------------------------------------------------------------- +// JSON shapers +// --------------------------------------------------------------------------- + +fn post_json(d: &ThingData) -> Value { + json!({ + "id": d.id, + "title": d.title, + "author": d.author, + "subreddit": d.subreddit_name_prefixed, + "permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")), + "url": d.url_overridden_by_dest, + "is_self": d.is_self, + "selftext": d.selftext, + "score": d.score, + "upvote_ratio": d.upvote_ratio, + "num_comments": d.num_comments, + "created_utc": d.created_utc, + "link_flair_text": d.link_flair_text, + "over_18": d.over_18, + "spoiler": d.spoiler, + "stickied": d.stickied, + "locked": d.locked, + }) +} + +/// Render a single comment + its reply tree. Returns `None` for non-t1 +/// kinds (the trailing `more` placeholder Reddit injects at depth limits). +fn comment_json(thing: &Thing) -> Option { + if thing.kind != "t1" { + return None; + } + let d = &thing.data; + let replies: Vec = match &d.replies { + Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(), + _ => Vec::new(), + }; + Some(json!({ + "id": d.id, + "author": d.author, + "body": d.body, + "score": d.score, + "created_utc": d.created_utc, + "is_submitter": d.is_submitter, + "stickied": d.stickied, + "depth": d.depth, + "permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")), + "replies": replies, + })) +} + +// --------------------------------------------------------------------------- +// URL helpers +// --------------------------------------------------------------------------- + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com` +/// or `old.reddit.com` as the caller gave us). Routing through +/// `old.reddit.com` unconditionally looks appealing but that host has +/// stricter UA-based blocking than `www.reddit.com`, while the main +/// host accepts our Chrome-fingerprinted client fine. +fn build_json_url(url: &str) -> String { + let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/'); + format!("{clean}.json?raw_json=1") +} + +// --------------------------------------------------------------------------- +// Reddit JSON types — only fields we render. Everything else is dropped. +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct Listing { + data: ListingData, +} + +#[derive(Deserialize)] +struct ListingData { + children: Vec, +} + +#[derive(Deserialize)] +struct Thing { + kind: String, + data: ThingData, +} + +#[derive(Deserialize, Default)] +struct ThingData { + // post (t3) + id: Option, + title: Option, + selftext: Option, + subreddit_name_prefixed: Option, + url_overridden_by_dest: Option, + is_self: Option, + upvote_ratio: Option, + num_comments: Option, + over_18: Option, + spoiler: Option, + stickied: Option, + locked: Option, + link_flair_text: Option, + + // comment (t1) + author: Option, + body: Option, + score: Option, + created_utc: Option, + is_submitter: Option, + depth: Option, + permalink: Option, + + // recursive + replies: Option, +} + +#[derive(Deserialize)] +#[serde(untagged)] +enum Replies { + Listing(Listing), + #[allow(dead_code)] + Empty(String), +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_reddit_post_urls() { + assert!(matches( + "https://www.reddit.com/r/rust/comments/abc123/some_title/" + )); + assert!(matches( + "https://reddit.com/r/rust/comments/abc123/some_title" + )); + assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/")); + } + + #[test] + fn rejects_non_post_reddit_urls() { + assert!(!matches("https://www.reddit.com/r/rust")); + assert!(!matches("https://www.reddit.com/user/foo")); + assert!(!matches("https://example.com/r/rust/comments/x")); + } + + #[test] + fn json_url_appends_suffix_and_drops_query() { + assert_eq!( + build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"), + "https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1" + ); + } +} diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index 517cb6e..831c2a5 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -6,6 +6,7 @@ pub mod client; pub mod crawler; pub mod document; pub mod error; +pub mod extractors; pub mod linkedin; pub mod proxy; pub mod reddit; diff --git a/crates/webclaw-server/src/main.rs b/crates/webclaw-server/src/main.rs index c57fed8..f4cfdcb 100644 --- a/crates/webclaw-server/src/main.rs +++ b/crates/webclaw-server/src/main.rs @@ -79,10 +79,15 @@ async fn main() -> anyhow::Result<()> { let v1 = Router::new() .route("/scrape", post(routes::scrape::scrape)) + .route( + "/scrape/{vertical}", + post(routes::structured::scrape_vertical), + ) .route("/crawl", post(routes::crawl::crawl)) .route("/map", post(routes::map::map)) .route("/batch", post(routes::batch::batch)) .route("/extract", post(routes::extract::extract)) + .route("/extractors", get(routes::structured::list_extractors)) .route("/summarize", post(routes::summarize::summarize_route)) .route("/diff", post(routes::diff::diff_route)) .route("/brand", post(routes::brand::brand)) diff --git a/crates/webclaw-server/src/routes/mod.rs b/crates/webclaw-server/src/routes/mod.rs index 7c3d68e..01f1052 100644 --- a/crates/webclaw-server/src/routes/mod.rs +++ b/crates/webclaw-server/src/routes/mod.rs @@ -15,4 +15,5 @@ pub mod extract; pub mod health; pub mod map; pub mod scrape; +pub mod structured; pub mod summarize; diff --git a/crates/webclaw-server/src/routes/structured.rs b/crates/webclaw-server/src/routes/structured.rs new file mode 100644 index 0000000..c9cdc1a --- /dev/null +++ b/crates/webclaw-server/src/routes/structured.rs @@ -0,0 +1,55 @@ +//! `POST /v1/scrape/{vertical}` and `GET /v1/extractors`. +//! +//! Vertical extractors return typed JSON instead of generic markdown. +//! See `webclaw_fetch::extractors` for the catalog and per-site logic. + +use axum::{ + Json, + extract::{Path, State}, +}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_fetch::extractors::{self, ExtractorDispatchError}; + +use crate::{error::ApiError, state::AppState}; + +#[derive(Debug, Deserialize)] +pub struct ScrapeRequest { + pub url: String, +} + +/// Map dispatcher errors to ApiError so users get clean HTTP statuses +/// instead of opaque 500s. +impl From for ApiError { + fn from(e: ExtractorDispatchError) -> Self { + match e { + ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound, + ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()), + ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()), + } + } +} + +/// `GET /v1/extractors` — catalog of all available verticals. +pub async fn list_extractors() -> Json { + Json(json!({ + "extractors": extractors::list(), + })) +} + +/// `POST /v1/scrape/{vertical}` — explicit vertical, e.g. /v1/scrape/reddit. +pub async fn scrape_vertical( + State(state): State, + Path(vertical): Path, + Json(req): Json, +) -> Result, ApiError> { + if req.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?; + Ok(Json(json!({ + "vertical": vertical, + "url": req.url, + "data": data, + }))) +}