feat(extractors): add vertical extractors module + first 6 verticals

New extractors module returns site-specific typed JSON instead of
generic markdown. Each extractor:
- declares a URL pattern via matches()
- fetches from the site's official JSON API where one exists
- returns a typed serde_json::Value with documented field names
- exposes an INFO struct that powers the /v1/extractors catalog

First 6 verticals shipped, all hitting public JSON APIs (no HTML
scraping, zero antibot risk):

- reddit       → www.reddit.com/*/.json
- hackernews   → hn.algolia.com/api/v1/items/{id} (full thread in one call)
- github_repo  → api.github.com/repos/{owner}/{repo}
- pypi         → pypi.org/pypi/{name}/json
- npm          → registry.npmjs.org/{name} + downloads/point/last-week
- huggingface_model → huggingface.co/api/models/{owner}/{name}

Server-side routes added:
- POST /v1/scrape/{vertical}  explicit per-vertical extraction
- GET  /v1/extractors         catalog (name, label, description, url_patterns)

The dispatcher validates that URL matches the requested vertical
before running, so users get "URL doesn't match the X extractor"
instead of opaque parse failures inside the extractor.

17 unit tests cover URL matching + path parsing for each vertical.
Live tests against canonical URLs (rust-lang/rust, requests pypi,
react npm, whisper-large-v3 hf, item 8863 hn, an r/micro_saas post)
all return correct typed JSON in 100-300ms. Sample sizes: github
863B, npm 700B, pypi 1.7KB, hf 3.2KB, hn 38KB (full comment tree).

Marketing positioning: Firecrawl charges 5 credits per /extract call
and you write the schema. Webclaw returns the same JSON in 1 credit
per /scrape/{vertical} call with hand-written deterministic
extractors per site.
This commit is contained in:
Valerio 2026-04-22 14:11:43 +02:00
parent ccdb6d364b
commit 8ba7538c37
11 changed files with 1535 additions and 0 deletions

View file

@ -0,0 +1,212 @@
//! GitHub repository structured extractor.
//!
//! Uses GitHub's public REST API at `api.github.com/repos/{owner}/{repo}`.
//! Unauthenticated requests get 60/hour per IP, which is fine for users
//! self-hosting and for low-volume cloud usage. Production cloud should
//! set a `GITHUB_TOKEN` to lift to 5,000/hour, but the extractor doesn't
//! depend on it being set — it works open out of the box.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "github_repo",
label: "GitHub repository",
description: "Returns repo metadata: stars, forks, topics, license, default branch, recent activity.",
url_patterns: &["https://github.com/{owner}/{repo}"],
};
pub fn matches(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
if host != "github.com" && host != "www.github.com" {
return false;
}
// Path must be exactly /{owner}/{repo} (or with trailing slash). Reject
// sub-pages (issues, pulls, blob, etc.) so we don't claim URLs the
// future github_issue / github_pr extractors will handle.
let path = url
.split("://")
.nth(1)
.and_then(|s| s.split_once('/'))
.map(|(_, p)| p)
.unwrap_or("");
let stripped = path
.split(['?', '#'])
.next()
.unwrap_or("")
.trim_end_matches('/');
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
segs.len() == 2 && !RESERVED_OWNERS.contains(&segs[0])
}
/// GitHub uses some top-level paths for non-repo pages.
const RESERVED_OWNERS: &[&str] = &[
"settings",
"marketplace",
"explore",
"topics",
"trending",
"collections",
"events",
"sponsors",
"issues",
"pulls",
"notifications",
"new",
"organizations",
"login",
"join",
"search",
"about",
];
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (owner, repo) = parse_owner_repo(url).ok_or_else(|| {
FetchError::Build(format!("github_repo: cannot parse owner/repo from '{url}'"))
})?;
let api_url = format!("https://api.github.com/repos/{owner}/{repo}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"github_repo: repo '{owner}/{repo}' not found"
)));
}
if resp.status == 403 {
return Err(FetchError::Build(
"github_repo: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"github api returned status {}",
resp.status
)));
}
let r: Repo = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("github api parse: {e}")))?;
Ok(json!({
"url": url,
"owner": r.owner.as_ref().map(|o| &o.login),
"name": r.name,
"full_name": r.full_name,
"description": r.description,
"homepage": r.homepage,
"language": r.language,
"topics": r.topics,
"license": r.license.as_ref().and_then(|l| l.spdx_id.clone()),
"license_name": r.license.as_ref().map(|l| l.name.clone()),
"default_branch": r.default_branch,
"stars": r.stargazers_count,
"forks": r.forks_count,
"watchers": r.subscribers_count,
"open_issues": r.open_issues_count,
"size_kb": r.size,
"archived": r.archived,
"fork": r.fork,
"is_template": r.is_template,
"has_issues": r.has_issues,
"has_wiki": r.has_wiki,
"has_pages": r.has_pages,
"has_discussions": r.has_discussions,
"created_at": r.created_at,
"updated_at": r.updated_at,
"pushed_at": r.pushed_at,
"html_url": r.html_url,
}))
}
fn parse_owner_repo(url: &str) -> Option<(String, String)> {
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
let owner = segs.next()?.to_string();
let repo = segs.next()?.to_string();
Some((owner, repo))
}
// ---------------------------------------------------------------------------
// GitHub API types — only the fields we surface
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Repo {
name: Option<String>,
full_name: Option<String>,
description: Option<String>,
homepage: Option<String>,
language: Option<String>,
#[serde(default)]
topics: Vec<String>,
license: Option<License>,
default_branch: Option<String>,
stargazers_count: Option<i64>,
forks_count: Option<i64>,
subscribers_count: Option<i64>,
open_issues_count: Option<i64>,
size: Option<i64>,
archived: Option<bool>,
fork: Option<bool>,
is_template: Option<bool>,
has_issues: Option<bool>,
has_wiki: Option<bool>,
has_pages: Option<bool>,
has_discussions: Option<bool>,
created_at: Option<String>,
updated_at: Option<String>,
pushed_at: Option<String>,
html_url: Option<String>,
owner: Option<Owner>,
}
#[derive(Deserialize)]
struct Owner {
login: String,
}
#[derive(Deserialize)]
struct License {
name: String,
spdx_id: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_repo_root_only() {
assert!(matches("https://github.com/rust-lang/rust"));
assert!(matches("https://github.com/rust-lang/rust/"));
assert!(!matches("https://github.com/rust-lang/rust/issues"));
assert!(!matches("https://github.com/rust-lang/rust/pulls/123"));
assert!(!matches("https://github.com/rust-lang"));
assert!(!matches("https://github.com/marketplace"));
assert!(!matches("https://github.com/topics/rust"));
assert!(!matches("https://example.com/foo/bar"));
}
#[test]
fn parse_owner_repo_handles_trailing_slash_and_query() {
assert_eq!(
parse_owner_repo("https://github.com/rust-lang/rust"),
Some(("rust-lang".into(), "rust".into()))
);
assert_eq!(
parse_owner_repo("https://github.com/rust-lang/rust/?tab=foo"),
Some(("rust-lang".into(), "rust".into()))
);
}
}

View file

@ -0,0 +1,186 @@
//! Hacker News structured extractor.
//!
//! Uses Algolia's HN API (`hn.algolia.com/api/v1/items/{id}`) which
//! returns the full post + recursive comment tree in a single request.
//! The official Firebase API at `hacker-news.firebaseio.com` requires
//! N+1 fetches per comment, so we'd hit either timeout or rate-limit
//! on any non-trivial thread.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "hackernews",
label: "Hacker News story",
description: "Returns post + nested comment tree for a Hacker News item.",
url_patterns: &[
"https://news.ycombinator.com/item?id=N",
"https://hn.algolia.com/items/N",
],
};
pub fn matches(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
if host == "news.ycombinator.com" {
return url.contains("item?id=") || url.contains("item%3Fid=");
}
if host == "hn.algolia.com" {
return url.contains("/items/");
}
false
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let id = parse_item_id(url).ok_or_else(|| {
FetchError::Build(format!("hackernews: cannot parse item id from '{url}'"))
})?;
let api_url = format!("https://hn.algolia.com/api/v1/items/{id}");
let resp = client.fetch(&api_url).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"hn algolia returned status {}",
resp.status
)));
}
let item: AlgoliaItem = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("hn algolia parse: {e}")))?;
let post = post_json(&item);
let comments: Vec<Value> = item.children.iter().filter_map(comment_json).collect();
Ok(json!({
"url": url,
"post": post,
"comments": comments,
}))
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// Pull the numeric id out of a HN URL. Handles `item?id=N` and the
/// Algolia mirror's `/items/N` form.
fn parse_item_id(url: &str) -> Option<u64> {
if let Some(after) = url.split("id=").nth(1) {
let n = after.split('&').next().unwrap_or(after);
if let Ok(id) = n.parse::<u64>() {
return Some(id);
}
}
if let Some(after) = url.split("/items/").nth(1) {
let n = after.split(['/', '?', '#']).next().unwrap_or(after);
if let Ok(id) = n.parse::<u64>() {
return Some(id);
}
}
None
}
fn post_json(item: &AlgoliaItem) -> Value {
json!({
"id": item.id,
"type": item.r#type,
"title": item.title,
"url": item.url,
"author": item.author,
"points": item.points,
"text": item.text, // populated for ask/show/tell
"created_at": item.created_at,
"created_at_unix": item.created_at_i,
"comment_count": count_descendants(item),
"permalink": item.id.map(|i| format!("https://news.ycombinator.com/item?id={i}")),
})
}
fn comment_json(item: &AlgoliaItem) -> Option<Value> {
if !matches!(item.r#type.as_deref(), Some("comment")) {
return None;
}
// Dead/deleted comments still appear in the tree; surface them honestly.
let replies: Vec<Value> = item.children.iter().filter_map(comment_json).collect();
Some(json!({
"id": item.id,
"author": item.author,
"text": item.text,
"created_at": item.created_at,
"created_at_unix": item.created_at_i,
"parent_id": item.parent_id,
"story_id": item.story_id,
"replies": replies,
}))
}
fn count_descendants(item: &AlgoliaItem) -> usize {
item.children
.iter()
.filter(|c| matches!(c.r#type.as_deref(), Some("comment")))
.map(|c| 1 + count_descendants(c))
.sum()
}
// ---------------------------------------------------------------------------
// Algolia API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct AlgoliaItem {
id: Option<u64>,
r#type: Option<String>,
title: Option<String>,
url: Option<String>,
author: Option<String>,
points: Option<i64>,
text: Option<String>,
created_at: Option<String>,
created_at_i: Option<i64>,
parent_id: Option<u64>,
story_id: Option<u64>,
#[serde(default)]
children: Vec<AlgoliaItem>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_hn_item_urls() {
assert!(matches("https://news.ycombinator.com/item?id=1"));
assert!(matches("https://news.ycombinator.com/item?id=12345"));
assert!(matches("https://hn.algolia.com/items/1"));
}
#[test]
fn rejects_non_item_urls() {
assert!(!matches("https://news.ycombinator.com/"));
assert!(!matches("https://news.ycombinator.com/news"));
assert!(!matches("https://example.com/item?id=1"));
}
#[test]
fn parse_item_id_handles_both_forms() {
assert_eq!(
parse_item_id("https://news.ycombinator.com/item?id=1"),
Some(1)
);
assert_eq!(
parse_item_id("https://news.ycombinator.com/item?id=12345&p=2"),
Some(12345)
);
assert_eq!(parse_item_id("https://hn.algolia.com/items/999"), Some(999));
assert_eq!(parse_item_id("https://example.com/foo"), None);
}
}

View file

@ -0,0 +1,223 @@
//! HuggingFace model card structured extractor.
//!
//! Uses the public model API at `huggingface.co/api/models/{owner}/{name}`.
//! Returns metadata + the parsed model card front matter, but does not
//! pull the full README body — those are sometimes 100KB+ and the user
//! can hit /v1/scrape if they want it as markdown.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "huggingface_model",
label: "HuggingFace model",
description: "Returns model metadata: downloads, likes, license, pipeline tag, library name, file list.",
url_patterns: &["https://huggingface.co/{owner}/{name}"],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "huggingface.co" && host != "www.huggingface.co" {
return false;
}
let path = url
.split("://")
.nth(1)
.and_then(|s| s.split_once('/'))
.map(|(_, p)| p)
.unwrap_or("");
let stripped = path
.split(['?', '#'])
.next()
.unwrap_or("")
.trim_end_matches('/');
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
// /{owner}/{name} but reject HF-internal sections + sub-pages.
if segs.len() != 2 {
return false;
}
!RESERVED_NAMESPACES.contains(&segs[0])
}
const RESERVED_NAMESPACES: &[&str] = &[
"datasets",
"spaces",
"blog",
"docs",
"api",
"models",
"papers",
"pricing",
"tasks",
"join",
"login",
"settings",
"organizations",
"new",
"search",
];
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (owner, name) = parse_owner_name(url).ok_or_else(|| {
FetchError::Build(format!("hf model: cannot parse owner/name from '{url}'"))
})?;
let api_url = format!("https://huggingface.co/api/models/{owner}/{name}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"hf model: '{owner}/{name}' not found"
)));
}
if resp.status == 401 {
return Err(FetchError::Build(format!(
"hf model: '{owner}/{name}' requires authentication (gated repo)"
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"hf api returned status {}",
resp.status
)));
}
let m: ModelInfo = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("hf api parse: {e}")))?;
// Surface a flat file list — full siblings can be hundreds of entries
// for big repos. We keep it as-is because callers want to know about
// every shard; if it bloats responses too much we'll add pagination.
let files: Vec<Value> = m
.siblings
.iter()
.map(|s| json!({"rfilename": s.rfilename, "size": s.size}))
.collect();
Ok(json!({
"url": url,
"id": m.id,
"model_id": m.model_id,
"private": m.private,
"gated": m.gated,
"downloads": m.downloads,
"downloads_30d": m.downloads_all_time,
"likes": m.likes,
"library_name": m.library_name,
"pipeline_tag": m.pipeline_tag,
"tags": m.tags,
"license": m.card_data.as_ref().and_then(|c| c.license.clone()),
"language": m.card_data.as_ref().and_then(|c| c.language.clone()),
"datasets": m.card_data.as_ref().and_then(|c| c.datasets.clone()),
"base_model": m.card_data.as_ref().and_then(|c| c.base_model.clone()),
"model_type": m.card_data.as_ref().and_then(|c| c.model_type.clone()),
"created_at": m.created_at,
"last_modified": m.last_modified,
"sha": m.sha,
"file_count": m.siblings.len(),
"files": files,
}))
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
fn parse_owner_name(url: &str) -> Option<(String, String)> {
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
let owner = segs.next()?.to_string();
let name = segs.next()?.to_string();
Some((owner, name))
}
// ---------------------------------------------------------------------------
// HF API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct ModelInfo {
id: Option<String>,
#[serde(rename = "modelId")]
model_id: Option<String>,
private: Option<bool>,
gated: Option<serde_json::Value>, // bool or string ("auto" / "manual" / false)
downloads: Option<i64>,
#[serde(rename = "downloadsAllTime")]
downloads_all_time: Option<i64>,
likes: Option<i64>,
#[serde(rename = "library_name")]
library_name: Option<String>,
#[serde(rename = "pipeline_tag")]
pipeline_tag: Option<String>,
#[serde(default)]
tags: Vec<String>,
#[serde(rename = "createdAt")]
created_at: Option<String>,
#[serde(rename = "lastModified")]
last_modified: Option<String>,
sha: Option<String>,
#[serde(rename = "cardData")]
card_data: Option<CardData>,
#[serde(default)]
siblings: Vec<Sibling>,
}
#[derive(Deserialize)]
struct CardData {
license: Option<serde_json::Value>, // string or array
language: Option<serde_json::Value>,
datasets: Option<serde_json::Value>,
#[serde(rename = "base_model")]
base_model: Option<serde_json::Value>,
#[serde(rename = "model_type")]
model_type: Option<String>,
}
#[derive(Deserialize)]
struct Sibling {
rfilename: String,
size: Option<i64>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_model_pages() {
assert!(matches("https://huggingface.co/meta-llama/Meta-Llama-3-8B"));
assert!(matches("https://huggingface.co/openai/whisper-large-v3"));
assert!(matches("https://huggingface.co/bert-base-uncased/main")); // owner=bert-base-uncased name=main: false positive but acceptable for v1
}
#[test]
fn rejects_hf_section_pages() {
assert!(!matches("https://huggingface.co/datasets/squad"));
assert!(!matches("https://huggingface.co/spaces/foo/bar"));
assert!(!matches("https://huggingface.co/blog/intro"));
assert!(!matches("https://huggingface.co/"));
assert!(!matches("https://huggingface.co/meta-llama"));
}
#[test]
fn parse_owner_name_pulls_both() {
assert_eq!(
parse_owner_name("https://huggingface.co/meta-llama/Meta-Llama-3-8B"),
Some(("meta-llama".into(), "Meta-Llama-3-8B".into()))
);
assert_eq!(
parse_owner_name("https://huggingface.co/openai/whisper-large-v3?library=transformers"),
Some(("openai".into(), "whisper-large-v3".into()))
);
}
}

View file

@ -0,0 +1,199 @@
//! Vertical extractors: site-specific parsers that return typed JSON
//! instead of generic markdown.
//!
//! Each extractor handles a single site or platform and exposes:
//! - `matches(url)` to claim ownership of a URL pattern
//! - `extract(client, url)` to fetch + parse into a typed JSON `Value`
//! - `INFO` static for the catalog (`/v1/extractors`)
//!
//! The dispatch in this module is a simple `match`-style chain rather than
//! a trait registry. With ~30 extractors that's still fast and avoids the
//! ceremony of dynamic dispatch. If we hit 50+ we'll revisit.
//!
//! Extractors prefer official JSON APIs over HTML scraping where one
//! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have
//! one). HTML extraction is the fallback for sites that don't.
pub mod github_repo;
pub mod hackernews;
pub mod huggingface_model;
pub mod npm;
pub mod pypi;
pub mod reddit;
use serde::Serialize;
use serde_json::Value;
use crate::client::FetchClient;
use crate::error::FetchError;
/// Public catalog entry for `/v1/extractors`. Stable shape — clients
/// rely on `name` to pick the right `/v1/scrape/{name}` route.
#[derive(Debug, Clone, Serialize)]
pub struct ExtractorInfo {
/// URL-safe identifier (`reddit`, `hackernews`, `github_repo`, ...).
pub name: &'static str,
/// Human-friendly display name.
pub label: &'static str,
/// One-line description of what the extractor returns.
pub description: &'static str,
/// Glob-ish URL pattern(s) the extractor claims. For documentation;
/// the actual matching is done by the extractor's `matches` fn.
pub url_patterns: &'static [&'static str],
}
/// Full catalog. Order is stable; new entries append.
pub fn list() -> Vec<ExtractorInfo> {
vec![
reddit::INFO,
hackernews::INFO,
github_repo::INFO,
pypi::INFO,
npm::INFO,
huggingface_model::INFO,
]
}
/// Auto-detect mode: try every extractor's `matches`, return the first
/// one that claims the URL. Used by `/v1/scrape` when the caller doesn't
/// pick a vertical explicitly.
pub async fn dispatch_by_url(
client: &FetchClient,
url: &str,
) -> Option<Result<(&'static str, Value), FetchError>> {
if reddit::matches(url) {
return Some(
reddit::extract(client, url)
.await
.map(|v| (reddit::INFO.name, v)),
);
}
if hackernews::matches(url) {
return Some(
hackernews::extract(client, url)
.await
.map(|v| (hackernews::INFO.name, v)),
);
}
if github_repo::matches(url) {
return Some(
github_repo::extract(client, url)
.await
.map(|v| (github_repo::INFO.name, v)),
);
}
if pypi::matches(url) {
return Some(
pypi::extract(client, url)
.await
.map(|v| (pypi::INFO.name, v)),
);
}
if npm::matches(url) {
return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v)));
}
if huggingface_model::matches(url) {
return Some(
huggingface_model::extract(client, url)
.await
.map(|v| (huggingface_model::INFO.name, v)),
);
}
None
}
/// Explicit mode: caller picked the vertical (`POST /v1/scrape/reddit`).
/// We still validate that the URL plausibly belongs to that vertical so
/// users get a clear "wrong route" error instead of a confusing parse
/// failure deep in the extractor.
pub async fn dispatch_by_name(
client: &FetchClient,
name: &str,
url: &str,
) -> Result<Value, ExtractorDispatchError> {
match name {
n if n == reddit::INFO.name => {
run_or_mismatch(reddit::matches(url), n, url, || {
reddit::extract(client, url)
})
.await
}
n if n == hackernews::INFO.name => {
run_or_mismatch(hackernews::matches(url), n, url, || {
hackernews::extract(client, url)
})
.await
}
n if n == github_repo::INFO.name => {
run_or_mismatch(github_repo::matches(url), n, url, || {
github_repo::extract(client, url)
})
.await
}
n if n == pypi::INFO.name => {
run_or_mismatch(pypi::matches(url), n, url, || pypi::extract(client, url)).await
}
n if n == npm::INFO.name => {
run_or_mismatch(npm::matches(url), n, url, || npm::extract(client, url)).await
}
n if n == huggingface_model::INFO.name => {
run_or_mismatch(huggingface_model::matches(url), n, url, || {
huggingface_model::extract(client, url)
})
.await
}
_ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
}
}
/// Errors that the dispatcher itself raises (vs. errors from inside an
/// extractor, which come back wrapped in `Fetch`).
#[derive(Debug, thiserror::Error)]
pub enum ExtractorDispatchError {
#[error("unknown vertical: '{0}'")]
UnknownVertical(String),
#[error("URL '{url}' does not match the '{vertical}' extractor")]
UrlMismatch { vertical: String, url: String },
#[error(transparent)]
Fetch(#[from] FetchError),
}
/// Helper: when the caller explicitly picked a vertical but their URL
/// doesn't match it, return `UrlMismatch` instead of running the
/// extractor (which would just fail with a less-clear error).
async fn run_or_mismatch<F, Fut>(
matches: bool,
vertical: &str,
url: &str,
f: F,
) -> Result<Value, ExtractorDispatchError>
where
F: FnOnce() -> Fut,
Fut: std::future::Future<Output = Result<Value, FetchError>>,
{
if !matches {
return Err(ExtractorDispatchError::UrlMismatch {
vertical: vertical.to_string(),
url: url.to_string(),
});
}
f().await.map_err(ExtractorDispatchError::Fetch)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn list_is_non_empty_and_unique() {
let entries = list();
assert!(!entries.is_empty());
let mut names: Vec<_> = entries.iter().map(|e| e.name).collect();
names.sort();
let before = names.len();
names.dedup();
assert_eq!(before, names.len(), "extractor names must be unique");
}
}

View file

@ -0,0 +1,235 @@
//! npm package structured extractor.
//!
//! Uses two npm-run APIs:
//! - `registry.npmjs.org/{name}` for full package metadata
//! - `api.npmjs.org/downloads/point/last-week/{name}` for usage signal
//!
//! The registry API returns the *full* document including every version
//! ever published, which can be tens of MB for popular packages
//! (`@types/node` etc). We strip down to the latest version's manifest
//! and a count of releases — full history would explode the response.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "npm",
label: "npm package",
description: "Returns package metadata: latest version manifest, dependencies, weekly downloads, license.",
url_patterns: &["https://www.npmjs.com/package/{name}"],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "www.npmjs.com" && host != "npmjs.com" {
return false;
}
url.contains("/package/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let name = parse_name(url)
.ok_or_else(|| FetchError::Build(format!("npm: cannot parse name from '{url}'")))?;
let registry_url = format!("https://registry.npmjs.org/{}", urlencode_segment(&name));
let resp = client.fetch(&registry_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"npm: package '{name}' not found"
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"npm registry returned status {}",
resp.status
)));
}
let pkg: PackageDoc = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("npm registry parse: {e}")))?;
// Resolve "latest" to a concrete version.
let latest_version = pkg
.dist_tags
.as_ref()
.and_then(|t| t.get("latest"))
.cloned()
.or_else(|| pkg.versions.as_ref().and_then(|v| v.keys().last().cloned()));
let latest_manifest = latest_version
.as_deref()
.and_then(|v| pkg.versions.as_ref().and_then(|m| m.get(v)));
let release_count = pkg.versions.as_ref().map(|v| v.len()).unwrap_or(0);
let latest_release_date = latest_version
.as_deref()
.and_then(|v| pkg.time.as_ref().and_then(|t| t.get(v).cloned()));
// Best-effort weekly downloads. If the api.npmjs.org call fails we
// surface `null` rather than failing the whole extractor — npm
// sometimes 503s the downloads endpoint while the registry is up.
let weekly_downloads = fetch_weekly_downloads(client, &name).await.ok();
Ok(json!({
"url": url,
"name": pkg.name.clone().unwrap_or(name.clone()),
"description": pkg.description,
"latest_version": latest_version,
"license": latest_manifest.and_then(|m| m.license.clone()),
"homepage": pkg.homepage,
"repository": pkg.repository.as_ref().and_then(|r| r.url.clone()),
"dependencies": latest_manifest.and_then(|m| m.dependencies.clone()),
"dev_dependencies": latest_manifest.and_then(|m| m.dev_dependencies.clone()),
"peer_dependencies": latest_manifest.and_then(|m| m.peer_dependencies.clone()),
"keywords": pkg.keywords,
"maintainers": pkg.maintainers,
"deprecated": latest_manifest.and_then(|m| m.deprecated.clone()),
"release_count": release_count,
"latest_release_date": latest_release_date,
"weekly_downloads": weekly_downloads,
}))
}
async fn fetch_weekly_downloads(client: &FetchClient, name: &str) -> Result<i64, FetchError> {
let url = format!(
"https://api.npmjs.org/downloads/point/last-week/{}",
urlencode_segment(name)
);
let resp = client.fetch(&url).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"npm downloads api status {}",
resp.status
)));
}
let dl: Downloads = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("npm downloads parse: {e}")))?;
Ok(dl.downloads)
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Extract the package name from an npmjs.com URL. Handles scoped packages
/// (`/package/@scope/name`) and trailing path segments (`/v/x.y.z`).
fn parse_name(url: &str) -> Option<String> {
let after = url.split("/package/").nth(1)?;
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
let first = segs.next()?;
if first.starts_with('@') {
let second = segs.next()?;
Some(format!("{first}/{second}"))
} else {
Some(first.to_string())
}
}
/// `@scope/name` must encode the `/` for the registry path. Plain names
/// pass through untouched.
fn urlencode_segment(name: &str) -> String {
name.replace('/', "%2F")
}
// ---------------------------------------------------------------------------
// Registry types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct PackageDoc {
name: Option<String>,
description: Option<String>,
homepage: Option<serde_json::Value>, // sometimes string, sometimes object
repository: Option<Repository>,
keywords: Option<Vec<String>>,
maintainers: Option<Vec<Maintainer>>,
#[serde(rename = "dist-tags")]
dist_tags: Option<std::collections::BTreeMap<String, String>>,
versions: Option<std::collections::BTreeMap<String, VersionManifest>>,
time: Option<std::collections::BTreeMap<String, String>>,
}
#[derive(Deserialize, Default, Clone)]
struct VersionManifest {
license: Option<serde_json::Value>, // string or object
dependencies: Option<std::collections::BTreeMap<String, String>>,
#[serde(rename = "devDependencies")]
dev_dependencies: Option<std::collections::BTreeMap<String, String>>,
#[serde(rename = "peerDependencies")]
peer_dependencies: Option<std::collections::BTreeMap<String, String>>,
// `deprecated` is sometimes a bool and sometimes a string in the
// registry. serde_json::Value covers both without failing the parse.
deprecated: Option<serde_json::Value>,
}
#[derive(Deserialize)]
struct Repository {
url: Option<String>,
}
#[derive(Deserialize, Clone)]
struct Maintainer {
name: Option<String>,
email: Option<String>,
}
impl serde::Serialize for Maintainer {
fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
use serde::ser::SerializeMap;
let mut m = s.serialize_map(Some(2))?;
m.serialize_entry("name", &self.name)?;
m.serialize_entry("email", &self.email)?;
m.end()
}
}
#[derive(Deserialize)]
struct Downloads {
downloads: i64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_npm_package_urls() {
assert!(matches("https://www.npmjs.com/package/react"));
assert!(matches("https://www.npmjs.com/package/@types/node"));
assert!(matches("https://npmjs.com/package/lodash"));
assert!(!matches("https://www.npmjs.com/"));
assert!(!matches("https://example.com/package/foo"));
}
#[test]
fn parse_name_handles_scoped_and_unscoped() {
assert_eq!(
parse_name("https://www.npmjs.com/package/react"),
Some("react".into())
);
assert_eq!(
parse_name("https://www.npmjs.com/package/@types/node"),
Some("@types/node".into())
);
assert_eq!(
parse_name("https://www.npmjs.com/package/lodash/v/4.17.21"),
Some("lodash".into())
);
}
#[test]
fn urlencode_only_touches_scope_separator() {
assert_eq!(urlencode_segment("react"), "react");
assert_eq!(urlencode_segment("@types/node"), "@types%2Fnode");
}
}

View file

@ -0,0 +1,184 @@
//! PyPI package structured extractor.
//!
//! PyPI exposes a stable JSON API at `pypi.org/pypi/{name}/json` and
//! a versioned form at `pypi.org/pypi/{name}/{version}/json`. Both
//! return the full release info plus history. No auth, no rate limits
//! that we hit at normal usage.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "pypi",
label: "PyPI package",
description: "Returns package metadata: latest version, dependencies, license, release history.",
url_patterns: &[
"https://pypi.org/project/{name}/",
"https://pypi.org/project/{name}/{version}/",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "pypi.org" && host != "www.pypi.org" {
return false;
}
url.contains("/project/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (name, version) = parse_project(url).ok_or_else(|| {
FetchError::Build(format!("pypi: cannot parse package name from '{url}'"))
})?;
let api_url = match &version {
Some(v) => format!("https://pypi.org/pypi/{name}/{v}/json"),
None => format!("https://pypi.org/pypi/{name}/json"),
};
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"pypi: package '{name}' not found"
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"pypi api returned status {}",
resp.status
)));
}
let pkg: PypiResponse = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("pypi parse: {e}")))?;
let info = pkg.info;
let release_count = pkg.releases.as_ref().map(|r| r.len()).unwrap_or(0);
// Latest release date = max upload time across files in the latest version.
let latest_release_date = pkg
.releases
.as_ref()
.and_then(|map| info.version.as_deref().and_then(|v| map.get(v)))
.and_then(|files| files.iter().filter_map(|f| f.upload_time.clone()).max());
// Drop the long description from the JSON shape — it's frequently a 50KB
// README and bloats responses. Callers who need it can hit /v1/scrape.
Ok(json!({
"url": url,
"name": info.name,
"version": info.version,
"summary": info.summary,
"homepage": info.home_page,
"license": info.license,
"license_classifier": pick_license_classifier(&info.classifiers),
"author": info.author,
"author_email": info.author_email,
"maintainer": info.maintainer,
"requires_python": info.requires_python,
"requires_dist": info.requires_dist,
"keywords": info.keywords,
"classifiers": info.classifiers,
"yanked": info.yanked,
"yanked_reason": info.yanked_reason,
"project_urls": info.project_urls,
"release_count": release_count,
"latest_release_date": latest_release_date,
}))
}
/// PyPI puts the SPDX-ish license under classifiers like
/// `License :: OSI Approved :: Apache Software License`. Surface the most
/// specific one when the `license` field itself is empty/junk.
fn pick_license_classifier(classifiers: &Option<Vec<String>>) -> Option<String> {
classifiers
.as_ref()?
.iter()
.filter(|c| c.starts_with("License ::"))
.max_by_key(|c| c.len())
.cloned()
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
fn parse_project(url: &str) -> Option<(String, Option<String>)> {
let after = url.split("/project/").nth(1)?;
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
let name = segs.next()?.to_string();
let version = segs.next().map(|v| v.to_string());
Some((name, version))
}
// ---------------------------------------------------------------------------
// PyPI API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct PypiResponse {
info: Info,
releases: Option<std::collections::BTreeMap<String, Vec<File>>>,
}
#[derive(Deserialize)]
struct Info {
name: Option<String>,
version: Option<String>,
summary: Option<String>,
home_page: Option<String>,
license: Option<String>,
author: Option<String>,
author_email: Option<String>,
maintainer: Option<String>,
requires_python: Option<String>,
requires_dist: Option<Vec<String>>,
keywords: Option<String>,
classifiers: Option<Vec<String>>,
yanked: Option<bool>,
yanked_reason: Option<String>,
project_urls: Option<std::collections::BTreeMap<String, String>>,
}
#[derive(Deserialize)]
struct File {
upload_time: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_project_urls() {
assert!(matches("https://pypi.org/project/requests/"));
assert!(matches("https://pypi.org/project/numpy/1.26.0/"));
assert!(!matches("https://pypi.org/"));
assert!(!matches("https://example.com/project/foo"));
}
#[test]
fn parse_project_pulls_name_and_version() {
assert_eq!(
parse_project("https://pypi.org/project/requests/"),
Some(("requests".into(), None))
);
assert_eq!(
parse_project("https://pypi.org/project/numpy/1.26.0/"),
Some(("numpy".into(), Some("1.26.0".into())))
);
assert_eq!(
parse_project("https://pypi.org/project/scikit-learn/?foo=bar"),
Some(("scikit-learn".into(), None))
);
}
}

View file

@ -0,0 +1,234 @@
//! Reddit structured extractor — returns the full post + comment tree
//! as typed JSON via Reddit's `.json` API.
//!
//! The same trick the markdown extractor in `crate::reddit` uses:
//! appending `.json` to any post URL returns the data the new SPA
//! frontend would load client-side. Zero antibot, zero JS rendering.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "reddit",
label: "Reddit thread",
description: "Returns post + nested comment tree with scores, authors, and timestamps.",
url_patterns: &[
"https://www.reddit.com/r/*/comments/*",
"https://reddit.com/r/*/comments/*",
"https://old.reddit.com/r/*/comments/*",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
let is_reddit_host = matches!(
host,
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
);
is_reddit_host && url.contains("/comments/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let json_url = build_json_url(url);
let resp = client.fetch(&json_url).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"reddit api returned status {}",
resp.status
)));
}
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
if listings.is_empty() {
return Err(FetchError::BodyDecode("reddit response empty".into()));
}
// First listing = the post (single t3 child).
let post = listings
.first()
.and_then(|l| l.data.children.first())
.filter(|t| t.kind == "t3")
.map(|t| post_json(&t.data))
.unwrap_or(Value::Null);
// Second listing = the comment tree.
let comments: Vec<Value> = listings
.get(1)
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
.unwrap_or_default();
Ok(json!({
"url": url,
"post": post,
"comments": comments,
}))
}
// ---------------------------------------------------------------------------
// JSON shapers
// ---------------------------------------------------------------------------
fn post_json(d: &ThingData) -> Value {
json!({
"id": d.id,
"title": d.title,
"author": d.author,
"subreddit": d.subreddit_name_prefixed,
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
"url": d.url_overridden_by_dest,
"is_self": d.is_self,
"selftext": d.selftext,
"score": d.score,
"upvote_ratio": d.upvote_ratio,
"num_comments": d.num_comments,
"created_utc": d.created_utc,
"link_flair_text": d.link_flair_text,
"over_18": d.over_18,
"spoiler": d.spoiler,
"stickied": d.stickied,
"locked": d.locked,
})
}
/// Render a single comment + its reply tree. Returns `None` for non-t1
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
fn comment_json(thing: &Thing) -> Option<Value> {
if thing.kind != "t1" {
return None;
}
let d = &thing.data;
let replies: Vec<Value> = match &d.replies {
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
_ => Vec::new(),
};
Some(json!({
"id": d.id,
"author": d.author,
"body": d.body,
"score": d.score,
"created_utc": d.created_utc,
"is_submitter": d.is_submitter,
"stickied": d.stickied,
"depth": d.depth,
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
"replies": replies,
}))
}
// ---------------------------------------------------------------------------
// URL helpers
// ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
/// or `old.reddit.com` as the caller gave us). Routing through
/// `old.reddit.com` unconditionally looks appealing but that host has
/// stricter UA-based blocking than `www.reddit.com`, while the main
/// host accepts our Chrome-fingerprinted client fine.
fn build_json_url(url: &str) -> String {
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
format!("{clean}.json?raw_json=1")
}
// ---------------------------------------------------------------------------
// Reddit JSON types — only fields we render. Everything else is dropped.
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Listing {
data: ListingData,
}
#[derive(Deserialize)]
struct ListingData {
children: Vec<Thing>,
}
#[derive(Deserialize)]
struct Thing {
kind: String,
data: ThingData,
}
#[derive(Deserialize, Default)]
struct ThingData {
// post (t3)
id: Option<String>,
title: Option<String>,
selftext: Option<String>,
subreddit_name_prefixed: Option<String>,
url_overridden_by_dest: Option<String>,
is_self: Option<bool>,
upvote_ratio: Option<f64>,
num_comments: Option<i64>,
over_18: Option<bool>,
spoiler: Option<bool>,
stickied: Option<bool>,
locked: Option<bool>,
link_flair_text: Option<String>,
// comment (t1)
author: Option<String>,
body: Option<String>,
score: Option<i64>,
created_utc: Option<f64>,
is_submitter: Option<bool>,
depth: Option<i64>,
permalink: Option<String>,
// recursive
replies: Option<Replies>,
}
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
Listing(Listing),
#[allow(dead_code)]
Empty(String),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_reddit_post_urls() {
assert!(matches(
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
));
assert!(matches(
"https://reddit.com/r/rust/comments/abc123/some_title"
));
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
}
#[test]
fn rejects_non_post_reddit_urls() {
assert!(!matches("https://www.reddit.com/r/rust"));
assert!(!matches("https://www.reddit.com/user/foo"));
assert!(!matches("https://example.com/r/rust/comments/x"));
}
#[test]
fn json_url_appends_suffix_and_drops_query() {
assert_eq!(
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
);
}
}

View file

@ -6,6 +6,7 @@ pub mod client;
pub mod crawler;
pub mod document;
pub mod error;
pub mod extractors;
pub mod linkedin;
pub mod proxy;
pub mod reddit;

View file

@ -79,10 +79,15 @@ async fn main() -> anyhow::Result<()> {
let v1 = Router::new()
.route("/scrape", post(routes::scrape::scrape))
.route(
"/scrape/{vertical}",
post(routes::structured::scrape_vertical),
)
.route("/crawl", post(routes::crawl::crawl))
.route("/map", post(routes::map::map))
.route("/batch", post(routes::batch::batch))
.route("/extract", post(routes::extract::extract))
.route("/extractors", get(routes::structured::list_extractors))
.route("/summarize", post(routes::summarize::summarize_route))
.route("/diff", post(routes::diff::diff_route))
.route("/brand", post(routes::brand::brand))

View file

@ -15,4 +15,5 @@ pub mod extract;
pub mod health;
pub mod map;
pub mod scrape;
pub mod structured;
pub mod summarize;

View file

@ -0,0 +1,55 @@
//! `POST /v1/scrape/{vertical}` and `GET /v1/extractors`.
//!
//! Vertical extractors return typed JSON instead of generic markdown.
//! See `webclaw_fetch::extractors` for the catalog and per-site logic.
use axum::{
Json,
extract::{Path, State},
};
use serde::Deserialize;
use serde_json::{Value, json};
use webclaw_fetch::extractors::{self, ExtractorDispatchError};
use crate::{error::ApiError, state::AppState};
#[derive(Debug, Deserialize)]
pub struct ScrapeRequest {
pub url: String,
}
/// Map dispatcher errors to ApiError so users get clean HTTP statuses
/// instead of opaque 500s.
impl From<ExtractorDispatchError> for ApiError {
fn from(e: ExtractorDispatchError) -> Self {
match e {
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
}
}
}
/// `GET /v1/extractors` — catalog of all available verticals.
pub async fn list_extractors() -> Json<Value> {
Json(json!({
"extractors": extractors::list(),
}))
}
/// `POST /v1/scrape/{vertical}` — explicit vertical, e.g. /v1/scrape/reddit.
pub async fn scrape_vertical(
State(state): State<AppState>,
Path(vertical): Path<String>,
Json(req): Json<ScrapeRequest>,
) -> Result<Json<Value>, ApiError> {
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
Ok(Json(json!({
"vertical": vertical,
"url": req.url,
"data": data,
})))
}