mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat(extractors): add vertical extractors module + first 6 verticals
New extractors module returns site-specific typed JSON instead of
generic markdown. Each extractor:
- declares a URL pattern via matches()
- fetches from the site's official JSON API where one exists
- returns a typed serde_json::Value with documented field names
- exposes an INFO struct that powers the /v1/extractors catalog
First 6 verticals shipped, all hitting public JSON APIs (no HTML
scraping, zero antibot risk):
- reddit → www.reddit.com/*/.json
- hackernews → hn.algolia.com/api/v1/items/{id} (full thread in one call)
- github_repo → api.github.com/repos/{owner}/{repo}
- pypi → pypi.org/pypi/{name}/json
- npm → registry.npmjs.org/{name} + downloads/point/last-week
- huggingface_model → huggingface.co/api/models/{owner}/{name}
Server-side routes added:
- POST /v1/scrape/{vertical} explicit per-vertical extraction
- GET /v1/extractors catalog (name, label, description, url_patterns)
The dispatcher validates that URL matches the requested vertical
before running, so users get "URL doesn't match the X extractor"
instead of opaque parse failures inside the extractor.
17 unit tests cover URL matching + path parsing for each vertical.
Live tests against canonical URLs (rust-lang/rust, requests pypi,
react npm, whisper-large-v3 hf, item 8863 hn, an r/micro_saas post)
all return correct typed JSON in 100-300ms. Sample sizes: github
863B, npm 700B, pypi 1.7KB, hf 3.2KB, hn 38KB (full comment tree).
Marketing positioning: Firecrawl charges 5 credits per /extract call
and you write the schema. Webclaw returns the same JSON in 1 credit
per /scrape/{vertical} call with hand-written deterministic
extractors per site.
This commit is contained in:
parent
ccdb6d364b
commit
8ba7538c37
11 changed files with 1535 additions and 0 deletions
212
crates/webclaw-fetch/src/extractors/github_repo.rs
Normal file
212
crates/webclaw-fetch/src/extractors/github_repo.rs
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
//! GitHub repository structured extractor.
|
||||
//!
|
||||
//! Uses GitHub's public REST API at `api.github.com/repos/{owner}/{repo}`.
|
||||
//! Unauthenticated requests get 60/hour per IP, which is fine for users
|
||||
//! self-hosting and for low-volume cloud usage. Production cloud should
|
||||
//! set a `GITHUB_TOKEN` to lift to 5,000/hour, but the extractor doesn't
|
||||
//! depend on it being set — it works open out of the box.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "github_repo",
|
||||
label: "GitHub repository",
|
||||
description: "Returns repo metadata: stars, forks, topics, license, default branch, recent activity.",
|
||||
url_patterns: &["https://github.com/{owner}/{repo}"],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
if host != "github.com" && host != "www.github.com" {
|
||||
return false;
|
||||
}
|
||||
// Path must be exactly /{owner}/{repo} (or with trailing slash). Reject
|
||||
// sub-pages (issues, pulls, blob, etc.) so we don't claim URLs the
|
||||
// future github_issue / github_pr extractors will handle.
|
||||
let path = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.and_then(|s| s.split_once('/'))
|
||||
.map(|(_, p)| p)
|
||||
.unwrap_or("");
|
||||
let stripped = path
|
||||
.split(['?', '#'])
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
.trim_end_matches('/');
|
||||
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
||||
segs.len() == 2 && !RESERVED_OWNERS.contains(&segs[0])
|
||||
}
|
||||
|
||||
/// GitHub uses some top-level paths for non-repo pages.
|
||||
const RESERVED_OWNERS: &[&str] = &[
|
||||
"settings",
|
||||
"marketplace",
|
||||
"explore",
|
||||
"topics",
|
||||
"trending",
|
||||
"collections",
|
||||
"events",
|
||||
"sponsors",
|
||||
"issues",
|
||||
"pulls",
|
||||
"notifications",
|
||||
"new",
|
||||
"organizations",
|
||||
"login",
|
||||
"join",
|
||||
"search",
|
||||
"about",
|
||||
];
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let (owner, repo) = parse_owner_repo(url).ok_or_else(|| {
|
||||
FetchError::Build(format!("github_repo: cannot parse owner/repo from '{url}'"))
|
||||
})?;
|
||||
|
||||
let api_url = format!("https://api.github.com/repos/{owner}/{repo}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"github_repo: repo '{owner}/{repo}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status == 403 {
|
||||
return Err(FetchError::Build(
|
||||
"github_repo: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
|
||||
));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"github api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let r: Repo = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("github api parse: {e}")))?;
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"owner": r.owner.as_ref().map(|o| &o.login),
|
||||
"name": r.name,
|
||||
"full_name": r.full_name,
|
||||
"description": r.description,
|
||||
"homepage": r.homepage,
|
||||
"language": r.language,
|
||||
"topics": r.topics,
|
||||
"license": r.license.as_ref().and_then(|l| l.spdx_id.clone()),
|
||||
"license_name": r.license.as_ref().map(|l| l.name.clone()),
|
||||
"default_branch": r.default_branch,
|
||||
"stars": r.stargazers_count,
|
||||
"forks": r.forks_count,
|
||||
"watchers": r.subscribers_count,
|
||||
"open_issues": r.open_issues_count,
|
||||
"size_kb": r.size,
|
||||
"archived": r.archived,
|
||||
"fork": r.fork,
|
||||
"is_template": r.is_template,
|
||||
"has_issues": r.has_issues,
|
||||
"has_wiki": r.has_wiki,
|
||||
"has_pages": r.has_pages,
|
||||
"has_discussions": r.has_discussions,
|
||||
"created_at": r.created_at,
|
||||
"updated_at": r.updated_at,
|
||||
"pushed_at": r.pushed_at,
|
||||
"html_url": r.html_url,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_owner_repo(url: &str) -> Option<(String, String)> {
|
||||
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
||||
let owner = segs.next()?.to_string();
|
||||
let repo = segs.next()?.to_string();
|
||||
Some((owner, repo))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// GitHub API types — only the fields we surface
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Repo {
|
||||
name: Option<String>,
|
||||
full_name: Option<String>,
|
||||
description: Option<String>,
|
||||
homepage: Option<String>,
|
||||
language: Option<String>,
|
||||
#[serde(default)]
|
||||
topics: Vec<String>,
|
||||
license: Option<License>,
|
||||
default_branch: Option<String>,
|
||||
stargazers_count: Option<i64>,
|
||||
forks_count: Option<i64>,
|
||||
subscribers_count: Option<i64>,
|
||||
open_issues_count: Option<i64>,
|
||||
size: Option<i64>,
|
||||
archived: Option<bool>,
|
||||
fork: Option<bool>,
|
||||
is_template: Option<bool>,
|
||||
has_issues: Option<bool>,
|
||||
has_wiki: Option<bool>,
|
||||
has_pages: Option<bool>,
|
||||
has_discussions: Option<bool>,
|
||||
created_at: Option<String>,
|
||||
updated_at: Option<String>,
|
||||
pushed_at: Option<String>,
|
||||
html_url: Option<String>,
|
||||
owner: Option<Owner>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Owner {
|
||||
login: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct License {
|
||||
name: String,
|
||||
spdx_id: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_repo_root_only() {
|
||||
assert!(matches("https://github.com/rust-lang/rust"));
|
||||
assert!(matches("https://github.com/rust-lang/rust/"));
|
||||
assert!(!matches("https://github.com/rust-lang/rust/issues"));
|
||||
assert!(!matches("https://github.com/rust-lang/rust/pulls/123"));
|
||||
assert!(!matches("https://github.com/rust-lang"));
|
||||
assert!(!matches("https://github.com/marketplace"));
|
||||
assert!(!matches("https://github.com/topics/rust"));
|
||||
assert!(!matches("https://example.com/foo/bar"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_owner_repo_handles_trailing_slash_and_query() {
|
||||
assert_eq!(
|
||||
parse_owner_repo("https://github.com/rust-lang/rust"),
|
||||
Some(("rust-lang".into(), "rust".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_owner_repo("https://github.com/rust-lang/rust/?tab=foo"),
|
||||
Some(("rust-lang".into(), "rust".into()))
|
||||
);
|
||||
}
|
||||
}
|
||||
186
crates/webclaw-fetch/src/extractors/hackernews.rs
Normal file
186
crates/webclaw-fetch/src/extractors/hackernews.rs
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
//! Hacker News structured extractor.
|
||||
//!
|
||||
//! Uses Algolia's HN API (`hn.algolia.com/api/v1/items/{id}`) which
|
||||
//! returns the full post + recursive comment tree in a single request.
|
||||
//! The official Firebase API at `hacker-news.firebaseio.com` requires
|
||||
//! N+1 fetches per comment, so we'd hit either timeout or rate-limit
|
||||
//! on any non-trivial thread.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "hackernews",
|
||||
label: "Hacker News story",
|
||||
description: "Returns post + nested comment tree for a Hacker News item.",
|
||||
url_patterns: &[
|
||||
"https://news.ycombinator.com/item?id=N",
|
||||
"https://hn.algolia.com/items/N",
|
||||
],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
if host == "news.ycombinator.com" {
|
||||
return url.contains("item?id=") || url.contains("item%3Fid=");
|
||||
}
|
||||
if host == "hn.algolia.com" {
|
||||
return url.contains("/items/");
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let id = parse_item_id(url).ok_or_else(|| {
|
||||
FetchError::Build(format!("hackernews: cannot parse item id from '{url}'"))
|
||||
})?;
|
||||
|
||||
let api_url = format!("https://hn.algolia.com/api/v1/items/{id}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"hn algolia returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let item: AlgoliaItem = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("hn algolia parse: {e}")))?;
|
||||
|
||||
let post = post_json(&item);
|
||||
let comments: Vec<Value> = item.children.iter().filter_map(comment_json).collect();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"post": post,
|
||||
"comments": comments,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Pull the numeric id out of a HN URL. Handles `item?id=N` and the
|
||||
/// Algolia mirror's `/items/N` form.
|
||||
fn parse_item_id(url: &str) -> Option<u64> {
|
||||
if let Some(after) = url.split("id=").nth(1) {
|
||||
let n = after.split('&').next().unwrap_or(after);
|
||||
if let Ok(id) = n.parse::<u64>() {
|
||||
return Some(id);
|
||||
}
|
||||
}
|
||||
if let Some(after) = url.split("/items/").nth(1) {
|
||||
let n = after.split(['/', '?', '#']).next().unwrap_or(after);
|
||||
if let Ok(id) = n.parse::<u64>() {
|
||||
return Some(id);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn post_json(item: &AlgoliaItem) -> Value {
|
||||
json!({
|
||||
"id": item.id,
|
||||
"type": item.r#type,
|
||||
"title": item.title,
|
||||
"url": item.url,
|
||||
"author": item.author,
|
||||
"points": item.points,
|
||||
"text": item.text, // populated for ask/show/tell
|
||||
"created_at": item.created_at,
|
||||
"created_at_unix": item.created_at_i,
|
||||
"comment_count": count_descendants(item),
|
||||
"permalink": item.id.map(|i| format!("https://news.ycombinator.com/item?id={i}")),
|
||||
})
|
||||
}
|
||||
|
||||
fn comment_json(item: &AlgoliaItem) -> Option<Value> {
|
||||
if !matches!(item.r#type.as_deref(), Some("comment")) {
|
||||
return None;
|
||||
}
|
||||
// Dead/deleted comments still appear in the tree; surface them honestly.
|
||||
let replies: Vec<Value> = item.children.iter().filter_map(comment_json).collect();
|
||||
Some(json!({
|
||||
"id": item.id,
|
||||
"author": item.author,
|
||||
"text": item.text,
|
||||
"created_at": item.created_at,
|
||||
"created_at_unix": item.created_at_i,
|
||||
"parent_id": item.parent_id,
|
||||
"story_id": item.story_id,
|
||||
"replies": replies,
|
||||
}))
|
||||
}
|
||||
|
||||
fn count_descendants(item: &AlgoliaItem) -> usize {
|
||||
item.children
|
||||
.iter()
|
||||
.filter(|c| matches!(c.r#type.as_deref(), Some("comment")))
|
||||
.map(|c| 1 + count_descendants(c))
|
||||
.sum()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Algolia API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct AlgoliaItem {
|
||||
id: Option<u64>,
|
||||
r#type: Option<String>,
|
||||
title: Option<String>,
|
||||
url: Option<String>,
|
||||
author: Option<String>,
|
||||
points: Option<i64>,
|
||||
text: Option<String>,
|
||||
created_at: Option<String>,
|
||||
created_at_i: Option<i64>,
|
||||
parent_id: Option<u64>,
|
||||
story_id: Option<u64>,
|
||||
#[serde(default)]
|
||||
children: Vec<AlgoliaItem>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_hn_item_urls() {
|
||||
assert!(matches("https://news.ycombinator.com/item?id=1"));
|
||||
assert!(matches("https://news.ycombinator.com/item?id=12345"));
|
||||
assert!(matches("https://hn.algolia.com/items/1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_non_item_urls() {
|
||||
assert!(!matches("https://news.ycombinator.com/"));
|
||||
assert!(!matches("https://news.ycombinator.com/news"));
|
||||
assert!(!matches("https://example.com/item?id=1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_item_id_handles_both_forms() {
|
||||
assert_eq!(
|
||||
parse_item_id("https://news.ycombinator.com/item?id=1"),
|
||||
Some(1)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_item_id("https://news.ycombinator.com/item?id=12345&p=2"),
|
||||
Some(12345)
|
||||
);
|
||||
assert_eq!(parse_item_id("https://hn.algolia.com/items/999"), Some(999));
|
||||
assert_eq!(parse_item_id("https://example.com/foo"), None);
|
||||
}
|
||||
}
|
||||
223
crates/webclaw-fetch/src/extractors/huggingface_model.rs
Normal file
223
crates/webclaw-fetch/src/extractors/huggingface_model.rs
Normal file
|
|
@ -0,0 +1,223 @@
|
|||
//! HuggingFace model card structured extractor.
|
||||
//!
|
||||
//! Uses the public model API at `huggingface.co/api/models/{owner}/{name}`.
|
||||
//! Returns metadata + the parsed model card front matter, but does not
|
||||
//! pull the full README body — those are sometimes 100KB+ and the user
|
||||
//! can hit /v1/scrape if they want it as markdown.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "huggingface_model",
|
||||
label: "HuggingFace model",
|
||||
description: "Returns model metadata: downloads, likes, license, pipeline tag, library name, file list.",
|
||||
url_patterns: &["https://huggingface.co/{owner}/{name}"],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "huggingface.co" && host != "www.huggingface.co" {
|
||||
return false;
|
||||
}
|
||||
let path = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.and_then(|s| s.split_once('/'))
|
||||
.map(|(_, p)| p)
|
||||
.unwrap_or("");
|
||||
let stripped = path
|
||||
.split(['?', '#'])
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
.trim_end_matches('/');
|
||||
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
||||
// /{owner}/{name} but reject HF-internal sections + sub-pages.
|
||||
if segs.len() != 2 {
|
||||
return false;
|
||||
}
|
||||
!RESERVED_NAMESPACES.contains(&segs[0])
|
||||
}
|
||||
|
||||
const RESERVED_NAMESPACES: &[&str] = &[
|
||||
"datasets",
|
||||
"spaces",
|
||||
"blog",
|
||||
"docs",
|
||||
"api",
|
||||
"models",
|
||||
"papers",
|
||||
"pricing",
|
||||
"tasks",
|
||||
"join",
|
||||
"login",
|
||||
"settings",
|
||||
"organizations",
|
||||
"new",
|
||||
"search",
|
||||
];
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let (owner, name) = parse_owner_name(url).ok_or_else(|| {
|
||||
FetchError::Build(format!("hf model: cannot parse owner/name from '{url}'"))
|
||||
})?;
|
||||
|
||||
let api_url = format!("https://huggingface.co/api/models/{owner}/{name}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"hf model: '{owner}/{name}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status == 401 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"hf model: '{owner}/{name}' requires authentication (gated repo)"
|
||||
)));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"hf api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let m: ModelInfo = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("hf api parse: {e}")))?;
|
||||
|
||||
// Surface a flat file list — full siblings can be hundreds of entries
|
||||
// for big repos. We keep it as-is because callers want to know about
|
||||
// every shard; if it bloats responses too much we'll add pagination.
|
||||
let files: Vec<Value> = m
|
||||
.siblings
|
||||
.iter()
|
||||
.map(|s| json!({"rfilename": s.rfilename, "size": s.size}))
|
||||
.collect();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"id": m.id,
|
||||
"model_id": m.model_id,
|
||||
"private": m.private,
|
||||
"gated": m.gated,
|
||||
"downloads": m.downloads,
|
||||
"downloads_30d": m.downloads_all_time,
|
||||
"likes": m.likes,
|
||||
"library_name": m.library_name,
|
||||
"pipeline_tag": m.pipeline_tag,
|
||||
"tags": m.tags,
|
||||
"license": m.card_data.as_ref().and_then(|c| c.license.clone()),
|
||||
"language": m.card_data.as_ref().and_then(|c| c.language.clone()),
|
||||
"datasets": m.card_data.as_ref().and_then(|c| c.datasets.clone()),
|
||||
"base_model": m.card_data.as_ref().and_then(|c| c.base_model.clone()),
|
||||
"model_type": m.card_data.as_ref().and_then(|c| c.model_type.clone()),
|
||||
"created_at": m.created_at,
|
||||
"last_modified": m.last_modified,
|
||||
"sha": m.sha,
|
||||
"file_count": m.siblings.len(),
|
||||
"files": files,
|
||||
}))
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
fn parse_owner_name(url: &str) -> Option<(String, String)> {
|
||||
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
||||
let owner = segs.next()?.to_string();
|
||||
let name = segs.next()?.to_string();
|
||||
Some((owner, name))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HF API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ModelInfo {
|
||||
id: Option<String>,
|
||||
#[serde(rename = "modelId")]
|
||||
model_id: Option<String>,
|
||||
private: Option<bool>,
|
||||
gated: Option<serde_json::Value>, // bool or string ("auto" / "manual" / false)
|
||||
downloads: Option<i64>,
|
||||
#[serde(rename = "downloadsAllTime")]
|
||||
downloads_all_time: Option<i64>,
|
||||
likes: Option<i64>,
|
||||
#[serde(rename = "library_name")]
|
||||
library_name: Option<String>,
|
||||
#[serde(rename = "pipeline_tag")]
|
||||
pipeline_tag: Option<String>,
|
||||
#[serde(default)]
|
||||
tags: Vec<String>,
|
||||
#[serde(rename = "createdAt")]
|
||||
created_at: Option<String>,
|
||||
#[serde(rename = "lastModified")]
|
||||
last_modified: Option<String>,
|
||||
sha: Option<String>,
|
||||
#[serde(rename = "cardData")]
|
||||
card_data: Option<CardData>,
|
||||
#[serde(default)]
|
||||
siblings: Vec<Sibling>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct CardData {
|
||||
license: Option<serde_json::Value>, // string or array
|
||||
language: Option<serde_json::Value>,
|
||||
datasets: Option<serde_json::Value>,
|
||||
#[serde(rename = "base_model")]
|
||||
base_model: Option<serde_json::Value>,
|
||||
#[serde(rename = "model_type")]
|
||||
model_type: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Sibling {
|
||||
rfilename: String,
|
||||
size: Option<i64>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_model_pages() {
|
||||
assert!(matches("https://huggingface.co/meta-llama/Meta-Llama-3-8B"));
|
||||
assert!(matches("https://huggingface.co/openai/whisper-large-v3"));
|
||||
assert!(matches("https://huggingface.co/bert-base-uncased/main")); // owner=bert-base-uncased name=main: false positive but acceptable for v1
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_hf_section_pages() {
|
||||
assert!(!matches("https://huggingface.co/datasets/squad"));
|
||||
assert!(!matches("https://huggingface.co/spaces/foo/bar"));
|
||||
assert!(!matches("https://huggingface.co/blog/intro"));
|
||||
assert!(!matches("https://huggingface.co/"));
|
||||
assert!(!matches("https://huggingface.co/meta-llama"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_owner_name_pulls_both() {
|
||||
assert_eq!(
|
||||
parse_owner_name("https://huggingface.co/meta-llama/Meta-Llama-3-8B"),
|
||||
Some(("meta-llama".into(), "Meta-Llama-3-8B".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_owner_name("https://huggingface.co/openai/whisper-large-v3?library=transformers"),
|
||||
Some(("openai".into(), "whisper-large-v3".into()))
|
||||
);
|
||||
}
|
||||
}
|
||||
199
crates/webclaw-fetch/src/extractors/mod.rs
Normal file
199
crates/webclaw-fetch/src/extractors/mod.rs
Normal file
|
|
@ -0,0 +1,199 @@
|
|||
//! Vertical extractors: site-specific parsers that return typed JSON
|
||||
//! instead of generic markdown.
|
||||
//!
|
||||
//! Each extractor handles a single site or platform and exposes:
|
||||
//! - `matches(url)` to claim ownership of a URL pattern
|
||||
//! - `extract(client, url)` to fetch + parse into a typed JSON `Value`
|
||||
//! - `INFO` static for the catalog (`/v1/extractors`)
|
||||
//!
|
||||
//! The dispatch in this module is a simple `match`-style chain rather than
|
||||
//! a trait registry. With ~30 extractors that's still fast and avoids the
|
||||
//! ceremony of dynamic dispatch. If we hit 50+ we'll revisit.
|
||||
//!
|
||||
//! Extractors prefer official JSON APIs over HTML scraping where one
|
||||
//! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have
|
||||
//! one). HTML extraction is the fallback for sites that don't.
|
||||
|
||||
pub mod github_repo;
|
||||
pub mod hackernews;
|
||||
pub mod huggingface_model;
|
||||
pub mod npm;
|
||||
pub mod pypi;
|
||||
pub mod reddit;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Public catalog entry for `/v1/extractors`. Stable shape — clients
|
||||
/// rely on `name` to pick the right `/v1/scrape/{name}` route.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ExtractorInfo {
|
||||
/// URL-safe identifier (`reddit`, `hackernews`, `github_repo`, ...).
|
||||
pub name: &'static str,
|
||||
/// Human-friendly display name.
|
||||
pub label: &'static str,
|
||||
/// One-line description of what the extractor returns.
|
||||
pub description: &'static str,
|
||||
/// Glob-ish URL pattern(s) the extractor claims. For documentation;
|
||||
/// the actual matching is done by the extractor's `matches` fn.
|
||||
pub url_patterns: &'static [&'static str],
|
||||
}
|
||||
|
||||
/// Full catalog. Order is stable; new entries append.
|
||||
pub fn list() -> Vec<ExtractorInfo> {
|
||||
vec![
|
||||
reddit::INFO,
|
||||
hackernews::INFO,
|
||||
github_repo::INFO,
|
||||
pypi::INFO,
|
||||
npm::INFO,
|
||||
huggingface_model::INFO,
|
||||
]
|
||||
}
|
||||
|
||||
/// Auto-detect mode: try every extractor's `matches`, return the first
|
||||
/// one that claims the URL. Used by `/v1/scrape` when the caller doesn't
|
||||
/// pick a vertical explicitly.
|
||||
pub async fn dispatch_by_url(
|
||||
client: &FetchClient,
|
||||
url: &str,
|
||||
) -> Option<Result<(&'static str, Value), FetchError>> {
|
||||
if reddit::matches(url) {
|
||||
return Some(
|
||||
reddit::extract(client, url)
|
||||
.await
|
||||
.map(|v| (reddit::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if hackernews::matches(url) {
|
||||
return Some(
|
||||
hackernews::extract(client, url)
|
||||
.await
|
||||
.map(|v| (hackernews::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if github_repo::matches(url) {
|
||||
return Some(
|
||||
github_repo::extract(client, url)
|
||||
.await
|
||||
.map(|v| (github_repo::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if pypi::matches(url) {
|
||||
return Some(
|
||||
pypi::extract(client, url)
|
||||
.await
|
||||
.map(|v| (pypi::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if npm::matches(url) {
|
||||
return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v)));
|
||||
}
|
||||
if huggingface_model::matches(url) {
|
||||
return Some(
|
||||
huggingface_model::extract(client, url)
|
||||
.await
|
||||
.map(|v| (huggingface_model::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Explicit mode: caller picked the vertical (`POST /v1/scrape/reddit`).
|
||||
/// We still validate that the URL plausibly belongs to that vertical so
|
||||
/// users get a clear "wrong route" error instead of a confusing parse
|
||||
/// failure deep in the extractor.
|
||||
pub async fn dispatch_by_name(
|
||||
client: &FetchClient,
|
||||
name: &str,
|
||||
url: &str,
|
||||
) -> Result<Value, ExtractorDispatchError> {
|
||||
match name {
|
||||
n if n == reddit::INFO.name => {
|
||||
run_or_mismatch(reddit::matches(url), n, url, || {
|
||||
reddit::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == hackernews::INFO.name => {
|
||||
run_or_mismatch(hackernews::matches(url), n, url, || {
|
||||
hackernews::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == github_repo::INFO.name => {
|
||||
run_or_mismatch(github_repo::matches(url), n, url, || {
|
||||
github_repo::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == pypi::INFO.name => {
|
||||
run_or_mismatch(pypi::matches(url), n, url, || pypi::extract(client, url)).await
|
||||
}
|
||||
n if n == npm::INFO.name => {
|
||||
run_or_mismatch(npm::matches(url), n, url, || npm::extract(client, url)).await
|
||||
}
|
||||
n if n == huggingface_model::INFO.name => {
|
||||
run_or_mismatch(huggingface_model::matches(url), n, url, || {
|
||||
huggingface_model::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
_ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors that the dispatcher itself raises (vs. errors from inside an
|
||||
/// extractor, which come back wrapped in `Fetch`).
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ExtractorDispatchError {
|
||||
#[error("unknown vertical: '{0}'")]
|
||||
UnknownVertical(String),
|
||||
|
||||
#[error("URL '{url}' does not match the '{vertical}' extractor")]
|
||||
UrlMismatch { vertical: String, url: String },
|
||||
|
||||
#[error(transparent)]
|
||||
Fetch(#[from] FetchError),
|
||||
}
|
||||
|
||||
/// Helper: when the caller explicitly picked a vertical but their URL
|
||||
/// doesn't match it, return `UrlMismatch` instead of running the
|
||||
/// extractor (which would just fail with a less-clear error).
|
||||
async fn run_or_mismatch<F, Fut>(
|
||||
matches: bool,
|
||||
vertical: &str,
|
||||
url: &str,
|
||||
f: F,
|
||||
) -> Result<Value, ExtractorDispatchError>
|
||||
where
|
||||
F: FnOnce() -> Fut,
|
||||
Fut: std::future::Future<Output = Result<Value, FetchError>>,
|
||||
{
|
||||
if !matches {
|
||||
return Err(ExtractorDispatchError::UrlMismatch {
|
||||
vertical: vertical.to_string(),
|
||||
url: url.to_string(),
|
||||
});
|
||||
}
|
||||
f().await.map_err(ExtractorDispatchError::Fetch)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn list_is_non_empty_and_unique() {
|
||||
let entries = list();
|
||||
assert!(!entries.is_empty());
|
||||
let mut names: Vec<_> = entries.iter().map(|e| e.name).collect();
|
||||
names.sort();
|
||||
let before = names.len();
|
||||
names.dedup();
|
||||
assert_eq!(before, names.len(), "extractor names must be unique");
|
||||
}
|
||||
}
|
||||
235
crates/webclaw-fetch/src/extractors/npm.rs
Normal file
235
crates/webclaw-fetch/src/extractors/npm.rs
Normal file
|
|
@ -0,0 +1,235 @@
|
|||
//! npm package structured extractor.
|
||||
//!
|
||||
//! Uses two npm-run APIs:
|
||||
//! - `registry.npmjs.org/{name}` for full package metadata
|
||||
//! - `api.npmjs.org/downloads/point/last-week/{name}` for usage signal
|
||||
//!
|
||||
//! The registry API returns the *full* document including every version
|
||||
//! ever published, which can be tens of MB for popular packages
|
||||
//! (`@types/node` etc). We strip down to the latest version's manifest
|
||||
//! and a count of releases — full history would explode the response.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "npm",
|
||||
label: "npm package",
|
||||
description: "Returns package metadata: latest version manifest, dependencies, weekly downloads, license.",
|
||||
url_patterns: &["https://www.npmjs.com/package/{name}"],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "www.npmjs.com" && host != "npmjs.com" {
|
||||
return false;
|
||||
}
|
||||
url.contains("/package/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let name = parse_name(url)
|
||||
.ok_or_else(|| FetchError::Build(format!("npm: cannot parse name from '{url}'")))?;
|
||||
|
||||
let registry_url = format!("https://registry.npmjs.org/{}", urlencode_segment(&name));
|
||||
let resp = client.fetch(®istry_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"npm: package '{name}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"npm registry returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let pkg: PackageDoc = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("npm registry parse: {e}")))?;
|
||||
|
||||
// Resolve "latest" to a concrete version.
|
||||
let latest_version = pkg
|
||||
.dist_tags
|
||||
.as_ref()
|
||||
.and_then(|t| t.get("latest"))
|
||||
.cloned()
|
||||
.or_else(|| pkg.versions.as_ref().and_then(|v| v.keys().last().cloned()));
|
||||
|
||||
let latest_manifest = latest_version
|
||||
.as_deref()
|
||||
.and_then(|v| pkg.versions.as_ref().and_then(|m| m.get(v)));
|
||||
|
||||
let release_count = pkg.versions.as_ref().map(|v| v.len()).unwrap_or(0);
|
||||
let latest_release_date = latest_version
|
||||
.as_deref()
|
||||
.and_then(|v| pkg.time.as_ref().and_then(|t| t.get(v).cloned()));
|
||||
|
||||
// Best-effort weekly downloads. If the api.npmjs.org call fails we
|
||||
// surface `null` rather than failing the whole extractor — npm
|
||||
// sometimes 503s the downloads endpoint while the registry is up.
|
||||
let weekly_downloads = fetch_weekly_downloads(client, &name).await.ok();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"name": pkg.name.clone().unwrap_or(name.clone()),
|
||||
"description": pkg.description,
|
||||
"latest_version": latest_version,
|
||||
"license": latest_manifest.and_then(|m| m.license.clone()),
|
||||
"homepage": pkg.homepage,
|
||||
"repository": pkg.repository.as_ref().and_then(|r| r.url.clone()),
|
||||
"dependencies": latest_manifest.and_then(|m| m.dependencies.clone()),
|
||||
"dev_dependencies": latest_manifest.and_then(|m| m.dev_dependencies.clone()),
|
||||
"peer_dependencies": latest_manifest.and_then(|m| m.peer_dependencies.clone()),
|
||||
"keywords": pkg.keywords,
|
||||
"maintainers": pkg.maintainers,
|
||||
"deprecated": latest_manifest.and_then(|m| m.deprecated.clone()),
|
||||
"release_count": release_count,
|
||||
"latest_release_date": latest_release_date,
|
||||
"weekly_downloads": weekly_downloads,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn fetch_weekly_downloads(client: &FetchClient, name: &str) -> Result<i64, FetchError> {
|
||||
let url = format!(
|
||||
"https://api.npmjs.org/downloads/point/last-week/{}",
|
||||
urlencode_segment(name)
|
||||
);
|
||||
let resp = client.fetch(&url).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"npm downloads api status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
let dl: Downloads = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("npm downloads parse: {e}")))?;
|
||||
Ok(dl.downloads)
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Extract the package name from an npmjs.com URL. Handles scoped packages
|
||||
/// (`/package/@scope/name`) and trailing path segments (`/v/x.y.z`).
|
||||
fn parse_name(url: &str) -> Option<String> {
|
||||
let after = url.split("/package/").nth(1)?;
|
||||
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
||||
let first = segs.next()?;
|
||||
if first.starts_with('@') {
|
||||
let second = segs.next()?;
|
||||
Some(format!("{first}/{second}"))
|
||||
} else {
|
||||
Some(first.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// `@scope/name` must encode the `/` for the registry path. Plain names
|
||||
/// pass through untouched.
|
||||
fn urlencode_segment(name: &str) -> String {
|
||||
name.replace('/', "%2F")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Registry types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct PackageDoc {
|
||||
name: Option<String>,
|
||||
description: Option<String>,
|
||||
homepage: Option<serde_json::Value>, // sometimes string, sometimes object
|
||||
repository: Option<Repository>,
|
||||
keywords: Option<Vec<String>>,
|
||||
maintainers: Option<Vec<Maintainer>>,
|
||||
#[serde(rename = "dist-tags")]
|
||||
dist_tags: Option<std::collections::BTreeMap<String, String>>,
|
||||
versions: Option<std::collections::BTreeMap<String, VersionManifest>>,
|
||||
time: Option<std::collections::BTreeMap<String, String>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Default, Clone)]
|
||||
struct VersionManifest {
|
||||
license: Option<serde_json::Value>, // string or object
|
||||
dependencies: Option<std::collections::BTreeMap<String, String>>,
|
||||
#[serde(rename = "devDependencies")]
|
||||
dev_dependencies: Option<std::collections::BTreeMap<String, String>>,
|
||||
#[serde(rename = "peerDependencies")]
|
||||
peer_dependencies: Option<std::collections::BTreeMap<String, String>>,
|
||||
// `deprecated` is sometimes a bool and sometimes a string in the
|
||||
// registry. serde_json::Value covers both without failing the parse.
|
||||
deprecated: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Repository {
|
||||
url: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Clone)]
|
||||
struct Maintainer {
|
||||
name: Option<String>,
|
||||
email: Option<String>,
|
||||
}
|
||||
|
||||
impl serde::Serialize for Maintainer {
|
||||
fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
|
||||
use serde::ser::SerializeMap;
|
||||
let mut m = s.serialize_map(Some(2))?;
|
||||
m.serialize_entry("name", &self.name)?;
|
||||
m.serialize_entry("email", &self.email)?;
|
||||
m.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Downloads {
|
||||
downloads: i64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_npm_package_urls() {
|
||||
assert!(matches("https://www.npmjs.com/package/react"));
|
||||
assert!(matches("https://www.npmjs.com/package/@types/node"));
|
||||
assert!(matches("https://npmjs.com/package/lodash"));
|
||||
assert!(!matches("https://www.npmjs.com/"));
|
||||
assert!(!matches("https://example.com/package/foo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_name_handles_scoped_and_unscoped() {
|
||||
assert_eq!(
|
||||
parse_name("https://www.npmjs.com/package/react"),
|
||||
Some("react".into())
|
||||
);
|
||||
assert_eq!(
|
||||
parse_name("https://www.npmjs.com/package/@types/node"),
|
||||
Some("@types/node".into())
|
||||
);
|
||||
assert_eq!(
|
||||
parse_name("https://www.npmjs.com/package/lodash/v/4.17.21"),
|
||||
Some("lodash".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn urlencode_only_touches_scope_separator() {
|
||||
assert_eq!(urlencode_segment("react"), "react");
|
||||
assert_eq!(urlencode_segment("@types/node"), "@types%2Fnode");
|
||||
}
|
||||
}
|
||||
184
crates/webclaw-fetch/src/extractors/pypi.rs
Normal file
184
crates/webclaw-fetch/src/extractors/pypi.rs
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
//! PyPI package structured extractor.
|
||||
//!
|
||||
//! PyPI exposes a stable JSON API at `pypi.org/pypi/{name}/json` and
|
||||
//! a versioned form at `pypi.org/pypi/{name}/{version}/json`. Both
|
||||
//! return the full release info plus history. No auth, no rate limits
|
||||
//! that we hit at normal usage.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "pypi",
|
||||
label: "PyPI package",
|
||||
description: "Returns package metadata: latest version, dependencies, license, release history.",
|
||||
url_patterns: &[
|
||||
"https://pypi.org/project/{name}/",
|
||||
"https://pypi.org/project/{name}/{version}/",
|
||||
],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "pypi.org" && host != "www.pypi.org" {
|
||||
return false;
|
||||
}
|
||||
url.contains("/project/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let (name, version) = parse_project(url).ok_or_else(|| {
|
||||
FetchError::Build(format!("pypi: cannot parse package name from '{url}'"))
|
||||
})?;
|
||||
|
||||
let api_url = match &version {
|
||||
Some(v) => format!("https://pypi.org/pypi/{name}/{v}/json"),
|
||||
None => format!("https://pypi.org/pypi/{name}/json"),
|
||||
};
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"pypi: package '{name}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"pypi api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let pkg: PypiResponse = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("pypi parse: {e}")))?;
|
||||
|
||||
let info = pkg.info;
|
||||
let release_count = pkg.releases.as_ref().map(|r| r.len()).unwrap_or(0);
|
||||
|
||||
// Latest release date = max upload time across files in the latest version.
|
||||
let latest_release_date = pkg
|
||||
.releases
|
||||
.as_ref()
|
||||
.and_then(|map| info.version.as_deref().and_then(|v| map.get(v)))
|
||||
.and_then(|files| files.iter().filter_map(|f| f.upload_time.clone()).max());
|
||||
|
||||
// Drop the long description from the JSON shape — it's frequently a 50KB
|
||||
// README and bloats responses. Callers who need it can hit /v1/scrape.
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"name": info.name,
|
||||
"version": info.version,
|
||||
"summary": info.summary,
|
||||
"homepage": info.home_page,
|
||||
"license": info.license,
|
||||
"license_classifier": pick_license_classifier(&info.classifiers),
|
||||
"author": info.author,
|
||||
"author_email": info.author_email,
|
||||
"maintainer": info.maintainer,
|
||||
"requires_python": info.requires_python,
|
||||
"requires_dist": info.requires_dist,
|
||||
"keywords": info.keywords,
|
||||
"classifiers": info.classifiers,
|
||||
"yanked": info.yanked,
|
||||
"yanked_reason": info.yanked_reason,
|
||||
"project_urls": info.project_urls,
|
||||
"release_count": release_count,
|
||||
"latest_release_date": latest_release_date,
|
||||
}))
|
||||
}
|
||||
|
||||
/// PyPI puts the SPDX-ish license under classifiers like
|
||||
/// `License :: OSI Approved :: Apache Software License`. Surface the most
|
||||
/// specific one when the `license` field itself is empty/junk.
|
||||
fn pick_license_classifier(classifiers: &Option<Vec<String>>) -> Option<String> {
|
||||
classifiers
|
||||
.as_ref()?
|
||||
.iter()
|
||||
.filter(|c| c.starts_with("License ::"))
|
||||
.max_by_key(|c| c.len())
|
||||
.cloned()
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
fn parse_project(url: &str) -> Option<(String, Option<String>)> {
|
||||
let after = url.split("/project/").nth(1)?;
|
||||
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
||||
let name = segs.next()?.to_string();
|
||||
let version = segs.next().map(|v| v.to_string());
|
||||
Some((name, version))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// PyPI API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct PypiResponse {
|
||||
info: Info,
|
||||
releases: Option<std::collections::BTreeMap<String, Vec<File>>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Info {
|
||||
name: Option<String>,
|
||||
version: Option<String>,
|
||||
summary: Option<String>,
|
||||
home_page: Option<String>,
|
||||
license: Option<String>,
|
||||
author: Option<String>,
|
||||
author_email: Option<String>,
|
||||
maintainer: Option<String>,
|
||||
requires_python: Option<String>,
|
||||
requires_dist: Option<Vec<String>>,
|
||||
keywords: Option<String>,
|
||||
classifiers: Option<Vec<String>>,
|
||||
yanked: Option<bool>,
|
||||
yanked_reason: Option<String>,
|
||||
project_urls: Option<std::collections::BTreeMap<String, String>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct File {
|
||||
upload_time: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_project_urls() {
|
||||
assert!(matches("https://pypi.org/project/requests/"));
|
||||
assert!(matches("https://pypi.org/project/numpy/1.26.0/"));
|
||||
assert!(!matches("https://pypi.org/"));
|
||||
assert!(!matches("https://example.com/project/foo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_project_pulls_name_and_version() {
|
||||
assert_eq!(
|
||||
parse_project("https://pypi.org/project/requests/"),
|
||||
Some(("requests".into(), None))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_project("https://pypi.org/project/numpy/1.26.0/"),
|
||||
Some(("numpy".into(), Some("1.26.0".into())))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_project("https://pypi.org/project/scikit-learn/?foo=bar"),
|
||||
Some(("scikit-learn".into(), None))
|
||||
);
|
||||
}
|
||||
}
|
||||
234
crates/webclaw-fetch/src/extractors/reddit.rs
Normal file
234
crates/webclaw-fetch/src/extractors/reddit.rs
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
//! Reddit structured extractor — returns the full post + comment tree
|
||||
//! as typed JSON via Reddit's `.json` API.
|
||||
//!
|
||||
//! The same trick the markdown extractor in `crate::reddit` uses:
|
||||
//! appending `.json` to any post URL returns the data the new SPA
|
||||
//! frontend would load client-side. Zero antibot, zero JS rendering.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "reddit",
|
||||
label: "Reddit thread",
|
||||
description: "Returns post + nested comment tree with scores, authors, and timestamps.",
|
||||
url_patterns: &[
|
||||
"https://www.reddit.com/r/*/comments/*",
|
||||
"https://reddit.com/r/*/comments/*",
|
||||
"https://old.reddit.com/r/*/comments/*",
|
||||
],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
let is_reddit_host = matches!(
|
||||
host,
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
);
|
||||
is_reddit_host && url.contains("/comments/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let json_url = build_json_url(url);
|
||||
let resp = client.fetch(&json_url).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"reddit api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
|
||||
|
||||
if listings.is_empty() {
|
||||
return Err(FetchError::BodyDecode("reddit response empty".into()));
|
||||
}
|
||||
|
||||
// First listing = the post (single t3 child).
|
||||
let post = listings
|
||||
.first()
|
||||
.and_then(|l| l.data.children.first())
|
||||
.filter(|t| t.kind == "t3")
|
||||
.map(|t| post_json(&t.data))
|
||||
.unwrap_or(Value::Null);
|
||||
|
||||
// Second listing = the comment tree.
|
||||
let comments: Vec<Value> = listings
|
||||
.get(1)
|
||||
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"post": post,
|
||||
"comments": comments,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// JSON shapers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn post_json(d: &ThingData) -> Value {
|
||||
json!({
|
||||
"id": d.id,
|
||||
"title": d.title,
|
||||
"author": d.author,
|
||||
"subreddit": d.subreddit_name_prefixed,
|
||||
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
||||
"url": d.url_overridden_by_dest,
|
||||
"is_self": d.is_self,
|
||||
"selftext": d.selftext,
|
||||
"score": d.score,
|
||||
"upvote_ratio": d.upvote_ratio,
|
||||
"num_comments": d.num_comments,
|
||||
"created_utc": d.created_utc,
|
||||
"link_flair_text": d.link_flair_text,
|
||||
"over_18": d.over_18,
|
||||
"spoiler": d.spoiler,
|
||||
"stickied": d.stickied,
|
||||
"locked": d.locked,
|
||||
})
|
||||
}
|
||||
|
||||
/// Render a single comment + its reply tree. Returns `None` for non-t1
|
||||
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
|
||||
fn comment_json(thing: &Thing) -> Option<Value> {
|
||||
if thing.kind != "t1" {
|
||||
return None;
|
||||
}
|
||||
let d = &thing.data;
|
||||
let replies: Vec<Value> = match &d.replies {
|
||||
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
|
||||
_ => Vec::new(),
|
||||
};
|
||||
Some(json!({
|
||||
"id": d.id,
|
||||
"author": d.author,
|
||||
"body": d.body,
|
||||
"score": d.score,
|
||||
"created_utc": d.created_utc,
|
||||
"is_submitter": d.is_submitter,
|
||||
"stickied": d.stickied,
|
||||
"depth": d.depth,
|
||||
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
||||
"replies": replies,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// URL helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
|
||||
/// or `old.reddit.com` as the caller gave us). Routing through
|
||||
/// `old.reddit.com` unconditionally looks appealing but that host has
|
||||
/// stricter UA-based blocking than `www.reddit.com`, while the main
|
||||
/// host accepts our Chrome-fingerprinted client fine.
|
||||
fn build_json_url(url: &str) -> String {
|
||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
format!("{clean}.json?raw_json=1")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reddit JSON types — only fields we render. Everything else is dropped.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Listing {
|
||||
data: ListingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListingData {
|
||||
children: Vec<Thing>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Thing {
|
||||
kind: String,
|
||||
data: ThingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Default)]
|
||||
struct ThingData {
|
||||
// post (t3)
|
||||
id: Option<String>,
|
||||
title: Option<String>,
|
||||
selftext: Option<String>,
|
||||
subreddit_name_prefixed: Option<String>,
|
||||
url_overridden_by_dest: Option<String>,
|
||||
is_self: Option<bool>,
|
||||
upvote_ratio: Option<f64>,
|
||||
num_comments: Option<i64>,
|
||||
over_18: Option<bool>,
|
||||
spoiler: Option<bool>,
|
||||
stickied: Option<bool>,
|
||||
locked: Option<bool>,
|
||||
link_flair_text: Option<String>,
|
||||
|
||||
// comment (t1)
|
||||
author: Option<String>,
|
||||
body: Option<String>,
|
||||
score: Option<i64>,
|
||||
created_utc: Option<f64>,
|
||||
is_submitter: Option<bool>,
|
||||
depth: Option<i64>,
|
||||
permalink: Option<String>,
|
||||
|
||||
// recursive
|
||||
replies: Option<Replies>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Replies {
|
||||
Listing(Listing),
|
||||
#[allow(dead_code)]
|
||||
Empty(String),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_reddit_post_urls() {
|
||||
assert!(matches(
|
||||
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
|
||||
));
|
||||
assert!(matches(
|
||||
"https://reddit.com/r/rust/comments/abc123/some_title"
|
||||
));
|
||||
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_non_post_reddit_urls() {
|
||||
assert!(!matches("https://www.reddit.com/r/rust"));
|
||||
assert!(!matches("https://www.reddit.com/user/foo"));
|
||||
assert!(!matches("https://example.com/r/rust/comments/x"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_url_appends_suffix_and_drops_query() {
|
||||
assert_eq!(
|
||||
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
|
||||
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -6,6 +6,7 @@ pub mod client;
|
|||
pub mod crawler;
|
||||
pub mod document;
|
||||
pub mod error;
|
||||
pub mod extractors;
|
||||
pub mod linkedin;
|
||||
pub mod proxy;
|
||||
pub mod reddit;
|
||||
|
|
|
|||
|
|
@ -79,10 +79,15 @@ async fn main() -> anyhow::Result<()> {
|
|||
|
||||
let v1 = Router::new()
|
||||
.route("/scrape", post(routes::scrape::scrape))
|
||||
.route(
|
||||
"/scrape/{vertical}",
|
||||
post(routes::structured::scrape_vertical),
|
||||
)
|
||||
.route("/crawl", post(routes::crawl::crawl))
|
||||
.route("/map", post(routes::map::map))
|
||||
.route("/batch", post(routes::batch::batch))
|
||||
.route("/extract", post(routes::extract::extract))
|
||||
.route("/extractors", get(routes::structured::list_extractors))
|
||||
.route("/summarize", post(routes::summarize::summarize_route))
|
||||
.route("/diff", post(routes::diff::diff_route))
|
||||
.route("/brand", post(routes::brand::brand))
|
||||
|
|
|
|||
|
|
@ -15,4 +15,5 @@ pub mod extract;
|
|||
pub mod health;
|
||||
pub mod map;
|
||||
pub mod scrape;
|
||||
pub mod structured;
|
||||
pub mod summarize;
|
||||
|
|
|
|||
55
crates/webclaw-server/src/routes/structured.rs
Normal file
55
crates/webclaw-server/src/routes/structured.rs
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
//! `POST /v1/scrape/{vertical}` and `GET /v1/extractors`.
|
||||
//!
|
||||
//! Vertical extractors return typed JSON instead of generic markdown.
|
||||
//! See `webclaw_fetch::extractors` for the catalog and per-site logic.
|
||||
|
||||
use axum::{
|
||||
Json,
|
||||
extract::{Path, State},
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_fetch::extractors::{self, ExtractorDispatchError};
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ScrapeRequest {
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
/// Map dispatcher errors to ApiError so users get clean HTTP statuses
|
||||
/// instead of opaque 500s.
|
||||
impl From<ExtractorDispatchError> for ApiError {
|
||||
fn from(e: ExtractorDispatchError) -> Self {
|
||||
match e {
|
||||
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
|
||||
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
|
||||
ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `GET /v1/extractors` — catalog of all available verticals.
|
||||
pub async fn list_extractors() -> Json<Value> {
|
||||
Json(json!({
|
||||
"extractors": extractors::list(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// `POST /v1/scrape/{vertical}` — explicit vertical, e.g. /v1/scrape/reddit.
|
||||
pub async fn scrape_vertical(
|
||||
State(state): State<AppState>,
|
||||
Path(vertical): Path<String>,
|
||||
Json(req): Json<ScrapeRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
|
||||
Ok(Json(json!({
|
||||
"vertical": vertical,
|
||||
"url": req.url,
|
||||
"data": data,
|
||||
})))
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue