mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-25 03:08:06 +02:00
Adds `webclaw_fetch::Fetcher` trait. All 28 vertical extractors now
take `client: &dyn Fetcher` instead of `client: &FetchClient` directly.
Backwards-compatible: FetchClient implements Fetcher, blanket impls
cover `&T` and `Arc<T>`, so existing CLI / MCP / self-hosted-server
callers keep working unchanged.
Motivation: the production API server (api.webclaw.io) must not do
in-process TLS fingerprinting; it delegates all HTTP to the Go
tls-sidecar. Before this trait, exposing /v1/scrape/{vertical} on
production would have required importing wreq into the server's
dep graph, violating the CLAUDE.md rule. Now production can provide
its own TlsSidecarFetcher implementation and pass it to the same
dispatcher the OSS server uses.
Changes:
- New `crates/webclaw-fetch/src/fetcher.rs` defining the trait plus
blanket impls for `&T` and `Arc<T>`.
- `FetchClient` gains a tiny impl block in client.rs that forwards to
its existing public methods.
- All 28 extractor signatures migrated from `&FetchClient` to
`&dyn Fetcher` (sed-driven bulk rewrite, no semantic change).
- `cloud::smart_fetch` and `cloud::smart_fetch_html` take `&dyn Fetcher`.
- `extractors::dispatch_by_url` and `extractors::dispatch_by_name`
take `&dyn Fetcher`.
- `async-trait 0.1` added to webclaw-fetch deps (Rust 1.75+ has
native async-fn-in-trait but dyn dispatch still needs async_trait).
- Version bumped to 0.5.1, CHANGELOG updated.
Tests: 215 passing in webclaw-fetch (no new tests needed — the existing
extractor tests exercise the trait methods transparently).
Clippy: clean workspace-wide.
223 lines
7.1 KiB
Rust
223 lines
7.1 KiB
Rust
//! HuggingFace model card structured extractor.
|
|
//!
|
|
//! Uses the public model API at `huggingface.co/api/models/{owner}/{name}`.
|
|
//! Returns metadata + the parsed model card front matter, but does not
|
|
//! pull the full README body — those are sometimes 100KB+ and the user
|
|
//! can hit /v1/scrape if they want it as markdown.
|
|
|
|
use serde::Deserialize;
|
|
use serde_json::{Value, json};
|
|
|
|
use super::ExtractorInfo;
|
|
use crate::error::FetchError;
|
|
use crate::fetcher::Fetcher;
|
|
|
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
|
name: "huggingface_model",
|
|
label: "HuggingFace model",
|
|
description: "Returns model metadata: downloads, likes, license, pipeline tag, library name, file list.",
|
|
url_patterns: &["https://huggingface.co/{owner}/{name}"],
|
|
};
|
|
|
|
pub fn matches(url: &str) -> bool {
|
|
let host = host_of(url);
|
|
if host != "huggingface.co" && host != "www.huggingface.co" {
|
|
return false;
|
|
}
|
|
let path = url
|
|
.split("://")
|
|
.nth(1)
|
|
.and_then(|s| s.split_once('/'))
|
|
.map(|(_, p)| p)
|
|
.unwrap_or("");
|
|
let stripped = path
|
|
.split(['?', '#'])
|
|
.next()
|
|
.unwrap_or("")
|
|
.trim_end_matches('/');
|
|
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
|
// /{owner}/{name} but reject HF-internal sections + sub-pages.
|
|
if segs.len() != 2 {
|
|
return false;
|
|
}
|
|
!RESERVED_NAMESPACES.contains(&segs[0])
|
|
}
|
|
|
|
const RESERVED_NAMESPACES: &[&str] = &[
|
|
"datasets",
|
|
"spaces",
|
|
"blog",
|
|
"docs",
|
|
"api",
|
|
"models",
|
|
"papers",
|
|
"pricing",
|
|
"tasks",
|
|
"join",
|
|
"login",
|
|
"settings",
|
|
"organizations",
|
|
"new",
|
|
"search",
|
|
];
|
|
|
|
pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
|
|
let (owner, name) = parse_owner_name(url).ok_or_else(|| {
|
|
FetchError::Build(format!("hf model: cannot parse owner/name from '{url}'"))
|
|
})?;
|
|
|
|
let api_url = format!("https://huggingface.co/api/models/{owner}/{name}");
|
|
let resp = client.fetch(&api_url).await?;
|
|
if resp.status == 404 {
|
|
return Err(FetchError::Build(format!(
|
|
"hf model: '{owner}/{name}' not found"
|
|
)));
|
|
}
|
|
if resp.status == 401 {
|
|
return Err(FetchError::Build(format!(
|
|
"hf model: '{owner}/{name}' requires authentication (gated repo)"
|
|
)));
|
|
}
|
|
if resp.status != 200 {
|
|
return Err(FetchError::Build(format!(
|
|
"hf api returned status {}",
|
|
resp.status
|
|
)));
|
|
}
|
|
|
|
let m: ModelInfo = serde_json::from_str(&resp.html)
|
|
.map_err(|e| FetchError::BodyDecode(format!("hf api parse: {e}")))?;
|
|
|
|
// Surface a flat file list — full siblings can be hundreds of entries
|
|
// for big repos. We keep it as-is because callers want to know about
|
|
// every shard; if it bloats responses too much we'll add pagination.
|
|
let files: Vec<Value> = m
|
|
.siblings
|
|
.iter()
|
|
.map(|s| json!({"rfilename": s.rfilename, "size": s.size}))
|
|
.collect();
|
|
|
|
Ok(json!({
|
|
"url": url,
|
|
"id": m.id,
|
|
"model_id": m.model_id,
|
|
"private": m.private,
|
|
"gated": m.gated,
|
|
"downloads": m.downloads,
|
|
"downloads_30d": m.downloads_all_time,
|
|
"likes": m.likes,
|
|
"library_name": m.library_name,
|
|
"pipeline_tag": m.pipeline_tag,
|
|
"tags": m.tags,
|
|
"license": m.card_data.as_ref().and_then(|c| c.license.clone()),
|
|
"language": m.card_data.as_ref().and_then(|c| c.language.clone()),
|
|
"datasets": m.card_data.as_ref().and_then(|c| c.datasets.clone()),
|
|
"base_model": m.card_data.as_ref().and_then(|c| c.base_model.clone()),
|
|
"model_type": m.card_data.as_ref().and_then(|c| c.model_type.clone()),
|
|
"created_at": m.created_at,
|
|
"last_modified": m.last_modified,
|
|
"sha": m.sha,
|
|
"file_count": m.siblings.len(),
|
|
"files": files,
|
|
}))
|
|
}
|
|
|
|
fn host_of(url: &str) -> &str {
|
|
url.split("://")
|
|
.nth(1)
|
|
.unwrap_or(url)
|
|
.split('/')
|
|
.next()
|
|
.unwrap_or("")
|
|
}
|
|
|
|
fn parse_owner_name(url: &str) -> Option<(String, String)> {
|
|
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
|
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
|
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
|
let owner = segs.next()?.to_string();
|
|
let name = segs.next()?.to_string();
|
|
Some((owner, name))
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// HF API types
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[derive(Deserialize)]
|
|
struct ModelInfo {
|
|
id: Option<String>,
|
|
#[serde(rename = "modelId")]
|
|
model_id: Option<String>,
|
|
private: Option<bool>,
|
|
gated: Option<serde_json::Value>, // bool or string ("auto" / "manual" / false)
|
|
downloads: Option<i64>,
|
|
#[serde(rename = "downloadsAllTime")]
|
|
downloads_all_time: Option<i64>,
|
|
likes: Option<i64>,
|
|
#[serde(rename = "library_name")]
|
|
library_name: Option<String>,
|
|
#[serde(rename = "pipeline_tag")]
|
|
pipeline_tag: Option<String>,
|
|
#[serde(default)]
|
|
tags: Vec<String>,
|
|
#[serde(rename = "createdAt")]
|
|
created_at: Option<String>,
|
|
#[serde(rename = "lastModified")]
|
|
last_modified: Option<String>,
|
|
sha: Option<String>,
|
|
#[serde(rename = "cardData")]
|
|
card_data: Option<CardData>,
|
|
#[serde(default)]
|
|
siblings: Vec<Sibling>,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct CardData {
|
|
license: Option<serde_json::Value>, // string or array
|
|
language: Option<serde_json::Value>,
|
|
datasets: Option<serde_json::Value>,
|
|
#[serde(rename = "base_model")]
|
|
base_model: Option<serde_json::Value>,
|
|
#[serde(rename = "model_type")]
|
|
model_type: Option<String>,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct Sibling {
|
|
rfilename: String,
|
|
size: Option<i64>,
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn matches_model_pages() {
|
|
assert!(matches("https://huggingface.co/meta-llama/Meta-Llama-3-8B"));
|
|
assert!(matches("https://huggingface.co/openai/whisper-large-v3"));
|
|
assert!(matches("https://huggingface.co/bert-base-uncased/main")); // owner=bert-base-uncased name=main: false positive but acceptable for v1
|
|
}
|
|
|
|
#[test]
|
|
fn rejects_hf_section_pages() {
|
|
assert!(!matches("https://huggingface.co/datasets/squad"));
|
|
assert!(!matches("https://huggingface.co/spaces/foo/bar"));
|
|
assert!(!matches("https://huggingface.co/blog/intro"));
|
|
assert!(!matches("https://huggingface.co/"));
|
|
assert!(!matches("https://huggingface.co/meta-llama"));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_owner_name_pulls_both() {
|
|
assert_eq!(
|
|
parse_owner_name("https://huggingface.co/meta-llama/Meta-Llama-3-8B"),
|
|
Some(("meta-llama".into(), "Meta-Llama-3-8B".into()))
|
|
);
|
|
assert_eq!(
|
|
parse_owner_name("https://huggingface.co/openai/whisper-large-v3?library=transformers"),
|
|
Some(("openai".into(), "whisper-large-v3".into()))
|
|
);
|
|
}
|
|
}
|