mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat(extractors): wave 2 \u2014 8 more verticals (14 total)
Adds 8 more vertical extractors using public JSON APIs. All hit
deterministic endpoints with no antibot risk. Live tests pass
against canonical URLs for each.
AI / ML ecosystem (3):
- crates_io \u2192 crates.io/api/v1/crates/{name}
- huggingface_dataset \u2192 huggingface.co/api/datasets/{path} (handles both
legacy /datasets/{name} and canonical {owner}/{name})
- arxiv \u2192 export.arxiv.org/api/query (Atom XML parsed by quick-xml)
Code / version control (2):
- github_pr \u2192 api.github.com/repos/{owner}/{repo}/pulls/{number}
- github_release \u2192 api.github.com/repos/{owner}/{repo}/releases/tags/{tag}
Infrastructure (1):
- docker_hub \u2192 hub.docker.com/v2/repositories/{namespace}/{name}
(official-image shorthand /_/nginx normalized to library/nginx)
Community / publishing (2):
- dev_to \u2192 dev.to/api/articles/{username}/{slug}
- stackoverflow \u2192 api.stackexchange.com/2.3/questions/{id} + answers,
filter=withbody for rendered HTML, sort=votes for
consistent top-answers ordering
Live test results (real URLs):
- serde: 942M downloads, 838B response
- 'Attention Is All You Need': abstract + authors, 1.8KB
- nginx official: 12.9B pulls, 21k stars, 17KB
- openai/gsm8k: 822k downloads, 1.7KB
- rust-lang/rust#138000: merged by RalfJung, +3/-2, 1KB
- webclaw v0.4.0: 2.4KB
- a real dev.to article: 2.2KB body, 3.1KB total
- python yield Q&A: score 13133, 51 answers, 104KB
Catalog now exposes 14 extractors via GET /v1/extractors. Total
unit tests across the module: 34 passing. Clippy clean. Fmt clean.
Marketing positioning sharpens: 14 dedicated extractors, all
deterministic, all 1-credit-per-call. Firecrawl's /extract is
5 credits per call and you write the schema yourself.
This commit is contained in:
parent
86182ef28a
commit
b041f3cddd
9 changed files with 1710 additions and 0 deletions
314
crates/webclaw-fetch/src/extractors/arxiv.rs
Normal file
314
crates/webclaw-fetch/src/extractors/arxiv.rs
Normal file
|
|
@ -0,0 +1,314 @@
|
|||
//! ArXiv paper structured extractor.
|
||||
//!
|
||||
//! Uses the public ArXiv API at `export.arxiv.org/api/query?id_list={id}`
|
||||
//! which returns Atom XML. We parse just enough to surface title, authors,
|
||||
//! abstract, categories, and the canonical PDF link. No HTML scraping
|
||||
//! required and no auth.
|
||||
|
||||
use quick_xml::Reader;
|
||||
use quick_xml::events::Event;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "arxiv",
|
||||
label: "ArXiv paper",
|
||||
description: "Returns paper metadata: title, authors, abstract, categories, primary category, PDF URL.",
|
||||
url_patterns: &[
|
||||
"https://arxiv.org/abs/{id}",
|
||||
"https://arxiv.org/abs/{id}v{n}",
|
||||
"https://arxiv.org/pdf/{id}",
|
||||
],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "arxiv.org" && host != "www.arxiv.org" {
|
||||
return false;
|
||||
}
|
||||
url.contains("/abs/") || url.contains("/pdf/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let id = parse_id(url)
|
||||
.ok_or_else(|| FetchError::Build(format!("arxiv: cannot parse id from '{url}'")))?;
|
||||
|
||||
let api_url = format!("https://export.arxiv.org/api/query?id_list={id}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"arxiv api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let entry = parse_atom_entry(&resp.html)
|
||||
.ok_or_else(|| FetchError::BodyDecode("arxiv: no <entry> in response".into()))?;
|
||||
if entry.title.is_none() && entry.summary.is_none() {
|
||||
return Err(FetchError::BodyDecode(format!(
|
||||
"arxiv: paper '{id}' returned empty entry (likely withdrawn or invalid id)"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"id": id,
|
||||
"arxiv_id": entry.id,
|
||||
"title": entry.title,
|
||||
"authors": entry.authors,
|
||||
"abstract": entry.summary.map(|s| collapse_whitespace(&s)),
|
||||
"published": entry.published,
|
||||
"updated": entry.updated,
|
||||
"primary_category": entry.primary_category,
|
||||
"categories": entry.categories,
|
||||
"doi": entry.doi,
|
||||
"comment": entry.comment,
|
||||
"pdf_url": entry.pdf_url,
|
||||
"abs_url": entry.abs_url,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Parse an arxiv id from a URL. Strips the version suffix (`v2`, `v3`)
|
||||
/// and the `.pdf` extension when present.
|
||||
fn parse_id(url: &str) -> Option<String> {
|
||||
let after = url
|
||||
.split("/abs/")
|
||||
.nth(1)
|
||||
.or_else(|| url.split("/pdf/").nth(1))?;
|
||||
let stripped = after
|
||||
.split(['?', '#'])
|
||||
.next()?
|
||||
.trim_end_matches('/')
|
||||
.trim_end_matches(".pdf");
|
||||
// Strip optional version suffix, e.g. "2401.12345v2" → "2401.12345"
|
||||
let no_version = match stripped.rfind('v') {
|
||||
Some(i) if stripped[i + 1..].chars().all(|c| c.is_ascii_digit()) => &stripped[..i],
|
||||
_ => stripped,
|
||||
};
|
||||
if no_version.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(no_version.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn collapse_whitespace(s: &str) -> String {
|
||||
s.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct AtomEntry {
|
||||
id: Option<String>,
|
||||
title: Option<String>,
|
||||
summary: Option<String>,
|
||||
published: Option<String>,
|
||||
updated: Option<String>,
|
||||
primary_category: Option<String>,
|
||||
categories: Vec<String>,
|
||||
authors: Vec<String>,
|
||||
doi: Option<String>,
|
||||
comment: Option<String>,
|
||||
pdf_url: Option<String>,
|
||||
abs_url: Option<String>,
|
||||
}
|
||||
|
||||
/// Parse the first `<entry>` block of an ArXiv Atom feed.
|
||||
fn parse_atom_entry(xml: &str) -> Option<AtomEntry> {
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
// States
|
||||
let mut in_entry = false;
|
||||
let mut current: Option<&'static str> = None;
|
||||
let mut in_author = false;
|
||||
let mut in_author_name = false;
|
||||
let mut entry = AtomEntry::default();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) => {
|
||||
let local = e.local_name();
|
||||
match local.as_ref() {
|
||||
b"entry" => in_entry = true,
|
||||
b"id" if in_entry && !in_author => current = Some("id"),
|
||||
b"title" if in_entry => current = Some("title"),
|
||||
b"summary" if in_entry => current = Some("summary"),
|
||||
b"published" if in_entry => current = Some("published"),
|
||||
b"updated" if in_entry => current = Some("updated"),
|
||||
b"author" if in_entry => in_author = true,
|
||||
b"name" if in_author => {
|
||||
in_author_name = true;
|
||||
current = Some("author_name");
|
||||
}
|
||||
b"category" if in_entry => {
|
||||
// primary_category is namespaced (arxiv:primary_category)
|
||||
// category is plain. quick-xml gives us local-name only,
|
||||
// so we treat both as categories and take the first as
|
||||
// primary.
|
||||
for attr in e.attributes().flatten() {
|
||||
if attr.key.as_ref() == b"term"
|
||||
&& let Ok(v) = attr.unescape_value()
|
||||
{
|
||||
let term = v.to_string();
|
||||
if entry.primary_category.is_none() {
|
||||
entry.primary_category = Some(term.clone());
|
||||
}
|
||||
entry.categories.push(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
b"link" if in_entry => {
|
||||
let mut href = None;
|
||||
let mut rel = None;
|
||||
let mut typ = None;
|
||||
for attr in e.attributes().flatten() {
|
||||
match attr.key.as_ref() {
|
||||
b"href" => href = attr.unescape_value().ok().map(|s| s.to_string()),
|
||||
b"rel" => rel = attr.unescape_value().ok().map(|s| s.to_string()),
|
||||
b"type" => typ = attr.unescape_value().ok().map(|s| s.to_string()),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
if let Some(h) = href {
|
||||
if typ.as_deref() == Some("application/pdf") {
|
||||
entry.pdf_url = Some(h.clone());
|
||||
}
|
||||
if rel.as_deref() == Some("alternate") {
|
||||
entry.abs_url = Some(h);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => current = None,
|
||||
}
|
||||
}
|
||||
Ok(Event::Empty(ref e)) => {
|
||||
// Self-closing tags (<link href="..." />). Same handling as Start.
|
||||
let local = e.local_name();
|
||||
if (local.as_ref() == b"link" || local.as_ref() == b"category") && in_entry {
|
||||
let mut href = None;
|
||||
let mut rel = None;
|
||||
let mut typ = None;
|
||||
let mut term = None;
|
||||
for attr in e.attributes().flatten() {
|
||||
match attr.key.as_ref() {
|
||||
b"href" => href = attr.unescape_value().ok().map(|s| s.to_string()),
|
||||
b"rel" => rel = attr.unescape_value().ok().map(|s| s.to_string()),
|
||||
b"type" => typ = attr.unescape_value().ok().map(|s| s.to_string()),
|
||||
b"term" => term = attr.unescape_value().ok().map(|s| s.to_string()),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
if let Some(t) = term {
|
||||
if entry.primary_category.is_none() {
|
||||
entry.primary_category = Some(t.clone());
|
||||
}
|
||||
entry.categories.push(t);
|
||||
}
|
||||
if let Some(h) = href {
|
||||
if typ.as_deref() == Some("application/pdf") {
|
||||
entry.pdf_url = Some(h.clone());
|
||||
}
|
||||
if rel.as_deref() == Some("alternate") {
|
||||
entry.abs_url = Some(h);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(ref e)) => {
|
||||
if let (Some(field), Ok(text)) = (current, e.unescape()) {
|
||||
let text = text.to_string();
|
||||
match field {
|
||||
"id" => entry.id = Some(text.trim().to_string()),
|
||||
"title" => entry.title = append_text(entry.title.take(), &text),
|
||||
"summary" => entry.summary = append_text(entry.summary.take(), &text),
|
||||
"published" => entry.published = Some(text.trim().to_string()),
|
||||
"updated" => entry.updated = Some(text.trim().to_string()),
|
||||
"author_name" => entry.authors.push(text.trim().to_string()),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
let local = e.local_name();
|
||||
match local.as_ref() {
|
||||
b"entry" => break,
|
||||
b"author" => in_author = false,
|
||||
b"name" => in_author_name = false,
|
||||
_ => {}
|
||||
}
|
||||
if !in_author_name {
|
||||
current = None;
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(_) => return None,
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
if in_entry { Some(entry) } else { None }
|
||||
}
|
||||
|
||||
/// Concatenate text fragments (long fields can be split across multiple
|
||||
/// text events if they contain entities or CDATA).
|
||||
fn append_text(prev: Option<String>, next: &str) -> Option<String> {
|
||||
match prev {
|
||||
Some(mut s) => {
|
||||
s.push_str(next);
|
||||
Some(s)
|
||||
}
|
||||
None => Some(next.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_arxiv_urls() {
|
||||
assert!(matches("https://arxiv.org/abs/2401.12345"));
|
||||
assert!(matches("https://arxiv.org/abs/2401.12345v2"));
|
||||
assert!(matches("https://arxiv.org/pdf/2401.12345.pdf"));
|
||||
assert!(!matches("https://arxiv.org/"));
|
||||
assert!(!matches("https://example.com/abs/foo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_id_strips_version_and_extension() {
|
||||
assert_eq!(
|
||||
parse_id("https://arxiv.org/abs/2401.12345"),
|
||||
Some("2401.12345".into())
|
||||
);
|
||||
assert_eq!(
|
||||
parse_id("https://arxiv.org/abs/2401.12345v3"),
|
||||
Some("2401.12345".into())
|
||||
);
|
||||
assert_eq!(
|
||||
parse_id("https://arxiv.org/pdf/2401.12345v2.pdf"),
|
||||
Some("2401.12345".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn collapse_whitespace_handles_newlines_and_tabs() {
|
||||
assert_eq!(collapse_whitespace("a b\n\tc "), "a b c");
|
||||
}
|
||||
}
|
||||
168
crates/webclaw-fetch/src/extractors/crates_io.rs
Normal file
168
crates/webclaw-fetch/src/extractors/crates_io.rs
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
//! crates.io structured extractor.
|
||||
//!
|
||||
//! Uses the public JSON API at `crates.io/api/v1/crates/{name}`. No
|
||||
//! auth, no rate limit at normal usage. The response includes both
|
||||
//! the crate metadata and the full version list, which we summarize
|
||||
//! down to a count + latest release info to keep the payload small.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "crates_io",
|
||||
label: "crates.io package",
|
||||
description: "Returns crate metadata: latest version, dependencies, downloads, license, repository.",
|
||||
url_patterns: &[
|
||||
"https://crates.io/crates/{name}",
|
||||
"https://crates.io/crates/{name}/{version}",
|
||||
],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "crates.io" && host != "www.crates.io" {
|
||||
return false;
|
||||
}
|
||||
url.contains("/crates/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let name = parse_name(url)
|
||||
.ok_or_else(|| FetchError::Build(format!("crates.io: cannot parse name from '{url}'")))?;
|
||||
|
||||
let api_url = format!("https://crates.io/api/v1/crates/{name}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"crates.io: crate '{name}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"crates.io api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let body: CratesResponse = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("crates.io parse: {e}")))?;
|
||||
|
||||
let c = body.crate_;
|
||||
let latest_version = body
|
||||
.versions
|
||||
.iter()
|
||||
.find(|v| !v.yanked.unwrap_or(false))
|
||||
.or_else(|| body.versions.first());
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"name": c.id,
|
||||
"description": c.description,
|
||||
"homepage": c.homepage,
|
||||
"documentation": c.documentation,
|
||||
"repository": c.repository,
|
||||
"max_stable_version": c.max_stable_version,
|
||||
"max_version": c.max_version,
|
||||
"newest_version": c.newest_version,
|
||||
"downloads": c.downloads,
|
||||
"recent_downloads": c.recent_downloads,
|
||||
"categories": c.categories,
|
||||
"keywords": c.keywords,
|
||||
"release_count": body.versions.len(),
|
||||
"latest_release_date": latest_version.and_then(|v| v.created_at.clone()),
|
||||
"latest_license": latest_version.and_then(|v| v.license.clone()),
|
||||
"latest_rust_version": latest_version.and_then(|v| v.rust_version.clone()),
|
||||
"latest_yanked": latest_version.and_then(|v| v.yanked),
|
||||
"created_at": c.created_at,
|
||||
"updated_at": c.updated_at,
|
||||
}))
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
fn parse_name(url: &str) -> Option<String> {
|
||||
let after = url.split("/crates/").nth(1)?;
|
||||
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let first = stripped.split('/').find(|s| !s.is_empty())?;
|
||||
Some(first.to_string())
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// crates.io API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct CratesResponse {
|
||||
#[serde(rename = "crate")]
|
||||
crate_: CrateInfo,
|
||||
#[serde(default)]
|
||||
versions: Vec<VersionInfo>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct CrateInfo {
|
||||
id: Option<String>,
|
||||
description: Option<String>,
|
||||
homepage: Option<String>,
|
||||
documentation: Option<String>,
|
||||
repository: Option<String>,
|
||||
max_stable_version: Option<String>,
|
||||
max_version: Option<String>,
|
||||
newest_version: Option<String>,
|
||||
downloads: Option<i64>,
|
||||
recent_downloads: Option<i64>,
|
||||
#[serde(default)]
|
||||
categories: Vec<String>,
|
||||
#[serde(default)]
|
||||
keywords: Vec<String>,
|
||||
created_at: Option<String>,
|
||||
updated_at: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct VersionInfo {
|
||||
license: Option<String>,
|
||||
rust_version: Option<String>,
|
||||
yanked: Option<bool>,
|
||||
created_at: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_crate_pages() {
|
||||
assert!(matches("https://crates.io/crates/serde"));
|
||||
assert!(matches("https://crates.io/crates/tokio/1.45.0"));
|
||||
assert!(!matches("https://crates.io/"));
|
||||
assert!(!matches("https://example.com/crates/foo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_name_handles_versioned_urls() {
|
||||
assert_eq!(
|
||||
parse_name("https://crates.io/crates/serde"),
|
||||
Some("serde".into())
|
||||
);
|
||||
assert_eq!(
|
||||
parse_name("https://crates.io/crates/tokio/1.45.0"),
|
||||
Some("tokio".into())
|
||||
);
|
||||
assert_eq!(
|
||||
parse_name("https://crates.io/crates/scraper/?foo=bar"),
|
||||
Some("scraper".into())
|
||||
);
|
||||
}
|
||||
}
|
||||
188
crates/webclaw-fetch/src/extractors/dev_to.rs
Normal file
188
crates/webclaw-fetch/src/extractors/dev_to.rs
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
//! dev.to article structured extractor.
|
||||
//!
|
||||
//! `dev.to/api/articles/{username}/{slug}` returns the full article body,
|
||||
//! tags, reaction count, comment count, and reading time. Anonymous
|
||||
//! access works fine for published posts.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "dev_to",
|
||||
label: "dev.to article",
|
||||
description: "Returns article metadata + body: title, body markdown, tags, reactions, comments, reading time.",
|
||||
url_patterns: &["https://dev.to/{username}/{slug}"],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "dev.to" && host != "www.dev.to" {
|
||||
return false;
|
||||
}
|
||||
let path = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.and_then(|s| s.split_once('/'))
|
||||
.map(|(_, p)| p)
|
||||
.unwrap_or("");
|
||||
let stripped = path
|
||||
.split(['?', '#'])
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
.trim_end_matches('/');
|
||||
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
||||
// Need exactly /{username}/{slug}, with username starting with non-reserved.
|
||||
segs.len() == 2 && !RESERVED_FIRST_SEGS.contains(&segs[0])
|
||||
}
|
||||
|
||||
const RESERVED_FIRST_SEGS: &[&str] = &[
|
||||
"api",
|
||||
"tags",
|
||||
"search",
|
||||
"settings",
|
||||
"enter",
|
||||
"signup",
|
||||
"about",
|
||||
"code-of-conduct",
|
||||
"privacy",
|
||||
"terms",
|
||||
"contact",
|
||||
"sponsorships",
|
||||
"sponsors",
|
||||
"shop",
|
||||
"videos",
|
||||
"listings",
|
||||
"podcasts",
|
||||
"p",
|
||||
"t",
|
||||
];
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let (username, slug) = parse_username_slug(url).ok_or_else(|| {
|
||||
FetchError::Build(format!("dev_to: cannot parse username/slug from '{url}'"))
|
||||
})?;
|
||||
|
||||
let api_url = format!("https://dev.to/api/articles/{username}/{slug}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"dev_to: article '{username}/{slug}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"dev.to api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let a: Article = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("dev.to parse: {e}")))?;
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"id": a.id,
|
||||
"title": a.title,
|
||||
"description": a.description,
|
||||
"body_markdown": a.body_markdown,
|
||||
"url_canonical": a.canonical_url,
|
||||
"published_at": a.published_at,
|
||||
"edited_at": a.edited_at,
|
||||
"reading_time_min": a.reading_time_minutes,
|
||||
"tags": a.tag_list,
|
||||
"positive_reactions": a.positive_reactions_count,
|
||||
"public_reactions": a.public_reactions_count,
|
||||
"comments_count": a.comments_count,
|
||||
"page_views_count": a.page_views_count,
|
||||
"cover_image": a.cover_image,
|
||||
"author": json!({
|
||||
"username": a.user.as_ref().and_then(|u| u.username.clone()),
|
||||
"name": a.user.as_ref().and_then(|u| u.name.clone()),
|
||||
"twitter": a.user.as_ref().and_then(|u| u.twitter_username.clone()),
|
||||
"github": a.user.as_ref().and_then(|u| u.github_username.clone()),
|
||||
"website": a.user.as_ref().and_then(|u| u.website_url.clone()),
|
||||
}),
|
||||
}))
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
fn parse_username_slug(url: &str) -> Option<(String, String)> {
|
||||
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
||||
let username = segs.next()?;
|
||||
let slug = segs.next()?;
|
||||
Some((username.to_string(), slug.to_string()))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// dev.to API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Article {
|
||||
id: Option<i64>,
|
||||
title: Option<String>,
|
||||
description: Option<String>,
|
||||
body_markdown: Option<String>,
|
||||
canonical_url: Option<String>,
|
||||
published_at: Option<String>,
|
||||
edited_at: Option<String>,
|
||||
reading_time_minutes: Option<i64>,
|
||||
tag_list: Option<serde_json::Value>, // string OR array depending on endpoint
|
||||
positive_reactions_count: Option<i64>,
|
||||
public_reactions_count: Option<i64>,
|
||||
comments_count: Option<i64>,
|
||||
page_views_count: Option<i64>,
|
||||
cover_image: Option<String>,
|
||||
user: Option<UserRef>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct UserRef {
|
||||
username: Option<String>,
|
||||
name: Option<String>,
|
||||
twitter_username: Option<String>,
|
||||
github_username: Option<String>,
|
||||
website_url: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_article_urls() {
|
||||
assert!(matches("https://dev.to/ben/welcome-thread"));
|
||||
assert!(matches("https://dev.to/0xmassi/some-post-1abc"));
|
||||
assert!(!matches("https://dev.to/"));
|
||||
assert!(!matches("https://dev.to/api/articles/foo/bar"));
|
||||
assert!(!matches("https://dev.to/tags/rust"));
|
||||
assert!(!matches("https://dev.to/ben")); // user profile, not article
|
||||
assert!(!matches("https://example.com/ben/post"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pulls_username_and_slug() {
|
||||
assert_eq!(
|
||||
parse_username_slug("https://dev.to/ben/welcome-thread"),
|
||||
Some(("ben".into(), "welcome-thread".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_username_slug("https://dev.to/0xmassi/some-post-1abc/?foo=bar"),
|
||||
Some(("0xmassi".into(), "some-post-1abc".into()))
|
||||
);
|
||||
}
|
||||
}
|
||||
150
crates/webclaw-fetch/src/extractors/docker_hub.rs
Normal file
150
crates/webclaw-fetch/src/extractors/docker_hub.rs
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
//! Docker Hub repository structured extractor.
|
||||
//!
|
||||
//! Uses the v2 JSON API at `hub.docker.com/v2/repositories/{namespace}/{name}`.
|
||||
//! Anonymous access is allowed for public images. The official-image
|
||||
//! shorthand (e.g. `nginx`, `redis`) is normalized to `library/{name}`.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "docker_hub",
|
||||
label: "Docker Hub repository",
|
||||
description: "Returns image metadata: pull count, star count, last_updated, official flag, description.",
|
||||
url_patterns: &[
|
||||
"https://hub.docker.com/_/{name}",
|
||||
"https://hub.docker.com/r/{namespace}/{name}",
|
||||
],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "hub.docker.com" {
|
||||
return false;
|
||||
}
|
||||
url.contains("/_/") || url.contains("/r/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let (namespace, name) = parse_repo(url)
|
||||
.ok_or_else(|| FetchError::Build(format!("docker_hub: cannot parse repo from '{url}'")))?;
|
||||
|
||||
let api_url = format!("https://hub.docker.com/v2/repositories/{namespace}/{name}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"docker_hub: repo '{namespace}/{name}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"docker_hub api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let r: RepoResponse = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("docker_hub parse: {e}")))?;
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"namespace": r.namespace,
|
||||
"name": r.name,
|
||||
"full_name": format!("{namespace}/{name}"),
|
||||
"pull_count": r.pull_count,
|
||||
"star_count": r.star_count,
|
||||
"description": r.description,
|
||||
"full_description": r.full_description,
|
||||
"last_updated": r.last_updated,
|
||||
"date_registered": r.date_registered,
|
||||
"is_official": namespace == "library",
|
||||
"is_private": r.is_private,
|
||||
"status_description":r.status_description,
|
||||
"categories": r.categories,
|
||||
}))
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Parse `(namespace, name)` from a Docker Hub URL. The official-image
|
||||
/// shorthand `/_/nginx` maps to `(library, nginx)`. Personal repos
|
||||
/// `/r/foo/bar` map to `(foo, bar)`.
|
||||
fn parse_repo(url: &str) -> Option<(String, String)> {
|
||||
if let Some(after) = url.split("/_/").nth(1) {
|
||||
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let name = stripped.split('/').next().filter(|s| !s.is_empty())?;
|
||||
return Some(("library".into(), name.to_string()));
|
||||
}
|
||||
let after = url.split("/r/").nth(1)?;
|
||||
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
||||
let ns = segs.next()?;
|
||||
let nm = segs.next()?;
|
||||
Some((ns.to_string(), nm.to_string()))
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct RepoResponse {
|
||||
namespace: Option<String>,
|
||||
name: Option<String>,
|
||||
pull_count: Option<i64>,
|
||||
star_count: Option<i64>,
|
||||
description: Option<String>,
|
||||
full_description: Option<String>,
|
||||
last_updated: Option<String>,
|
||||
date_registered: Option<String>,
|
||||
is_private: Option<bool>,
|
||||
status_description: Option<String>,
|
||||
#[serde(default)]
|
||||
categories: Vec<DockerCategory>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, serde::Serialize)]
|
||||
struct DockerCategory {
|
||||
name: Option<String>,
|
||||
slug: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_docker_urls() {
|
||||
assert!(matches("https://hub.docker.com/_/nginx"));
|
||||
assert!(matches("https://hub.docker.com/r/grafana/grafana"));
|
||||
assert!(!matches("https://hub.docker.com/"));
|
||||
assert!(!matches("https://example.com/_/nginx"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_repo_handles_official_and_personal() {
|
||||
assert_eq!(
|
||||
parse_repo("https://hub.docker.com/_/nginx"),
|
||||
Some(("library".into(), "nginx".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_repo("https://hub.docker.com/_/nginx/tags"),
|
||||
Some(("library".into(), "nginx".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_repo("https://hub.docker.com/r/grafana/grafana"),
|
||||
Some(("grafana".into(), "grafana".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_repo("https://hub.docker.com/r/grafana/grafana/?foo=bar"),
|
||||
Some(("grafana".into(), "grafana".into()))
|
||||
);
|
||||
}
|
||||
}
|
||||
189
crates/webclaw-fetch/src/extractors/github_pr.rs
Normal file
189
crates/webclaw-fetch/src/extractors/github_pr.rs
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
//! GitHub pull request structured extractor.
|
||||
//!
|
||||
//! Uses `api.github.com/repos/{owner}/{repo}/pulls/{number}`. Returns
|
||||
//! the PR metadata + a counted summary of comments and review activity.
|
||||
//! Full diff and per-comment bodies require additional calls — left for
|
||||
//! a follow-up enhancement so the v1 stays one network round-trip.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "github_pr",
|
||||
label: "GitHub pull request",
|
||||
description: "Returns PR metadata: title, body, state, author, labels, additions/deletions, file count.",
|
||||
url_patterns: &["https://github.com/{owner}/{repo}/pull/{number}"],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
if host != "github.com" && host != "www.github.com" {
|
||||
return false;
|
||||
}
|
||||
parse_pr(url).is_some()
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let (owner, repo, number) = parse_pr(url).ok_or_else(|| {
|
||||
FetchError::Build(format!("github_pr: cannot parse pull-request URL '{url}'"))
|
||||
})?;
|
||||
|
||||
let api_url = format!("https://api.github.com/repos/{owner}/{repo}/pulls/{number}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"github_pr: pull request '{owner}/{repo}#{number}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status == 403 {
|
||||
return Err(FetchError::Build(
|
||||
"github_pr: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
|
||||
));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"github api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let p: PullRequest = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("github pr parse: {e}")))?;
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"owner": owner,
|
||||
"repo": repo,
|
||||
"number": p.number,
|
||||
"title": p.title,
|
||||
"body": p.body,
|
||||
"state": p.state,
|
||||
"draft": p.draft,
|
||||
"merged": p.merged,
|
||||
"merged_at": p.merged_at,
|
||||
"merge_commit_sha": p.merge_commit_sha,
|
||||
"author": p.user.as_ref().and_then(|u| u.login.clone()),
|
||||
"labels": p.labels.iter().filter_map(|l| l.name.clone()).collect::<Vec<_>>(),
|
||||
"milestone": p.milestone.as_ref().and_then(|m| m.title.clone()),
|
||||
"head_ref": p.head.as_ref().and_then(|r| r.ref_name.clone()),
|
||||
"base_ref": p.base.as_ref().and_then(|r| r.ref_name.clone()),
|
||||
"head_sha": p.head.as_ref().and_then(|r| r.sha.clone()),
|
||||
"additions": p.additions,
|
||||
"deletions": p.deletions,
|
||||
"changed_files": p.changed_files,
|
||||
"commits": p.commits,
|
||||
"comments": p.comments,
|
||||
"review_comments":p.review_comments,
|
||||
"created_at": p.created_at,
|
||||
"updated_at": p.updated_at,
|
||||
"closed_at": p.closed_at,
|
||||
"html_url": p.html_url,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_pr(url: &str) -> Option<(String, String, u64)> {
|
||||
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
||||
// /{owner}/{repo}/pull/{number} (or /pulls/{number} variant)
|
||||
if segs.len() < 4 {
|
||||
return None;
|
||||
}
|
||||
if segs[2] != "pull" && segs[2] != "pulls" {
|
||||
return None;
|
||||
}
|
||||
let number: u64 = segs[3].parse().ok()?;
|
||||
Some((segs[0].to_string(), segs[1].to_string(), number))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// GitHub PR API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct PullRequest {
|
||||
number: Option<i64>,
|
||||
title: Option<String>,
|
||||
body: Option<String>,
|
||||
state: Option<String>,
|
||||
draft: Option<bool>,
|
||||
merged: Option<bool>,
|
||||
merged_at: Option<String>,
|
||||
merge_commit_sha: Option<String>,
|
||||
user: Option<UserRef>,
|
||||
#[serde(default)]
|
||||
labels: Vec<LabelRef>,
|
||||
milestone: Option<Milestone>,
|
||||
head: Option<GitRef>,
|
||||
base: Option<GitRef>,
|
||||
additions: Option<i64>,
|
||||
deletions: Option<i64>,
|
||||
changed_files: Option<i64>,
|
||||
commits: Option<i64>,
|
||||
comments: Option<i64>,
|
||||
review_comments: Option<i64>,
|
||||
created_at: Option<String>,
|
||||
updated_at: Option<String>,
|
||||
closed_at: Option<String>,
|
||||
html_url: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct UserRef {
|
||||
login: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct LabelRef {
|
||||
name: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Milestone {
|
||||
title: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct GitRef {
|
||||
#[serde(rename = "ref")]
|
||||
ref_name: Option<String>,
|
||||
sha: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_pr_urls() {
|
||||
assert!(matches("https://github.com/rust-lang/rust/pull/12345"));
|
||||
assert!(matches(
|
||||
"https://github.com/rust-lang/rust/pull/12345/files"
|
||||
));
|
||||
assert!(!matches("https://github.com/rust-lang/rust"));
|
||||
assert!(!matches("https://github.com/rust-lang/rust/issues/100"));
|
||||
assert!(!matches("https://github.com/rust-lang"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pr_extracts_owner_repo_number() {
|
||||
assert_eq!(
|
||||
parse_pr("https://github.com/rust-lang/rust/pull/12345"),
|
||||
Some(("rust-lang".into(), "rust".into(), 12345))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_pr("https://github.com/rust-lang/rust/pull/12345/files"),
|
||||
Some(("rust-lang".into(), "rust".into(), 12345))
|
||||
);
|
||||
}
|
||||
}
|
||||
179
crates/webclaw-fetch/src/extractors/github_release.rs
Normal file
179
crates/webclaw-fetch/src/extractors/github_release.rs
Normal file
|
|
@ -0,0 +1,179 @@
|
|||
//! GitHub release structured extractor.
|
||||
//!
|
||||
//! `api.github.com/repos/{owner}/{repo}/releases/tags/{tag}`. Returns
|
||||
//! the release notes body, asset list with download counts, and
|
||||
//! prerelease flag.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "github_release",
|
||||
label: "GitHub release",
|
||||
description: "Returns release metadata: tag, name, body (release notes), assets with download counts.",
|
||||
url_patterns: &["https://github.com/{owner}/{repo}/releases/tag/{tag}"],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
if host != "github.com" && host != "www.github.com" {
|
||||
return false;
|
||||
}
|
||||
parse_release(url).is_some()
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let (owner, repo, tag) = parse_release(url).ok_or_else(|| {
|
||||
FetchError::Build(format!("github_release: cannot parse release URL '{url}'"))
|
||||
})?;
|
||||
|
||||
let api_url = format!("https://api.github.com/repos/{owner}/{repo}/releases/tags/{tag}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"github_release: release '{owner}/{repo}@{tag}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status == 403 {
|
||||
return Err(FetchError::Build(
|
||||
"github_release: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour."
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"github api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let r: Release = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("github release parse: {e}")))?;
|
||||
|
||||
let assets: Vec<Value> = r
|
||||
.assets
|
||||
.iter()
|
||||
.map(|a| {
|
||||
json!({
|
||||
"name": a.name,
|
||||
"size": a.size,
|
||||
"download_count": a.download_count,
|
||||
"browser_download_url": a.browser_download_url,
|
||||
"content_type": a.content_type,
|
||||
"created_at": a.created_at,
|
||||
"updated_at": a.updated_at,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"owner": owner,
|
||||
"repo": repo,
|
||||
"tag_name": r.tag_name,
|
||||
"name": r.name,
|
||||
"body": r.body,
|
||||
"draft": r.draft,
|
||||
"prerelease": r.prerelease,
|
||||
"author": r.author.as_ref().and_then(|u| u.login.clone()),
|
||||
"created_at": r.created_at,
|
||||
"published_at": r.published_at,
|
||||
"asset_count": assets.len(),
|
||||
"total_downloads": r.assets.iter().map(|a| a.download_count.unwrap_or(0)).sum::<i64>(),
|
||||
"assets": assets,
|
||||
"html_url": r.html_url,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_release(url: &str) -> Option<(String, String, String)> {
|
||||
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
||||
// /{owner}/{repo}/releases/tag/{tag}
|
||||
if segs.len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if segs[2] != "releases" || segs[3] != "tag" {
|
||||
return None;
|
||||
}
|
||||
Some((
|
||||
segs[0].to_string(),
|
||||
segs[1].to_string(),
|
||||
segs[4].to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// GitHub Release API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Release {
|
||||
tag_name: Option<String>,
|
||||
name: Option<String>,
|
||||
body: Option<String>,
|
||||
draft: Option<bool>,
|
||||
prerelease: Option<bool>,
|
||||
author: Option<UserRef>,
|
||||
created_at: Option<String>,
|
||||
published_at: Option<String>,
|
||||
html_url: Option<String>,
|
||||
#[serde(default)]
|
||||
assets: Vec<Asset>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct UserRef {
|
||||
login: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Asset {
|
||||
name: Option<String>,
|
||||
size: Option<i64>,
|
||||
download_count: Option<i64>,
|
||||
browser_download_url: Option<String>,
|
||||
content_type: Option<String>,
|
||||
created_at: Option<String>,
|
||||
updated_at: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_release_urls() {
|
||||
assert!(matches(
|
||||
"https://github.com/rust-lang/rust/releases/tag/1.85.0"
|
||||
));
|
||||
assert!(matches(
|
||||
"https://github.com/0xMassi/webclaw/releases/tag/v0.4.0"
|
||||
));
|
||||
assert!(!matches("https://github.com/rust-lang/rust"));
|
||||
assert!(!matches("https://github.com/rust-lang/rust/releases"));
|
||||
assert!(!matches("https://github.com/rust-lang/rust/pull/100"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_release_extracts_owner_repo_tag() {
|
||||
assert_eq!(
|
||||
parse_release("https://github.com/0xMassi/webclaw/releases/tag/v0.4.0"),
|
||||
Some(("0xMassi".into(), "webclaw".into(), "v0.4.0".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_release("https://github.com/rust-lang/rust/releases/tag/1.85.0/?foo=bar"),
|
||||
Some(("rust-lang".into(), "rust".into(), "1.85.0".into()))
|
||||
);
|
||||
}
|
||||
}
|
||||
189
crates/webclaw-fetch/src/extractors/huggingface_dataset.rs
Normal file
189
crates/webclaw-fetch/src/extractors/huggingface_dataset.rs
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
//! HuggingFace dataset structured extractor.
|
||||
//!
|
||||
//! Same shape as the model extractor but hits the dataset endpoint.
|
||||
//! `huggingface.co/api/datasets/{owner}/{name}`.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "huggingface_dataset",
|
||||
label: "HuggingFace dataset",
|
||||
description: "Returns dataset metadata: downloads, likes, license, language, task categories, file list.",
|
||||
url_patterns: &["https://huggingface.co/datasets/{owner}/{name}"],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "huggingface.co" && host != "www.huggingface.co" {
|
||||
return false;
|
||||
}
|
||||
let path = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.and_then(|s| s.split_once('/'))
|
||||
.map(|(_, p)| p)
|
||||
.unwrap_or("");
|
||||
let stripped = path
|
||||
.split(['?', '#'])
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
.trim_end_matches('/');
|
||||
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
||||
// /datasets/{name} (legacy top-level) or /datasets/{owner}/{name} (canonical).
|
||||
segs.first().copied() == Some("datasets") && (segs.len() == 2 || segs.len() == 3)
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let dataset_path = parse_dataset_path(url).ok_or_else(|| {
|
||||
FetchError::Build(format!(
|
||||
"hf_dataset: cannot parse dataset path from '{url}'"
|
||||
))
|
||||
})?;
|
||||
|
||||
let api_url = format!("https://huggingface.co/api/datasets/{dataset_path}");
|
||||
let resp = client.fetch(&api_url).await?;
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"hf_dataset: '{dataset_path}' not found"
|
||||
)));
|
||||
}
|
||||
if resp.status == 401 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"hf_dataset: '{dataset_path}' requires authentication (gated)"
|
||||
)));
|
||||
}
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"hf_dataset api returned status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let d: DatasetInfo = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("hf_dataset parse: {e}")))?;
|
||||
|
||||
let files: Vec<Value> = d
|
||||
.siblings
|
||||
.iter()
|
||||
.map(|s| json!({"rfilename": s.rfilename, "size": s.size}))
|
||||
.collect();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"id": d.id,
|
||||
"private": d.private,
|
||||
"gated": d.gated,
|
||||
"downloads": d.downloads,
|
||||
"downloads_30d": d.downloads_all_time,
|
||||
"likes": d.likes,
|
||||
"tags": d.tags,
|
||||
"license": d.card_data.as_ref().and_then(|c| c.license.clone()),
|
||||
"language": d.card_data.as_ref().and_then(|c| c.language.clone()),
|
||||
"task_categories": d.card_data.as_ref().and_then(|c| c.task_categories.clone()),
|
||||
"size_categories": d.card_data.as_ref().and_then(|c| c.size_categories.clone()),
|
||||
"annotations_creators": d.card_data.as_ref().and_then(|c| c.annotations_creators.clone()),
|
||||
"configs": d.card_data.as_ref().and_then(|c| c.configs.clone()),
|
||||
"created_at": d.created_at,
|
||||
"last_modified": d.last_modified,
|
||||
"sha": d.sha,
|
||||
"file_count": d.siblings.len(),
|
||||
"files": files,
|
||||
}))
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Returns the part to append to the API URL — either `name` (legacy
|
||||
/// top-level dataset like `squad`) or `owner/name` (canonical form).
|
||||
fn parse_dataset_path(url: &str) -> Option<String> {
|
||||
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
||||
if segs.next() != Some("datasets") {
|
||||
return None;
|
||||
}
|
||||
let first = segs.next()?.to_string();
|
||||
match segs.next() {
|
||||
Some(second) => Some(format!("{first}/{second}")),
|
||||
None => Some(first),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct DatasetInfo {
|
||||
id: Option<String>,
|
||||
private: Option<bool>,
|
||||
gated: Option<serde_json::Value>,
|
||||
downloads: Option<i64>,
|
||||
#[serde(rename = "downloadsAllTime")]
|
||||
downloads_all_time: Option<i64>,
|
||||
likes: Option<i64>,
|
||||
#[serde(default)]
|
||||
tags: Vec<String>,
|
||||
#[serde(rename = "createdAt")]
|
||||
created_at: Option<String>,
|
||||
#[serde(rename = "lastModified")]
|
||||
last_modified: Option<String>,
|
||||
sha: Option<String>,
|
||||
#[serde(rename = "cardData")]
|
||||
card_data: Option<DatasetCard>,
|
||||
#[serde(default)]
|
||||
siblings: Vec<Sibling>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct DatasetCard {
|
||||
license: Option<serde_json::Value>,
|
||||
language: Option<serde_json::Value>,
|
||||
task_categories: Option<serde_json::Value>,
|
||||
size_categories: Option<serde_json::Value>,
|
||||
annotations_creators: Option<serde_json::Value>,
|
||||
configs: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Sibling {
|
||||
rfilename: String,
|
||||
size: Option<i64>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_dataset_pages() {
|
||||
assert!(matches("https://huggingface.co/datasets/squad")); // legacy top-level
|
||||
assert!(matches("https://huggingface.co/datasets/openai/gsm8k")); // canonical owner/name
|
||||
assert!(!matches("https://huggingface.co/openai/whisper-large-v3"));
|
||||
assert!(!matches("https://huggingface.co/datasets/"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_dataset_path_works() {
|
||||
assert_eq!(
|
||||
parse_dataset_path("https://huggingface.co/datasets/squad"),
|
||||
Some("squad".into())
|
||||
);
|
||||
assert_eq!(
|
||||
parse_dataset_path("https://huggingface.co/datasets/openai/gsm8k"),
|
||||
Some("openai/gsm8k".into())
|
||||
);
|
||||
assert_eq!(
|
||||
parse_dataset_path("https://huggingface.co/datasets/openai/gsm8k/?lib=transformers"),
|
||||
Some("openai/gsm8k".into())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -14,12 +14,20 @@
|
|||
//! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have
|
||||
//! one). HTML extraction is the fallback for sites that don't.
|
||||
|
||||
pub mod arxiv;
|
||||
pub mod crates_io;
|
||||
pub mod dev_to;
|
||||
pub mod docker_hub;
|
||||
pub mod github_pr;
|
||||
pub mod github_release;
|
||||
pub mod github_repo;
|
||||
pub mod hackernews;
|
||||
pub mod huggingface_dataset;
|
||||
pub mod huggingface_model;
|
||||
pub mod npm;
|
||||
pub mod pypi;
|
||||
pub mod reddit;
|
||||
pub mod stackoverflow;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
|
|
@ -48,9 +56,17 @@ pub fn list() -> Vec<ExtractorInfo> {
|
|||
reddit::INFO,
|
||||
hackernews::INFO,
|
||||
github_repo::INFO,
|
||||
github_pr::INFO,
|
||||
github_release::INFO,
|
||||
pypi::INFO,
|
||||
npm::INFO,
|
||||
crates_io::INFO,
|
||||
huggingface_model::INFO,
|
||||
huggingface_dataset::INFO,
|
||||
arxiv::INFO,
|
||||
docker_hub::INFO,
|
||||
dev_to::INFO,
|
||||
stackoverflow::INFO,
|
||||
]
|
||||
}
|
||||
|
||||
|
|
@ -92,6 +108,27 @@ pub async fn dispatch_by_url(
|
|||
if npm::matches(url) {
|
||||
return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v)));
|
||||
}
|
||||
if github_pr::matches(url) {
|
||||
return Some(
|
||||
github_pr::extract(client, url)
|
||||
.await
|
||||
.map(|v| (github_pr::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if github_release::matches(url) {
|
||||
return Some(
|
||||
github_release::extract(client, url)
|
||||
.await
|
||||
.map(|v| (github_release::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if crates_io::matches(url) {
|
||||
return Some(
|
||||
crates_io::extract(client, url)
|
||||
.await
|
||||
.map(|v| (crates_io::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if huggingface_model::matches(url) {
|
||||
return Some(
|
||||
huggingface_model::extract(client, url)
|
||||
|
|
@ -99,6 +136,41 @@ pub async fn dispatch_by_url(
|
|||
.map(|v| (huggingface_model::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if huggingface_dataset::matches(url) {
|
||||
return Some(
|
||||
huggingface_dataset::extract(client, url)
|
||||
.await
|
||||
.map(|v| (huggingface_dataset::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if arxiv::matches(url) {
|
||||
return Some(
|
||||
arxiv::extract(client, url)
|
||||
.await
|
||||
.map(|v| (arxiv::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if docker_hub::matches(url) {
|
||||
return Some(
|
||||
docker_hub::extract(client, url)
|
||||
.await
|
||||
.map(|v| (docker_hub::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if dev_to::matches(url) {
|
||||
return Some(
|
||||
dev_to::extract(client, url)
|
||||
.await
|
||||
.map(|v| (dev_to::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if stackoverflow::matches(url) {
|
||||
return Some(
|
||||
stackoverflow::extract(client, url)
|
||||
.await
|
||||
.map(|v| (stackoverflow::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
|
|
@ -136,12 +208,57 @@ pub async fn dispatch_by_name(
|
|||
n if n == npm::INFO.name => {
|
||||
run_or_mismatch(npm::matches(url), n, url, || npm::extract(client, url)).await
|
||||
}
|
||||
n if n == github_pr::INFO.name => {
|
||||
run_or_mismatch(github_pr::matches(url), n, url, || {
|
||||
github_pr::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == github_release::INFO.name => {
|
||||
run_or_mismatch(github_release::matches(url), n, url, || {
|
||||
github_release::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == crates_io::INFO.name => {
|
||||
run_or_mismatch(crates_io::matches(url), n, url, || {
|
||||
crates_io::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == huggingface_model::INFO.name => {
|
||||
run_or_mismatch(huggingface_model::matches(url), n, url, || {
|
||||
huggingface_model::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == huggingface_dataset::INFO.name => {
|
||||
run_or_mismatch(huggingface_dataset::matches(url), n, url, || {
|
||||
huggingface_dataset::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == arxiv::INFO.name => {
|
||||
run_or_mismatch(arxiv::matches(url), n, url, || arxiv::extract(client, url)).await
|
||||
}
|
||||
n if n == docker_hub::INFO.name => {
|
||||
run_or_mismatch(docker_hub::matches(url), n, url, || {
|
||||
docker_hub::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == dev_to::INFO.name => {
|
||||
run_or_mismatch(dev_to::matches(url), n, url, || {
|
||||
dev_to::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == stackoverflow::INFO.name => {
|
||||
run_or_mismatch(stackoverflow::matches(url), n, url, || {
|
||||
stackoverflow::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
_ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
216
crates/webclaw-fetch/src/extractors/stackoverflow.rs
Normal file
216
crates/webclaw-fetch/src/extractors/stackoverflow.rs
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
//! Stack Overflow Q&A structured extractor.
|
||||
//!
|
||||
//! Uses the Stack Exchange API at `api.stackexchange.com/2.3/questions/{id}`
|
||||
//! with `site=stackoverflow`. Two calls: one for the question, one for
|
||||
//! its answers. Both come pre-filtered to include the rendered HTML body
|
||||
//! so we don't re-parse the question page itself.
|
||||
//!
|
||||
//! Anonymous access caps at 300 requests per IP per day. Production
|
||||
//! cloud should set `STACKAPPS_KEY` to lift to 10,000/day, but we don't
|
||||
//! require it to work out of the box.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "stackoverflow",
|
||||
label: "Stack Overflow Q&A",
|
||||
description: "Returns question + answers: title, body, tags, votes, accepted answer, top answers.",
|
||||
url_patterns: &["https://stackoverflow.com/questions/{id}/{slug}"],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if host != "stackoverflow.com" && host != "www.stackoverflow.com" {
|
||||
return false;
|
||||
}
|
||||
parse_question_id(url).is_some()
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let id = parse_question_id(url).ok_or_else(|| {
|
||||
FetchError::Build(format!(
|
||||
"stackoverflow: cannot parse question id from '{url}'"
|
||||
))
|
||||
})?;
|
||||
|
||||
// Filter `withbody` includes the rendered HTML body for both questions
|
||||
// and answers. Stack Exchange's filter system is documented at
|
||||
// api.stackexchange.com/docs/filters.
|
||||
let q_url = format!(
|
||||
"https://api.stackexchange.com/2.3/questions/{id}?site=stackoverflow&filter=withbody"
|
||||
);
|
||||
let q_resp = client.fetch(&q_url).await?;
|
||||
if q_resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"stackexchange api returned status {}",
|
||||
q_resp.status
|
||||
)));
|
||||
}
|
||||
let q_body: QResponse = serde_json::from_str(&q_resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("stackoverflow q parse: {e}")))?;
|
||||
let q = q_body
|
||||
.items
|
||||
.first()
|
||||
.ok_or_else(|| FetchError::Build(format!("stackoverflow: question {id} not found")))?;
|
||||
|
||||
let a_url = format!(
|
||||
"https://api.stackexchange.com/2.3/questions/{id}/answers?site=stackoverflow&filter=withbody&order=desc&sort=votes"
|
||||
);
|
||||
let a_resp = client.fetch(&a_url).await?;
|
||||
let answers = if a_resp.status == 200 {
|
||||
let a_body: AResponse = serde_json::from_str(&a_resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("stackoverflow a parse: {e}")))?;
|
||||
a_body
|
||||
.items
|
||||
.iter()
|
||||
.map(|a| {
|
||||
json!({
|
||||
"answer_id": a.answer_id,
|
||||
"is_accepted": a.is_accepted,
|
||||
"score": a.score,
|
||||
"body": a.body,
|
||||
"creation_date": a.creation_date,
|
||||
"last_edit_date":a.last_edit_date,
|
||||
"author": a.owner.as_ref().and_then(|o| o.display_name.clone()),
|
||||
"author_rep": a.owner.as_ref().and_then(|o| o.reputation),
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let accepted = answers
|
||||
.iter()
|
||||
.find(|a| {
|
||||
a.get("is_accepted")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.cloned();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"question_id": q.question_id,
|
||||
"title": q.title,
|
||||
"body": q.body,
|
||||
"tags": q.tags,
|
||||
"score": q.score,
|
||||
"view_count": q.view_count,
|
||||
"answer_count": q.answer_count,
|
||||
"is_answered": q.is_answered,
|
||||
"accepted_answer_id": q.accepted_answer_id,
|
||||
"creation_date": q.creation_date,
|
||||
"last_activity_date": q.last_activity_date,
|
||||
"author": q.owner.as_ref().and_then(|o| o.display_name.clone()),
|
||||
"author_rep": q.owner.as_ref().and_then(|o| o.reputation),
|
||||
"link": q.link,
|
||||
"accepted_answer": accepted,
|
||||
"top_answers": answers,
|
||||
}))
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Parse question id from a URL of the form `/questions/{id}/{slug}`.
|
||||
fn parse_question_id(url: &str) -> Option<u64> {
|
||||
let after = url.split("/questions/").nth(1)?;
|
||||
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let first = stripped.split('/').next()?;
|
||||
first.parse::<u64>().ok()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stack Exchange API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct QResponse {
|
||||
#[serde(default)]
|
||||
items: Vec<Question>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Question {
|
||||
question_id: Option<u64>,
|
||||
title: Option<String>,
|
||||
body: Option<String>,
|
||||
#[serde(default)]
|
||||
tags: Vec<String>,
|
||||
score: Option<i64>,
|
||||
view_count: Option<i64>,
|
||||
answer_count: Option<i64>,
|
||||
is_answered: Option<bool>,
|
||||
accepted_answer_id: Option<u64>,
|
||||
creation_date: Option<i64>,
|
||||
last_activity_date: Option<i64>,
|
||||
owner: Option<Owner>,
|
||||
link: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct AResponse {
|
||||
#[serde(default)]
|
||||
items: Vec<Answer>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Answer {
|
||||
answer_id: Option<u64>,
|
||||
is_accepted: Option<bool>,
|
||||
score: Option<i64>,
|
||||
body: Option<String>,
|
||||
creation_date: Option<i64>,
|
||||
last_edit_date: Option<i64>,
|
||||
owner: Option<Owner>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Owner {
|
||||
display_name: Option<String>,
|
||||
reputation: Option<i64>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_question_urls() {
|
||||
assert!(matches(
|
||||
"https://stackoverflow.com/questions/12345/some-slug"
|
||||
));
|
||||
assert!(matches(
|
||||
"https://stackoverflow.com/questions/12345/some-slug?answertab=votes"
|
||||
));
|
||||
assert!(!matches("https://stackoverflow.com/"));
|
||||
assert!(!matches("https://stackoverflow.com/questions"));
|
||||
assert!(!matches("https://stackoverflow.com/users/100"));
|
||||
assert!(!matches("https://example.com/questions/12345/x"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_question_id_handles_slug_and_query() {
|
||||
assert_eq!(
|
||||
parse_question_id("https://stackoverflow.com/questions/12345/some-slug"),
|
||||
Some(12345)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_question_id("https://stackoverflow.com/questions/12345/some-slug?tab=newest"),
|
||||
Some(12345)
|
||||
);
|
||||
assert_eq!(parse_question_id("https://stackoverflow.com/foo"), None);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue