feat(extractors): wave 2 \u2014 8 more verticals (14 total)

Adds 8 more vertical extractors using public JSON APIs. All hit
deterministic endpoints with no antibot risk. Live tests pass
against canonical URLs for each.

AI / ML ecosystem (3):
- crates_io          \u2192 crates.io/api/v1/crates/{name}
- huggingface_dataset \u2192 huggingface.co/api/datasets/{path} (handles both
                       legacy /datasets/{name} and canonical {owner}/{name})
- arxiv              \u2192 export.arxiv.org/api/query (Atom XML parsed by quick-xml)

Code / version control (2):
- github_pr      \u2192 api.github.com/repos/{owner}/{repo}/pulls/{number}
- github_release \u2192 api.github.com/repos/{owner}/{repo}/releases/tags/{tag}

Infrastructure (1):
- docker_hub \u2192 hub.docker.com/v2/repositories/{namespace}/{name}
              (official-image shorthand /_/nginx normalized to library/nginx)

Community / publishing (2):
- dev_to        \u2192 dev.to/api/articles/{username}/{slug}
- stackoverflow \u2192 api.stackexchange.com/2.3/questions/{id} + answers,
                  filter=withbody for rendered HTML, sort=votes for
                  consistent top-answers ordering

Live test results (real URLs):
- serde:                 942M downloads, 838B response
- 'Attention Is All You Need': abstract + authors, 1.8KB
- nginx official:        12.9B pulls, 21k stars, 17KB
- openai/gsm8k:          822k downloads, 1.7KB
- rust-lang/rust#138000: merged by RalfJung, +3/-2, 1KB
- webclaw v0.4.0:        2.4KB
- a real dev.to article: 2.2KB body, 3.1KB total
- python yield Q&A:      score 13133, 51 answers, 104KB

Catalog now exposes 14 extractors via GET /v1/extractors. Total
unit tests across the module: 34 passing. Clippy clean. Fmt clean.

Marketing positioning sharpens: 14 dedicated extractors, all
deterministic, all 1-credit-per-call. Firecrawl's /extract is
5 credits per call and you write the schema yourself.
This commit is contained in:
Valerio 2026-04-22 14:20:21 +02:00
parent 86182ef28a
commit b041f3cddd
9 changed files with 1710 additions and 0 deletions

View file

@ -0,0 +1,314 @@
//! ArXiv paper structured extractor.
//!
//! Uses the public ArXiv API at `export.arxiv.org/api/query?id_list={id}`
//! which returns Atom XML. We parse just enough to surface title, authors,
//! abstract, categories, and the canonical PDF link. No HTML scraping
//! required and no auth.
use quick_xml::Reader;
use quick_xml::events::Event;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "arxiv",
label: "ArXiv paper",
description: "Returns paper metadata: title, authors, abstract, categories, primary category, PDF URL.",
url_patterns: &[
"https://arxiv.org/abs/{id}",
"https://arxiv.org/abs/{id}v{n}",
"https://arxiv.org/pdf/{id}",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "arxiv.org" && host != "www.arxiv.org" {
return false;
}
url.contains("/abs/") || url.contains("/pdf/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let id = parse_id(url)
.ok_or_else(|| FetchError::Build(format!("arxiv: cannot parse id from '{url}'")))?;
let api_url = format!("https://export.arxiv.org/api/query?id_list={id}");
let resp = client.fetch(&api_url).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"arxiv api returned status {}",
resp.status
)));
}
let entry = parse_atom_entry(&resp.html)
.ok_or_else(|| FetchError::BodyDecode("arxiv: no <entry> in response".into()))?;
if entry.title.is_none() && entry.summary.is_none() {
return Err(FetchError::BodyDecode(format!(
"arxiv: paper '{id}' returned empty entry (likely withdrawn or invalid id)"
)));
}
Ok(json!({
"url": url,
"id": id,
"arxiv_id": entry.id,
"title": entry.title,
"authors": entry.authors,
"abstract": entry.summary.map(|s| collapse_whitespace(&s)),
"published": entry.published,
"updated": entry.updated,
"primary_category": entry.primary_category,
"categories": entry.categories,
"doi": entry.doi,
"comment": entry.comment,
"pdf_url": entry.pdf_url,
"abs_url": entry.abs_url,
}))
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Parse an arxiv id from a URL. Strips the version suffix (`v2`, `v3`)
/// and the `.pdf` extension when present.
fn parse_id(url: &str) -> Option<String> {
let after = url
.split("/abs/")
.nth(1)
.or_else(|| url.split("/pdf/").nth(1))?;
let stripped = after
.split(['?', '#'])
.next()?
.trim_end_matches('/')
.trim_end_matches(".pdf");
// Strip optional version suffix, e.g. "2401.12345v2" → "2401.12345"
let no_version = match stripped.rfind('v') {
Some(i) if stripped[i + 1..].chars().all(|c| c.is_ascii_digit()) => &stripped[..i],
_ => stripped,
};
if no_version.is_empty() {
None
} else {
Some(no_version.to_string())
}
}
fn collapse_whitespace(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
#[derive(Default)]
struct AtomEntry {
id: Option<String>,
title: Option<String>,
summary: Option<String>,
published: Option<String>,
updated: Option<String>,
primary_category: Option<String>,
categories: Vec<String>,
authors: Vec<String>,
doi: Option<String>,
comment: Option<String>,
pdf_url: Option<String>,
abs_url: Option<String>,
}
/// Parse the first `<entry>` block of an ArXiv Atom feed.
fn parse_atom_entry(xml: &str) -> Option<AtomEntry> {
let mut reader = Reader::from_str(xml);
let mut buf = Vec::new();
// States
let mut in_entry = false;
let mut current: Option<&'static str> = None;
let mut in_author = false;
let mut in_author_name = false;
let mut entry = AtomEntry::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
let local = e.local_name();
match local.as_ref() {
b"entry" => in_entry = true,
b"id" if in_entry && !in_author => current = Some("id"),
b"title" if in_entry => current = Some("title"),
b"summary" if in_entry => current = Some("summary"),
b"published" if in_entry => current = Some("published"),
b"updated" if in_entry => current = Some("updated"),
b"author" if in_entry => in_author = true,
b"name" if in_author => {
in_author_name = true;
current = Some("author_name");
}
b"category" if in_entry => {
// primary_category is namespaced (arxiv:primary_category)
// category is plain. quick-xml gives us local-name only,
// so we treat both as categories and take the first as
// primary.
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"term"
&& let Ok(v) = attr.unescape_value()
{
let term = v.to_string();
if entry.primary_category.is_none() {
entry.primary_category = Some(term.clone());
}
entry.categories.push(term);
}
}
}
b"link" if in_entry => {
let mut href = None;
let mut rel = None;
let mut typ = None;
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"href" => href = attr.unescape_value().ok().map(|s| s.to_string()),
b"rel" => rel = attr.unescape_value().ok().map(|s| s.to_string()),
b"type" => typ = attr.unescape_value().ok().map(|s| s.to_string()),
_ => {}
}
}
if let Some(h) = href {
if typ.as_deref() == Some("application/pdf") {
entry.pdf_url = Some(h.clone());
}
if rel.as_deref() == Some("alternate") {
entry.abs_url = Some(h);
}
}
}
_ => current = None,
}
}
Ok(Event::Empty(ref e)) => {
// Self-closing tags (<link href="..." />). Same handling as Start.
let local = e.local_name();
if (local.as_ref() == b"link" || local.as_ref() == b"category") && in_entry {
let mut href = None;
let mut rel = None;
let mut typ = None;
let mut term = None;
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"href" => href = attr.unescape_value().ok().map(|s| s.to_string()),
b"rel" => rel = attr.unescape_value().ok().map(|s| s.to_string()),
b"type" => typ = attr.unescape_value().ok().map(|s| s.to_string()),
b"term" => term = attr.unescape_value().ok().map(|s| s.to_string()),
_ => {}
}
}
if let Some(t) = term {
if entry.primary_category.is_none() {
entry.primary_category = Some(t.clone());
}
entry.categories.push(t);
}
if let Some(h) = href {
if typ.as_deref() == Some("application/pdf") {
entry.pdf_url = Some(h.clone());
}
if rel.as_deref() == Some("alternate") {
entry.abs_url = Some(h);
}
}
}
}
Ok(Event::Text(ref e)) => {
if let (Some(field), Ok(text)) = (current, e.unescape()) {
let text = text.to_string();
match field {
"id" => entry.id = Some(text.trim().to_string()),
"title" => entry.title = append_text(entry.title.take(), &text),
"summary" => entry.summary = append_text(entry.summary.take(), &text),
"published" => entry.published = Some(text.trim().to_string()),
"updated" => entry.updated = Some(text.trim().to_string()),
"author_name" => entry.authors.push(text.trim().to_string()),
_ => {}
}
}
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
match local.as_ref() {
b"entry" => break,
b"author" => in_author = false,
b"name" => in_author_name = false,
_ => {}
}
if !in_author_name {
current = None;
}
}
Ok(Event::Eof) => break,
Err(_) => return None,
_ => {}
}
buf.clear();
}
if in_entry { Some(entry) } else { None }
}
/// Concatenate text fragments (long fields can be split across multiple
/// text events if they contain entities or CDATA).
fn append_text(prev: Option<String>, next: &str) -> Option<String> {
match prev {
Some(mut s) => {
s.push_str(next);
Some(s)
}
None => Some(next.to_string()),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_arxiv_urls() {
assert!(matches("https://arxiv.org/abs/2401.12345"));
assert!(matches("https://arxiv.org/abs/2401.12345v2"));
assert!(matches("https://arxiv.org/pdf/2401.12345.pdf"));
assert!(!matches("https://arxiv.org/"));
assert!(!matches("https://example.com/abs/foo"));
}
#[test]
fn parse_id_strips_version_and_extension() {
assert_eq!(
parse_id("https://arxiv.org/abs/2401.12345"),
Some("2401.12345".into())
);
assert_eq!(
parse_id("https://arxiv.org/abs/2401.12345v3"),
Some("2401.12345".into())
);
assert_eq!(
parse_id("https://arxiv.org/pdf/2401.12345v2.pdf"),
Some("2401.12345".into())
);
}
#[test]
fn collapse_whitespace_handles_newlines_and_tabs() {
assert_eq!(collapse_whitespace("a b\n\tc "), "a b c");
}
}

View file

@ -0,0 +1,168 @@
//! crates.io structured extractor.
//!
//! Uses the public JSON API at `crates.io/api/v1/crates/{name}`. No
//! auth, no rate limit at normal usage. The response includes both
//! the crate metadata and the full version list, which we summarize
//! down to a count + latest release info to keep the payload small.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "crates_io",
label: "crates.io package",
description: "Returns crate metadata: latest version, dependencies, downloads, license, repository.",
url_patterns: &[
"https://crates.io/crates/{name}",
"https://crates.io/crates/{name}/{version}",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "crates.io" && host != "www.crates.io" {
return false;
}
url.contains("/crates/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let name = parse_name(url)
.ok_or_else(|| FetchError::Build(format!("crates.io: cannot parse name from '{url}'")))?;
let api_url = format!("https://crates.io/api/v1/crates/{name}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"crates.io: crate '{name}' not found"
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"crates.io api returned status {}",
resp.status
)));
}
let body: CratesResponse = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("crates.io parse: {e}")))?;
let c = body.crate_;
let latest_version = body
.versions
.iter()
.find(|v| !v.yanked.unwrap_or(false))
.or_else(|| body.versions.first());
Ok(json!({
"url": url,
"name": c.id,
"description": c.description,
"homepage": c.homepage,
"documentation": c.documentation,
"repository": c.repository,
"max_stable_version": c.max_stable_version,
"max_version": c.max_version,
"newest_version": c.newest_version,
"downloads": c.downloads,
"recent_downloads": c.recent_downloads,
"categories": c.categories,
"keywords": c.keywords,
"release_count": body.versions.len(),
"latest_release_date": latest_version.and_then(|v| v.created_at.clone()),
"latest_license": latest_version.and_then(|v| v.license.clone()),
"latest_rust_version": latest_version.and_then(|v| v.rust_version.clone()),
"latest_yanked": latest_version.and_then(|v| v.yanked),
"created_at": c.created_at,
"updated_at": c.updated_at,
}))
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
fn parse_name(url: &str) -> Option<String> {
let after = url.split("/crates/").nth(1)?;
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
let first = stripped.split('/').find(|s| !s.is_empty())?;
Some(first.to_string())
}
// ---------------------------------------------------------------------------
// crates.io API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct CratesResponse {
#[serde(rename = "crate")]
crate_: CrateInfo,
#[serde(default)]
versions: Vec<VersionInfo>,
}
#[derive(Deserialize)]
struct CrateInfo {
id: Option<String>,
description: Option<String>,
homepage: Option<String>,
documentation: Option<String>,
repository: Option<String>,
max_stable_version: Option<String>,
max_version: Option<String>,
newest_version: Option<String>,
downloads: Option<i64>,
recent_downloads: Option<i64>,
#[serde(default)]
categories: Vec<String>,
#[serde(default)]
keywords: Vec<String>,
created_at: Option<String>,
updated_at: Option<String>,
}
#[derive(Deserialize)]
struct VersionInfo {
license: Option<String>,
rust_version: Option<String>,
yanked: Option<bool>,
created_at: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_crate_pages() {
assert!(matches("https://crates.io/crates/serde"));
assert!(matches("https://crates.io/crates/tokio/1.45.0"));
assert!(!matches("https://crates.io/"));
assert!(!matches("https://example.com/crates/foo"));
}
#[test]
fn parse_name_handles_versioned_urls() {
assert_eq!(
parse_name("https://crates.io/crates/serde"),
Some("serde".into())
);
assert_eq!(
parse_name("https://crates.io/crates/tokio/1.45.0"),
Some("tokio".into())
);
assert_eq!(
parse_name("https://crates.io/crates/scraper/?foo=bar"),
Some("scraper".into())
);
}
}

View file

@ -0,0 +1,188 @@
//! dev.to article structured extractor.
//!
//! `dev.to/api/articles/{username}/{slug}` returns the full article body,
//! tags, reaction count, comment count, and reading time. Anonymous
//! access works fine for published posts.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "dev_to",
label: "dev.to article",
description: "Returns article metadata + body: title, body markdown, tags, reactions, comments, reading time.",
url_patterns: &["https://dev.to/{username}/{slug}"],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "dev.to" && host != "www.dev.to" {
return false;
}
let path = url
.split("://")
.nth(1)
.and_then(|s| s.split_once('/'))
.map(|(_, p)| p)
.unwrap_or("");
let stripped = path
.split(['?', '#'])
.next()
.unwrap_or("")
.trim_end_matches('/');
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
// Need exactly /{username}/{slug}, with username starting with non-reserved.
segs.len() == 2 && !RESERVED_FIRST_SEGS.contains(&segs[0])
}
const RESERVED_FIRST_SEGS: &[&str] = &[
"api",
"tags",
"search",
"settings",
"enter",
"signup",
"about",
"code-of-conduct",
"privacy",
"terms",
"contact",
"sponsorships",
"sponsors",
"shop",
"videos",
"listings",
"podcasts",
"p",
"t",
];
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (username, slug) = parse_username_slug(url).ok_or_else(|| {
FetchError::Build(format!("dev_to: cannot parse username/slug from '{url}'"))
})?;
let api_url = format!("https://dev.to/api/articles/{username}/{slug}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"dev_to: article '{username}/{slug}' not found"
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"dev.to api returned status {}",
resp.status
)));
}
let a: Article = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("dev.to parse: {e}")))?;
Ok(json!({
"url": url,
"id": a.id,
"title": a.title,
"description": a.description,
"body_markdown": a.body_markdown,
"url_canonical": a.canonical_url,
"published_at": a.published_at,
"edited_at": a.edited_at,
"reading_time_min": a.reading_time_minutes,
"tags": a.tag_list,
"positive_reactions": a.positive_reactions_count,
"public_reactions": a.public_reactions_count,
"comments_count": a.comments_count,
"page_views_count": a.page_views_count,
"cover_image": a.cover_image,
"author": json!({
"username": a.user.as_ref().and_then(|u| u.username.clone()),
"name": a.user.as_ref().and_then(|u| u.name.clone()),
"twitter": a.user.as_ref().and_then(|u| u.twitter_username.clone()),
"github": a.user.as_ref().and_then(|u| u.github_username.clone()),
"website": a.user.as_ref().and_then(|u| u.website_url.clone()),
}),
}))
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
fn parse_username_slug(url: &str) -> Option<(String, String)> {
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
let username = segs.next()?;
let slug = segs.next()?;
Some((username.to_string(), slug.to_string()))
}
// ---------------------------------------------------------------------------
// dev.to API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Article {
id: Option<i64>,
title: Option<String>,
description: Option<String>,
body_markdown: Option<String>,
canonical_url: Option<String>,
published_at: Option<String>,
edited_at: Option<String>,
reading_time_minutes: Option<i64>,
tag_list: Option<serde_json::Value>, // string OR array depending on endpoint
positive_reactions_count: Option<i64>,
public_reactions_count: Option<i64>,
comments_count: Option<i64>,
page_views_count: Option<i64>,
cover_image: Option<String>,
user: Option<UserRef>,
}
#[derive(Deserialize)]
struct UserRef {
username: Option<String>,
name: Option<String>,
twitter_username: Option<String>,
github_username: Option<String>,
website_url: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_article_urls() {
assert!(matches("https://dev.to/ben/welcome-thread"));
assert!(matches("https://dev.to/0xmassi/some-post-1abc"));
assert!(!matches("https://dev.to/"));
assert!(!matches("https://dev.to/api/articles/foo/bar"));
assert!(!matches("https://dev.to/tags/rust"));
assert!(!matches("https://dev.to/ben")); // user profile, not article
assert!(!matches("https://example.com/ben/post"));
}
#[test]
fn parse_pulls_username_and_slug() {
assert_eq!(
parse_username_slug("https://dev.to/ben/welcome-thread"),
Some(("ben".into(), "welcome-thread".into()))
);
assert_eq!(
parse_username_slug("https://dev.to/0xmassi/some-post-1abc/?foo=bar"),
Some(("0xmassi".into(), "some-post-1abc".into()))
);
}
}

View file

@ -0,0 +1,150 @@
//! Docker Hub repository structured extractor.
//!
//! Uses the v2 JSON API at `hub.docker.com/v2/repositories/{namespace}/{name}`.
//! Anonymous access is allowed for public images. The official-image
//! shorthand (e.g. `nginx`, `redis`) is normalized to `library/{name}`.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "docker_hub",
label: "Docker Hub repository",
description: "Returns image metadata: pull count, star count, last_updated, official flag, description.",
url_patterns: &[
"https://hub.docker.com/_/{name}",
"https://hub.docker.com/r/{namespace}/{name}",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "hub.docker.com" {
return false;
}
url.contains("/_/") || url.contains("/r/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (namespace, name) = parse_repo(url)
.ok_or_else(|| FetchError::Build(format!("docker_hub: cannot parse repo from '{url}'")))?;
let api_url = format!("https://hub.docker.com/v2/repositories/{namespace}/{name}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"docker_hub: repo '{namespace}/{name}' not found"
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"docker_hub api returned status {}",
resp.status
)));
}
let r: RepoResponse = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("docker_hub parse: {e}")))?;
Ok(json!({
"url": url,
"namespace": r.namespace,
"name": r.name,
"full_name": format!("{namespace}/{name}"),
"pull_count": r.pull_count,
"star_count": r.star_count,
"description": r.description,
"full_description": r.full_description,
"last_updated": r.last_updated,
"date_registered": r.date_registered,
"is_official": namespace == "library",
"is_private": r.is_private,
"status_description":r.status_description,
"categories": r.categories,
}))
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Parse `(namespace, name)` from a Docker Hub URL. The official-image
/// shorthand `/_/nginx` maps to `(library, nginx)`. Personal repos
/// `/r/foo/bar` map to `(foo, bar)`.
fn parse_repo(url: &str) -> Option<(String, String)> {
if let Some(after) = url.split("/_/").nth(1) {
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
let name = stripped.split('/').next().filter(|s| !s.is_empty())?;
return Some(("library".into(), name.to_string()));
}
let after = url.split("/r/").nth(1)?;
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
let ns = segs.next()?;
let nm = segs.next()?;
Some((ns.to_string(), nm.to_string()))
}
#[derive(Deserialize)]
struct RepoResponse {
namespace: Option<String>,
name: Option<String>,
pull_count: Option<i64>,
star_count: Option<i64>,
description: Option<String>,
full_description: Option<String>,
last_updated: Option<String>,
date_registered: Option<String>,
is_private: Option<bool>,
status_description: Option<String>,
#[serde(default)]
categories: Vec<DockerCategory>,
}
#[derive(Deserialize, serde::Serialize)]
struct DockerCategory {
name: Option<String>,
slug: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_docker_urls() {
assert!(matches("https://hub.docker.com/_/nginx"));
assert!(matches("https://hub.docker.com/r/grafana/grafana"));
assert!(!matches("https://hub.docker.com/"));
assert!(!matches("https://example.com/_/nginx"));
}
#[test]
fn parse_repo_handles_official_and_personal() {
assert_eq!(
parse_repo("https://hub.docker.com/_/nginx"),
Some(("library".into(), "nginx".into()))
);
assert_eq!(
parse_repo("https://hub.docker.com/_/nginx/tags"),
Some(("library".into(), "nginx".into()))
);
assert_eq!(
parse_repo("https://hub.docker.com/r/grafana/grafana"),
Some(("grafana".into(), "grafana".into()))
);
assert_eq!(
parse_repo("https://hub.docker.com/r/grafana/grafana/?foo=bar"),
Some(("grafana".into(), "grafana".into()))
);
}
}

View file

@ -0,0 +1,189 @@
//! GitHub pull request structured extractor.
//!
//! Uses `api.github.com/repos/{owner}/{repo}/pulls/{number}`. Returns
//! the PR metadata + a counted summary of comments and review activity.
//! Full diff and per-comment bodies require additional calls — left for
//! a follow-up enhancement so the v1 stays one network round-trip.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "github_pr",
label: "GitHub pull request",
description: "Returns PR metadata: title, body, state, author, labels, additions/deletions, file count.",
url_patterns: &["https://github.com/{owner}/{repo}/pull/{number}"],
};
pub fn matches(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
if host != "github.com" && host != "www.github.com" {
return false;
}
parse_pr(url).is_some()
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (owner, repo, number) = parse_pr(url).ok_or_else(|| {
FetchError::Build(format!("github_pr: cannot parse pull-request URL '{url}'"))
})?;
let api_url = format!("https://api.github.com/repos/{owner}/{repo}/pulls/{number}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"github_pr: pull request '{owner}/{repo}#{number}' not found"
)));
}
if resp.status == 403 {
return Err(FetchError::Build(
"github_pr: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"github api returned status {}",
resp.status
)));
}
let p: PullRequest = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("github pr parse: {e}")))?;
Ok(json!({
"url": url,
"owner": owner,
"repo": repo,
"number": p.number,
"title": p.title,
"body": p.body,
"state": p.state,
"draft": p.draft,
"merged": p.merged,
"merged_at": p.merged_at,
"merge_commit_sha": p.merge_commit_sha,
"author": p.user.as_ref().and_then(|u| u.login.clone()),
"labels": p.labels.iter().filter_map(|l| l.name.clone()).collect::<Vec<_>>(),
"milestone": p.milestone.as_ref().and_then(|m| m.title.clone()),
"head_ref": p.head.as_ref().and_then(|r| r.ref_name.clone()),
"base_ref": p.base.as_ref().and_then(|r| r.ref_name.clone()),
"head_sha": p.head.as_ref().and_then(|r| r.sha.clone()),
"additions": p.additions,
"deletions": p.deletions,
"changed_files": p.changed_files,
"commits": p.commits,
"comments": p.comments,
"review_comments":p.review_comments,
"created_at": p.created_at,
"updated_at": p.updated_at,
"closed_at": p.closed_at,
"html_url": p.html_url,
}))
}
fn parse_pr(url: &str) -> Option<(String, String, u64)> {
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
// /{owner}/{repo}/pull/{number} (or /pulls/{number} variant)
if segs.len() < 4 {
return None;
}
if segs[2] != "pull" && segs[2] != "pulls" {
return None;
}
let number: u64 = segs[3].parse().ok()?;
Some((segs[0].to_string(), segs[1].to_string(), number))
}
// ---------------------------------------------------------------------------
// GitHub PR API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct PullRequest {
number: Option<i64>,
title: Option<String>,
body: Option<String>,
state: Option<String>,
draft: Option<bool>,
merged: Option<bool>,
merged_at: Option<String>,
merge_commit_sha: Option<String>,
user: Option<UserRef>,
#[serde(default)]
labels: Vec<LabelRef>,
milestone: Option<Milestone>,
head: Option<GitRef>,
base: Option<GitRef>,
additions: Option<i64>,
deletions: Option<i64>,
changed_files: Option<i64>,
commits: Option<i64>,
comments: Option<i64>,
review_comments: Option<i64>,
created_at: Option<String>,
updated_at: Option<String>,
closed_at: Option<String>,
html_url: Option<String>,
}
#[derive(Deserialize)]
struct UserRef {
login: Option<String>,
}
#[derive(Deserialize)]
struct LabelRef {
name: Option<String>,
}
#[derive(Deserialize)]
struct Milestone {
title: Option<String>,
}
#[derive(Deserialize)]
struct GitRef {
#[serde(rename = "ref")]
ref_name: Option<String>,
sha: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_pr_urls() {
assert!(matches("https://github.com/rust-lang/rust/pull/12345"));
assert!(matches(
"https://github.com/rust-lang/rust/pull/12345/files"
));
assert!(!matches("https://github.com/rust-lang/rust"));
assert!(!matches("https://github.com/rust-lang/rust/issues/100"));
assert!(!matches("https://github.com/rust-lang"));
}
#[test]
fn parse_pr_extracts_owner_repo_number() {
assert_eq!(
parse_pr("https://github.com/rust-lang/rust/pull/12345"),
Some(("rust-lang".into(), "rust".into(), 12345))
);
assert_eq!(
parse_pr("https://github.com/rust-lang/rust/pull/12345/files"),
Some(("rust-lang".into(), "rust".into(), 12345))
);
}
}

View file

@ -0,0 +1,179 @@
//! GitHub release structured extractor.
//!
//! `api.github.com/repos/{owner}/{repo}/releases/tags/{tag}`. Returns
//! the release notes body, asset list with download counts, and
//! prerelease flag.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "github_release",
label: "GitHub release",
description: "Returns release metadata: tag, name, body (release notes), assets with download counts.",
url_patterns: &["https://github.com/{owner}/{repo}/releases/tag/{tag}"],
};
pub fn matches(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
if host != "github.com" && host != "www.github.com" {
return false;
}
parse_release(url).is_some()
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (owner, repo, tag) = parse_release(url).ok_or_else(|| {
FetchError::Build(format!("github_release: cannot parse release URL '{url}'"))
})?;
let api_url = format!("https://api.github.com/repos/{owner}/{repo}/releases/tags/{tag}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"github_release: release '{owner}/{repo}@{tag}' not found"
)));
}
if resp.status == 403 {
return Err(FetchError::Build(
"github_release: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour."
.into(),
));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"github api returned status {}",
resp.status
)));
}
let r: Release = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("github release parse: {e}")))?;
let assets: Vec<Value> = r
.assets
.iter()
.map(|a| {
json!({
"name": a.name,
"size": a.size,
"download_count": a.download_count,
"browser_download_url": a.browser_download_url,
"content_type": a.content_type,
"created_at": a.created_at,
"updated_at": a.updated_at,
})
})
.collect();
Ok(json!({
"url": url,
"owner": owner,
"repo": repo,
"tag_name": r.tag_name,
"name": r.name,
"body": r.body,
"draft": r.draft,
"prerelease": r.prerelease,
"author": r.author.as_ref().and_then(|u| u.login.clone()),
"created_at": r.created_at,
"published_at": r.published_at,
"asset_count": assets.len(),
"total_downloads": r.assets.iter().map(|a| a.download_count.unwrap_or(0)).sum::<i64>(),
"assets": assets,
"html_url": r.html_url,
}))
}
fn parse_release(url: &str) -> Option<(String, String, String)> {
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
// /{owner}/{repo}/releases/tag/{tag}
if segs.len() < 5 {
return None;
}
if segs[2] != "releases" || segs[3] != "tag" {
return None;
}
Some((
segs[0].to_string(),
segs[1].to_string(),
segs[4].to_string(),
))
}
// ---------------------------------------------------------------------------
// GitHub Release API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Release {
tag_name: Option<String>,
name: Option<String>,
body: Option<String>,
draft: Option<bool>,
prerelease: Option<bool>,
author: Option<UserRef>,
created_at: Option<String>,
published_at: Option<String>,
html_url: Option<String>,
#[serde(default)]
assets: Vec<Asset>,
}
#[derive(Deserialize)]
struct UserRef {
login: Option<String>,
}
#[derive(Deserialize)]
struct Asset {
name: Option<String>,
size: Option<i64>,
download_count: Option<i64>,
browser_download_url: Option<String>,
content_type: Option<String>,
created_at: Option<String>,
updated_at: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_release_urls() {
assert!(matches(
"https://github.com/rust-lang/rust/releases/tag/1.85.0"
));
assert!(matches(
"https://github.com/0xMassi/webclaw/releases/tag/v0.4.0"
));
assert!(!matches("https://github.com/rust-lang/rust"));
assert!(!matches("https://github.com/rust-lang/rust/releases"));
assert!(!matches("https://github.com/rust-lang/rust/pull/100"));
}
#[test]
fn parse_release_extracts_owner_repo_tag() {
assert_eq!(
parse_release("https://github.com/0xMassi/webclaw/releases/tag/v0.4.0"),
Some(("0xMassi".into(), "webclaw".into(), "v0.4.0".into()))
);
assert_eq!(
parse_release("https://github.com/rust-lang/rust/releases/tag/1.85.0/?foo=bar"),
Some(("rust-lang".into(), "rust".into(), "1.85.0".into()))
);
}
}

View file

@ -0,0 +1,189 @@
//! HuggingFace dataset structured extractor.
//!
//! Same shape as the model extractor but hits the dataset endpoint.
//! `huggingface.co/api/datasets/{owner}/{name}`.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "huggingface_dataset",
label: "HuggingFace dataset",
description: "Returns dataset metadata: downloads, likes, license, language, task categories, file list.",
url_patterns: &["https://huggingface.co/datasets/{owner}/{name}"],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "huggingface.co" && host != "www.huggingface.co" {
return false;
}
let path = url
.split("://")
.nth(1)
.and_then(|s| s.split_once('/'))
.map(|(_, p)| p)
.unwrap_or("");
let stripped = path
.split(['?', '#'])
.next()
.unwrap_or("")
.trim_end_matches('/');
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
// /datasets/{name} (legacy top-level) or /datasets/{owner}/{name} (canonical).
segs.first().copied() == Some("datasets") && (segs.len() == 2 || segs.len() == 3)
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let dataset_path = parse_dataset_path(url).ok_or_else(|| {
FetchError::Build(format!(
"hf_dataset: cannot parse dataset path from '{url}'"
))
})?;
let api_url = format!("https://huggingface.co/api/datasets/{dataset_path}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"hf_dataset: '{dataset_path}' not found"
)));
}
if resp.status == 401 {
return Err(FetchError::Build(format!(
"hf_dataset: '{dataset_path}' requires authentication (gated)"
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"hf_dataset api returned status {}",
resp.status
)));
}
let d: DatasetInfo = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("hf_dataset parse: {e}")))?;
let files: Vec<Value> = d
.siblings
.iter()
.map(|s| json!({"rfilename": s.rfilename, "size": s.size}))
.collect();
Ok(json!({
"url": url,
"id": d.id,
"private": d.private,
"gated": d.gated,
"downloads": d.downloads,
"downloads_30d": d.downloads_all_time,
"likes": d.likes,
"tags": d.tags,
"license": d.card_data.as_ref().and_then(|c| c.license.clone()),
"language": d.card_data.as_ref().and_then(|c| c.language.clone()),
"task_categories": d.card_data.as_ref().and_then(|c| c.task_categories.clone()),
"size_categories": d.card_data.as_ref().and_then(|c| c.size_categories.clone()),
"annotations_creators": d.card_data.as_ref().and_then(|c| c.annotations_creators.clone()),
"configs": d.card_data.as_ref().and_then(|c| c.configs.clone()),
"created_at": d.created_at,
"last_modified": d.last_modified,
"sha": d.sha,
"file_count": d.siblings.len(),
"files": files,
}))
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Returns the part to append to the API URL — either `name` (legacy
/// top-level dataset like `squad`) or `owner/name` (canonical form).
fn parse_dataset_path(url: &str) -> Option<String> {
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
if segs.next() != Some("datasets") {
return None;
}
let first = segs.next()?.to_string();
match segs.next() {
Some(second) => Some(format!("{first}/{second}")),
None => Some(first),
}
}
#[derive(Deserialize)]
struct DatasetInfo {
id: Option<String>,
private: Option<bool>,
gated: Option<serde_json::Value>,
downloads: Option<i64>,
#[serde(rename = "downloadsAllTime")]
downloads_all_time: Option<i64>,
likes: Option<i64>,
#[serde(default)]
tags: Vec<String>,
#[serde(rename = "createdAt")]
created_at: Option<String>,
#[serde(rename = "lastModified")]
last_modified: Option<String>,
sha: Option<String>,
#[serde(rename = "cardData")]
card_data: Option<DatasetCard>,
#[serde(default)]
siblings: Vec<Sibling>,
}
#[derive(Deserialize)]
struct DatasetCard {
license: Option<serde_json::Value>,
language: Option<serde_json::Value>,
task_categories: Option<serde_json::Value>,
size_categories: Option<serde_json::Value>,
annotations_creators: Option<serde_json::Value>,
configs: Option<serde_json::Value>,
}
#[derive(Deserialize)]
struct Sibling {
rfilename: String,
size: Option<i64>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_dataset_pages() {
assert!(matches("https://huggingface.co/datasets/squad")); // legacy top-level
assert!(matches("https://huggingface.co/datasets/openai/gsm8k")); // canonical owner/name
assert!(!matches("https://huggingface.co/openai/whisper-large-v3"));
assert!(!matches("https://huggingface.co/datasets/"));
}
#[test]
fn parse_dataset_path_works() {
assert_eq!(
parse_dataset_path("https://huggingface.co/datasets/squad"),
Some("squad".into())
);
assert_eq!(
parse_dataset_path("https://huggingface.co/datasets/openai/gsm8k"),
Some("openai/gsm8k".into())
);
assert_eq!(
parse_dataset_path("https://huggingface.co/datasets/openai/gsm8k/?lib=transformers"),
Some("openai/gsm8k".into())
);
}
}

View file

@ -14,12 +14,20 @@
//! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have
//! one). HTML extraction is the fallback for sites that don't.
pub mod arxiv;
pub mod crates_io;
pub mod dev_to;
pub mod docker_hub;
pub mod github_pr;
pub mod github_release;
pub mod github_repo;
pub mod hackernews;
pub mod huggingface_dataset;
pub mod huggingface_model;
pub mod npm;
pub mod pypi;
pub mod reddit;
pub mod stackoverflow;
use serde::Serialize;
use serde_json::Value;
@ -48,9 +56,17 @@ pub fn list() -> Vec<ExtractorInfo> {
reddit::INFO,
hackernews::INFO,
github_repo::INFO,
github_pr::INFO,
github_release::INFO,
pypi::INFO,
npm::INFO,
crates_io::INFO,
huggingface_model::INFO,
huggingface_dataset::INFO,
arxiv::INFO,
docker_hub::INFO,
dev_to::INFO,
stackoverflow::INFO,
]
}
@ -92,6 +108,27 @@ pub async fn dispatch_by_url(
if npm::matches(url) {
return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v)));
}
if github_pr::matches(url) {
return Some(
github_pr::extract(client, url)
.await
.map(|v| (github_pr::INFO.name, v)),
);
}
if github_release::matches(url) {
return Some(
github_release::extract(client, url)
.await
.map(|v| (github_release::INFO.name, v)),
);
}
if crates_io::matches(url) {
return Some(
crates_io::extract(client, url)
.await
.map(|v| (crates_io::INFO.name, v)),
);
}
if huggingface_model::matches(url) {
return Some(
huggingface_model::extract(client, url)
@ -99,6 +136,41 @@ pub async fn dispatch_by_url(
.map(|v| (huggingface_model::INFO.name, v)),
);
}
if huggingface_dataset::matches(url) {
return Some(
huggingface_dataset::extract(client, url)
.await
.map(|v| (huggingface_dataset::INFO.name, v)),
);
}
if arxiv::matches(url) {
return Some(
arxiv::extract(client, url)
.await
.map(|v| (arxiv::INFO.name, v)),
);
}
if docker_hub::matches(url) {
return Some(
docker_hub::extract(client, url)
.await
.map(|v| (docker_hub::INFO.name, v)),
);
}
if dev_to::matches(url) {
return Some(
dev_to::extract(client, url)
.await
.map(|v| (dev_to::INFO.name, v)),
);
}
if stackoverflow::matches(url) {
return Some(
stackoverflow::extract(client, url)
.await
.map(|v| (stackoverflow::INFO.name, v)),
);
}
None
}
@ -136,12 +208,57 @@ pub async fn dispatch_by_name(
n if n == npm::INFO.name => {
run_or_mismatch(npm::matches(url), n, url, || npm::extract(client, url)).await
}
n if n == github_pr::INFO.name => {
run_or_mismatch(github_pr::matches(url), n, url, || {
github_pr::extract(client, url)
})
.await
}
n if n == github_release::INFO.name => {
run_or_mismatch(github_release::matches(url), n, url, || {
github_release::extract(client, url)
})
.await
}
n if n == crates_io::INFO.name => {
run_or_mismatch(crates_io::matches(url), n, url, || {
crates_io::extract(client, url)
})
.await
}
n if n == huggingface_model::INFO.name => {
run_or_mismatch(huggingface_model::matches(url), n, url, || {
huggingface_model::extract(client, url)
})
.await
}
n if n == huggingface_dataset::INFO.name => {
run_or_mismatch(huggingface_dataset::matches(url), n, url, || {
huggingface_dataset::extract(client, url)
})
.await
}
n if n == arxiv::INFO.name => {
run_or_mismatch(arxiv::matches(url), n, url, || arxiv::extract(client, url)).await
}
n if n == docker_hub::INFO.name => {
run_or_mismatch(docker_hub::matches(url), n, url, || {
docker_hub::extract(client, url)
})
.await
}
n if n == dev_to::INFO.name => {
run_or_mismatch(dev_to::matches(url), n, url, || {
dev_to::extract(client, url)
})
.await
}
n if n == stackoverflow::INFO.name => {
run_or_mismatch(stackoverflow::matches(url), n, url, || {
stackoverflow::extract(client, url)
})
.await
}
_ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
}
}

View file

@ -0,0 +1,216 @@
//! Stack Overflow Q&A structured extractor.
//!
//! Uses the Stack Exchange API at `api.stackexchange.com/2.3/questions/{id}`
//! with `site=stackoverflow`. Two calls: one for the question, one for
//! its answers. Both come pre-filtered to include the rendered HTML body
//! so we don't re-parse the question page itself.
//!
//! Anonymous access caps at 300 requests per IP per day. Production
//! cloud should set `STACKAPPS_KEY` to lift to 10,000/day, but we don't
//! require it to work out of the box.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "stackoverflow",
label: "Stack Overflow Q&A",
description: "Returns question + answers: title, body, tags, votes, accepted answer, top answers.",
url_patterns: &["https://stackoverflow.com/questions/{id}/{slug}"],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host != "stackoverflow.com" && host != "www.stackoverflow.com" {
return false;
}
parse_question_id(url).is_some()
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let id = parse_question_id(url).ok_or_else(|| {
FetchError::Build(format!(
"stackoverflow: cannot parse question id from '{url}'"
))
})?;
// Filter `withbody` includes the rendered HTML body for both questions
// and answers. Stack Exchange's filter system is documented at
// api.stackexchange.com/docs/filters.
let q_url = format!(
"https://api.stackexchange.com/2.3/questions/{id}?site=stackoverflow&filter=withbody"
);
let q_resp = client.fetch(&q_url).await?;
if q_resp.status != 200 {
return Err(FetchError::Build(format!(
"stackexchange api returned status {}",
q_resp.status
)));
}
let q_body: QResponse = serde_json::from_str(&q_resp.html)
.map_err(|e| FetchError::BodyDecode(format!("stackoverflow q parse: {e}")))?;
let q = q_body
.items
.first()
.ok_or_else(|| FetchError::Build(format!("stackoverflow: question {id} not found")))?;
let a_url = format!(
"https://api.stackexchange.com/2.3/questions/{id}/answers?site=stackoverflow&filter=withbody&order=desc&sort=votes"
);
let a_resp = client.fetch(&a_url).await?;
let answers = if a_resp.status == 200 {
let a_body: AResponse = serde_json::from_str(&a_resp.html)
.map_err(|e| FetchError::BodyDecode(format!("stackoverflow a parse: {e}")))?;
a_body
.items
.iter()
.map(|a| {
json!({
"answer_id": a.answer_id,
"is_accepted": a.is_accepted,
"score": a.score,
"body": a.body,
"creation_date": a.creation_date,
"last_edit_date":a.last_edit_date,
"author": a.owner.as_ref().and_then(|o| o.display_name.clone()),
"author_rep": a.owner.as_ref().and_then(|o| o.reputation),
})
})
.collect::<Vec<_>>()
} else {
Vec::new()
};
let accepted = answers
.iter()
.find(|a| {
a.get("is_accepted")
.and_then(|v| v.as_bool())
.unwrap_or(false)
})
.cloned();
Ok(json!({
"url": url,
"question_id": q.question_id,
"title": q.title,
"body": q.body,
"tags": q.tags,
"score": q.score,
"view_count": q.view_count,
"answer_count": q.answer_count,
"is_answered": q.is_answered,
"accepted_answer_id": q.accepted_answer_id,
"creation_date": q.creation_date,
"last_activity_date": q.last_activity_date,
"author": q.owner.as_ref().and_then(|o| o.display_name.clone()),
"author_rep": q.owner.as_ref().and_then(|o| o.reputation),
"link": q.link,
"accepted_answer": accepted,
"top_answers": answers,
}))
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Parse question id from a URL of the form `/questions/{id}/{slug}`.
fn parse_question_id(url: &str) -> Option<u64> {
let after = url.split("/questions/").nth(1)?;
let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
let first = stripped.split('/').next()?;
first.parse::<u64>().ok()
}
// ---------------------------------------------------------------------------
// Stack Exchange API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct QResponse {
#[serde(default)]
items: Vec<Question>,
}
#[derive(Deserialize)]
struct Question {
question_id: Option<u64>,
title: Option<String>,
body: Option<String>,
#[serde(default)]
tags: Vec<String>,
score: Option<i64>,
view_count: Option<i64>,
answer_count: Option<i64>,
is_answered: Option<bool>,
accepted_answer_id: Option<u64>,
creation_date: Option<i64>,
last_activity_date: Option<i64>,
owner: Option<Owner>,
link: Option<String>,
}
#[derive(Deserialize)]
struct AResponse {
#[serde(default)]
items: Vec<Answer>,
}
#[derive(Deserialize)]
struct Answer {
answer_id: Option<u64>,
is_accepted: Option<bool>,
score: Option<i64>,
body: Option<String>,
creation_date: Option<i64>,
last_edit_date: Option<i64>,
owner: Option<Owner>,
}
#[derive(Deserialize)]
struct Owner {
display_name: Option<String>,
reputation: Option<i64>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_question_urls() {
assert!(matches(
"https://stackoverflow.com/questions/12345/some-slug"
));
assert!(matches(
"https://stackoverflow.com/questions/12345/some-slug?answertab=votes"
));
assert!(!matches("https://stackoverflow.com/"));
assert!(!matches("https://stackoverflow.com/questions"));
assert!(!matches("https://stackoverflow.com/users/100"));
assert!(!matches("https://example.com/questions/12345/x"));
}
#[test]
fn parse_question_id_handles_slug_and_query() {
assert_eq!(
parse_question_id("https://stackoverflow.com/questions/12345/some-slug"),
Some(12345)
);
assert_eq!(
parse_question_id("https://stackoverflow.com/questions/12345/some-slug?tab=newest"),
Some(12345)
);
assert_eq!(parse_question_id("https://stackoverflow.com/foo"), None);
}
}