diff --git a/crates/webclaw-fetch/src/extractors/arxiv.rs b/crates/webclaw-fetch/src/extractors/arxiv.rs new file mode 100644 index 0000000..cbcb3d1 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/arxiv.rs @@ -0,0 +1,314 @@ +//! ArXiv paper structured extractor. +//! +//! Uses the public ArXiv API at `export.arxiv.org/api/query?id_list={id}` +//! which returns Atom XML. We parse just enough to surface title, authors, +//! abstract, categories, and the canonical PDF link. No HTML scraping +//! required and no auth. + +use quick_xml::Reader; +use quick_xml::events::Event; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "arxiv", + label: "ArXiv paper", + description: "Returns paper metadata: title, authors, abstract, categories, primary category, PDF URL.", + url_patterns: &[ + "https://arxiv.org/abs/{id}", + "https://arxiv.org/abs/{id}v{n}", + "https://arxiv.org/pdf/{id}", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "arxiv.org" && host != "www.arxiv.org" { + return false; + } + url.contains("/abs/") || url.contains("/pdf/") +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let id = parse_id(url) + .ok_or_else(|| FetchError::Build(format!("arxiv: cannot parse id from '{url}'")))?; + + let api_url = format!("https://export.arxiv.org/api/query?id_list={id}"); + let resp = client.fetch(&api_url).await?; + if resp.status != 200 { + return Err(FetchError::Build(format!( + "arxiv api returned status {}", + resp.status + ))); + } + + let entry = parse_atom_entry(&resp.html) + .ok_or_else(|| FetchError::BodyDecode("arxiv: no in response".into()))?; + if entry.title.is_none() && entry.summary.is_none() { + return Err(FetchError::BodyDecode(format!( + "arxiv: paper '{id}' returned empty entry (likely withdrawn or invalid id)" + ))); + } + + Ok(json!({ + "url": url, + "id": id, + "arxiv_id": entry.id, + "title": entry.title, + "authors": entry.authors, + "abstract": entry.summary.map(|s| collapse_whitespace(&s)), + "published": entry.published, + "updated": entry.updated, + "primary_category": entry.primary_category, + "categories": entry.categories, + "doi": entry.doi, + "comment": entry.comment, + "pdf_url": entry.pdf_url, + "abs_url": entry.abs_url, + })) +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +/// Parse an arxiv id from a URL. Strips the version suffix (`v2`, `v3`) +/// and the `.pdf` extension when present. +fn parse_id(url: &str) -> Option { + let after = url + .split("/abs/") + .nth(1) + .or_else(|| url.split("/pdf/").nth(1))?; + let stripped = after + .split(['?', '#']) + .next()? + .trim_end_matches('/') + .trim_end_matches(".pdf"); + // Strip optional version suffix, e.g. "2401.12345v2" → "2401.12345" + let no_version = match stripped.rfind('v') { + Some(i) if stripped[i + 1..].chars().all(|c| c.is_ascii_digit()) => &stripped[..i], + _ => stripped, + }; + if no_version.is_empty() { + None + } else { + Some(no_version.to_string()) + } +} + +fn collapse_whitespace(s: &str) -> String { + s.split_whitespace().collect::>().join(" ") +} + +#[derive(Default)] +struct AtomEntry { + id: Option, + title: Option, + summary: Option, + published: Option, + updated: Option, + primary_category: Option, + categories: Vec, + authors: Vec, + doi: Option, + comment: Option, + pdf_url: Option, + abs_url: Option, +} + +/// Parse the first `` block of an ArXiv Atom feed. +fn parse_atom_entry(xml: &str) -> Option { + let mut reader = Reader::from_str(xml); + let mut buf = Vec::new(); + + // States + let mut in_entry = false; + let mut current: Option<&'static str> = None; + let mut in_author = false; + let mut in_author_name = false; + let mut entry = AtomEntry::default(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + let local = e.local_name(); + match local.as_ref() { + b"entry" => in_entry = true, + b"id" if in_entry && !in_author => current = Some("id"), + b"title" if in_entry => current = Some("title"), + b"summary" if in_entry => current = Some("summary"), + b"published" if in_entry => current = Some("published"), + b"updated" if in_entry => current = Some("updated"), + b"author" if in_entry => in_author = true, + b"name" if in_author => { + in_author_name = true; + current = Some("author_name"); + } + b"category" if in_entry => { + // primary_category is namespaced (arxiv:primary_category) + // category is plain. quick-xml gives us local-name only, + // so we treat both as categories and take the first as + // primary. + for attr in e.attributes().flatten() { + if attr.key.as_ref() == b"term" + && let Ok(v) = attr.unescape_value() + { + let term = v.to_string(); + if entry.primary_category.is_none() { + entry.primary_category = Some(term.clone()); + } + entry.categories.push(term); + } + } + } + b"link" if in_entry => { + let mut href = None; + let mut rel = None; + let mut typ = None; + for attr in e.attributes().flatten() { + match attr.key.as_ref() { + b"href" => href = attr.unescape_value().ok().map(|s| s.to_string()), + b"rel" => rel = attr.unescape_value().ok().map(|s| s.to_string()), + b"type" => typ = attr.unescape_value().ok().map(|s| s.to_string()), + _ => {} + } + } + if let Some(h) = href { + if typ.as_deref() == Some("application/pdf") { + entry.pdf_url = Some(h.clone()); + } + if rel.as_deref() == Some("alternate") { + entry.abs_url = Some(h); + } + } + } + _ => current = None, + } + } + Ok(Event::Empty(ref e)) => { + // Self-closing tags (). Same handling as Start. + let local = e.local_name(); + if (local.as_ref() == b"link" || local.as_ref() == b"category") && in_entry { + let mut href = None; + let mut rel = None; + let mut typ = None; + let mut term = None; + for attr in e.attributes().flatten() { + match attr.key.as_ref() { + b"href" => href = attr.unescape_value().ok().map(|s| s.to_string()), + b"rel" => rel = attr.unescape_value().ok().map(|s| s.to_string()), + b"type" => typ = attr.unescape_value().ok().map(|s| s.to_string()), + b"term" => term = attr.unescape_value().ok().map(|s| s.to_string()), + _ => {} + } + } + if let Some(t) = term { + if entry.primary_category.is_none() { + entry.primary_category = Some(t.clone()); + } + entry.categories.push(t); + } + if let Some(h) = href { + if typ.as_deref() == Some("application/pdf") { + entry.pdf_url = Some(h.clone()); + } + if rel.as_deref() == Some("alternate") { + entry.abs_url = Some(h); + } + } + } + } + Ok(Event::Text(ref e)) => { + if let (Some(field), Ok(text)) = (current, e.unescape()) { + let text = text.to_string(); + match field { + "id" => entry.id = Some(text.trim().to_string()), + "title" => entry.title = append_text(entry.title.take(), &text), + "summary" => entry.summary = append_text(entry.summary.take(), &text), + "published" => entry.published = Some(text.trim().to_string()), + "updated" => entry.updated = Some(text.trim().to_string()), + "author_name" => entry.authors.push(text.trim().to_string()), + _ => {} + } + } + } + Ok(Event::End(ref e)) => { + let local = e.local_name(); + match local.as_ref() { + b"entry" => break, + b"author" => in_author = false, + b"name" => in_author_name = false, + _ => {} + } + if !in_author_name { + current = None; + } + } + Ok(Event::Eof) => break, + Err(_) => return None, + _ => {} + } + buf.clear(); + } + + if in_entry { Some(entry) } else { None } +} + +/// Concatenate text fragments (long fields can be split across multiple +/// text events if they contain entities or CDATA). +fn append_text(prev: Option, next: &str) -> Option { + match prev { + Some(mut s) => { + s.push_str(next); + Some(s) + } + None => Some(next.to_string()), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_arxiv_urls() { + assert!(matches("https://arxiv.org/abs/2401.12345")); + assert!(matches("https://arxiv.org/abs/2401.12345v2")); + assert!(matches("https://arxiv.org/pdf/2401.12345.pdf")); + assert!(!matches("https://arxiv.org/")); + assert!(!matches("https://example.com/abs/foo")); + } + + #[test] + fn parse_id_strips_version_and_extension() { + assert_eq!( + parse_id("https://arxiv.org/abs/2401.12345"), + Some("2401.12345".into()) + ); + assert_eq!( + parse_id("https://arxiv.org/abs/2401.12345v3"), + Some("2401.12345".into()) + ); + assert_eq!( + parse_id("https://arxiv.org/pdf/2401.12345v2.pdf"), + Some("2401.12345".into()) + ); + } + + #[test] + fn collapse_whitespace_handles_newlines_and_tabs() { + assert_eq!(collapse_whitespace("a b\n\tc "), "a b c"); + } +} diff --git a/crates/webclaw-fetch/src/extractors/crates_io.rs b/crates/webclaw-fetch/src/extractors/crates_io.rs new file mode 100644 index 0000000..915b1c3 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/crates_io.rs @@ -0,0 +1,168 @@ +//! crates.io structured extractor. +//! +//! Uses the public JSON API at `crates.io/api/v1/crates/{name}`. No +//! auth, no rate limit at normal usage. The response includes both +//! the crate metadata and the full version list, which we summarize +//! down to a count + latest release info to keep the payload small. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "crates_io", + label: "crates.io package", + description: "Returns crate metadata: latest version, dependencies, downloads, license, repository.", + url_patterns: &[ + "https://crates.io/crates/{name}", + "https://crates.io/crates/{name}/{version}", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "crates.io" && host != "www.crates.io" { + return false; + } + url.contains("/crates/") +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let name = parse_name(url) + .ok_or_else(|| FetchError::Build(format!("crates.io: cannot parse name from '{url}'")))?; + + let api_url = format!("https://crates.io/api/v1/crates/{name}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "crates.io: crate '{name}' not found" + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "crates.io api returned status {}", + resp.status + ))); + } + + let body: CratesResponse = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("crates.io parse: {e}")))?; + + let c = body.crate_; + let latest_version = body + .versions + .iter() + .find(|v| !v.yanked.unwrap_or(false)) + .or_else(|| body.versions.first()); + + Ok(json!({ + "url": url, + "name": c.id, + "description": c.description, + "homepage": c.homepage, + "documentation": c.documentation, + "repository": c.repository, + "max_stable_version": c.max_stable_version, + "max_version": c.max_version, + "newest_version": c.newest_version, + "downloads": c.downloads, + "recent_downloads": c.recent_downloads, + "categories": c.categories, + "keywords": c.keywords, + "release_count": body.versions.len(), + "latest_release_date": latest_version.and_then(|v| v.created_at.clone()), + "latest_license": latest_version.and_then(|v| v.license.clone()), + "latest_rust_version": latest_version.and_then(|v| v.rust_version.clone()), + "latest_yanked": latest_version.and_then(|v| v.yanked), + "created_at": c.created_at, + "updated_at": c.updated_at, + })) +} + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +fn parse_name(url: &str) -> Option { + let after = url.split("/crates/").nth(1)?; + let stripped = after.split(['?', '#']).next()?.trim_end_matches('/'); + let first = stripped.split('/').find(|s| !s.is_empty())?; + Some(first.to_string()) +} + +// --------------------------------------------------------------------------- +// crates.io API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct CratesResponse { + #[serde(rename = "crate")] + crate_: CrateInfo, + #[serde(default)] + versions: Vec, +} + +#[derive(Deserialize)] +struct CrateInfo { + id: Option, + description: Option, + homepage: Option, + documentation: Option, + repository: Option, + max_stable_version: Option, + max_version: Option, + newest_version: Option, + downloads: Option, + recent_downloads: Option, + #[serde(default)] + categories: Vec, + #[serde(default)] + keywords: Vec, + created_at: Option, + updated_at: Option, +} + +#[derive(Deserialize)] +struct VersionInfo { + license: Option, + rust_version: Option, + yanked: Option, + created_at: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_crate_pages() { + assert!(matches("https://crates.io/crates/serde")); + assert!(matches("https://crates.io/crates/tokio/1.45.0")); + assert!(!matches("https://crates.io/")); + assert!(!matches("https://example.com/crates/foo")); + } + + #[test] + fn parse_name_handles_versioned_urls() { + assert_eq!( + parse_name("https://crates.io/crates/serde"), + Some("serde".into()) + ); + assert_eq!( + parse_name("https://crates.io/crates/tokio/1.45.0"), + Some("tokio".into()) + ); + assert_eq!( + parse_name("https://crates.io/crates/scraper/?foo=bar"), + Some("scraper".into()) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/dev_to.rs b/crates/webclaw-fetch/src/extractors/dev_to.rs new file mode 100644 index 0000000..49372ce --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/dev_to.rs @@ -0,0 +1,188 @@ +//! dev.to article structured extractor. +//! +//! `dev.to/api/articles/{username}/{slug}` returns the full article body, +//! tags, reaction count, comment count, and reading time. Anonymous +//! access works fine for published posts. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "dev_to", + label: "dev.to article", + description: "Returns article metadata + body: title, body markdown, tags, reactions, comments, reading time.", + url_patterns: &["https://dev.to/{username}/{slug}"], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "dev.to" && host != "www.dev.to" { + return false; + } + let path = url + .split("://") + .nth(1) + .and_then(|s| s.split_once('/')) + .map(|(_, p)| p) + .unwrap_or(""); + let stripped = path + .split(['?', '#']) + .next() + .unwrap_or("") + .trim_end_matches('/'); + let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect(); + // Need exactly /{username}/{slug}, with username starting with non-reserved. + segs.len() == 2 && !RESERVED_FIRST_SEGS.contains(&segs[0]) +} + +const RESERVED_FIRST_SEGS: &[&str] = &[ + "api", + "tags", + "search", + "settings", + "enter", + "signup", + "about", + "code-of-conduct", + "privacy", + "terms", + "contact", + "sponsorships", + "sponsors", + "shop", + "videos", + "listings", + "podcasts", + "p", + "t", +]; + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (username, slug) = parse_username_slug(url).ok_or_else(|| { + FetchError::Build(format!("dev_to: cannot parse username/slug from '{url}'")) + })?; + + let api_url = format!("https://dev.to/api/articles/{username}/{slug}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "dev_to: article '{username}/{slug}' not found" + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "dev.to api returned status {}", + resp.status + ))); + } + + let a: Article = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("dev.to parse: {e}")))?; + + Ok(json!({ + "url": url, + "id": a.id, + "title": a.title, + "description": a.description, + "body_markdown": a.body_markdown, + "url_canonical": a.canonical_url, + "published_at": a.published_at, + "edited_at": a.edited_at, + "reading_time_min": a.reading_time_minutes, + "tags": a.tag_list, + "positive_reactions": a.positive_reactions_count, + "public_reactions": a.public_reactions_count, + "comments_count": a.comments_count, + "page_views_count": a.page_views_count, + "cover_image": a.cover_image, + "author": json!({ + "username": a.user.as_ref().and_then(|u| u.username.clone()), + "name": a.user.as_ref().and_then(|u| u.name.clone()), + "twitter": a.user.as_ref().and_then(|u| u.twitter_username.clone()), + "github": a.user.as_ref().and_then(|u| u.github_username.clone()), + "website": a.user.as_ref().and_then(|u| u.website_url.clone()), + }), + })) +} + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +fn parse_username_slug(url: &str) -> Option<(String, String)> { + let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?; + let stripped = path.split(['?', '#']).next()?.trim_end_matches('/'); + let mut segs = stripped.split('/').filter(|s| !s.is_empty()); + let username = segs.next()?; + let slug = segs.next()?; + Some((username.to_string(), slug.to_string())) +} + +// --------------------------------------------------------------------------- +// dev.to API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct Article { + id: Option, + title: Option, + description: Option, + body_markdown: Option, + canonical_url: Option, + published_at: Option, + edited_at: Option, + reading_time_minutes: Option, + tag_list: Option, // string OR array depending on endpoint + positive_reactions_count: Option, + public_reactions_count: Option, + comments_count: Option, + page_views_count: Option, + cover_image: Option, + user: Option, +} + +#[derive(Deserialize)] +struct UserRef { + username: Option, + name: Option, + twitter_username: Option, + github_username: Option, + website_url: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_article_urls() { + assert!(matches("https://dev.to/ben/welcome-thread")); + assert!(matches("https://dev.to/0xmassi/some-post-1abc")); + assert!(!matches("https://dev.to/")); + assert!(!matches("https://dev.to/api/articles/foo/bar")); + assert!(!matches("https://dev.to/tags/rust")); + assert!(!matches("https://dev.to/ben")); // user profile, not article + assert!(!matches("https://example.com/ben/post")); + } + + #[test] + fn parse_pulls_username_and_slug() { + assert_eq!( + parse_username_slug("https://dev.to/ben/welcome-thread"), + Some(("ben".into(), "welcome-thread".into())) + ); + assert_eq!( + parse_username_slug("https://dev.to/0xmassi/some-post-1abc/?foo=bar"), + Some(("0xmassi".into(), "some-post-1abc".into())) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/docker_hub.rs b/crates/webclaw-fetch/src/extractors/docker_hub.rs new file mode 100644 index 0000000..15c928c --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/docker_hub.rs @@ -0,0 +1,150 @@ +//! Docker Hub repository structured extractor. +//! +//! Uses the v2 JSON API at `hub.docker.com/v2/repositories/{namespace}/{name}`. +//! Anonymous access is allowed for public images. The official-image +//! shorthand (e.g. `nginx`, `redis`) is normalized to `library/{name}`. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "docker_hub", + label: "Docker Hub repository", + description: "Returns image metadata: pull count, star count, last_updated, official flag, description.", + url_patterns: &[ + "https://hub.docker.com/_/{name}", + "https://hub.docker.com/r/{namespace}/{name}", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "hub.docker.com" { + return false; + } + url.contains("/_/") || url.contains("/r/") +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (namespace, name) = parse_repo(url) + .ok_or_else(|| FetchError::Build(format!("docker_hub: cannot parse repo from '{url}'")))?; + + let api_url = format!("https://hub.docker.com/v2/repositories/{namespace}/{name}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "docker_hub: repo '{namespace}/{name}' not found" + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "docker_hub api returned status {}", + resp.status + ))); + } + + let r: RepoResponse = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("docker_hub parse: {e}")))?; + + Ok(json!({ + "url": url, + "namespace": r.namespace, + "name": r.name, + "full_name": format!("{namespace}/{name}"), + "pull_count": r.pull_count, + "star_count": r.star_count, + "description": r.description, + "full_description": r.full_description, + "last_updated": r.last_updated, + "date_registered": r.date_registered, + "is_official": namespace == "library", + "is_private": r.is_private, + "status_description":r.status_description, + "categories": r.categories, + })) +} + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +/// Parse `(namespace, name)` from a Docker Hub URL. The official-image +/// shorthand `/_/nginx` maps to `(library, nginx)`. Personal repos +/// `/r/foo/bar` map to `(foo, bar)`. +fn parse_repo(url: &str) -> Option<(String, String)> { + if let Some(after) = url.split("/_/").nth(1) { + let stripped = after.split(['?', '#']).next()?.trim_end_matches('/'); + let name = stripped.split('/').next().filter(|s| !s.is_empty())?; + return Some(("library".into(), name.to_string())); + } + let after = url.split("/r/").nth(1)?; + let stripped = after.split(['?', '#']).next()?.trim_end_matches('/'); + let mut segs = stripped.split('/').filter(|s| !s.is_empty()); + let ns = segs.next()?; + let nm = segs.next()?; + Some((ns.to_string(), nm.to_string())) +} + +#[derive(Deserialize)] +struct RepoResponse { + namespace: Option, + name: Option, + pull_count: Option, + star_count: Option, + description: Option, + full_description: Option, + last_updated: Option, + date_registered: Option, + is_private: Option, + status_description: Option, + #[serde(default)] + categories: Vec, +} + +#[derive(Deserialize, serde::Serialize)] +struct DockerCategory { + name: Option, + slug: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_docker_urls() { + assert!(matches("https://hub.docker.com/_/nginx")); + assert!(matches("https://hub.docker.com/r/grafana/grafana")); + assert!(!matches("https://hub.docker.com/")); + assert!(!matches("https://example.com/_/nginx")); + } + + #[test] + fn parse_repo_handles_official_and_personal() { + assert_eq!( + parse_repo("https://hub.docker.com/_/nginx"), + Some(("library".into(), "nginx".into())) + ); + assert_eq!( + parse_repo("https://hub.docker.com/_/nginx/tags"), + Some(("library".into(), "nginx".into())) + ); + assert_eq!( + parse_repo("https://hub.docker.com/r/grafana/grafana"), + Some(("grafana".into(), "grafana".into())) + ); + assert_eq!( + parse_repo("https://hub.docker.com/r/grafana/grafana/?foo=bar"), + Some(("grafana".into(), "grafana".into())) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/github_pr.rs b/crates/webclaw-fetch/src/extractors/github_pr.rs new file mode 100644 index 0000000..9d4b95a --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/github_pr.rs @@ -0,0 +1,189 @@ +//! GitHub pull request structured extractor. +//! +//! Uses `api.github.com/repos/{owner}/{repo}/pulls/{number}`. Returns +//! the PR metadata + a counted summary of comments and review activity. +//! Full diff and per-comment bodies require additional calls — left for +//! a follow-up enhancement so the v1 stays one network round-trip. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "github_pr", + label: "GitHub pull request", + description: "Returns PR metadata: title, body, state, author, labels, additions/deletions, file count.", + url_patterns: &["https://github.com/{owner}/{repo}/pull/{number}"], +}; + +pub fn matches(url: &str) -> bool { + let host = url + .split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or(""); + if host != "github.com" && host != "www.github.com" { + return false; + } + parse_pr(url).is_some() +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (owner, repo, number) = parse_pr(url).ok_or_else(|| { + FetchError::Build(format!("github_pr: cannot parse pull-request URL '{url}'")) + })?; + + let api_url = format!("https://api.github.com/repos/{owner}/{repo}/pulls/{number}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "github_pr: pull request '{owner}/{repo}#{number}' not found" + ))); + } + if resp.status == 403 { + return Err(FetchError::Build( + "github_pr: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(), + )); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "github api returned status {}", + resp.status + ))); + } + + let p: PullRequest = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("github pr parse: {e}")))?; + + Ok(json!({ + "url": url, + "owner": owner, + "repo": repo, + "number": p.number, + "title": p.title, + "body": p.body, + "state": p.state, + "draft": p.draft, + "merged": p.merged, + "merged_at": p.merged_at, + "merge_commit_sha": p.merge_commit_sha, + "author": p.user.as_ref().and_then(|u| u.login.clone()), + "labels": p.labels.iter().filter_map(|l| l.name.clone()).collect::>(), + "milestone": p.milestone.as_ref().and_then(|m| m.title.clone()), + "head_ref": p.head.as_ref().and_then(|r| r.ref_name.clone()), + "base_ref": p.base.as_ref().and_then(|r| r.ref_name.clone()), + "head_sha": p.head.as_ref().and_then(|r| r.sha.clone()), + "additions": p.additions, + "deletions": p.deletions, + "changed_files": p.changed_files, + "commits": p.commits, + "comments": p.comments, + "review_comments":p.review_comments, + "created_at": p.created_at, + "updated_at": p.updated_at, + "closed_at": p.closed_at, + "html_url": p.html_url, + })) +} + +fn parse_pr(url: &str) -> Option<(String, String, u64)> { + let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?; + let stripped = path.split(['?', '#']).next()?.trim_end_matches('/'); + let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect(); + // /{owner}/{repo}/pull/{number} (or /pulls/{number} variant) + if segs.len() < 4 { + return None; + } + if segs[2] != "pull" && segs[2] != "pulls" { + return None; + } + let number: u64 = segs[3].parse().ok()?; + Some((segs[0].to_string(), segs[1].to_string(), number)) +} + +// --------------------------------------------------------------------------- +// GitHub PR API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct PullRequest { + number: Option, + title: Option, + body: Option, + state: Option, + draft: Option, + merged: Option, + merged_at: Option, + merge_commit_sha: Option, + user: Option, + #[serde(default)] + labels: Vec, + milestone: Option, + head: Option, + base: Option, + additions: Option, + deletions: Option, + changed_files: Option, + commits: Option, + comments: Option, + review_comments: Option, + created_at: Option, + updated_at: Option, + closed_at: Option, + html_url: Option, +} + +#[derive(Deserialize)] +struct UserRef { + login: Option, +} + +#[derive(Deserialize)] +struct LabelRef { + name: Option, +} + +#[derive(Deserialize)] +struct Milestone { + title: Option, +} + +#[derive(Deserialize)] +struct GitRef { + #[serde(rename = "ref")] + ref_name: Option, + sha: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_pr_urls() { + assert!(matches("https://github.com/rust-lang/rust/pull/12345")); + assert!(matches( + "https://github.com/rust-lang/rust/pull/12345/files" + )); + assert!(!matches("https://github.com/rust-lang/rust")); + assert!(!matches("https://github.com/rust-lang/rust/issues/100")); + assert!(!matches("https://github.com/rust-lang")); + } + + #[test] + fn parse_pr_extracts_owner_repo_number() { + assert_eq!( + parse_pr("https://github.com/rust-lang/rust/pull/12345"), + Some(("rust-lang".into(), "rust".into(), 12345)) + ); + assert_eq!( + parse_pr("https://github.com/rust-lang/rust/pull/12345/files"), + Some(("rust-lang".into(), "rust".into(), 12345)) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/github_release.rs b/crates/webclaw-fetch/src/extractors/github_release.rs new file mode 100644 index 0000000..b019550 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/github_release.rs @@ -0,0 +1,179 @@ +//! GitHub release structured extractor. +//! +//! `api.github.com/repos/{owner}/{repo}/releases/tags/{tag}`. Returns +//! the release notes body, asset list with download counts, and +//! prerelease flag. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "github_release", + label: "GitHub release", + description: "Returns release metadata: tag, name, body (release notes), assets with download counts.", + url_patterns: &["https://github.com/{owner}/{repo}/releases/tag/{tag}"], +}; + +pub fn matches(url: &str) -> bool { + let host = url + .split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or(""); + if host != "github.com" && host != "www.github.com" { + return false; + } + parse_release(url).is_some() +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (owner, repo, tag) = parse_release(url).ok_or_else(|| { + FetchError::Build(format!("github_release: cannot parse release URL '{url}'")) + })?; + + let api_url = format!("https://api.github.com/repos/{owner}/{repo}/releases/tags/{tag}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "github_release: release '{owner}/{repo}@{tag}' not found" + ))); + } + if resp.status == 403 { + return Err(FetchError::Build( + "github_release: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour." + .into(), + )); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "github api returned status {}", + resp.status + ))); + } + + let r: Release = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("github release parse: {e}")))?; + + let assets: Vec = r + .assets + .iter() + .map(|a| { + json!({ + "name": a.name, + "size": a.size, + "download_count": a.download_count, + "browser_download_url": a.browser_download_url, + "content_type": a.content_type, + "created_at": a.created_at, + "updated_at": a.updated_at, + }) + }) + .collect(); + + Ok(json!({ + "url": url, + "owner": owner, + "repo": repo, + "tag_name": r.tag_name, + "name": r.name, + "body": r.body, + "draft": r.draft, + "prerelease": r.prerelease, + "author": r.author.as_ref().and_then(|u| u.login.clone()), + "created_at": r.created_at, + "published_at": r.published_at, + "asset_count": assets.len(), + "total_downloads": r.assets.iter().map(|a| a.download_count.unwrap_or(0)).sum::(), + "assets": assets, + "html_url": r.html_url, + })) +} + +fn parse_release(url: &str) -> Option<(String, String, String)> { + let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?; + let stripped = path.split(['?', '#']).next()?.trim_end_matches('/'); + let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect(); + // /{owner}/{repo}/releases/tag/{tag} + if segs.len() < 5 { + return None; + } + if segs[2] != "releases" || segs[3] != "tag" { + return None; + } + Some(( + segs[0].to_string(), + segs[1].to_string(), + segs[4].to_string(), + )) +} + +// --------------------------------------------------------------------------- +// GitHub Release API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct Release { + tag_name: Option, + name: Option, + body: Option, + draft: Option, + prerelease: Option, + author: Option, + created_at: Option, + published_at: Option, + html_url: Option, + #[serde(default)] + assets: Vec, +} + +#[derive(Deserialize)] +struct UserRef { + login: Option, +} + +#[derive(Deserialize)] +struct Asset { + name: Option, + size: Option, + download_count: Option, + browser_download_url: Option, + content_type: Option, + created_at: Option, + updated_at: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_release_urls() { + assert!(matches( + "https://github.com/rust-lang/rust/releases/tag/1.85.0" + )); + assert!(matches( + "https://github.com/0xMassi/webclaw/releases/tag/v0.4.0" + )); + assert!(!matches("https://github.com/rust-lang/rust")); + assert!(!matches("https://github.com/rust-lang/rust/releases")); + assert!(!matches("https://github.com/rust-lang/rust/pull/100")); + } + + #[test] + fn parse_release_extracts_owner_repo_tag() { + assert_eq!( + parse_release("https://github.com/0xMassi/webclaw/releases/tag/v0.4.0"), + Some(("0xMassi".into(), "webclaw".into(), "v0.4.0".into())) + ); + assert_eq!( + parse_release("https://github.com/rust-lang/rust/releases/tag/1.85.0/?foo=bar"), + Some(("rust-lang".into(), "rust".into(), "1.85.0".into())) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs b/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs new file mode 100644 index 0000000..cb1f524 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs @@ -0,0 +1,189 @@ +//! HuggingFace dataset structured extractor. +//! +//! Same shape as the model extractor but hits the dataset endpoint. +//! `huggingface.co/api/datasets/{owner}/{name}`. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "huggingface_dataset", + label: "HuggingFace dataset", + description: "Returns dataset metadata: downloads, likes, license, language, task categories, file list.", + url_patterns: &["https://huggingface.co/datasets/{owner}/{name}"], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "huggingface.co" && host != "www.huggingface.co" { + return false; + } + let path = url + .split("://") + .nth(1) + .and_then(|s| s.split_once('/')) + .map(|(_, p)| p) + .unwrap_or(""); + let stripped = path + .split(['?', '#']) + .next() + .unwrap_or("") + .trim_end_matches('/'); + let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect(); + // /datasets/{name} (legacy top-level) or /datasets/{owner}/{name} (canonical). + segs.first().copied() == Some("datasets") && (segs.len() == 2 || segs.len() == 3) +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let dataset_path = parse_dataset_path(url).ok_or_else(|| { + FetchError::Build(format!( + "hf_dataset: cannot parse dataset path from '{url}'" + )) + })?; + + let api_url = format!("https://huggingface.co/api/datasets/{dataset_path}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "hf_dataset: '{dataset_path}' not found" + ))); + } + if resp.status == 401 { + return Err(FetchError::Build(format!( + "hf_dataset: '{dataset_path}' requires authentication (gated)" + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "hf_dataset api returned status {}", + resp.status + ))); + } + + let d: DatasetInfo = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("hf_dataset parse: {e}")))?; + + let files: Vec = d + .siblings + .iter() + .map(|s| json!({"rfilename": s.rfilename, "size": s.size})) + .collect(); + + Ok(json!({ + "url": url, + "id": d.id, + "private": d.private, + "gated": d.gated, + "downloads": d.downloads, + "downloads_30d": d.downloads_all_time, + "likes": d.likes, + "tags": d.tags, + "license": d.card_data.as_ref().and_then(|c| c.license.clone()), + "language": d.card_data.as_ref().and_then(|c| c.language.clone()), + "task_categories": d.card_data.as_ref().and_then(|c| c.task_categories.clone()), + "size_categories": d.card_data.as_ref().and_then(|c| c.size_categories.clone()), + "annotations_creators": d.card_data.as_ref().and_then(|c| c.annotations_creators.clone()), + "configs": d.card_data.as_ref().and_then(|c| c.configs.clone()), + "created_at": d.created_at, + "last_modified": d.last_modified, + "sha": d.sha, + "file_count": d.siblings.len(), + "files": files, + })) +} + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +/// Returns the part to append to the API URL — either `name` (legacy +/// top-level dataset like `squad`) or `owner/name` (canonical form). +fn parse_dataset_path(url: &str) -> Option { + let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?; + let stripped = path.split(['?', '#']).next()?.trim_end_matches('/'); + let mut segs = stripped.split('/').filter(|s| !s.is_empty()); + if segs.next() != Some("datasets") { + return None; + } + let first = segs.next()?.to_string(); + match segs.next() { + Some(second) => Some(format!("{first}/{second}")), + None => Some(first), + } +} + +#[derive(Deserialize)] +struct DatasetInfo { + id: Option, + private: Option, + gated: Option, + downloads: Option, + #[serde(rename = "downloadsAllTime")] + downloads_all_time: Option, + likes: Option, + #[serde(default)] + tags: Vec, + #[serde(rename = "createdAt")] + created_at: Option, + #[serde(rename = "lastModified")] + last_modified: Option, + sha: Option, + #[serde(rename = "cardData")] + card_data: Option, + #[serde(default)] + siblings: Vec, +} + +#[derive(Deserialize)] +struct DatasetCard { + license: Option, + language: Option, + task_categories: Option, + size_categories: Option, + annotations_creators: Option, + configs: Option, +} + +#[derive(Deserialize)] +struct Sibling { + rfilename: String, + size: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_dataset_pages() { + assert!(matches("https://huggingface.co/datasets/squad")); // legacy top-level + assert!(matches("https://huggingface.co/datasets/openai/gsm8k")); // canonical owner/name + assert!(!matches("https://huggingface.co/openai/whisper-large-v3")); + assert!(!matches("https://huggingface.co/datasets/")); + } + + #[test] + fn parse_dataset_path_works() { + assert_eq!( + parse_dataset_path("https://huggingface.co/datasets/squad"), + Some("squad".into()) + ); + assert_eq!( + parse_dataset_path("https://huggingface.co/datasets/openai/gsm8k"), + Some("openai/gsm8k".into()) + ); + assert_eq!( + parse_dataset_path("https://huggingface.co/datasets/openai/gsm8k/?lib=transformers"), + Some("openai/gsm8k".into()) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/mod.rs b/crates/webclaw-fetch/src/extractors/mod.rs index b9a539b..8cf8ba5 100644 --- a/crates/webclaw-fetch/src/extractors/mod.rs +++ b/crates/webclaw-fetch/src/extractors/mod.rs @@ -14,12 +14,20 @@ //! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have //! one). HTML extraction is the fallback for sites that don't. +pub mod arxiv; +pub mod crates_io; +pub mod dev_to; +pub mod docker_hub; +pub mod github_pr; +pub mod github_release; pub mod github_repo; pub mod hackernews; +pub mod huggingface_dataset; pub mod huggingface_model; pub mod npm; pub mod pypi; pub mod reddit; +pub mod stackoverflow; use serde::Serialize; use serde_json::Value; @@ -48,9 +56,17 @@ pub fn list() -> Vec { reddit::INFO, hackernews::INFO, github_repo::INFO, + github_pr::INFO, + github_release::INFO, pypi::INFO, npm::INFO, + crates_io::INFO, huggingface_model::INFO, + huggingface_dataset::INFO, + arxiv::INFO, + docker_hub::INFO, + dev_to::INFO, + stackoverflow::INFO, ] } @@ -92,6 +108,27 @@ pub async fn dispatch_by_url( if npm::matches(url) { return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v))); } + if github_pr::matches(url) { + return Some( + github_pr::extract(client, url) + .await + .map(|v| (github_pr::INFO.name, v)), + ); + } + if github_release::matches(url) { + return Some( + github_release::extract(client, url) + .await + .map(|v| (github_release::INFO.name, v)), + ); + } + if crates_io::matches(url) { + return Some( + crates_io::extract(client, url) + .await + .map(|v| (crates_io::INFO.name, v)), + ); + } if huggingface_model::matches(url) { return Some( huggingface_model::extract(client, url) @@ -99,6 +136,41 @@ pub async fn dispatch_by_url( .map(|v| (huggingface_model::INFO.name, v)), ); } + if huggingface_dataset::matches(url) { + return Some( + huggingface_dataset::extract(client, url) + .await + .map(|v| (huggingface_dataset::INFO.name, v)), + ); + } + if arxiv::matches(url) { + return Some( + arxiv::extract(client, url) + .await + .map(|v| (arxiv::INFO.name, v)), + ); + } + if docker_hub::matches(url) { + return Some( + docker_hub::extract(client, url) + .await + .map(|v| (docker_hub::INFO.name, v)), + ); + } + if dev_to::matches(url) { + return Some( + dev_to::extract(client, url) + .await + .map(|v| (dev_to::INFO.name, v)), + ); + } + if stackoverflow::matches(url) { + return Some( + stackoverflow::extract(client, url) + .await + .map(|v| (stackoverflow::INFO.name, v)), + ); + } None } @@ -136,12 +208,57 @@ pub async fn dispatch_by_name( n if n == npm::INFO.name => { run_or_mismatch(npm::matches(url), n, url, || npm::extract(client, url)).await } + n if n == github_pr::INFO.name => { + run_or_mismatch(github_pr::matches(url), n, url, || { + github_pr::extract(client, url) + }) + .await + } + n if n == github_release::INFO.name => { + run_or_mismatch(github_release::matches(url), n, url, || { + github_release::extract(client, url) + }) + .await + } + n if n == crates_io::INFO.name => { + run_or_mismatch(crates_io::matches(url), n, url, || { + crates_io::extract(client, url) + }) + .await + } n if n == huggingface_model::INFO.name => { run_or_mismatch(huggingface_model::matches(url), n, url, || { huggingface_model::extract(client, url) }) .await } + n if n == huggingface_dataset::INFO.name => { + run_or_mismatch(huggingface_dataset::matches(url), n, url, || { + huggingface_dataset::extract(client, url) + }) + .await + } + n if n == arxiv::INFO.name => { + run_or_mismatch(arxiv::matches(url), n, url, || arxiv::extract(client, url)).await + } + n if n == docker_hub::INFO.name => { + run_or_mismatch(docker_hub::matches(url), n, url, || { + docker_hub::extract(client, url) + }) + .await + } + n if n == dev_to::INFO.name => { + run_or_mismatch(dev_to::matches(url), n, url, || { + dev_to::extract(client, url) + }) + .await + } + n if n == stackoverflow::INFO.name => { + run_or_mismatch(stackoverflow::matches(url), n, url, || { + stackoverflow::extract(client, url) + }) + .await + } _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())), } } diff --git a/crates/webclaw-fetch/src/extractors/stackoverflow.rs b/crates/webclaw-fetch/src/extractors/stackoverflow.rs new file mode 100644 index 0000000..d74b511 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/stackoverflow.rs @@ -0,0 +1,216 @@ +//! Stack Overflow Q&A structured extractor. +//! +//! Uses the Stack Exchange API at `api.stackexchange.com/2.3/questions/{id}` +//! with `site=stackoverflow`. Two calls: one for the question, one for +//! its answers. Both come pre-filtered to include the rendered HTML body +//! so we don't re-parse the question page itself. +//! +//! Anonymous access caps at 300 requests per IP per day. Production +//! cloud should set `STACKAPPS_KEY` to lift to 10,000/day, but we don't +//! require it to work out of the box. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "stackoverflow", + label: "Stack Overflow Q&A", + description: "Returns question + answers: title, body, tags, votes, accepted answer, top answers.", + url_patterns: &["https://stackoverflow.com/questions/{id}/{slug}"], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host != "stackoverflow.com" && host != "www.stackoverflow.com" { + return false; + } + parse_question_id(url).is_some() +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let id = parse_question_id(url).ok_or_else(|| { + FetchError::Build(format!( + "stackoverflow: cannot parse question id from '{url}'" + )) + })?; + + // Filter `withbody` includes the rendered HTML body for both questions + // and answers. Stack Exchange's filter system is documented at + // api.stackexchange.com/docs/filters. + let q_url = format!( + "https://api.stackexchange.com/2.3/questions/{id}?site=stackoverflow&filter=withbody" + ); + let q_resp = client.fetch(&q_url).await?; + if q_resp.status != 200 { + return Err(FetchError::Build(format!( + "stackexchange api returned status {}", + q_resp.status + ))); + } + let q_body: QResponse = serde_json::from_str(&q_resp.html) + .map_err(|e| FetchError::BodyDecode(format!("stackoverflow q parse: {e}")))?; + let q = q_body + .items + .first() + .ok_or_else(|| FetchError::Build(format!("stackoverflow: question {id} not found")))?; + + let a_url = format!( + "https://api.stackexchange.com/2.3/questions/{id}/answers?site=stackoverflow&filter=withbody&order=desc&sort=votes" + ); + let a_resp = client.fetch(&a_url).await?; + let answers = if a_resp.status == 200 { + let a_body: AResponse = serde_json::from_str(&a_resp.html) + .map_err(|e| FetchError::BodyDecode(format!("stackoverflow a parse: {e}")))?; + a_body + .items + .iter() + .map(|a| { + json!({ + "answer_id": a.answer_id, + "is_accepted": a.is_accepted, + "score": a.score, + "body": a.body, + "creation_date": a.creation_date, + "last_edit_date":a.last_edit_date, + "author": a.owner.as_ref().and_then(|o| o.display_name.clone()), + "author_rep": a.owner.as_ref().and_then(|o| o.reputation), + }) + }) + .collect::>() + } else { + Vec::new() + }; + + let accepted = answers + .iter() + .find(|a| { + a.get("is_accepted") + .and_then(|v| v.as_bool()) + .unwrap_or(false) + }) + .cloned(); + + Ok(json!({ + "url": url, + "question_id": q.question_id, + "title": q.title, + "body": q.body, + "tags": q.tags, + "score": q.score, + "view_count": q.view_count, + "answer_count": q.answer_count, + "is_answered": q.is_answered, + "accepted_answer_id": q.accepted_answer_id, + "creation_date": q.creation_date, + "last_activity_date": q.last_activity_date, + "author": q.owner.as_ref().and_then(|o| o.display_name.clone()), + "author_rep": q.owner.as_ref().and_then(|o| o.reputation), + "link": q.link, + "accepted_answer": accepted, + "top_answers": answers, + })) +} + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +/// Parse question id from a URL of the form `/questions/{id}/{slug}`. +fn parse_question_id(url: &str) -> Option { + let after = url.split("/questions/").nth(1)?; + let stripped = after.split(['?', '#']).next()?.trim_end_matches('/'); + let first = stripped.split('/').next()?; + first.parse::().ok() +} + +// --------------------------------------------------------------------------- +// Stack Exchange API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct QResponse { + #[serde(default)] + items: Vec, +} + +#[derive(Deserialize)] +struct Question { + question_id: Option, + title: Option, + body: Option, + #[serde(default)] + tags: Vec, + score: Option, + view_count: Option, + answer_count: Option, + is_answered: Option, + accepted_answer_id: Option, + creation_date: Option, + last_activity_date: Option, + owner: Option, + link: Option, +} + +#[derive(Deserialize)] +struct AResponse { + #[serde(default)] + items: Vec, +} + +#[derive(Deserialize)] +struct Answer { + answer_id: Option, + is_accepted: Option, + score: Option, + body: Option, + creation_date: Option, + last_edit_date: Option, + owner: Option, +} + +#[derive(Deserialize)] +struct Owner { + display_name: Option, + reputation: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_question_urls() { + assert!(matches( + "https://stackoverflow.com/questions/12345/some-slug" + )); + assert!(matches( + "https://stackoverflow.com/questions/12345/some-slug?answertab=votes" + )); + assert!(!matches("https://stackoverflow.com/")); + assert!(!matches("https://stackoverflow.com/questions")); + assert!(!matches("https://stackoverflow.com/users/100")); + assert!(!matches("https://example.com/questions/12345/x")); + } + + #[test] + fn parse_question_id_handles_slug_and_query() { + assert_eq!( + parse_question_id("https://stackoverflow.com/questions/12345/some-slug"), + Some(12345) + ); + assert_eq!( + parse_question_id("https://stackoverflow.com/questions/12345/some-slug?tab=newest"), + Some(12345) + ); + assert_eq!(parse_question_id("https://stackoverflow.com/foo"), None); + } +}