From 66974366d7cfe6a18e5c3010a3004a60ec9ad89c Mon Sep 17 00:00:00 2001 From: devnen Date: Sat, 23 May 2026 20:38:59 +0200 Subject: [PATCH] feat(core): schema-aware JSON-LD parser + --prefer-structured + --articles-from-jsonld JSON-LD is consistently the cleanest source on major outlets (Reuters, BBC, Le Monde, N1, Pitchfork). Webclaw already emitted a raw Structured Data block at the bottom of -f llm output; this iter teaches it to parse the JSON-LD by schema and surface it usefully. New schema-aware parser at crates/webclaw-core/src/jsonld.rs classifies items by @type into: ItemList, LiveBlogPosting, NewsArticle, Review, WebPageOrChrome, Unknown. CollectionPage with mainEntity ItemList is auto-lifted (Reuters CollectionPage shape). Two new CLI flags: --prefer-structured: surfaces the schema-aware block at the TOP of the output, before prose. For -f llm emits a Markdown summary block; for -f json emits a {structured, extracted} envelope. Bypasses the default DROP list for WebPage/chrome types when explicitly requested. --articles-from-jsonld: when the page contains ItemList or LiveBlogPosting, output ONLY a JSON array of articles ({position, title, url, published}). When no such schema is present, emit a stderr hint and fall through to default extraction (no error). Default behavior (neither flag set) byte-identical to iter-3 on all default-flag probes (regression sentinel passed): Cyrillic p14 still 7735 B, M1 caps p18/p19/p20 deterministic, M2 hub p40/p41 byte-identical, M3 registry p44/p45/p46 still fast-fail with exit 67. 14 new tests in webclaw-core covering schema-variant parsing, parse error handling, fall-through behavior, flag combinations, and the default-byte-identical sentinel. Workspace tests 657 -> 671. --- .gitignore | 1 + crates/webclaw-cli/src/main.rs | 145 +++++++- crates/webclaw-core/src/jsonld.rs | 521 +++++++++++++++++++++++++++++ crates/webclaw-core/src/lib.rs | 10 +- crates/webclaw-core/src/llm/mod.rs | 270 ++++++++++++++- 5 files changed, 911 insertions(+), 36 deletions(-) create mode 100644 crates/webclaw-core/src/jsonld.rs diff --git a/.gitignore b/.gitignore index 9000d27..50e1a6f 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ _build-release.bat _build-release.log improve-loop-CONTINUE.md iter-*-smoke/ +_local/ diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 46819dd..6855fca 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -11,8 +11,9 @@ use std::sync::atomic::{AtomicBool, Ordering}; use clap::{Parser, Subcommand, ValueEnum}; use tracing_subscriber::EnvFilter; use webclaw_core::{ - ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options, - to_llm_text, + ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, JsonLdSchema, LlmTextOptions, + Metadata, classify_jsonld_all, extract_with_options, primary_schema, to_llm_text, + to_llm_text_with_options, }; use webclaw_fetch::{ BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient, @@ -188,6 +189,24 @@ struct Cli { #[arg(long)] prefer_articles: bool, + /// Surface the schema-aware JSON-LD block (when present) at the TOP of + /// the output, before prose. Bypasses the default-drop list for + /// WebPage/SiteNavigationElement when explicitly requested. Affects + /// `-f llm` / `-f text` (adds a Markdown block) and `-f json` (adds a + /// `structured` field to the output object). + #[arg(long)] + prefer_structured: bool, + + /// When the page contains an ItemList or LiveBlogPosting in its JSON-LD, + /// emit ONLY the article list as a JSON array of + /// {position, title, url, published}. The `-f` flag is OVERRIDDEN in this + /// mode: stdout is always a JSON array. When the page has no + /// ItemList/LiveBlogPosting, emits a one-line stderr hint and falls + /// through to default extraction (does NOT error). Combined with + /// --prefer-structured, this flag wins. + #[arg(long)] + articles_from_jsonld: bool, + /// Browser to impersonate #[arg(short, long, default_value = "chrome")] browser: Browser, @@ -769,7 +788,28 @@ fn format_output_with_mode( mode: &OutputMode, max_output_bytes: u64, ) -> String { - let body = render_body(result, format, show_metadata, mode); + format_output_with_mode_and_structured( + result, + format, + show_metadata, + mode, + max_output_bytes, + false, + ) +} + +/// M4 extension: same as `format_output_with_mode` but with an extra +/// `prefer_structured` flag. When false this is byte-identical to the +/// legacy formatter — sentinel-critical for p01-p15. +fn format_output_with_mode_and_structured( + result: &ExtractionResult, + format: &OutputFormat, + show_metadata: bool, + mode: &OutputMode, + max_output_bytes: u64, + prefer_structured: bool, +) -> String { + let body = render_body(result, format, show_metadata, mode, prefer_structured); apply_byte_cap(&body, format, max_output_bytes) } @@ -778,6 +818,7 @@ fn render_body( format: &OutputFormat, show_metadata: bool, mode: &OutputMode, + prefer_structured: bool, ) -> String { match mode { OutputMode::Summary => match format { @@ -805,10 +846,26 @@ fn render_body( out } OutputFormat::Json => { - serde_json::to_string_pretty(result).expect("serialization failed") + if prefer_structured { + let schemas = classify_jsonld_all(&result.structured_data); + let structured = primary_schema(&schemas); + let envelope = serde_json::json!({ + "structured": structured, + "extracted": result, + }); + serde_json::to_string_pretty(&envelope).expect("serialization failed") + } else { + serde_json::to_string_pretty(result).expect("serialization failed") + } } OutputFormat::Text => result.content.plain_text.clone(), - OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()), + OutputFormat::Llm => to_llm_text_with_options( + result, + result.metadata.url.as_deref(), + &LlmTextOptions { + prefer_structured, + }, + ), OutputFormat::Html => raw_html_or_markdown(result).to_string(), }, } @@ -1129,17 +1186,6 @@ fn format_frontmatter(meta: &Metadata) -> String { lines.join("\n") } -fn print_output_with_mode( - result: &ExtractionResult, - format: &OutputFormat, - show_metadata: bool, - mode: &OutputMode, - max_output_bytes: u64, -) { - let out = format_output_with_mode(result, format, show_metadata, mode, max_output_bytes); - println!("{out}"); -} - /// Apply iter-2 M2's hub-page detector. When a hub is detected: /// - emit a single stderr hint line (always — informational only), /// - if `prefer_articles` is on, override the OutputMode to `Summary` @@ -1152,6 +1198,51 @@ fn print_output_with_mode( /// Designed to be additive — `prefer_articles=false` callers keep their /// existing stdout bytes byte-identical; the hint goes to stderr so it /// doesn't affect the sentinel byte-counting on p01-p15. +/// M4: If the page has an ItemList or LiveBlogPosting JSON-LD record, return +/// a JSON array of articles (one entry per item). Returns None when the page +/// has no such schema, in which case the caller should fall through to +/// default extraction and emit a stderr hint. +/// +/// Output shape per element: `{position, title, url, published}`. Null fields +/// for the values that don't appear on this page. +fn try_articles_from_jsonld(result: &ExtractionResult) -> Option { + let schemas = classify_jsonld_all(&result.structured_data); + let primary = primary_schema(&schemas)?; + match primary { + JsonLdSchema::ItemList { items, .. } => { + let arr: Vec = items + .iter() + .enumerate() + .map(|(idx, it)| { + serde_json::json!({ + "position": it.position.unwrap_or(idx as u64 + 1), + "title": it.title, + "url": it.url, + "published": it.published, + }) + }) + .collect(); + Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string())) + } + JsonLdSchema::LiveBlogPosting { updates, .. } => { + let arr: Vec = updates + .iter() + .enumerate() + .map(|(idx, u)| { + serde_json::json!({ + "position": idx as u64 + 1, + "title": u.headline, + "url": u.url, + "published": u.published, + }) + }) + .collect(); + Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string())) + } + _ => None, + } +} + fn apply_hub_detection( result: &ExtractionResult, requested_mode: &OutputMode, @@ -2803,6 +2894,21 @@ async fn main() { // Single-page extraction (handles both HTML and PDF via content-type detection) match fetch_and_extract(&cli).await { Ok(FetchOutput::Local(result)) => { + // M4: --articles-from-jsonld short-circuits with a JSON array of + // articles when the page has an ItemList or LiveBlogPosting. + // When neither is present, emit a stderr hint and fall through to + // the default extraction path (the --mode flag still applies). + if cli.articles_from_jsonld { + if let Some(json_array) = try_articles_from_jsonld(&result) { + println!("{json_array}"); + return; + } + eprintln!( + "# hint: --articles-from-jsonld found no ItemList or LiveBlogPosting on this URL; falling through to default extraction" + ); + // Fall through. + } + let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles); if let Some(ref dir) = cli.output_dir { let url = cli @@ -2812,25 +2918,28 @@ async fn main() { .unwrap_or_default(); let custom_name = entries.first().and_then(|(_, name)| name.clone()); let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format)); - let content = format_output_with_mode( + let content = format_output_with_mode_and_structured( &result, &cli.format, cli.metadata, &effective_mode, cli.max_output_bytes, + cli.prefer_structured, ); if let Err(e) = write_to_file(dir, &filename, &content) { eprintln!("error: {e}"); process::exit(1); } } else { - print_output_with_mode( + let content = format_output_with_mode_and_structured( &result, &cli.format, cli.metadata, &effective_mode, cli.max_output_bytes, + cli.prefer_structured, ); + println!("{content}"); } } Ok(FetchOutput::Cloud(resp)) => { diff --git a/crates/webclaw-core/src/jsonld.rs b/crates/webclaw-core/src/jsonld.rs new file mode 100644 index 0000000..ea3c4e7 --- /dev/null +++ b/crates/webclaw-core/src/jsonld.rs @@ -0,0 +1,521 @@ +/// Schema-aware JSON-LD classification. +/// +/// The existing `structured_data::extract_json_ld` returns raw parsed +/// `serde_json::Value`s. This module classifies them into the typed +/// `JsonLdSchema` enum that the M4 CLI flags (`--prefer-structured`, +/// `--articles-from-jsonld`) route on. +/// +/// Design: a thin classifier on top of the existing parser. We do NOT +/// re-implement JSON-LD parsing — we accept the same `Vec` that +/// `ExtractionResult.structured_data` already carries, and produce a +/// typed view useful for downstream formatting. +use serde::Serialize; +use serde_json::Value; + +/// Article reference extracted from an ItemList / LiveBlogPosting. +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct ArticleRef { + pub title: Option, + pub url: Option, + pub published: Option, + pub position: Option, +} + +/// One update from a LiveBlogPosting.liveBlogUpdate array. +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct LiveUpdate { + pub headline: Option, + pub url: Option, + pub published: Option, +} + +/// Classified JSON-LD record. Mirrors the schema.org types that webclaw +/// callers care about most: ItemList (Reuters category pages, Pitchfork +/// index), LiveBlogPosting (Le Monde live updates), NewsArticle / Article +/// (most outlets), Review (Pitchfork album reviews), and chrome types +/// (WebPage, WebSite, SiteNavigationElement) that downstream formatters +/// usually drop. +#[derive(Debug, Clone, PartialEq, Serialize)] +#[serde(tag = "schema", rename_all = "PascalCase")] +pub enum JsonLdSchema { + /// `@type=ItemList` — possibly nested inside `CollectionPage.mainEntity`. + ItemList { + items: Vec, + number_of_items: Option, + }, + /// `@type=LiveBlogPosting` — Le Monde / Guardian live coverage. + LiveBlogPosting { + headline: Option, + updates: Vec, + }, + /// `@type=NewsArticle` / `Article` / `BlogPosting`. + NewsArticle { + headline: Option, + body: Option, + date_published: Option, + author: Option, + }, + /// `@type=Review` — Pitchfork album reviews. + Review { + headline: Option, + review_body: Option, + rated_item: Option, + author: Option, + date_published: Option, + }, + /// Chrome types: WebPage, WebSite, SiteNavigationElement, BreadcrumbList. + /// Formatters typically drop these unless explicitly asked to surface. + WebPageOrChrome { raw_type: String }, + /// Recognised schema.org type we don't have a typed variant for yet. + /// The raw value is preserved so callers can still emit it. + Unknown { + raw_type: String, + raw: Box, + }, +} + +impl JsonLdSchema { + /// Convenience: is this a content-bearing schema (vs WebPage chrome)? + pub fn is_content(&self) -> bool { + !matches!(self, JsonLdSchema::WebPageOrChrome { .. }) + } + + /// Convenience: short stable string for the schema kind, used by probe.py. + pub fn kind(&self) -> &'static str { + match self { + JsonLdSchema::ItemList { .. } => "ItemList", + JsonLdSchema::LiveBlogPosting { .. } => "LiveBlogPosting", + JsonLdSchema::NewsArticle { .. } => "NewsArticle", + JsonLdSchema::Review { .. } => "Review", + JsonLdSchema::WebPageOrChrome { .. } => "WebPageOrChrome", + JsonLdSchema::Unknown { .. } => "Unknown", + } + } +} + +/// Classify a single JSON-LD value. Descends into `mainEntity` once +/// (Reuters `CollectionPage.mainEntity` → ItemList). +pub fn classify_value(v: &Value) -> Option { + let obj = v.as_object()?; + let raw_type = type_string(obj.get("@type"))?; + let lower = raw_type.to_ascii_lowercase(); + + match lower.as_str() { + "itemlist" => Some(parse_itemlist(obj)), + "liveblogposting" => Some(parse_liveblog(obj)), + "newsarticle" | "article" | "blogposting" | "reportagenewsarticle" => { + Some(parse_news_article(obj)) + } + "review" => Some(parse_review(obj)), + // Chrome / navigation types — explicit list. + "webpage" | "website" | "sitenavigationelement" | "breadcrumblist" + | "collectionpage" => { + // CollectionPage may wrap an ItemList in mainEntity. If so, lift it. + if let Some(main) = obj.get("mainEntity") { + if let Some(inner) = classify_value(main) { + return Some(inner); + } + } + Some(JsonLdSchema::WebPageOrChrome { raw_type }) + } + _ => Some(JsonLdSchema::Unknown { + raw_type, + raw: Box::new(v.clone()), + }), + } +} + +/// Classify a `Vec` (matches `ExtractionResult.structured_data`'s shape). +/// Returns one `JsonLdSchema` per input value. +pub fn classify_all(values: &[Value]) -> Vec { + values.iter().filter_map(classify_value).collect() +} + +/// Find the FIRST schema among the classified items that is a content-bearing +/// type useful for routing. Priority: ItemList > LiveBlogPosting > Review > +/// NewsArticle > Unknown > WebPageOrChrome. +pub fn primary_schema(schemas: &[JsonLdSchema]) -> Option<&JsonLdSchema> { + let priority = |s: &JsonLdSchema| -> u8 { + match s { + JsonLdSchema::ItemList { .. } => 0, + JsonLdSchema::LiveBlogPosting { .. } => 1, + JsonLdSchema::Review { .. } => 2, + JsonLdSchema::NewsArticle { .. } => 3, + JsonLdSchema::Unknown { .. } => 4, + JsonLdSchema::WebPageOrChrome { .. } => 5, + } + }; + schemas.iter().min_by_key(|s| priority(s)) +} + +// ---------------------------------------------------------------------- +// Helpers +// ---------------------------------------------------------------------- + +fn type_string(v: Option<&Value>) -> Option { + match v? { + Value::String(s) => Some(s.clone()), + Value::Array(a) => a + .iter() + .find_map(|x| x.as_str().map(str::to_string)), + _ => None, + } +} + +fn str_field(obj: &serde_json::Map, key: &str) -> Option { + obj.get(key) + .and_then(|v| v.as_str()) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) +} + +fn u64_field(obj: &serde_json::Map, key: &str) -> Option { + obj.get(key).and_then(|v| v.as_u64()) +} + +fn author_string(v: Option<&Value>) -> Option { + match v? { + Value::String(s) => Some(s.clone()), + Value::Object(o) => o.get("name").and_then(|n| n.as_str()).map(str::to_string), + Value::Array(a) => { + let names: Vec = a + .iter() + .filter_map(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Object(o) => o.get("name").and_then(|n| n.as_str()).map(str::to_string), + _ => None, + }) + .collect(); + if names.is_empty() { + None + } else { + Some(names.join(", ")) + } + } + _ => None, + } +} + +fn item_reviewed_string(v: Option<&Value>) -> Option { + let v = v?; + let obj = v.as_object()?; + obj.get("name").and_then(|n| n.as_str()).map(str::to_string) +} + +fn parse_itemlist(obj: &serde_json::Map) -> JsonLdSchema { + let mut items = Vec::new(); + if let Some(arr) = obj.get("itemListElement").and_then(|v| v.as_array()) { + for entry in arr { + let Some(e) = entry.as_object() else { continue }; + // Two shapes seen in the wild: + // (1) ListItem with {position, url, name}. + // (2) ListItem with {position, item: {url, name, datePublished}}. + let inner_obj = e.get("item").and_then(|v| v.as_object()).unwrap_or(e); + + let position = u64_field(e, "position").or_else(|| u64_field(inner_obj, "position")); + let url = str_field(inner_obj, "url").or_else(|| str_field(e, "url")); + let title = str_field(inner_obj, "name") + .or_else(|| str_field(e, "name")) + .or_else(|| str_field(inner_obj, "headline")) + .or_else(|| str_field(e, "headline")); + let published = str_field(inner_obj, "datePublished") + .or_else(|| str_field(e, "datePublished")); + + items.push(ArticleRef { + title, + url, + published, + position, + }); + } + } + let number_of_items = u64_field(obj, "numberOfItems"); + JsonLdSchema::ItemList { + items, + number_of_items, + } +} + +fn parse_liveblog(obj: &serde_json::Map) -> JsonLdSchema { + let headline = str_field(obj, "headline"); + let mut updates = Vec::new(); + if let Some(arr) = obj.get("liveBlogUpdate").and_then(|v| v.as_array()) { + for entry in arr { + let Some(e) = entry.as_object() else { continue }; + updates.push(LiveUpdate { + headline: str_field(e, "headline"), + url: str_field(e, "url"), + published: str_field(e, "datePublished"), + }); + } + } + JsonLdSchema::LiveBlogPosting { headline, updates } +} + +fn parse_news_article(obj: &serde_json::Map) -> JsonLdSchema { + let headline = str_field(obj, "headline"); + // articleBody is the canonical field; some sites use description. + let body = str_field(obj, "articleBody").or_else(|| str_field(obj, "description")); + let date_published = str_field(obj, "datePublished"); + let author = author_string(obj.get("author")); + JsonLdSchema::NewsArticle { + headline, + body, + date_published, + author, + } +} + +fn parse_review(obj: &serde_json::Map) -> JsonLdSchema { + let headline = str_field(obj, "headline").or_else(|| str_field(obj, "name")); + let review_body = str_field(obj, "reviewBody").or_else(|| str_field(obj, "description")); + let rated_item = item_reviewed_string(obj.get("itemReviewed")); + let author = author_string(obj.get("author")); + let date_published = str_field(obj, "datePublished"); + JsonLdSchema::Review { + headline, + review_body, + rated_item, + author, + date_published, + } +} + +// ---------------------------------------------------------------------- +// Tests +// ---------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + /// Test 1: ItemList JSON-LD with 3 itemListElement entries. + #[test] + fn test_jsonld_parse_itemlist() { + let v = json!({ + "@context": "https://schema.org", + "@type": "ItemList", + "numberOfItems": 3, + "itemListElement": [ + {"@type": "ListItem", "position": 1, "url": "https://a.example/1", "name": "First"}, + {"@type": "ListItem", "position": 2, "url": "https://a.example/2", "name": "Second"}, + {"@type": "ListItem", "position": 3, "url": "https://a.example/3", "name": "Third"}, + ] + }); + let s = classify_value(&v).expect("classify"); + match s { + JsonLdSchema::ItemList { items, number_of_items } => { + assert_eq!(number_of_items, Some(3)); + assert_eq!(items.len(), 3); + assert_eq!(items[0].position, Some(1)); + assert_eq!(items[0].url.as_deref(), Some("https://a.example/1")); + assert_eq!(items[0].title.as_deref(), Some("First")); + assert_eq!(items[2].position, Some(3)); + } + other => panic!("expected ItemList, got {other:?}"), + } + } + + /// Test 2: LiveBlogPosting with 2 liveBlogUpdate entries. + #[test] + fn test_jsonld_parse_liveblog() { + let v = json!({ + "@type": "LiveBlogPosting", + "headline": "Election Night Live", + "liveBlogUpdate": [ + {"headline": "Polls closing", "url": "https://x/1", "datePublished": "2026-05-23T19:00:00Z"}, + {"headline": "First results", "url": "https://x/2", "datePublished": "2026-05-23T19:15:00Z"}, + ] + }); + let s = classify_value(&v).expect("classify"); + match s { + JsonLdSchema::LiveBlogPosting { headline, updates } => { + assert_eq!(headline.as_deref(), Some("Election Night Live")); + assert_eq!(updates.len(), 2); + assert_eq!(updates[0].headline.as_deref(), Some("Polls closing")); + assert_eq!(updates[1].url.as_deref(), Some("https://x/2")); + } + other => panic!("expected LiveBlogPosting, got {other:?}"), + } + } + + /// Test 3: NewsArticle with articleBody. + #[test] + fn test_jsonld_parse_newsarticle() { + let v = json!({ + "@type": "NewsArticle", + "headline": "Big Story", + "articleBody": "Lorem ipsum dolor sit amet.", + "datePublished": "2026-05-23", + "author": {"@type": "Person", "name": "Jane Doe"}, + }); + let s = classify_value(&v).expect("classify"); + match s { + JsonLdSchema::NewsArticle { headline, body, date_published, author } => { + assert_eq!(headline.as_deref(), Some("Big Story")); + assert_eq!(body.as_deref(), Some("Lorem ipsum dolor sit amet.")); + assert_eq!(date_published.as_deref(), Some("2026-05-23")); + assert_eq!(author.as_deref(), Some("Jane Doe")); + } + other => panic!("expected NewsArticle, got {other:?}"), + } + } + + /// Test 4: Review with reviewBody and itemReviewed. + #[test] + fn test_jsonld_parse_review() { + let v = json!({ + "@type": "Review", + "headline": "Images of Life", + "reviewBody": "A bountiful, baroque, eccentric record.", + "itemReviewed": {"@type": "MusicRecording", "name": "Images of Life"}, + "author": [{"@type": "Person", "name": "Critic A"}], + "datePublished": "2026-05-23", + }); + let s = classify_value(&v).expect("classify"); + match s { + JsonLdSchema::Review { headline, review_body, rated_item, author, date_published } => { + assert_eq!(headline.as_deref(), Some("Images of Life")); + assert_eq!(review_body.as_deref(), Some("A bountiful, baroque, eccentric record.")); + assert_eq!(rated_item.as_deref(), Some("Images of Life")); + assert_eq!(author.as_deref(), Some("Critic A")); + assert_eq!(date_published.as_deref(), Some("2026-05-23")); + } + other => panic!("expected Review, got {other:?}"), + } + } + + /// Test 5: Unknown @type (Recipe) returns Unknown variant, doesn't crash. + #[test] + fn test_jsonld_parse_unknown_type() { + let v = json!({ + "@type": "Recipe", + "name": "Banana Bread", + "recipeYield": "1 loaf", + }); + let s = classify_value(&v).expect("classify"); + match s { + JsonLdSchema::Unknown { raw_type, .. } => { + assert_eq!(raw_type, "Recipe"); + } + other => panic!("expected Unknown, got {other:?}"), + } + } + + /// Test 6: SiteNavigationElement returns WebPageOrChrome. + #[test] + fn test_jsonld_parse_webpage_dropped() { + let v = json!({ + "@type": "SiteNavigationElement", + "name": "Main nav", + }); + let s = classify_value(&v).expect("classify"); + assert!(matches!(s, JsonLdSchema::WebPageOrChrome { .. })); + if let JsonLdSchema::WebPageOrChrome { raw_type } = s { + assert_eq!(raw_type, "SiteNavigationElement"); + } + } + + /// Test 7: Malformed Value (no @type at all) returns None, doesn't panic. + /// The "truncated JSON" case is the parser's responsibility (already + /// handled in structured_data.rs); the classifier sees only valid Values. + #[test] + fn test_jsonld_parse_malformed_no_crash() { + // Empty object — no @type. + let v1 = json!({}); + assert!(classify_value(&v1).is_none()); + + // Bare string — not an object at all. + let v2 = json!("garbage"); + assert!(classify_value(&v2).is_none()); + + // @type is not a string or array. + let v3 = json!({"@type": 42}); + assert!(classify_value(&v3).is_none()); + + // Array of mixed garbage. + let v4 = json!([1, "two", {"@type": "Article", "headline": "ok"}]); + // classify_value on the array itself returns None (not an object), + // but classify_all extracts the one Article. + let all = classify_all(v4.as_array().unwrap()); + assert_eq!(all.len(), 1); + assert_eq!(all[0].kind(), "NewsArticle"); + } + + /// Test 8: CollectionPage with nested mainEntity ItemList — lifts the inner. + /// This is the Reuters shape phase A confirmed. + #[test] + fn test_jsonld_collectionpage_lifts_mainentity_itemlist() { + let v = json!({ + "@type": "CollectionPage", + "mainEntity": { + "@type": "ItemList", + "numberOfItems": 2, + "itemListElement": [ + {"@type": "ListItem", "position": 1, "url": "https://r.example/1"}, + {"@type": "ListItem", "position": 2, "url": "https://r.example/2"}, + ] + } + }); + let s = classify_value(&v).expect("classify"); + match s { + JsonLdSchema::ItemList { items, number_of_items } => { + assert_eq!(items.len(), 2); + assert_eq!(number_of_items, Some(2)); + } + other => panic!("expected lifted ItemList, got {other:?}"), + } + } + + /// Test 9: primary_schema picks ItemList over NewsArticle and WebPage. + #[test] + fn test_primary_schema_picks_itemlist_first() { + let schemas = vec![ + JsonLdSchema::WebPageOrChrome { raw_type: "WebPage".into() }, + JsonLdSchema::NewsArticle { + headline: Some("x".into()), + body: None, + date_published: None, + author: None, + }, + JsonLdSchema::ItemList { + items: vec![], + number_of_items: None, + }, + ]; + let p = primary_schema(&schemas).expect("primary"); + assert!(matches!(p, JsonLdSchema::ItemList { .. })); + } + + /// Test 10: ListItem with nested `item` object (alternate shape). + #[test] + fn test_jsonld_itemlist_with_nested_item_shape() { + let v = json!({ + "@type": "ItemList", + "itemListElement": [ + { + "@type": "ListItem", + "position": 1, + "item": { + "@type": "NewsArticle", + "url": "https://x/1", + "name": "Wrapped Title", + "datePublished": "2026-05-23", + } + }, + ] + }); + let s = classify_value(&v).expect("classify"); + match s { + JsonLdSchema::ItemList { items, .. } => { + assert_eq!(items.len(), 1); + assert_eq!(items[0].url.as_deref(), Some("https://x/1")); + assert_eq!(items[0].title.as_deref(), Some("Wrapped Title")); + assert_eq!(items[0].published.as_deref(), Some("2026-05-23")); + assert_eq!(items[0].position, Some(1)); + } + other => panic!("expected ItemList, got {other:?}"), + } + } +} diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index b5b91b9..ff5d71b 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -12,6 +12,7 @@ pub mod error; pub mod extractor; #[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))] pub mod js_eval; +pub mod jsonld; pub mod llm; pub mod markdown; pub mod metadata; @@ -25,9 +26,14 @@ pub use brand::BrandIdentity; pub use diff::{ChangeStatus, ContentDiff, MetadataChange}; pub use domain::DomainType; pub use error::ExtractError; +pub use jsonld::{ + classify_all as classify_jsonld_all, classify_value as classify_jsonld_value, primary_schema, + ArticleRef, JsonLdSchema, LiveUpdate, +}; pub use llm::{ - classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text, to_llm_toc, - truncate_json_with_wrapper, truncate_with_footer, HubClassification, + classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text, + to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, + HubClassification, LlmTextOptions, }; pub use types::{ CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata, diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs index e7de00a..507f258 100644 --- a/crates/webclaw-core/src/llm/mod.rs +++ b/crates/webclaw-core/src/llm/mod.rs @@ -18,8 +18,27 @@ pub use output_size::{ truncate_with_footer, }; +use crate::jsonld::{classify_all, primary_schema, JsonLdSchema}; use crate::types::ExtractionResult; +/// Hard size cap on the legacy `## Structured Data` block emitted at the +/// bottom of `to_llm_text` output. The schema-aware block emitted at the top +/// when `--prefer-structured` is set is NOT capped by this value (it has its +/// own per-variant size discipline; see `render_structured_block`). +const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024; + +/// Controls extra structured-data rendering on top of the legacy `to_llm_text`. +/// +/// Default values reproduce the legacy `to_llm_text` behaviour exactly — +/// no caller without M4 flags sees any byte change. +#[derive(Debug, Clone, Default)] +pub struct LlmTextOptions { + /// When true, emit a schema-aware structured-data block at the TOP of + /// the output (after metadata, before prose) and suppress the legacy + /// raw JSON `## Structured Data` block at the bottom. + pub prefer_structured: bool, +} + /// Produce a token-optimized text representation of extracted content. /// /// The output has three sections: @@ -27,11 +46,35 @@ use crate::types::ExtractionResult; /// 2. Cleaned body (no images, no bold/italic, links as plain text) /// 3. Deduplicated links section at the end pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String { + to_llm_text_with_options(result, url, &LlmTextOptions::default()) +} + +/// Same as `to_llm_text`, but with additional structured-data behaviours +/// controlled by `LlmTextOptions`. Used by the M4 `--prefer-structured` CLI +/// flag. +pub fn to_llm_text_with_options( + result: &ExtractionResult, + url: Option<&str>, + opts: &LlmTextOptions, +) -> String { let mut out = String::new(); // -- 1. Metadata header -- metadata::build_metadata_header(&mut out, result, url); + // -- 1b. Schema-aware structured data BEFORE the prose, if requested -- + // Phase A confirmed that on Pitchfork review pages the existing raw-JSON + // block surfaces at byte ~50000 of a 58KB output; this hoists it. + if opts.prefer_structured { + let schemas = classify_all(&result.structured_data); + if let Some(block) = render_structured_block(&schemas) { + if !out.is_empty() { + out.push('\n'); + } + out.push_str(&block); + } + } + // -- 2. Process body -- let processed = body::process_body(&result.content.markdown); @@ -59,28 +102,140 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String { // hydration blobs (Next.js pageProps full of ad-targeting flags, build // IDs, schedule paths) explode to hundreds of KB and drown the LLM in // noise — drop them rather than ship them. - let mut useful: Vec<_> = result - .structured_data - .iter() - .filter(|v| is_useful_structured_data(v)) - .cloned() - .collect(); - for value in &mut useful { - scrub_body_fields(value, 0); - } - if !useful.is_empty() { - let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default(); - const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024; - if serialized.len() <= STRUCTURED_DATA_MAX_BYTES { - out.push_str("\n\n## Structured Data\n\n```json\n"); - out.push_str(&serialized); - out.push_str("\n```"); + // + // When `prefer_structured` is set the schema-aware block already + // carries this information at the top, so we drop the legacy raw block + // to avoid duplication. + if !opts.prefer_structured { + let mut useful: Vec<_> = result + .structured_data + .iter() + .filter(|v| is_useful_structured_data(v)) + .cloned() + .collect(); + for value in &mut useful { + scrub_body_fields(value, 0); + } + if !useful.is_empty() { + let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default(); + if serialized.len() <= STRUCTURED_DATA_MAX_BYTES { + out.push_str("\n\n## Structured Data\n\n```json\n"); + out.push_str(&serialized); + out.push_str("\n```"); + } } } out.trim().to_string() } +/// Render a schema-aware Markdown block summarising the page's JSON-LD. +/// Returns `None` when no content-bearing schema is present. +/// +/// Format: +/// ```text +/// ## Structured data +/// +/// schema: ItemList (20 items) +/// 1. +/// 2. ... +/// ``` +fn render_structured_block(schemas: &[JsonLdSchema]) -> Option { + let primary = primary_schema(schemas)?; + let mut buf = String::new(); + buf.push_str("\n## Structured data\n\n"); + match primary { + JsonLdSchema::ItemList { items, number_of_items } => { + let n = number_of_items.unwrap_or(items.len() as u64); + buf.push_str(&format!("schema: ItemList ({n} items)\n")); + for (i, it) in items.iter().enumerate() { + let pos = it.position.unwrap_or(i as u64 + 1); + let label = it.title.clone().unwrap_or_else(|| { + it.url.clone().unwrap_or_else(|| "(no url)".to_string()) + }); + let url = it.url.as_deref().unwrap_or(""); + if url.is_empty() { + buf.push_str(&format!("{pos}. {label}\n")); + } else { + buf.push_str(&format!("{pos}. {label} — {url}\n")); + } + } + } + JsonLdSchema::LiveBlogPosting { headline, updates } => { + buf.push_str("schema: LiveBlogPosting"); + if let Some(h) = headline { + buf.push_str(&format!(" — {h}")); + } + buf.push('\n'); + buf.push_str(&format!("updates: {}\n", updates.len())); + for u in updates { + let label = u.headline.clone().unwrap_or_else(|| { + u.url.clone().unwrap_or_else(|| "(no url)".into()) + }); + let ts = u.published.as_deref().unwrap_or(""); + if ts.is_empty() { + buf.push_str(&format!("- {label}\n")); + } else { + buf.push_str(&format!("- [{ts}] {label}\n")); + } + } + } + JsonLdSchema::NewsArticle { headline, body, date_published, author } => { + buf.push_str("schema: NewsArticle\n"); + if let Some(h) = headline { + buf.push_str(&format!("headline: {h}\n")); + } + if let Some(a) = author { + buf.push_str(&format!("author: {a}\n")); + } + if let Some(d) = date_published { + buf.push_str(&format!("published: {d}\n")); + } + if let Some(b) = body { + buf.push_str("\n"); + buf.push_str(b); + buf.push('\n'); + } + } + JsonLdSchema::Review { headline, review_body, rated_item, author, date_published } => { + buf.push_str("schema: Review\n"); + if let Some(h) = headline { + buf.push_str(&format!("headline: {h}\n")); + } + if let Some(item) = rated_item { + buf.push_str(&format!("rated: {item}\n")); + } + if let Some(a) = author { + buf.push_str(&format!("author: {a}\n")); + } + if let Some(d) = date_published { + buf.push_str(&format!("published: {d}\n")); + } + if let Some(b) = review_body { + buf.push('\n'); + buf.push_str(b); + buf.push('\n'); + } + } + JsonLdSchema::WebPageOrChrome { raw_type } => { + // Surface the WebPage block even though normal output drops it — + // user explicitly asked via --prefer-structured. + buf.push_str(&format!("schema: {raw_type}\n")); + buf.push_str("(navigation/chrome record; no content fields)\n"); + } + JsonLdSchema::Unknown { raw_type, raw } => { + buf.push_str(&format!("schema: {raw_type} (unrecognised)\n")); + let pretty = serde_json::to_string_pretty(raw).unwrap_or_default(); + if pretty.len() <= 4096 { + buf.push_str("\n```json\n"); + buf.push_str(&pretty); + buf.push_str("\n```\n"); + } + } + } + Some(buf) +} + /// Decide whether a structured-data value carries content worth emitting. /// /// Schema.org records with a recognizable content `@type` (Article, NewsArticle, @@ -976,4 +1131,87 @@ mod tests { "shallow articleBody must still be scrubbed" ); } + + // ------------------------------------------------------------------ + // M4: --prefer-structured / --articles-from-jsonld integration tests + // ------------------------------------------------------------------ + + /// Default options (no flags) produce byte-identical output to legacy + /// `to_llm_text`. This is the sentinel for "additive change" — every + /// p01-p20 probe relies on this. + #[test] + fn to_llm_text_with_options_default_is_legacy_identical() { + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "Article", + "headline": "Hello", + })]); + let legacy = to_llm_text(&r, None); + let with_opts = to_llm_text_with_options(&r, None, &LlmTextOptions::default()); + assert_eq!(legacy, with_opts, "default opts must be byte-identical"); + } + + /// With `prefer_structured`, the schema-aware block appears at the TOP + /// of the output (after the metadata header, before the prose body). + /// Also: the legacy bottom `## Structured Data` block is suppressed. + #[test] + fn prefer_structured_places_block_above_body_and_drops_legacy() { + let mut r = make_result_with_structured(vec![serde_json::json!({ + "@type": "Review", + "headline": "Album X", + "reviewBody": "A long-form review body that would normally be far down the page.".repeat(20), + "datePublished": "2026-05-23", + })]); + r.content.markdown = "## Body Section\n\nLong prose body here.\n".repeat(20); + let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true }); + + // Structured-data section is present at the top. + let struct_idx = out + .find("## Structured data") + .expect("schema-aware block must be present"); + let body_idx = out + .find("Body Section") + .expect("prose body must be present"); + assert!( + struct_idx < body_idx, + "schema-aware block must come BEFORE prose body (struct@{struct_idx}, body@{body_idx})" + ); + + // Legacy bottom block is suppressed to avoid duplication. + assert!( + !out.contains("## Structured Data"), + "legacy uppercase 'Structured Data' block must be dropped when prefer_structured is set" + ); + } + + /// With `prefer_structured` and an ItemList page, the top block lists + /// the items with positions and URLs. + #[test] + fn prefer_structured_itemlist_renders_items() { + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "ItemList", + "numberOfItems": 2, + "itemListElement": [ + {"@type": "ListItem", "position": 1, "url": "https://x/1", "name": "First"}, + {"@type": "ListItem", "position": 2, "url": "https://x/2", "name": "Second"}, + ] + })]); + let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true }); + assert!(out.contains("schema: ItemList (2 items)"), "missing header in:\n{out}"); + assert!(out.contains("1. First — https://x/1"), "missing item 1 in:\n{out}"); + assert!(out.contains("2. Second — https://x/2"), "missing item 2 in:\n{out}"); + } + + /// With `prefer_structured` and a WebPage chrome type, the block is + /// still emitted (override of the normal DROP filter) but identifies + /// itself as a navigation/chrome record. + #[test] + fn prefer_structured_surfaces_webpage_chrome() { + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "WebPage", + "name": "Hub Page", + })]); + let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true }); + assert!(out.contains("## Structured data"), "missing header in:\n{out}"); + assert!(out.contains("schema: WebPage"), "missing WebPage schema label in:\n{out}"); + } }