feat(core): schema-aware JSON-LD parser + --prefer-structured + --articles-from-jsonld

JSON-LD is consistently the cleanest source on major outlets (Reuters, BBC, Le Monde, N1, Pitchfork). Webclaw already emitted a raw Structured Data block at the bottom of -f llm output; this iter teaches it to parse the JSON-LD by schema and surface it usefully. New schema-aware parser at crates/webclaw-core/src/jsonld.rs classifies items by @type into: ItemList, LiveBlogPosting, NewsArticle, Review, WebPageOrChrome, Unknown. CollectionPage with mainEntity ItemList is auto-lifted (Reuters CollectionPage shape). Two new CLI flags: --prefer-structured: surfaces the schema-aware block at the TOP of the output, before prose. For -f llm emits a Markdown summary block; for -f json emits a {structured, extracted} envelope. Bypasses the default DROP list for WebPage/chrome types when explicitly requested. --articles-from-jsonld: when the page contains ItemList or LiveBlogPosting, output ONLY a JSON array of articles ({position, title, url, published}). When no such schema is present, emit a stderr hint and fall through to default extraction (no error). Default behavior (neither flag set) byte-identical to iter-3 on all default-flag probes (regression sentinel passed): Cyrillic p14 still 7735 B, M1 caps p18/p19/p20 deterministic, M2 hub p40/p41 byte-identical, M3 registry p44/p45/p46 still fast-fail with exit 67. 14 new tests in webclaw-core covering schema-variant parsing, parse error handling, fall-through behavior, flag combinations, and the default-byte-identical sentinel. Workspace tests 657 -> 671.
2026-06-16 23:45:13 +02:00 · 2026-05-23 20:38:59 +02:00 · 2026-05-23 20:38:59 +02:00 · 66974366d7
commit 66974366d7
parent e28b22adf7
5 changed files with 911 additions and 36 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -11,8 +11,9 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use clap::{Parser, Subcommand, ValueEnum};
 use tracing_subscriber::EnvFilter;
 use webclaw_core::{
-    ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
-    to_llm_text,
+    ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, JsonLdSchema, LlmTextOptions,
+    Metadata, classify_jsonld_all, extract_with_options, primary_schema, to_llm_text,
+    to_llm_text_with_options,
 };
 use webclaw_fetch::{
    BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
@ -188,6 +189,24 @@ struct Cli {
    #[arg(long)]
    prefer_articles: bool,

+    /// Surface the schema-aware JSON-LD block (when present) at the TOP of
+    /// the output, before prose. Bypasses the default-drop list for
+    /// WebPage/SiteNavigationElement when explicitly requested. Affects
+    /// `-f llm` / `-f text` (adds a Markdown block) and `-f json` (adds a
+    /// `structured` field to the output object).
+    #[arg(long)]
+    prefer_structured: bool,
+
+    /// When the page contains an ItemList or LiveBlogPosting in its JSON-LD,
+    /// emit ONLY the article list as a JSON array of
+    /// {position, title, url, published}. The `-f` flag is OVERRIDDEN in this
+    /// mode: stdout is always a JSON array. When the page has no
+    /// ItemList/LiveBlogPosting, emits a one-line stderr hint and falls
+    /// through to default extraction (does NOT error). Combined with
+    /// --prefer-structured, this flag wins.
+    #[arg(long)]
+    articles_from_jsonld: bool,
+
    /// Browser to impersonate
    #[arg(short, long, default_value = "chrome")]
    browser: Browser,
@ -769,7 +788,28 @@ fn format_output_with_mode(
    mode: &OutputMode,
    max_output_bytes: u64,
 ) -> String {
-    let body = render_body(result, format, show_metadata, mode);
+    format_output_with_mode_and_structured(
+        result,
+        format,
+        show_metadata,
+        mode,
+        max_output_bytes,
+        false,
+    )
+}
+
+/// M4 extension: same as `format_output_with_mode` but with an extra
+/// `prefer_structured` flag. When false this is byte-identical to the
+/// legacy formatter — sentinel-critical for p01-p15.
+fn format_output_with_mode_and_structured(
+    result: &ExtractionResult,
+    format: &OutputFormat,
+    show_metadata: bool,
+    mode: &OutputMode,
+    max_output_bytes: u64,
+    prefer_structured: bool,
+) -> String {
+    let body = render_body(result, format, show_metadata, mode, prefer_structured);
    apply_byte_cap(&body, format, max_output_bytes)
 }

@ -778,6 +818,7 @@ fn render_body(
    format: &OutputFormat,
    show_metadata: bool,
    mode: &OutputMode,
+    prefer_structured: bool,
 ) -> String {
    match mode {
        OutputMode::Summary => match format {
@ -805,10 +846,26 @@ fn render_body(
                out
            }
            OutputFormat::Json => {
-                serde_json::to_string_pretty(result).expect("serialization failed")
+                if prefer_structured {
+                    let schemas = classify_jsonld_all(&result.structured_data);
+                    let structured = primary_schema(&schemas);
+                    let envelope = serde_json::json!({
+                        "structured": structured,
+                        "extracted": result,
+                    });
+                    serde_json::to_string_pretty(&envelope).expect("serialization failed")
+                } else {
+                    serde_json::to_string_pretty(result).expect("serialization failed")
+                }
            }
            OutputFormat::Text => result.content.plain_text.clone(),
-            OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
+            OutputFormat::Llm => to_llm_text_with_options(
+                result,
+                result.metadata.url.as_deref(),
+                &LlmTextOptions {
+                    prefer_structured,
+                },
+            ),
            OutputFormat::Html => raw_html_or_markdown(result).to_string(),
        },
    }
@ -1129,17 +1186,6 @@ fn format_frontmatter(meta: &Metadata) -> String {
    lines.join("\n")
 }

-fn print_output_with_mode(
-    result: &ExtractionResult,
-    format: &OutputFormat,
-    show_metadata: bool,
-    mode: &OutputMode,
-    max_output_bytes: u64,
-) {
-    let out = format_output_with_mode(result, format, show_metadata, mode, max_output_bytes);
-    println!("{out}");
-}
-
 /// Apply iter-2 M2's hub-page detector. When a hub is detected:
 ///   - emit a single stderr hint line (always — informational only),
 ///   - if `prefer_articles` is on, override the OutputMode to `Summary`
@ -1152,6 +1198,51 @@ fn print_output_with_mode(
 /// Designed to be additive — `prefer_articles=false` callers keep their
 /// existing stdout bytes byte-identical; the hint goes to stderr so it
 /// doesn't affect the sentinel byte-counting on p01-p15.
+/// M4: If the page has an ItemList or LiveBlogPosting JSON-LD record, return
+/// a JSON array of articles (one entry per item). Returns None when the page
+/// has no such schema, in which case the caller should fall through to
+/// default extraction and emit a stderr hint.
+///
+/// Output shape per element: `{position, title, url, published}`. Null fields
+/// for the values that don't appear on this page.
+fn try_articles_from_jsonld(result: &ExtractionResult) -> Option<String> {
+    let schemas = classify_jsonld_all(&result.structured_data);
+    let primary = primary_schema(&schemas)?;
+    match primary {
+        JsonLdSchema::ItemList { items, .. } => {
+            let arr: Vec<serde_json::Value> = items
+                .iter()
+                .enumerate()
+                .map(|(idx, it)| {
+                    serde_json::json!({
+                        "position": it.position.unwrap_or(idx as u64 + 1),
+                        "title": it.title,
+                        "url": it.url,
+                        "published": it.published,
+                    })
+                })
+                .collect();
+            Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string()))
+        }
+        JsonLdSchema::LiveBlogPosting { updates, .. } => {
+            let arr: Vec<serde_json::Value> = updates
+                .iter()
+                .enumerate()
+                .map(|(idx, u)| {
+                    serde_json::json!({
+                        "position": idx as u64 + 1,
+                        "title": u.headline,
+                        "url": u.url,
+                        "published": u.published,
+                    })
+                })
+                .collect();
+            Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string()))
+        }
+        _ => None,
+    }
+}
+
 fn apply_hub_detection(
    result: &ExtractionResult,
    requested_mode: &OutputMode,
@ -2803,6 +2894,21 @@ async fn main() {
    // Single-page extraction (handles both HTML and PDF via content-type detection)
    match fetch_and_extract(&cli).await {
        Ok(FetchOutput::Local(result)) => {
+            // M4: --articles-from-jsonld short-circuits with a JSON array of
+            // articles when the page has an ItemList or LiveBlogPosting.
+            // When neither is present, emit a stderr hint and fall through to
+            // the default extraction path (the --mode flag still applies).
+            if cli.articles_from_jsonld {
+                if let Some(json_array) = try_articles_from_jsonld(&result) {
+                    println!("{json_array}");
+                    return;
+                }
+                eprintln!(
+                    "# hint: --articles-from-jsonld found no ItemList or LiveBlogPosting on this URL; falling through to default extraction"
+                );
+                // Fall through.
+            }
+
            let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
            if let Some(ref dir) = cli.output_dir {
                let url = cli
@ -2812,25 +2918,28 @@ async fn main() {
                    .unwrap_or_default();
                let custom_name = entries.first().and_then(|(_, name)| name.clone());
                let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
-                let content = format_output_with_mode(
+                let content = format_output_with_mode_and_structured(
                    &result,
                    &cli.format,
                    cli.metadata,
                    &effective_mode,
                    cli.max_output_bytes,
+                    cli.prefer_structured,
                );
                if let Err(e) = write_to_file(dir, &filename, &content) {
                    eprintln!("error: {e}");
                    process::exit(1);
                }
            } else {
-                print_output_with_mode(
+                let content = format_output_with_mode_and_structured(
                    &result,
                    &cli.format,
                    cli.metadata,
                    &effective_mode,
                    cli.max_output_bytes,
+                    cli.prefer_structured,
                );
+                println!("{content}");
            }
        }
        Ok(FetchOutput::Cloud(resp)) => {