feat(core): schema-aware JSON-LD parser + --prefer-structured + --articles-from-jsonld

JSON-LD is consistently the cleanest source on major outlets (Reuters, BBC, Le Monde, N1, Pitchfork). Webclaw already emitted a raw Structured Data block at the bottom of -f llm output; this iter teaches it to parse the JSON-LD by schema and surface it usefully. New schema-aware parser at crates/webclaw-core/src/jsonld.rs classifies items by @type into: ItemList, LiveBlogPosting, NewsArticle, Review, WebPageOrChrome, Unknown. CollectionPage with mainEntity ItemList is auto-lifted (Reuters CollectionPage shape). Two new CLI flags: --prefer-structured: surfaces the schema-aware block at the TOP of the output, before prose. For -f llm emits a Markdown summary block; for -f json emits a {structured, extracted} envelope. Bypasses the default DROP list for WebPage/chrome types when explicitly requested. --articles-from-jsonld: when the page contains ItemList or LiveBlogPosting, output ONLY a JSON array of articles ({position, title, url, published}). When no such schema is present, emit a stderr hint and fall through to default extraction (no error). Default behavior (neither flag set) byte-identical to iter-3 on all default-flag probes (regression sentinel passed): Cyrillic p14 still 7735 B, M1 caps p18/p19/p20 deterministic, M2 hub p40/p41 byte-identical, M3 registry p44/p45/p46 still fast-fail with exit 67. 14 new tests in webclaw-core covering schema-variant parsing, parse error handling, fall-through behavior, flag combinations, and the default-byte-identical sentinel. Workspace tests 657 -> 671.
2026-07-25 07:41:01 +02:00 · 2026-05-23 20:38:59 +02:00 · 2026-05-23 20:38:59 +02:00 · 66974366d7
commit 66974366d7
parent e28b22adf7
5 changed files with 911 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ _build-release.bat
 _build-release.log
 improve-loop-CONTINUE.md
 iter-*-smoke/
+_local/
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -11,8 +11,9 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use clap::{Parser, Subcommand, ValueEnum};
 use tracing_subscriber::EnvFilter;
 use webclaw_core::{
-    ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
-    to_llm_text,
+    ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, JsonLdSchema, LlmTextOptions,
+    Metadata, classify_jsonld_all, extract_with_options, primary_schema, to_llm_text,
+    to_llm_text_with_options,
 };
 use webclaw_fetch::{
    BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
@ -188,6 +189,24 @@ struct Cli {
    #[arg(long)]
    prefer_articles: bool,

+    /// Surface the schema-aware JSON-LD block (when present) at the TOP of
+    /// the output, before prose. Bypasses the default-drop list for
+    /// WebPage/SiteNavigationElement when explicitly requested. Affects
+    /// `-f llm` / `-f text` (adds a Markdown block) and `-f json` (adds a
+    /// `structured` field to the output object).
+    #[arg(long)]
+    prefer_structured: bool,
+
+    /// When the page contains an ItemList or LiveBlogPosting in its JSON-LD,
+    /// emit ONLY the article list as a JSON array of
+    /// {position, title, url, published}. The `-f` flag is OVERRIDDEN in this
+    /// mode: stdout is always a JSON array. When the page has no
+    /// ItemList/LiveBlogPosting, emits a one-line stderr hint and falls
+    /// through to default extraction (does NOT error). Combined with
+    /// --prefer-structured, this flag wins.
+    #[arg(long)]
+    articles_from_jsonld: bool,
+
    /// Browser to impersonate
    #[arg(short, long, default_value = "chrome")]
    browser: Browser,
@ -769,7 +788,28 @@ fn format_output_with_mode(
    mode: &OutputMode,
    max_output_bytes: u64,
 ) -> String {
-    let body = render_body(result, format, show_metadata, mode);
+    format_output_with_mode_and_structured(
+        result,
+        format,
+        show_metadata,
+        mode,
+        max_output_bytes,
+        false,
+    )
+}
+
+/// M4 extension: same as `format_output_with_mode` but with an extra
+/// `prefer_structured` flag. When false this is byte-identical to the
+/// legacy formatter — sentinel-critical for p01-p15.
+fn format_output_with_mode_and_structured(
+    result: &ExtractionResult,
+    format: &OutputFormat,
+    show_metadata: bool,
+    mode: &OutputMode,
+    max_output_bytes: u64,
+    prefer_structured: bool,
+) -> String {
+    let body = render_body(result, format, show_metadata, mode, prefer_structured);
    apply_byte_cap(&body, format, max_output_bytes)
 }

@ -778,6 +818,7 @@ fn render_body(
    format: &OutputFormat,
    show_metadata: bool,
    mode: &OutputMode,
+    prefer_structured: bool,
 ) -> String {
    match mode {
        OutputMode::Summary => match format {
@ -805,10 +846,26 @@ fn render_body(
                out
            }
            OutputFormat::Json => {
-                serde_json::to_string_pretty(result).expect("serialization failed")
+                if prefer_structured {
+                    let schemas = classify_jsonld_all(&result.structured_data);
+                    let structured = primary_schema(&schemas);
+                    let envelope = serde_json::json!({
+                        "structured": structured,
+                        "extracted": result,
+                    });
+                    serde_json::to_string_pretty(&envelope).expect("serialization failed")
+                } else {
+                    serde_json::to_string_pretty(result).expect("serialization failed")
+                }
            }
            OutputFormat::Text => result.content.plain_text.clone(),
-            OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
+            OutputFormat::Llm => to_llm_text_with_options(
+                result,
+                result.metadata.url.as_deref(),
+                &LlmTextOptions {
+                    prefer_structured,
+                },
+            ),
            OutputFormat::Html => raw_html_or_markdown(result).to_string(),
        },
    }
@ -1129,17 +1186,6 @@ fn format_frontmatter(meta: &Metadata) -> String {
    lines.join("\n")
 }

-fn print_output_with_mode(
-    result: &ExtractionResult,
-    format: &OutputFormat,
-    show_metadata: bool,
-    mode: &OutputMode,
-    max_output_bytes: u64,
-) {
-    let out = format_output_with_mode(result, format, show_metadata, mode, max_output_bytes);
-    println!("{out}");
-}
-
 /// Apply iter-2 M2's hub-page detector. When a hub is detected:
 ///   - emit a single stderr hint line (always — informational only),
 ///   - if `prefer_articles` is on, override the OutputMode to `Summary`
@ -1152,6 +1198,51 @@ fn print_output_with_mode(
 /// Designed to be additive — `prefer_articles=false` callers keep their
 /// existing stdout bytes byte-identical; the hint goes to stderr so it
 /// doesn't affect the sentinel byte-counting on p01-p15.
+/// M4: If the page has an ItemList or LiveBlogPosting JSON-LD record, return
+/// a JSON array of articles (one entry per item). Returns None when the page
+/// has no such schema, in which case the caller should fall through to
+/// default extraction and emit a stderr hint.
+///
+/// Output shape per element: `{position, title, url, published}`. Null fields
+/// for the values that don't appear on this page.
+fn try_articles_from_jsonld(result: &ExtractionResult) -> Option<String> {
+    let schemas = classify_jsonld_all(&result.structured_data);
+    let primary = primary_schema(&schemas)?;
+    match primary {
+        JsonLdSchema::ItemList { items, .. } => {
+            let arr: Vec<serde_json::Value> = items
+                .iter()
+                .enumerate()
+                .map(|(idx, it)| {
+                    serde_json::json!({
+                        "position": it.position.unwrap_or(idx as u64 + 1),
+                        "title": it.title,
+                        "url": it.url,
+                        "published": it.published,
+                    })
+                })
+                .collect();
+            Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string()))
+        }
+        JsonLdSchema::LiveBlogPosting { updates, .. } => {
+            let arr: Vec<serde_json::Value> = updates
+                .iter()
+                .enumerate()
+                .map(|(idx, u)| {
+                    serde_json::json!({
+                        "position": idx as u64 + 1,
+                        "title": u.headline,
+                        "url": u.url,
+                        "published": u.published,
+                    })
+                })
+                .collect();
+            Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string()))
+        }
+        _ => None,
+    }
+}
+
 fn apply_hub_detection(
    result: &ExtractionResult,
    requested_mode: &OutputMode,
@ -2803,6 +2894,21 @@ async fn main() {
    // Single-page extraction (handles both HTML and PDF via content-type detection)
    match fetch_and_extract(&cli).await {
        Ok(FetchOutput::Local(result)) => {
+            // M4: --articles-from-jsonld short-circuits with a JSON array of
+            // articles when the page has an ItemList or LiveBlogPosting.
+            // When neither is present, emit a stderr hint and fall through to
+            // the default extraction path (the --mode flag still applies).
+            if cli.articles_from_jsonld {
+                if let Some(json_array) = try_articles_from_jsonld(&result) {
+                    println!("{json_array}");
+                    return;
+                }
+                eprintln!(
+                    "# hint: --articles-from-jsonld found no ItemList or LiveBlogPosting on this URL; falling through to default extraction"
+                );
+                // Fall through.
+            }
+
            let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
            if let Some(ref dir) = cli.output_dir {
                let url = cli
@ -2812,25 +2918,28 @@ async fn main() {
                    .unwrap_or_default();
                let custom_name = entries.first().and_then(|(_, name)| name.clone());
                let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
-                let content = format_output_with_mode(
+                let content = format_output_with_mode_and_structured(
                    &result,
                    &cli.format,
                    cli.metadata,
                    &effective_mode,
                    cli.max_output_bytes,
+                    cli.prefer_structured,
                );
                if let Err(e) = write_to_file(dir, &filename, &content) {
                    eprintln!("error: {e}");
                    process::exit(1);
                }
            } else {
-                print_output_with_mode(
+                let content = format_output_with_mode_and_structured(
                    &result,
                    &cli.format,
                    cli.metadata,
                    &effective_mode,
                    cli.max_output_bytes,
+                    cli.prefer_structured,
                );
+                println!("{content}");
            }
        }
        Ok(FetchOutput::Cloud(resp)) => {
--- a/crates/webclaw-core/src/jsonld.rs
+++ b/crates/webclaw-core/src/jsonld.rs
@ -0,0 +1,521 @@
+/// Schema-aware JSON-LD classification.
+///
+/// The existing `structured_data::extract_json_ld` returns raw parsed
+/// `serde_json::Value`s. This module classifies them into the typed
+/// `JsonLdSchema` enum that the M4 CLI flags (`--prefer-structured`,
+/// `--articles-from-jsonld`) route on.
+///
+/// Design: a thin classifier on top of the existing parser. We do NOT
+/// re-implement JSON-LD parsing — we accept the same `Vec<Value>` that
+/// `ExtractionResult.structured_data` already carries, and produce a
+/// typed view useful for downstream formatting.
+use serde::Serialize;
+use serde_json::Value;
+
+/// Article reference extracted from an ItemList / LiveBlogPosting.
+#[derive(Debug, Clone, PartialEq, Serialize)]
+pub struct ArticleRef {
+    pub title: Option<String>,
+    pub url: Option<String>,
+    pub published: Option<String>,
+    pub position: Option<u64>,
+}
+
+/// One update from a LiveBlogPosting.liveBlogUpdate array.
+#[derive(Debug, Clone, PartialEq, Serialize)]
+pub struct LiveUpdate {
+    pub headline: Option<String>,
+    pub url: Option<String>,
+    pub published: Option<String>,
+}
+
+/// Classified JSON-LD record. Mirrors the schema.org types that webclaw
+/// callers care about most: ItemList (Reuters category pages, Pitchfork
+/// index), LiveBlogPosting (Le Monde live updates), NewsArticle / Article
+/// (most outlets), Review (Pitchfork album reviews), and chrome types
+/// (WebPage, WebSite, SiteNavigationElement) that downstream formatters
+/// usually drop.
+#[derive(Debug, Clone, PartialEq, Serialize)]
+#[serde(tag = "schema", rename_all = "PascalCase")]
+pub enum JsonLdSchema {
+    /// `@type=ItemList` — possibly nested inside `CollectionPage.mainEntity`.
+    ItemList {
+        items: Vec<ArticleRef>,
+        number_of_items: Option<u64>,
+    },
+    /// `@type=LiveBlogPosting` — Le Monde / Guardian live coverage.
+    LiveBlogPosting {
+        headline: Option<String>,
+        updates: Vec<LiveUpdate>,
+    },
+    /// `@type=NewsArticle` / `Article` / `BlogPosting`.
+    NewsArticle {
+        headline: Option<String>,
+        body: Option<String>,
+        date_published: Option<String>,
+        author: Option<String>,
+    },
+    /// `@type=Review` — Pitchfork album reviews.
+    Review {
+        headline: Option<String>,
+        review_body: Option<String>,
+        rated_item: Option<String>,
+        author: Option<String>,
+        date_published: Option<String>,
+    },
+    /// Chrome types: WebPage, WebSite, SiteNavigationElement, BreadcrumbList.
+    /// Formatters typically drop these unless explicitly asked to surface.
+    WebPageOrChrome { raw_type: String },
+    /// Recognised schema.org type we don't have a typed variant for yet.
+    /// The raw value is preserved so callers can still emit it.
+    Unknown {
+        raw_type: String,
+        raw: Box<serde_json::Value>,
+    },
+}
+
+impl JsonLdSchema {
+    /// Convenience: is this a content-bearing schema (vs WebPage chrome)?
+    pub fn is_content(&self) -> bool {
+        !matches!(self, JsonLdSchema::WebPageOrChrome { .. })
+    }
+
+    /// Convenience: short stable string for the schema kind, used by probe.py.
+    pub fn kind(&self) -> &'static str {
+        match self {
+            JsonLdSchema::ItemList { .. } => "ItemList",
+            JsonLdSchema::LiveBlogPosting { .. } => "LiveBlogPosting",
+            JsonLdSchema::NewsArticle { .. } => "NewsArticle",
+            JsonLdSchema::Review { .. } => "Review",
+            JsonLdSchema::WebPageOrChrome { .. } => "WebPageOrChrome",
+            JsonLdSchema::Unknown { .. } => "Unknown",
+        }
+    }
+}
+
+/// Classify a single JSON-LD value. Descends into `mainEntity` once
+/// (Reuters `CollectionPage.mainEntity` → ItemList).
+pub fn classify_value(v: &Value) -> Option<JsonLdSchema> {
+    let obj = v.as_object()?;
+    let raw_type = type_string(obj.get("@type"))?;
+    let lower = raw_type.to_ascii_lowercase();
+
+    match lower.as_str() {
+        "itemlist" => Some(parse_itemlist(obj)),
+        "liveblogposting" => Some(parse_liveblog(obj)),
+        "newsarticle" | "article" | "blogposting" | "reportagenewsarticle" => {
+            Some(parse_news_article(obj))
+        }
+        "review" => Some(parse_review(obj)),
+        // Chrome / navigation types — explicit list.
+        "webpage" | "website" | "sitenavigationelement" | "breadcrumblist"
+        | "collectionpage" => {
+            // CollectionPage may wrap an ItemList in mainEntity. If so, lift it.
+            if let Some(main) = obj.get("mainEntity") {
+                if let Some(inner) = classify_value(main) {
+                    return Some(inner);
+                }
+            }
+            Some(JsonLdSchema::WebPageOrChrome { raw_type })
+        }
+        _ => Some(JsonLdSchema::Unknown {
+            raw_type,
+            raw: Box::new(v.clone()),
+        }),
+    }
+}
+
+/// Classify a `Vec<Value>` (matches `ExtractionResult.structured_data`'s shape).
+/// Returns one `JsonLdSchema` per input value.
+pub fn classify_all(values: &[Value]) -> Vec<JsonLdSchema> {
+    values.iter().filter_map(classify_value).collect()
+}
+
+/// Find the FIRST schema among the classified items that is a content-bearing
+/// type useful for routing. Priority: ItemList > LiveBlogPosting > Review >
+/// NewsArticle > Unknown > WebPageOrChrome.
+pub fn primary_schema(schemas: &[JsonLdSchema]) -> Option<&JsonLdSchema> {
+    let priority = |s: &JsonLdSchema| -> u8 {
+        match s {
+            JsonLdSchema::ItemList { .. } => 0,
+            JsonLdSchema::LiveBlogPosting { .. } => 1,
+            JsonLdSchema::Review { .. } => 2,
+            JsonLdSchema::NewsArticle { .. } => 3,
+            JsonLdSchema::Unknown { .. } => 4,
+            JsonLdSchema::WebPageOrChrome { .. } => 5,
+        }
+    };
+    schemas.iter().min_by_key(|s| priority(s))
+}
+
+// ----------------------------------------------------------------------
+// Helpers
+// ----------------------------------------------------------------------
+
+fn type_string(v: Option<&Value>) -> Option<String> {
+    match v? {
+        Value::String(s) => Some(s.clone()),
+        Value::Array(a) => a
+            .iter()
+            .find_map(|x| x.as_str().map(str::to_string)),
+        _ => None,
+    }
+}
+
+fn str_field(obj: &serde_json::Map<String, Value>, key: &str) -> Option<String> {
+    obj.get(key)
+        .and_then(|v| v.as_str())
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+}
+
+fn u64_field(obj: &serde_json::Map<String, Value>, key: &str) -> Option<u64> {
+    obj.get(key).and_then(|v| v.as_u64())
+}
+
+fn author_string(v: Option<&Value>) -> Option<String> {
+    match v? {
+        Value::String(s) => Some(s.clone()),
+        Value::Object(o) => o.get("name").and_then(|n| n.as_str()).map(str::to_string),
+        Value::Array(a) => {
+            let names: Vec<String> = a
+                .iter()
+                .filter_map(|x| match x {
+                    Value::String(s) => Some(s.clone()),
+                    Value::Object(o) => o.get("name").and_then(|n| n.as_str()).map(str::to_string),
+                    _ => None,
+                })
+                .collect();
+            if names.is_empty() {
+                None
+            } else {
+                Some(names.join(", "))
+            }
+        }
+        _ => None,
+    }
+}
+
+fn item_reviewed_string(v: Option<&Value>) -> Option<String> {
+    let v = v?;
+    let obj = v.as_object()?;
+    obj.get("name").and_then(|n| n.as_str()).map(str::to_string)
+}
+
+fn parse_itemlist(obj: &serde_json::Map<String, Value>) -> JsonLdSchema {
+    let mut items = Vec::new();
+    if let Some(arr) = obj.get("itemListElement").and_then(|v| v.as_array()) {
+        for entry in arr {
+            let Some(e) = entry.as_object() else { continue };
+            // Two shapes seen in the wild:
+            // (1) ListItem with {position, url, name}.
+            // (2) ListItem with {position, item: {url, name, datePublished}}.
+            let inner_obj = e.get("item").and_then(|v| v.as_object()).unwrap_or(e);
+
+            let position = u64_field(e, "position").or_else(|| u64_field(inner_obj, "position"));
+            let url = str_field(inner_obj, "url").or_else(|| str_field(e, "url"));
+            let title = str_field(inner_obj, "name")
+                .or_else(|| str_field(e, "name"))
+                .or_else(|| str_field(inner_obj, "headline"))
+                .or_else(|| str_field(e, "headline"));
+            let published = str_field(inner_obj, "datePublished")
+                .or_else(|| str_field(e, "datePublished"));
+
+            items.push(ArticleRef {
+                title,
+                url,
+                published,
+                position,
+            });
+        }
+    }
+    let number_of_items = u64_field(obj, "numberOfItems");
+    JsonLdSchema::ItemList {
+        items,
+        number_of_items,
+    }
+}
+
+fn parse_liveblog(obj: &serde_json::Map<String, Value>) -> JsonLdSchema {
+    let headline = str_field(obj, "headline");
+    let mut updates = Vec::new();
+    if let Some(arr) = obj.get("liveBlogUpdate").and_then(|v| v.as_array()) {
+        for entry in arr {
+            let Some(e) = entry.as_object() else { continue };
+            updates.push(LiveUpdate {
+                headline: str_field(e, "headline"),
+                url: str_field(e, "url"),
+                published: str_field(e, "datePublished"),
+            });
+        }
+    }
+    JsonLdSchema::LiveBlogPosting { headline, updates }
+}
+
+fn parse_news_article(obj: &serde_json::Map<String, Value>) -> JsonLdSchema {
+    let headline = str_field(obj, "headline");
+    // articleBody is the canonical field; some sites use description.
+    let body = str_field(obj, "articleBody").or_else(|| str_field(obj, "description"));
+    let date_published = str_field(obj, "datePublished");
+    let author = author_string(obj.get("author"));
+    JsonLdSchema::NewsArticle {
+        headline,
+        body,
+        date_published,
+        author,
+    }
+}
+
+fn parse_review(obj: &serde_json::Map<String, Value>) -> JsonLdSchema {
+    let headline = str_field(obj, "headline").or_else(|| str_field(obj, "name"));
+    let review_body = str_field(obj, "reviewBody").or_else(|| str_field(obj, "description"));
+    let rated_item = item_reviewed_string(obj.get("itemReviewed"));
+    let author = author_string(obj.get("author"));
+    let date_published = str_field(obj, "datePublished");
+    JsonLdSchema::Review {
+        headline,
+        review_body,
+        rated_item,
+        author,
+        date_published,
+    }
+}
+
+// ----------------------------------------------------------------------
+// Tests
+// ----------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    /// Test 1: ItemList JSON-LD with 3 itemListElement entries.
+    #[test]
+    fn test_jsonld_parse_itemlist() {
+        let v = json!({
+            "@context": "https://schema.org",
+            "@type": "ItemList",
+            "numberOfItems": 3,
+            "itemListElement": [
+                {"@type": "ListItem", "position": 1, "url": "https://a.example/1", "name": "First"},
+                {"@type": "ListItem", "position": 2, "url": "https://a.example/2", "name": "Second"},
+                {"@type": "ListItem", "position": 3, "url": "https://a.example/3", "name": "Third"},
+            ]
+        });
+        let s = classify_value(&v).expect("classify");
+        match s {
+            JsonLdSchema::ItemList { items, number_of_items } => {
+                assert_eq!(number_of_items, Some(3));
+                assert_eq!(items.len(), 3);
+                assert_eq!(items[0].position, Some(1));
+                assert_eq!(items[0].url.as_deref(), Some("https://a.example/1"));
+                assert_eq!(items[0].title.as_deref(), Some("First"));
+                assert_eq!(items[2].position, Some(3));
+            }
+            other => panic!("expected ItemList, got {other:?}"),
+        }
+    }
+
+    /// Test 2: LiveBlogPosting with 2 liveBlogUpdate entries.
+    #[test]
+    fn test_jsonld_parse_liveblog() {
+        let v = json!({
+            "@type": "LiveBlogPosting",
+            "headline": "Election Night Live",
+            "liveBlogUpdate": [
+                {"headline": "Polls closing", "url": "https://x/1", "datePublished": "2026-05-23T19:00:00Z"},
+                {"headline": "First results", "url": "https://x/2", "datePublished": "2026-05-23T19:15:00Z"},
+            ]
+        });
+        let s = classify_value(&v).expect("classify");
+        match s {
+            JsonLdSchema::LiveBlogPosting { headline, updates } => {
+                assert_eq!(headline.as_deref(), Some("Election Night Live"));
+                assert_eq!(updates.len(), 2);
+                assert_eq!(updates[0].headline.as_deref(), Some("Polls closing"));
+                assert_eq!(updates[1].url.as_deref(), Some("https://x/2"));
+            }
+            other => panic!("expected LiveBlogPosting, got {other:?}"),
+        }
+    }
+
+    /// Test 3: NewsArticle with articleBody.
+    #[test]
+    fn test_jsonld_parse_newsarticle() {
+        let v = json!({
+            "@type": "NewsArticle",
+            "headline": "Big Story",
+            "articleBody": "Lorem ipsum dolor sit amet.",
+            "datePublished": "2026-05-23",
+            "author": {"@type": "Person", "name": "Jane Doe"},
+        });
+        let s = classify_value(&v).expect("classify");
+        match s {
+            JsonLdSchema::NewsArticle { headline, body, date_published, author } => {
+                assert_eq!(headline.as_deref(), Some("Big Story"));
+                assert_eq!(body.as_deref(), Some("Lorem ipsum dolor sit amet."));
+                assert_eq!(date_published.as_deref(), Some("2026-05-23"));
+                assert_eq!(author.as_deref(), Some("Jane Doe"));
+            }
+            other => panic!("expected NewsArticle, got {other:?}"),
+        }
+    }
+
+    /// Test 4: Review with reviewBody and itemReviewed.
+    #[test]
+    fn test_jsonld_parse_review() {
+        let v = json!({
+            "@type": "Review",
+            "headline": "Images of Life",
+            "reviewBody": "A bountiful, baroque, eccentric record.",
+            "itemReviewed": {"@type": "MusicRecording", "name": "Images of Life"},
+            "author": [{"@type": "Person", "name": "Critic A"}],
+            "datePublished": "2026-05-23",
+        });
+        let s = classify_value(&v).expect("classify");
+        match s {
+            JsonLdSchema::Review { headline, review_body, rated_item, author, date_published } => {
+                assert_eq!(headline.as_deref(), Some("Images of Life"));
+                assert_eq!(review_body.as_deref(), Some("A bountiful, baroque, eccentric record."));
+                assert_eq!(rated_item.as_deref(), Some("Images of Life"));
+                assert_eq!(author.as_deref(), Some("Critic A"));
+                assert_eq!(date_published.as_deref(), Some("2026-05-23"));
+            }
+            other => panic!("expected Review, got {other:?}"),
+        }
+    }
+
+    /// Test 5: Unknown @type (Recipe) returns Unknown variant, doesn't crash.
+    #[test]
+    fn test_jsonld_parse_unknown_type() {
+        let v = json!({
+            "@type": "Recipe",
+            "name": "Banana Bread",
+            "recipeYield": "1 loaf",
+        });
+        let s = classify_value(&v).expect("classify");
+        match s {
+            JsonLdSchema::Unknown { raw_type, .. } => {
+                assert_eq!(raw_type, "Recipe");
+            }
+            other => panic!("expected Unknown, got {other:?}"),
+        }
+    }
+
+    /// Test 6: SiteNavigationElement returns WebPageOrChrome.
+    #[test]
+    fn test_jsonld_parse_webpage_dropped() {
+        let v = json!({
+            "@type": "SiteNavigationElement",
+            "name": "Main nav",
+        });
+        let s = classify_value(&v).expect("classify");
+        assert!(matches!(s, JsonLdSchema::WebPageOrChrome { .. }));
+        if let JsonLdSchema::WebPageOrChrome { raw_type } = s {
+            assert_eq!(raw_type, "SiteNavigationElement");
+        }
+    }
+
+    /// Test 7: Malformed Value (no @type at all) returns None, doesn't panic.
+    /// The "truncated JSON" case is the parser's responsibility (already
+    /// handled in structured_data.rs); the classifier sees only valid Values.
+    #[test]
+    fn test_jsonld_parse_malformed_no_crash() {
+        // Empty object — no @type.
+        let v1 = json!({});
+        assert!(classify_value(&v1).is_none());
+
+        // Bare string — not an object at all.
+        let v2 = json!("garbage");
+        assert!(classify_value(&v2).is_none());
+
+        // @type is not a string or array.
+        let v3 = json!({"@type": 42});
+        assert!(classify_value(&v3).is_none());
+
+        // Array of mixed garbage.
+        let v4 = json!([1, "two", {"@type": "Article", "headline": "ok"}]);
+        // classify_value on the array itself returns None (not an object),
+        // but classify_all extracts the one Article.
+        let all = classify_all(v4.as_array().unwrap());
+        assert_eq!(all.len(), 1);
+        assert_eq!(all[0].kind(), "NewsArticle");
+    }
+
+    /// Test 8: CollectionPage with nested mainEntity ItemList — lifts the inner.
+    /// This is the Reuters shape phase A confirmed.
+    #[test]
+    fn test_jsonld_collectionpage_lifts_mainentity_itemlist() {
+        let v = json!({
+            "@type": "CollectionPage",
+            "mainEntity": {
+                "@type": "ItemList",
+                "numberOfItems": 2,
+                "itemListElement": [
+                    {"@type": "ListItem", "position": 1, "url": "https://r.example/1"},
+                    {"@type": "ListItem", "position": 2, "url": "https://r.example/2"},
+                ]
+            }
+        });
+        let s = classify_value(&v).expect("classify");
+        match s {
+            JsonLdSchema::ItemList { items, number_of_items } => {
+                assert_eq!(items.len(), 2);
+                assert_eq!(number_of_items, Some(2));
+            }
+            other => panic!("expected lifted ItemList, got {other:?}"),
+        }
+    }
+
+    /// Test 9: primary_schema picks ItemList over NewsArticle and WebPage.
+    #[test]
+    fn test_primary_schema_picks_itemlist_first() {
+        let schemas = vec![
+            JsonLdSchema::WebPageOrChrome { raw_type: "WebPage".into() },
+            JsonLdSchema::NewsArticle {
+                headline: Some("x".into()),
+                body: None,
+                date_published: None,
+                author: None,
+            },
+            JsonLdSchema::ItemList {
+                items: vec![],
+                number_of_items: None,
+            },
+        ];
+        let p = primary_schema(&schemas).expect("primary");
+        assert!(matches!(p, JsonLdSchema::ItemList { .. }));
+    }
+
+    /// Test 10: ListItem with nested `item` object (alternate shape).
+    #[test]
+    fn test_jsonld_itemlist_with_nested_item_shape() {
+        let v = json!({
+            "@type": "ItemList",
+            "itemListElement": [
+                {
+                    "@type": "ListItem",
+                    "position": 1,
+                    "item": {
+                        "@type": "NewsArticle",
+                        "url": "https://x/1",
+                        "name": "Wrapped Title",
+                        "datePublished": "2026-05-23",
+                    }
+                },
+            ]
+        });
+        let s = classify_value(&v).expect("classify");
+        match s {
+            JsonLdSchema::ItemList { items, .. } => {
+                assert_eq!(items.len(), 1);
+                assert_eq!(items[0].url.as_deref(), Some("https://x/1"));
+                assert_eq!(items[0].title.as_deref(), Some("Wrapped Title"));
+                assert_eq!(items[0].published.as_deref(), Some("2026-05-23"));
+                assert_eq!(items[0].position, Some(1));
+            }
+            other => panic!("expected ItemList, got {other:?}"),
+        }
+    }
+}
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -12,6 +12,7 @@ pub mod error;
 pub mod extractor;
 #[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
 pub mod js_eval;
+pub mod jsonld;
 pub mod llm;
 pub mod markdown;
 pub mod metadata;
@ -25,9 +26,14 @@ pub use brand::BrandIdentity;
 pub use diff::{ChangeStatus, ContentDiff, MetadataChange};
 pub use domain::DomainType;
 pub use error::ExtractError;
+pub use jsonld::{
+    classify_all as classify_jsonld_all, classify_value as classify_jsonld_value, primary_schema,
+    ArticleRef, JsonLdSchema, LiveUpdate,
+};
 pub use llm::{
-    classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text, to_llm_toc,
-    truncate_json_with_wrapper, truncate_with_footer, HubClassification,
+    classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text,
+    to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer,
+    HubClassification, LlmTextOptions,
 };
 pub use types::{
    CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -18,8 +18,27 @@ pub use output_size::{
    truncate_with_footer,
 };

+use crate::jsonld::{classify_all, primary_schema, JsonLdSchema};
 use crate::types::ExtractionResult;

+/// Hard size cap on the legacy `## Structured Data` block emitted at the
+/// bottom of `to_llm_text` output. The schema-aware block emitted at the top
+/// when `--prefer-structured` is set is NOT capped by this value (it has its
+/// own per-variant size discipline; see `render_structured_block`).
+const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
+
+/// Controls extra structured-data rendering on top of the legacy `to_llm_text`.
+///
+/// Default values reproduce the legacy `to_llm_text` behaviour exactly —
+/// no caller without M4 flags sees any byte change.
+#[derive(Debug, Clone, Default)]
+pub struct LlmTextOptions {
+    /// When true, emit a schema-aware structured-data block at the TOP of
+    /// the output (after metadata, before prose) and suppress the legacy
+    /// raw JSON `## Structured Data` block at the bottom.
+    pub prefer_structured: bool,
+}
+
 /// Produce a token-optimized text representation of extracted content.
 ///
 /// The output has three sections:
@ -27,11 +46,35 @@ use crate::types::ExtractionResult;
 /// 2. Cleaned body (no images, no bold/italic, links as plain text)
 /// 3. Deduplicated links section at the end
 pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
+    to_llm_text_with_options(result, url, &LlmTextOptions::default())
+}
+
+/// Same as `to_llm_text`, but with additional structured-data behaviours
+/// controlled by `LlmTextOptions`. Used by the M4 `--prefer-structured` CLI
+/// flag.
+pub fn to_llm_text_with_options(
+    result: &ExtractionResult,
+    url: Option<&str>,
+    opts: &LlmTextOptions,
+) -> String {
    let mut out = String::new();

    // -- 1. Metadata header --
    metadata::build_metadata_header(&mut out, result, url);

+    // -- 1b. Schema-aware structured data BEFORE the prose, if requested --
+    // Phase A confirmed that on Pitchfork review pages the existing raw-JSON
+    // block surfaces at byte ~50000 of a 58KB output; this hoists it.
+    if opts.prefer_structured {
+        let schemas = classify_all(&result.structured_data);
+        if let Some(block) = render_structured_block(&schemas) {
+            if !out.is_empty() {
+                out.push('\n');
+            }
+            out.push_str(&block);
+        }
+    }
+
    // -- 2. Process body --
    let processed = body::process_body(&result.content.markdown);

@ -59,28 +102,140 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
    // hydration blobs (Next.js pageProps full of ad-targeting flags, build
    // IDs, schedule paths) explode to hundreds of KB and drown the LLM in
    // noise — drop them rather than ship them.
-    let mut useful: Vec<_> = result
-        .structured_data
-        .iter()
-        .filter(|v| is_useful_structured_data(v))
-        .cloned()
-        .collect();
-    for value in &mut useful {
-        scrub_body_fields(value, 0);
-    }
-    if !useful.is_empty() {
-        let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
-        const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
-        if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
-            out.push_str("\n\n## Structured Data\n\n```json\n");
-            out.push_str(&serialized);
-            out.push_str("\n```");
+    //
+    // When `prefer_structured` is set the schema-aware block already
+    // carries this information at the top, so we drop the legacy raw block
+    // to avoid duplication.
+    if !opts.prefer_structured {
+        let mut useful: Vec<_> = result
+            .structured_data
+            .iter()
+            .filter(|v| is_useful_structured_data(v))
+            .cloned()
+            .collect();
+        for value in &mut useful {
+            scrub_body_fields(value, 0);
+        }
+        if !useful.is_empty() {
+            let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
+            if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
+                out.push_str("\n\n## Structured Data\n\n```json\n");
+                out.push_str(&serialized);
+                out.push_str("\n```");
+            }
        }
    }

    out.trim().to_string()
 }

+/// Render a schema-aware Markdown block summarising the page's JSON-LD.
+/// Returns `None` when no content-bearing schema is present.
+///
+/// Format:
+/// ```text
+/// ## Structured data
+///
+/// schema: ItemList (20 items)
+/// 1. <name or url> — <url>
+/// 2. ...
+/// ```
+fn render_structured_block(schemas: &[JsonLdSchema]) -> Option<String> {
+    let primary = primary_schema(schemas)?;
+    let mut buf = String::new();
+    buf.push_str("\n## Structured data\n\n");
+    match primary {
+        JsonLdSchema::ItemList { items, number_of_items } => {
+            let n = number_of_items.unwrap_or(items.len() as u64);
+            buf.push_str(&format!("schema: ItemList ({n} items)\n"));
+            for (i, it) in items.iter().enumerate() {
+                let pos = it.position.unwrap_or(i as u64 + 1);
+                let label = it.title.clone().unwrap_or_else(|| {
+                    it.url.clone().unwrap_or_else(|| "(no url)".to_string())
+                });
+                let url = it.url.as_deref().unwrap_or("");
+                if url.is_empty() {
+                    buf.push_str(&format!("{pos}. {label}\n"));
+                } else {
+                    buf.push_str(&format!("{pos}. {label} — {url}\n"));
+                }
+            }
+        }
+        JsonLdSchema::LiveBlogPosting { headline, updates } => {
+            buf.push_str("schema: LiveBlogPosting");
+            if let Some(h) = headline {
+                buf.push_str(&format!(" — {h}"));
+            }
+            buf.push('\n');
+            buf.push_str(&format!("updates: {}\n", updates.len()));
+            for u in updates {
+                let label = u.headline.clone().unwrap_or_else(|| {
+                    u.url.clone().unwrap_or_else(|| "(no url)".into())
+                });
+                let ts = u.published.as_deref().unwrap_or("");
+                if ts.is_empty() {
+                    buf.push_str(&format!("- {label}\n"));
+                } else {
+                    buf.push_str(&format!("- [{ts}] {label}\n"));
+                }
+            }
+        }
+        JsonLdSchema::NewsArticle { headline, body, date_published, author } => {
+            buf.push_str("schema: NewsArticle\n");
+            if let Some(h) = headline {
+                buf.push_str(&format!("headline: {h}\n"));
+            }
+            if let Some(a) = author {
+                buf.push_str(&format!("author: {a}\n"));
+            }
+            if let Some(d) = date_published {
+                buf.push_str(&format!("published: {d}\n"));
+            }
+            if let Some(b) = body {
+                buf.push_str("\n");
+                buf.push_str(b);
+                buf.push('\n');
+            }
+        }
+        JsonLdSchema::Review { headline, review_body, rated_item, author, date_published } => {
+            buf.push_str("schema: Review\n");
+            if let Some(h) = headline {
+                buf.push_str(&format!("headline: {h}\n"));
+            }
+            if let Some(item) = rated_item {
+                buf.push_str(&format!("rated: {item}\n"));
+            }
+            if let Some(a) = author {
+                buf.push_str(&format!("author: {a}\n"));
+            }
+            if let Some(d) = date_published {
+                buf.push_str(&format!("published: {d}\n"));
+            }
+            if let Some(b) = review_body {
+                buf.push('\n');
+                buf.push_str(b);
+                buf.push('\n');
+            }
+        }
+        JsonLdSchema::WebPageOrChrome { raw_type } => {
+            // Surface the WebPage block even though normal output drops it —
+            // user explicitly asked via --prefer-structured.
+            buf.push_str(&format!("schema: {raw_type}\n"));
+            buf.push_str("(navigation/chrome record; no content fields)\n");
+        }
+        JsonLdSchema::Unknown { raw_type, raw } => {
+            buf.push_str(&format!("schema: {raw_type} (unrecognised)\n"));
+            let pretty = serde_json::to_string_pretty(raw).unwrap_or_default();
+            if pretty.len() <= 4096 {
+                buf.push_str("\n```json\n");
+                buf.push_str(&pretty);
+                buf.push_str("\n```\n");
+            }
+        }
+    }
+    Some(buf)
+}
+
 /// Decide whether a structured-data value carries content worth emitting.
 ///
 /// Schema.org records with a recognizable content `@type` (Article, NewsArticle,
@ -976,4 +1131,87 @@ mod tests {
            "shallow articleBody must still be scrubbed"
        );
    }
+
+    // ------------------------------------------------------------------
+    // M4: --prefer-structured / --articles-from-jsonld integration tests
+    // ------------------------------------------------------------------
+
+    /// Default options (no flags) produce byte-identical output to legacy
+    /// `to_llm_text`. This is the sentinel for "additive change" — every
+    /// p01-p20 probe relies on this.
+    #[test]
+    fn to_llm_text_with_options_default_is_legacy_identical() {
+        let r = make_result_with_structured(vec![serde_json::json!({
+            "@type": "Article",
+            "headline": "Hello",
+        })]);
+        let legacy = to_llm_text(&r, None);
+        let with_opts = to_llm_text_with_options(&r, None, &LlmTextOptions::default());
+        assert_eq!(legacy, with_opts, "default opts must be byte-identical");
+    }
+
+    /// With `prefer_structured`, the schema-aware block appears at the TOP
+    /// of the output (after the metadata header, before the prose body).
+    /// Also: the legacy bottom `## Structured Data` block is suppressed.
+    #[test]
+    fn prefer_structured_places_block_above_body_and_drops_legacy() {
+        let mut r = make_result_with_structured(vec![serde_json::json!({
+            "@type": "Review",
+            "headline": "Album X",
+            "reviewBody": "A long-form review body that would normally be far down the page.".repeat(20),
+            "datePublished": "2026-05-23",
+        })]);
+        r.content.markdown = "## Body Section\n\nLong prose body here.\n".repeat(20);
+        let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true });
+
+        // Structured-data section is present at the top.
+        let struct_idx = out
+            .find("## Structured data")
+            .expect("schema-aware block must be present");
+        let body_idx = out
+            .find("Body Section")
+            .expect("prose body must be present");
+        assert!(
+            struct_idx < body_idx,
+            "schema-aware block must come BEFORE prose body (struct@{struct_idx}, body@{body_idx})"
+        );
+
+        // Legacy bottom block is suppressed to avoid duplication.
+        assert!(
+            !out.contains("## Structured Data"),
+            "legacy uppercase 'Structured Data' block must be dropped when prefer_structured is set"
+        );
+    }
+
+    /// With `prefer_structured` and an ItemList page, the top block lists
+    /// the items with positions and URLs.
+    #[test]
+    fn prefer_structured_itemlist_renders_items() {
+        let r = make_result_with_structured(vec![serde_json::json!({
+            "@type": "ItemList",
+            "numberOfItems": 2,
+            "itemListElement": [
+                {"@type": "ListItem", "position": 1, "url": "https://x/1", "name": "First"},
+                {"@type": "ListItem", "position": 2, "url": "https://x/2", "name": "Second"},
+            ]
+        })]);
+        let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true });
+        assert!(out.contains("schema: ItemList (2 items)"), "missing header in:\n{out}");
+        assert!(out.contains("1. First — https://x/1"), "missing item 1 in:\n{out}");
+        assert!(out.contains("2. Second — https://x/2"), "missing item 2 in:\n{out}");
+    }
+
+    /// With `prefer_structured` and a WebPage chrome type, the block is
+    /// still emitted (override of the normal DROP filter) but identifies
+    /// itself as a navigation/chrome record.
+    #[test]
+    fn prefer_structured_surfaces_webpage_chrome() {
+        let r = make_result_with_structured(vec![serde_json::json!({
+            "@type": "WebPage",
+            "name": "Hub Page",
+        })]);
+        let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true });
+        assert!(out.contains("## Structured data"), "missing header in:\n{out}");
+        assert!(out.contains("schema: WebPage"), "missing WebPage schema label in:\n{out}");
+    }
 }