feat(core): word-count breakdown in header — article vs chrome split

Current Word count: N is a single number conflating article body and surrounding chrome (nav, ads, footer). Callers couldn't tell from the header alone whether to drill or move on. New: Word count: N (article: M, chrome: K) in -f llm/text output. For -f json: adds word_count_article and word_count_chrome fields alongside the existing word_count. M (article body) is sourced from JSON-LD articleBody when M4's parser found one (NewsArticle or Review.reviewBody); otherwise computed by llm::body_word_count (the M2-style heuristic — words outside markdown link patterns, the same body::process_body output hub_detect uses). --mode summary / toc / sections fall back to the simple Word count: N form (the modes don't extract body content; the breakdown would be meaningless). Suppression piggybacks on the existing include_status toggle in build_metadata_header_with_opts. 9 new tests in webclaw-core (4 in lib.rs::tests for the population logic; 5 in llm/metadata.rs::m12_tests for the header formatter). Workspace 701 -> 710.
2026-07-25 07:41:01 +02:00 · 2026-05-23 23:56:14 +02:00 · 2026-05-23 23:56:14 +02:00 · d5a3aa4bf9
commit d5a3aa4bf9
parent ade2a5143c
17 changed files with 519 additions and 7 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -3032,6 +3032,8 @@ mod tests {
                image: None,
                favicon: None,
                word_count: markdown.split_whitespace().count(),
+                word_count_article: 0,
+                word_count_chrome: 0,
                http_status: None,
            },
            content: Content {
--- a/crates/webclaw-core/src/diff.rs
+++ b/crates/webclaw-core/src/diff.rs
@ -148,6 +148,8 @@ mod tests {
                image: None,
                favicon: None,
                word_count,
+                word_count_article: 0,
+                word_count_chrome: 0,
                http_status: None,
            },
            content: Content {
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -31,10 +31,10 @@ pub use jsonld::{
    ArticleRef, JsonLdSchema, LiveUpdate,
 };
 pub use llm::{
-    classify_hub, classify_thin_body, collect_section_links, to_json_sections, to_json_summary,
-    to_json_toc, to_llm_sections, to_llm_summary, to_llm_text, to_llm_text_with_options,
-    to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, HubClassification,
-    LlmTextOptions, ThinBodyClassification,
+    body_word_count, classify_hub, classify_thin_body, collect_section_links, to_json_sections,
+    to_json_summary, to_json_toc, to_llm_sections, to_llm_summary, to_llm_text,
+    to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer,
+    HubClassification, LlmTextOptions, ThinBodyClassification,
 };
 pub use types::{
    CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
@ -114,6 +114,11 @@ fn extract_with_options_inner(
        let doc = Html::parse_document(html);
        let mut meta = metadata::extract(&doc, url);
        meta.word_count = extractor::word_count(&yt_md);
+        // M12: YouTube fast path emits structured video metadata only
+        // (title, channel, view count, description). No chrome / nav /
+        // ads in the output — all words are "article" by definition.
+        meta.word_count_article = meta.word_count;
+        meta.word_count_chrome = 0;

        let plain_text = yt_md
            .lines()
@ -234,6 +239,16 @@ fn extract_with_options_inner(
    structured_data.extend(structured_data::extract_next_data(html));
    structured_data.extend(structured_data::extract_sveltekit(html));

+    // M12 (issue #7): split the total word_count into an article-body
+    // portion and a chrome portion. Computed once, here, AFTER all the
+    // word_count update paths above (data island, QuickJS, retry strategies)
+    // have settled. Sourced from JSON-LD articleBody/reviewBody when
+    // present, else the M2-style body word count on the extracted markdown.
+    let (article_wc, chrome_wc) =
+        compute_word_count_breakdown(&content.markdown, &structured_data, meta.word_count);
+    meta.word_count_article = article_wc;
+    meta.word_count_chrome = chrome_wc;
+
    Ok(ExtractionResult {
        metadata: meta,
        content,
@ -242,6 +257,60 @@ fn extract_with_options_inner(
    })
 }

+/// M12 helper: split a page's total word_count into an article-body portion
+/// and a chrome remainder.
+///
+/// Precedence:
+/// 1. JSON-LD `articleBody` (NewsArticle) or `reviewBody` (Review) via
+///    [`crate::jsonld::primary_schema`]. When present, the article portion
+///    is the word count of that string.
+/// 2. Fallback: [`llm::body_word_count`] on the extracted markdown — M2's
+///    "words outside markdown link patterns" estimator (same pipeline
+///    `hub_detect::count_body_words` uses for hub classification).
+///
+/// Invariant: returns `(article, chrome)` such that `article + chrome ==
+/// total_wc`. `article` is clamped to `total_wc` if the JSON-LD body has
+/// more words than the extracted markdown (tokenization differences are
+/// expected — the breakdown is a best-effort split, not a perfect
+/// partition). When `total_wc == 0`, returns `(0, 0)` so the
+/// `skip_serializing_if = "is_zero_usize"` guard on the Metadata fields
+/// drops them from JSON output.
+fn compute_word_count_breakdown(
+    markdown: &str,
+    structured_data: &[serde_json::Value],
+    total_wc: usize,
+) -> (usize, usize) {
+    if total_wc == 0 {
+        return (0, 0);
+    }
+
+    // 1. JSON-LD articleBody / reviewBody — ground truth when present.
+    let schemas = jsonld::classify_all(structured_data);
+    let jsonld_body: Option<&str> = jsonld::primary_schema(&schemas).and_then(|s| match s {
+        jsonld::JsonLdSchema::NewsArticle { body: Some(b), .. } => Some(b.as_str()),
+        jsonld::JsonLdSchema::Review {
+            review_body: Some(b),
+            ..
+        } => Some(b.as_str()),
+        _ => None,
+    });
+
+    let article_raw = if let Some(body_str) = jsonld_body {
+        extractor::word_count(body_str)
+    } else {
+        // 2. Fallback: M2-style body word count on extracted markdown.
+        llm::body_word_count(markdown)
+    };
+
+    // Clamp so article + chrome == total_wc (invariant for the JSON shape
+    // and the header arithmetic). Tokenization mismatches (JSON-LD body
+    // vs extractor::word_count) can make article_raw > total_wc; that's
+    // not a bug, it's a representation gap — clamp and move on.
+    let article = article_raw.min(total_wc);
+    let chrome = total_wc - article;
+    (article, chrome)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@ -660,4 +729,209 @@ mod tests {
            "wasm path and threaded path must produce identical content"
        );
    }
+
+    // -----------------------------------------------------------------
+    // M12 (issue #7): word-count breakdown — article vs chrome split.
+    // Tests the POPULATION logic in `extract_with_options_inner` /
+    // `compute_word_count_breakdown`. Formatter behavior is tested in
+    // `crate::llm::metadata::m12_tests`.
+    // -----------------------------------------------------------------
+
+    /// M12 test 1: a page with a JSON-LD `NewsArticle.articleBody` gets
+    /// the article portion sourced from the articleBody string. Chrome
+    /// is the remainder. Total invariant: article + chrome == word_count.
+    #[test]
+    fn test_word_count_breakdown_with_jsonld_article_body() {
+        // 20-word articleBody. Wrap in a <p> + nav chrome so the
+        // extracted markdown has both article words AND chrome words.
+        let html = r#"
+        <html lang="en">
+        <head>
+            <title>Tariffs hit consumers</title>
+            <script type="application/ld+json">
+            {
+              "@context": "https://schema.org",
+              "@type": "NewsArticle",
+              "headline": "Tariffs hit consumers",
+              "articleBody": "Tariffs are taxes on imports paid by consumers in the importing country, not the exporting one, economists explained today again."
+            }
+            </script>
+        </head>
+        <body>
+            <nav><a href="/">Home</a> | <a href="/markets">Markets</a> | <a href="/world">World</a></nav>
+            <article>
+                <h1>Tariffs hit consumers</h1>
+                <p>Tariffs are taxes on imports paid by consumers in the importing country, not the exporting one, economists explained today again.</p>
+            </article>
+            <footer>Subscribe to our newsletter for daily updates and breaking-news alerts</footer>
+        </body>
+        </html>"#;
+
+        let result = extract(html, Some("https://news.example.com/tariffs")).unwrap();
+        let m = &result.metadata;
+
+        assert!(m.word_count > 0, "extraction must produce a word count");
+        // articleBody is exactly 20 words. The extracted markdown may
+        // include more or fewer words depending on what the scorer
+        // captured; the invariant we assert is structural, not numeric.
+        assert_eq!(
+            m.word_count_article + m.word_count_chrome,
+            m.word_count,
+            "invariant: article + chrome == total. \
+             got article={}, chrome={}, total={}",
+            m.word_count_article,
+            m.word_count_chrome,
+            m.word_count
+        );
+        assert!(
+            m.word_count_article > 0,
+            "JSON-LD articleBody must populate article portion (>0); \
+             got article={}",
+            m.word_count_article
+        );
+    }
+
+    /// M12 test 2: when no JSON-LD body is present, the article portion
+    /// falls back to the M2-style body heuristic (`llm::body_word_count`
+    /// on extracted markdown). Chrome is the remainder. The article
+    /// portion must still be >0 on a real body page; total invariant holds.
+    #[test]
+    fn test_word_count_breakdown_without_jsonld_falls_back_to_heuristic() {
+        // No <script type="application/ld+json"> block — the breakdown
+        // must come from the body::process_body fallback.
+        let html = r#"
+        <html lang="en">
+        <head><title>Plain article</title></head>
+        <body>
+            <article>
+                <h1>Plain article</h1>
+                <p>The economy expanded last quarter at an annualized rate of three percent
+                   driven primarily by consumer spending and a rebound in fixed investment,
+                   government statisticians reported on Thursday morning at the usual hour.</p>
+                <p>Analysts had broadly expected the print, but the composition of the gain
+                   surprised some who had bet that residential housing would drag the headline
+                   number into the low twos rather than the comfortable threes.</p>
+            </article>
+        </body>
+        </html>"#;
+
+        let result = extract(html, Some("https://news.example.com/gdp")).unwrap();
+        let m = &result.metadata;
+
+        assert!(m.word_count > 0, "extraction must produce a word count");
+        assert_eq!(
+            m.word_count_article + m.word_count_chrome,
+            m.word_count,
+            "invariant: article + chrome == total. \
+             got article={}, chrome={}, total={}",
+            m.word_count_article,
+            m.word_count_chrome,
+            m.word_count
+        );
+        assert!(
+            m.word_count_article > 0,
+            "fallback body heuristic must populate article portion (>0); \
+             got article={}",
+            m.word_count_article
+        );
+        // Sanity: structured_data should be empty (no JSON-LD in fixture).
+        assert!(
+            result.structured_data.is_empty()
+                || crate::jsonld::classify_all(&result.structured_data)
+                    .iter()
+                    .all(|s| !matches!(
+                        s,
+                        crate::jsonld::JsonLdSchema::NewsArticle { body: Some(_), .. }
+                            | crate::jsonld::JsonLdSchema::Review { review_body: Some(_), .. }
+                    )),
+            "fixture should have no JSON-LD article/review body — \
+             this test exercises the fallback path"
+        );
+    }
+
+    /// M12 test 3: JSON output shape gains `word_count_article` and
+    /// `word_count_chrome` fields when populated. The existing
+    /// `word_count` field is preserved. The three numbers satisfy
+    /// article + chrome == total.
+    #[test]
+    fn test_word_count_breakdown_json_format_has_three_fields() {
+        let html = r#"
+        <html lang="en">
+        <head><title>JSON shape test</title></head>
+        <body>
+            <article>
+                <h1>JSON shape</h1>
+                <p>The body of this article has more than ten words so the
+                   fallback heuristic populates a positive article portion.
+                   The remaining chrome words come from any nav and footer.</p>
+            </article>
+        </body>
+        </html>"#;
+
+        let result = extract(html, Some("https://example.com/json")).unwrap();
+        let json: serde_json::Value =
+            serde_json::from_str(&serde_json::to_string(&result).unwrap()).unwrap();
+        let meta = &json["metadata"];
+
+        // Existing field preserved.
+        assert!(
+            meta.get("word_count").is_some(),
+            "json must keep word_count field; got: {meta}"
+        );
+        // New fields present (because population logic ran and produced
+        // non-zero values — `skip_serializing_if = is_zero_usize` would
+        // drop them if both were 0).
+        let total = meta["word_count"].as_u64().unwrap();
+        let article = meta["word_count_article"].as_u64().unwrap_or(0);
+        let chrome = meta["word_count_chrome"].as_u64().unwrap_or(0);
+        assert_eq!(
+            article + chrome,
+            total,
+            "invariant: article + chrome == word_count in JSON output. \
+             got article={article}, chrome={chrome}, total={total}; meta={meta}"
+        );
+        assert!(
+            article > 0 || total == 0,
+            "expect at least some article words when total > 0; \
+             got article={article}, total={total}"
+        );
+    }
+
+    /// M12 test 4: --mode summary / toc / sections do NOT call into
+    /// `build_metadata_header`, so the breakdown line never appears in
+    /// those modes. This pins the modes' contract (link-list outputs
+    /// stay clean of metadata noise — see iter-5 / iter-7 carry-forward).
+    #[test]
+    fn test_word_count_omitted_or_simple_in_summary_mode() {
+        let html = r#"
+        <html lang="en">
+        <head><title>Hub-style page</title></head>
+        <body>
+            <nav>
+                <a href="/a">First section</a>
+                <a href="/b">Second section</a>
+                <a href="/c">Third section</a>
+            </nav>
+            <article>
+                <p>Short body for hub-style page; the summary mode emits a link list, not a metadata header.</p>
+            </article>
+        </body>
+        </html>"#;
+
+        let result = extract(html, Some("https://example.com/hub")).unwrap();
+        let summary = crate::to_llm_summary(&result, Some("https://example.com/hub"));
+        let toc = crate::to_llm_toc(&result, Some("https://example.com/hub"));
+        let sections = crate::to_llm_sections(&result, Some("https://example.com/hub"));
+
+        for (name, output) in [("summary", &summary), ("toc", &toc), ("sections", &sections)] {
+            assert!(
+                !output.contains("(article:"),
+                "{name} mode must NOT contain the article/chrome breakdown; \
+                 got: {output}"
+            );
+            // toc/summary/sections may or may not have a "Word count:" line
+            // depending on their own header conventions, but it must NOT
+            // carry the M12 parenthetical when it exists.
+        }
+    }
 }
--- a/crates/webclaw-core/src/llm/hub_detect.rs
+++ b/crates/webclaw-core/src/llm/hub_detect.rs
@ -136,6 +136,8 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 0,
+                word_count_article: 0,
+                word_count_chrome: 0,
                http_status: None,
            },
            content: Content {
--- a/crates/webclaw-core/src/llm/metadata.rs
+++ b/crates/webclaw-core/src/llm/metadata.rs
@ -68,6 +68,158 @@ pub(crate) fn build_metadata_header_with_opts(
        out.push_str(&format!("> Language: {l}\n"));
    }
    if meta.word_count > 0 {
-        out.push_str(&format!("> Word count: {}\n", meta.word_count));
+        // M12 (issue #7): split the total into an article-body portion and
+        // a chrome remainder so LLM callers can tell at a glance whether
+        // there's real content under the chrome. When the breakdown is
+        // available (article + chrome == total, set in
+        // `extract_with_options_inner::compute_word_count_breakdown`), emit
+        // the parenthetical; otherwise fall back to the legacy single-N
+        // form (e.g. local-file / --stdin / direct
+        // `extract_with_options` calls that leave the breakdown fields at
+        // their `Default` zero — same shape as the http_status fallback).
+        //
+        // `--mode summary` / `--mode toc` (`include_status=false`)
+        // intentionally fall back to the simple `Word count: N` form: the
+        // link-list / outline modes don't surface article content, so the
+        // breakdown's "did chrome eat the body?" question is irrelevant
+        // there. This piggybacks on the existing `include_status` toggle
+        // — same modes, same suppression intent (iter-7 next-prompt
+        // explicitly authorized either omit-or-simple). `--mode sections`
+        // builds its own header and doesn't reach this code at all.
+        let n = meta.word_count;
+        let m = meta.word_count_article;
+        let k = meta.word_count_chrome;
+        if include_status && m + k == n && (m > 0 || k > 0) {
+            out.push_str(&format!(
+                "> Word count: {n} (article: {m}, chrome: {k})\n"
+            ));
+        } else {
+            out.push_str(&format!("> Word count: {n}\n"));
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// M12 tests for the header word-count breakdown emission. The breakdown POPULATION
+// logic (jsonld articleBody → fallback heuristic) is tested in lib.rs::tests.
+// These tests pin the FORMATTER behavior given pre-populated Metadata fields.
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod m12_tests {
+    use super::*;
+    use crate::types::{Content, ExtractionResult, Metadata};
+
+    fn make_result_with_wc(word_count: usize, article: usize, chrome: usize) -> ExtractionResult {
+        ExtractionResult {
+            metadata: Metadata {
+                title: Some("Test Page".into()),
+                description: None,
+                author: None,
+                published_date: None,
+                language: None,
+                url: Some("https://example.com/".into()),
+                site_name: None,
+                image: None,
+                favicon: None,
+                word_count,
+                word_count_article: article,
+                word_count_chrome: chrome,
+                http_status: None,
+            },
+            content: Content {
+                markdown: String::new(),
+                plain_text: String::new(),
+                links: Vec::new(),
+                images: Vec::new(),
+                code_blocks: Vec::new(),
+                raw_html: None,
+            },
+            domain_data: None,
+            structured_data: Vec::new(),
+        }
+    }
+
+    /// Phase B test 1: when M+K==N and at least one is non-zero, the header
+    /// emits the parenthetical breakdown form.
+    #[test]
+    fn header_emits_breakdown_when_article_plus_chrome_equals_total() {
+        let result = make_result_with_wc(1000, 600, 400);
+        let mut out = String::new();
+        build_metadata_header(&mut out, &result, None);
+        assert!(
+            out.contains("> Word count: 1000 (article: 600, chrome: 400)"),
+            "expected breakdown form, got: {out}"
+        );
+        assert!(
+            !out.contains("> Word count: 1000\n"),
+            "must not contain legacy form when breakdown is present, got: {out}"
+        );
+    }
+
+    /// Phase B test 2: when the breakdown fields are zero (default —
+    /// `extract_with_options` direct path, local-file, --stdin), fall back
+    /// to the legacy single-N form. This protects all the test fixtures
+    /// that don't pre-populate the breakdown.
+    #[test]
+    fn header_falls_back_to_legacy_form_when_breakdown_unpopulated() {
+        let result = make_result_with_wc(1000, 0, 0);
+        let mut out = String::new();
+        build_metadata_header(&mut out, &result, None);
+        assert!(
+            out.contains("> Word count: 1000\n"),
+            "expected legacy single-N form, got: {out}"
+        );
+        assert!(
+            !out.contains("(article:"),
+            "must not contain parenthetical when fields are zero, got: {out}"
+        );
+    }
+
+    /// Phase B test 3: chrome=0 (all-article page, e.g. YouTube fast path
+    /// or document extractor) still emits the breakdown form, so the JSON
+    /// shape and the header shape stay consistent.
+    #[test]
+    fn header_emits_breakdown_with_chrome_zero_when_article_equals_total() {
+        let result = make_result_with_wc(500, 500, 0);
+        let mut out = String::new();
+        build_metadata_header(&mut out, &result, None);
+        assert!(
+            out.contains("> Word count: 500 (article: 500, chrome: 0)"),
+            "expected breakdown with chrome=0, got: {out}"
+        );
+    }
+
+    /// Phase B test 4: when total is zero, no Word count line is emitted at
+    /// all (preserves existing behavior — see `metadata_header_includes_populated_fields`
+    /// sentinel).
+    #[test]
+    fn header_omits_word_count_line_entirely_when_total_zero() {
+        let result = make_result_with_wc(0, 0, 0);
+        let mut out = String::new();
+        build_metadata_header(&mut out, &result, None);
+        assert!(
+            !out.contains("Word count"),
+            "expected no Word count line when total is 0, got: {out}"
+        );
+    }
+
+    /// Phase B test 5: if article+chrome != total (shouldn't happen via the
+    /// canonical `compute_word_count_breakdown` path — invariant), the
+    /// formatter falls back to the legacy single-N form rather than
+    /// surfacing inconsistent arithmetic. Defensive guard.
+    #[test]
+    fn header_falls_back_when_article_plus_chrome_mismatches_total() {
+        let result = make_result_with_wc(1000, 600, 300); // 600 + 300 != 1000
+        let mut out = String::new();
+        build_metadata_header(&mut out, &result, None);
+        assert!(
+            out.contains("> Word count: 1000\n"),
+            "expected legacy form when breakdown invariant violated, got: {out}"
+        );
+        assert!(
+            !out.contains("(article:"),
+            "must not surface inconsistent breakdown, got: {out}"
+        );
    }
 }
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -17,6 +17,26 @@ mod thin_body;
 pub use hub_detect::{classify as classify_hub, HubClassification};
 pub use sections::{collect_section_links, to_json_sections, to_llm_sections};
 pub use thin_body::{classify as classify_thin_body, ThinBodyClassification};
+
+/// Count words in the body AFTER the same processing pipeline the LLM
+/// formatter applies (image / link-syntax / framework-blob stripping, dedup,
+/// whitespace collapse). Words inside markdown link patterns `[text](url)`
+/// are excluded by the pipeline — what's left is "real" article-body prose.
+///
+/// Used by M12 (`word_count_article` / `word_count_chrome` breakdown) as the
+/// fallback estimator when no JSON-LD `articleBody` / `reviewBody` is
+/// available. Mirrors `hub_detect::count_body_words` — same dependency on
+/// `body::process_body(...).text.split_whitespace().count()` — exposed
+/// publicly so `lib.rs::extract_with_options_inner` can populate the
+/// `Metadata.word_count_article` field without reaching across the
+/// `llm::body` `pub(crate)` boundary.
+pub fn body_word_count(markdown: &str) -> usize {
+    body::process_body(markdown)
+        .text
+        .split_whitespace()
+        .filter(|w| !w.is_empty())
+        .count()
+}
 pub use output_size::{
    to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
    truncate_with_footer,
@ -348,6 +368,12 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 42,
+                // M12: default fixture leaves breakdown unset (zero); the
+                // header formatter falls back to the legacy `Word count: N`
+                // form when article+chrome != total. Tests that need the
+                // breakdown set these explicitly.
+                word_count_article: 0,
+                word_count_chrome: 0,
                http_status: None,
            },
            content: Content {
@ -602,6 +628,8 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 0,
+                word_count_article: 0,
+                word_count_chrome: 0,
                http_status: None,
            },
            content: Content {
--- a/crates/webclaw-core/src/llm/output_size.rs
+++ b/crates/webclaw-core/src/llm/output_size.rs
@ -359,6 +359,8 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 0,
+                word_count_article: 0,
+                word_count_chrome: 0,
                http_status: None,
            },
            content: Content {
--- a/crates/webclaw-core/src/llm/sections.rs
+++ b/crates/webclaw-core/src/llm/sections.rs
@ -316,6 +316,8 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 0,
+                word_count_article: 0,
+                word_count_chrome: 0,
                http_status: None,
            },
            content: Content {
--- a/crates/webclaw-core/src/llm/thin_body.rs
+++ b/crates/webclaw-core/src/llm/thin_body.rs
@ -152,6 +152,8 @@ mod tests {
                image: None,
                favicon: None,
                word_count,
+                word_count_article: 0,
+                word_count_chrome: 0,
                http_status: Some(200),
            },
            content: Content {
--- a/crates/webclaw-core/src/metadata.rs
+++ b/crates/webclaw-core/src/metadata.rs
@ -51,8 +51,10 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
        site_name,
        image,
        favicon,
-        word_count: 0, // filled later by the extractor
-        http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin
+        word_count: 0,         // filled later by the extractor
+        word_count_article: 0, // filled by M12 breakdown in extract_with_options_inner
+        word_count_chrome: 0,  // filled by M12 breakdown in extract_with_options_inner
+        http_status: None,     // filled by webclaw-fetch when reachable; None for local-file / --stdin
    }
 }

--- a/crates/webclaw-core/src/types.rs
+++ b/crates/webclaw-core/src/types.rs
@ -27,6 +27,28 @@ pub struct Metadata {
    pub image: Option<String>,
    pub favicon: Option<String>,
    pub word_count: usize,
+    /// Article-body portion of `word_count`, computed in
+    /// `extract_with_options_inner`. Sourced from JSON-LD `articleBody`
+    /// (or `reviewBody`) via the M4 classifier when present; falls back
+    /// to the M2-style body-text heuristic (`llm::body_word_count` on the
+    /// extracted markdown) when JSON-LD body is absent. Always satisfies
+    /// `word_count_article + word_count_chrome == word_count` (M is
+    /// clamped to N if the JSON-LD body has more words than the extracted
+    /// markdown). Serialized unconditionally (parallel to `word_count`)
+    /// so callers can rely on both M12 fields being present alongside
+    /// the existing total; zero on the no-breakdown path (local-file /
+    /// --stdin / direct `extract_with_options` callers). `default = 0`
+    /// for backward-compat on incoming JSON that predates M12. M12 /
+    /// issue #7.
+    #[serde(default)]
+    pub word_count_article: usize,
+    /// Chrome portion of `word_count` (= `word_count - word_count_article`).
+    /// "Chrome" means everything not in the article body: navigation,
+    /// related-link sidebars, footers, ad slots, ticker rows, link cards,
+    /// etc. Serialized unconditionally (parallel to `word_count`); zero
+    /// on the no-breakdown path. M12 / issue #7.
+    #[serde(default)]
+    pub word_count_chrome: usize,
    /// HTTP status code from the final response (after redirects). `None`
    /// when extraction was not preceded by an HTTP fetch — e.g. `--file`,
    /// `--stdin`, or any call into `extract_with_options` directly.
@ -43,6 +65,7 @@ pub struct Metadata {
    pub http_status: Option<u16>,
 }

+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Content {
    pub markdown: String,
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -899,6 +899,10 @@ fn pdf_to_extraction_result(
            image: None,
            favicon: None,
            word_count,
+            // M12: PDF text is body content end-to-end (no nav/chrome
+            // wrapper extracted around it). Treat all words as article.
+            word_count_article: word_count,
+            word_count_chrome: 0,
            http_status: None,
        },
        content: webclaw_core::Content {
--- a/crates/webclaw-fetch/src/document.rs
+++ b/crates/webclaw-fetch/src/document.rs
@ -110,6 +110,10 @@ pub fn extract_document(
            image: None,
            favicon: None,
            word_count,
+            // M12: document extractors (DOCX, CSV, etc.) emit body content
+            // only; no chrome envelope. All words are article.
+            word_count_article: word_count,
+            word_count_chrome: 0,
            http_status: None,
        },
        content: webclaw_core::Content {
--- a/crates/webclaw-fetch/src/linkedin.rs
+++ b/crates/webclaw-fetch/src/linkedin.rs
@ -216,6 +216,11 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
            image: None,
            favicon: None,
            word_count,
+            // M12: LinkedIn post payload IS the article body; comments
+            // and reactions are appended below but treated as article
+            // for this extractor (no separate chrome envelope).
+            word_count_article: word_count,
+            word_count_chrome: 0,
            http_status: None,
        },
        content: Content {
--- a/crates/webclaw-fetch/src/reddit.rs
+++ b/crates/webclaw-fetch/src/reddit.rs
@ -92,6 +92,10 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
            image: None,
            favicon: None,
            word_count,
+            // M12: Reddit JSON path emits post body + comments; no chrome
+            // wrapper. All words are article-side content.
+            word_count_article: word_count,
+            word_count_chrome: 0,
            http_status: None,
        },
        content: Content {
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@ -518,6 +518,8 @@ impl WebclawMcp {
                        image: None,
                        favicon: None,
                        word_count: markdown.split_whitespace().count(),
+                        word_count_article: 0,
+                        word_count_chrome: 0,
                        http_status: None,
                    },
                    domain_data: None,
--- a/crates/webclaw-server/src/routes/diff.rs
+++ b/crates/webclaw-server/src/routes/diff.rs
@ -65,6 +65,8 @@ fn empty_metadata() -> Metadata {
        image: None,
        favicon: None,
        word_count: 0,
+        word_count_article: 0,
+        word_count_chrome: 0,
        http_status: None,
    }
 }