Tariffs hit consumers
+Tariffs are taxes on imports paid by consumers in the importing country, not the exporting one, economists explained today again.
+diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 3409692..c07e1fc 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -3032,6 +3032,8 @@ mod tests { image: None, favicon: None, word_count: markdown.split_whitespace().count(), + word_count_article: 0, + word_count_chrome: 0, http_status: None, }, content: Content { diff --git a/crates/webclaw-core/src/diff.rs b/crates/webclaw-core/src/diff.rs index e888d1c..0c64e43 100644 --- a/crates/webclaw-core/src/diff.rs +++ b/crates/webclaw-core/src/diff.rs @@ -148,6 +148,8 @@ mod tests { image: None, favicon: None, word_count, + word_count_article: 0, + word_count_chrome: 0, http_status: None, }, content: Content { diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index 357e808..4353024 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -31,10 +31,10 @@ pub use jsonld::{ ArticleRef, JsonLdSchema, LiveUpdate, }; pub use llm::{ - classify_hub, classify_thin_body, collect_section_links, to_json_sections, to_json_summary, - to_json_toc, to_llm_sections, to_llm_summary, to_llm_text, to_llm_text_with_options, - to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, HubClassification, - LlmTextOptions, ThinBodyClassification, + body_word_count, classify_hub, classify_thin_body, collect_section_links, to_json_sections, + to_json_summary, to_json_toc, to_llm_sections, to_llm_summary, to_llm_text, + to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, + HubClassification, LlmTextOptions, ThinBodyClassification, }; pub use types::{ CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata, @@ -114,6 +114,11 @@ fn extract_with_options_inner( let doc = Html::parse_document(html); let mut meta = metadata::extract(&doc, url); meta.word_count = extractor::word_count(&yt_md); + // M12: YouTube fast path emits structured video metadata only + // (title, channel, view count, description). No chrome / nav / + // ads in the output — all words are "article" by definition. + meta.word_count_article = meta.word_count; + meta.word_count_chrome = 0; let plain_text = yt_md .lines() @@ -234,6 +239,16 @@ fn extract_with_options_inner( structured_data.extend(structured_data::extract_next_data(html)); structured_data.extend(structured_data::extract_sveltekit(html)); + // M12 (issue #7): split the total word_count into an article-body + // portion and a chrome portion. Computed once, here, AFTER all the + // word_count update paths above (data island, QuickJS, retry strategies) + // have settled. Sourced from JSON-LD articleBody/reviewBody when + // present, else the M2-style body word count on the extracted markdown. + let (article_wc, chrome_wc) = + compute_word_count_breakdown(&content.markdown, &structured_data, meta.word_count); + meta.word_count_article = article_wc; + meta.word_count_chrome = chrome_wc; + Ok(ExtractionResult { metadata: meta, content, @@ -242,6 +257,60 @@ fn extract_with_options_inner( }) } +/// M12 helper: split a page's total word_count into an article-body portion +/// and a chrome remainder. +/// +/// Precedence: +/// 1. JSON-LD `articleBody` (NewsArticle) or `reviewBody` (Review) via +/// [`crate::jsonld::primary_schema`]. When present, the article portion +/// is the word count of that string. +/// 2. Fallback: [`llm::body_word_count`] on the extracted markdown — M2's +/// "words outside markdown link patterns" estimator (same pipeline +/// `hub_detect::count_body_words` uses for hub classification). +/// +/// Invariant: returns `(article, chrome)` such that `article + chrome == +/// total_wc`. `article` is clamped to `total_wc` if the JSON-LD body has +/// more words than the extracted markdown (tokenization differences are +/// expected — the breakdown is a best-effort split, not a perfect +/// partition). When `total_wc == 0`, returns `(0, 0)` so the +/// `skip_serializing_if = "is_zero_usize"` guard on the Metadata fields +/// drops them from JSON output. +fn compute_word_count_breakdown( + markdown: &str, + structured_data: &[serde_json::Value], + total_wc: usize, +) -> (usize, usize) { + if total_wc == 0 { + return (0, 0); + } + + // 1. JSON-LD articleBody / reviewBody — ground truth when present. + let schemas = jsonld::classify_all(structured_data); + let jsonld_body: Option<&str> = jsonld::primary_schema(&schemas).and_then(|s| match s { + jsonld::JsonLdSchema::NewsArticle { body: Some(b), .. } => Some(b.as_str()), + jsonld::JsonLdSchema::Review { + review_body: Some(b), + .. + } => Some(b.as_str()), + _ => None, + }); + + let article_raw = if let Some(body_str) = jsonld_body { + extractor::word_count(body_str) + } else { + // 2. Fallback: M2-style body word count on extracted markdown. + llm::body_word_count(markdown) + }; + + // Clamp so article + chrome == total_wc (invariant for the JSON shape + // and the header arithmetic). Tokenization mismatches (JSON-LD body + // vs extractor::word_count) can make article_raw > total_wc; that's + // not a bug, it's a representation gap — clamp and move on. + let article = article_raw.min(total_wc); + let chrome = total_wc - article; + (article, chrome) +} + #[cfg(test)] mod tests { use super::*; @@ -660,4 +729,209 @@ mod tests { "wasm path and threaded path must produce identical content" ); } + + // ----------------------------------------------------------------- + // M12 (issue #7): word-count breakdown — article vs chrome split. + // Tests the POPULATION logic in `extract_with_options_inner` / + // `compute_word_count_breakdown`. Formatter behavior is tested in + // `crate::llm::metadata::m12_tests`. + // ----------------------------------------------------------------- + + /// M12 test 1: a page with a JSON-LD `NewsArticle.articleBody` gets + /// the article portion sourced from the articleBody string. Chrome + /// is the remainder. Total invariant: article + chrome == word_count. + #[test] + fn test_word_count_breakdown_with_jsonld_article_body() { + // 20-word articleBody. Wrap in a
+ nav chrome so the + // extracted markdown has both article words AND chrome words. + let html = r#" + +
+Tariffs are taxes on imports paid by consumers in the importing country, not the exporting one, economists explained today again.
+