feat(core): word-count breakdown in header — article vs chrome split

Current Word count: N is a single number conflating article body
and surrounding chrome (nav, ads, footer). Callers couldn't tell
from the header alone whether to drill or move on.

New: Word count: N (article: M, chrome: K) in -f llm/text output.
For -f json: adds word_count_article and word_count_chrome
fields alongside the existing word_count.

M (article body) is sourced from JSON-LD articleBody when M4's
parser found one (NewsArticle or Review.reviewBody); otherwise
computed by llm::body_word_count (the M2-style heuristic — words
outside markdown link patterns, the same body::process_body output
hub_detect uses).

--mode summary / toc / sections fall back to the simple Word count: N
form (the modes don't extract body content; the breakdown would be
meaningless). Suppression piggybacks on the existing include_status
toggle in build_metadata_header_with_opts.

9 new tests in webclaw-core (4 in lib.rs::tests for the population
logic; 5 in llm/metadata.rs::m12_tests for the header formatter).
Workspace 701 -> 710.
This commit is contained in:
devnen 2026-05-23 23:56:14 +02:00
parent ade2a5143c
commit d5a3aa4bf9
17 changed files with 519 additions and 7 deletions

View file

@ -3032,6 +3032,8 @@ mod tests {
image: None,
favicon: None,
word_count: markdown.split_whitespace().count(),
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
},
content: Content {

View file

@ -148,6 +148,8 @@ mod tests {
image: None,
favicon: None,
word_count,
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
},
content: Content {

View file

@ -31,10 +31,10 @@ pub use jsonld::{
ArticleRef, JsonLdSchema, LiveUpdate,
};
pub use llm::{
classify_hub, classify_thin_body, collect_section_links, to_json_sections, to_json_summary,
to_json_toc, to_llm_sections, to_llm_summary, to_llm_text, to_llm_text_with_options,
to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, HubClassification,
LlmTextOptions, ThinBodyClassification,
body_word_count, classify_hub, classify_thin_body, collect_section_links, to_json_sections,
to_json_summary, to_json_toc, to_llm_sections, to_llm_summary, to_llm_text,
to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer,
HubClassification, LlmTextOptions, ThinBodyClassification,
};
pub use types::{
CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
@ -114,6 +114,11 @@ fn extract_with_options_inner(
let doc = Html::parse_document(html);
let mut meta = metadata::extract(&doc, url);
meta.word_count = extractor::word_count(&yt_md);
// M12: YouTube fast path emits structured video metadata only
// (title, channel, view count, description). No chrome / nav /
// ads in the output — all words are "article" by definition.
meta.word_count_article = meta.word_count;
meta.word_count_chrome = 0;
let plain_text = yt_md
.lines()
@ -234,6 +239,16 @@ fn extract_with_options_inner(
structured_data.extend(structured_data::extract_next_data(html));
structured_data.extend(structured_data::extract_sveltekit(html));
// M12 (issue #7): split the total word_count into an article-body
// portion and a chrome portion. Computed once, here, AFTER all the
// word_count update paths above (data island, QuickJS, retry strategies)
// have settled. Sourced from JSON-LD articleBody/reviewBody when
// present, else the M2-style body word count on the extracted markdown.
let (article_wc, chrome_wc) =
compute_word_count_breakdown(&content.markdown, &structured_data, meta.word_count);
meta.word_count_article = article_wc;
meta.word_count_chrome = chrome_wc;
Ok(ExtractionResult {
metadata: meta,
content,
@ -242,6 +257,60 @@ fn extract_with_options_inner(
})
}
/// M12 helper: split a page's total word_count into an article-body portion
/// and a chrome remainder.
///
/// Precedence:
/// 1. JSON-LD `articleBody` (NewsArticle) or `reviewBody` (Review) via
/// [`crate::jsonld::primary_schema`]. When present, the article portion
/// is the word count of that string.
/// 2. Fallback: [`llm::body_word_count`] on the extracted markdown — M2's
/// "words outside markdown link patterns" estimator (same pipeline
/// `hub_detect::count_body_words` uses for hub classification).
///
/// Invariant: returns `(article, chrome)` such that `article + chrome ==
/// total_wc`. `article` is clamped to `total_wc` if the JSON-LD body has
/// more words than the extracted markdown (tokenization differences are
/// expected — the breakdown is a best-effort split, not a perfect
/// partition). When `total_wc == 0`, returns `(0, 0)` so the
/// `skip_serializing_if = "is_zero_usize"` guard on the Metadata fields
/// drops them from JSON output.
fn compute_word_count_breakdown(
markdown: &str,
structured_data: &[serde_json::Value],
total_wc: usize,
) -> (usize, usize) {
if total_wc == 0 {
return (0, 0);
}
// 1. JSON-LD articleBody / reviewBody — ground truth when present.
let schemas = jsonld::classify_all(structured_data);
let jsonld_body: Option<&str> = jsonld::primary_schema(&schemas).and_then(|s| match s {
jsonld::JsonLdSchema::NewsArticle { body: Some(b), .. } => Some(b.as_str()),
jsonld::JsonLdSchema::Review {
review_body: Some(b),
..
} => Some(b.as_str()),
_ => None,
});
let article_raw = if let Some(body_str) = jsonld_body {
extractor::word_count(body_str)
} else {
// 2. Fallback: M2-style body word count on extracted markdown.
llm::body_word_count(markdown)
};
// Clamp so article + chrome == total_wc (invariant for the JSON shape
// and the header arithmetic). Tokenization mismatches (JSON-LD body
// vs extractor::word_count) can make article_raw > total_wc; that's
// not a bug, it's a representation gap — clamp and move on.
let article = article_raw.min(total_wc);
let chrome = total_wc - article;
(article, chrome)
}
#[cfg(test)]
mod tests {
use super::*;
@ -660,4 +729,209 @@ mod tests {
"wasm path and threaded path must produce identical content"
);
}
// -----------------------------------------------------------------
// M12 (issue #7): word-count breakdown — article vs chrome split.
// Tests the POPULATION logic in `extract_with_options_inner` /
// `compute_word_count_breakdown`. Formatter behavior is tested in
// `crate::llm::metadata::m12_tests`.
// -----------------------------------------------------------------
/// M12 test 1: a page with a JSON-LD `NewsArticle.articleBody` gets
/// the article portion sourced from the articleBody string. Chrome
/// is the remainder. Total invariant: article + chrome == word_count.
#[test]
fn test_word_count_breakdown_with_jsonld_article_body() {
// 20-word articleBody. Wrap in a <p> + nav chrome so the
// extracted markdown has both article words AND chrome words.
let html = r#"
<html lang="en">
<head>
<title>Tariffs hit consumers</title>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "NewsArticle",
"headline": "Tariffs hit consumers",
"articleBody": "Tariffs are taxes on imports paid by consumers in the importing country, not the exporting one, economists explained today again."
}
</script>
</head>
<body>
<nav><a href="/">Home</a> | <a href="/markets">Markets</a> | <a href="/world">World</a></nav>
<article>
<h1>Tariffs hit consumers</h1>
<p>Tariffs are taxes on imports paid by consumers in the importing country, not the exporting one, economists explained today again.</p>
</article>
<footer>Subscribe to our newsletter for daily updates and breaking-news alerts</footer>
</body>
</html>"#;
let result = extract(html, Some("https://news.example.com/tariffs")).unwrap();
let m = &result.metadata;
assert!(m.word_count > 0, "extraction must produce a word count");
// articleBody is exactly 20 words. The extracted markdown may
// include more or fewer words depending on what the scorer
// captured; the invariant we assert is structural, not numeric.
assert_eq!(
m.word_count_article + m.word_count_chrome,
m.word_count,
"invariant: article + chrome == total. \
got article={}, chrome={}, total={}",
m.word_count_article,
m.word_count_chrome,
m.word_count
);
assert!(
m.word_count_article > 0,
"JSON-LD articleBody must populate article portion (>0); \
got article={}",
m.word_count_article
);
}
/// M12 test 2: when no JSON-LD body is present, the article portion
/// falls back to the M2-style body heuristic (`llm::body_word_count`
/// on extracted markdown). Chrome is the remainder. The article
/// portion must still be >0 on a real body page; total invariant holds.
#[test]
fn test_word_count_breakdown_without_jsonld_falls_back_to_heuristic() {
// No <script type="application/ld+json"> block — the breakdown
// must come from the body::process_body fallback.
let html = r#"
<html lang="en">
<head><title>Plain article</title></head>
<body>
<article>
<h1>Plain article</h1>
<p>The economy expanded last quarter at an annualized rate of three percent
driven primarily by consumer spending and a rebound in fixed investment,
government statisticians reported on Thursday morning at the usual hour.</p>
<p>Analysts had broadly expected the print, but the composition of the gain
surprised some who had bet that residential housing would drag the headline
number into the low twos rather than the comfortable threes.</p>
</article>
</body>
</html>"#;
let result = extract(html, Some("https://news.example.com/gdp")).unwrap();
let m = &result.metadata;
assert!(m.word_count > 0, "extraction must produce a word count");
assert_eq!(
m.word_count_article + m.word_count_chrome,
m.word_count,
"invariant: article + chrome == total. \
got article={}, chrome={}, total={}",
m.word_count_article,
m.word_count_chrome,
m.word_count
);
assert!(
m.word_count_article > 0,
"fallback body heuristic must populate article portion (>0); \
got article={}",
m.word_count_article
);
// Sanity: structured_data should be empty (no JSON-LD in fixture).
assert!(
result.structured_data.is_empty()
|| crate::jsonld::classify_all(&result.structured_data)
.iter()
.all(|s| !matches!(
s,
crate::jsonld::JsonLdSchema::NewsArticle { body: Some(_), .. }
| crate::jsonld::JsonLdSchema::Review { review_body: Some(_), .. }
)),
"fixture should have no JSON-LD article/review body — \
this test exercises the fallback path"
);
}
/// M12 test 3: JSON output shape gains `word_count_article` and
/// `word_count_chrome` fields when populated. The existing
/// `word_count` field is preserved. The three numbers satisfy
/// article + chrome == total.
#[test]
fn test_word_count_breakdown_json_format_has_three_fields() {
let html = r#"
<html lang="en">
<head><title>JSON shape test</title></head>
<body>
<article>
<h1>JSON shape</h1>
<p>The body of this article has more than ten words so the
fallback heuristic populates a positive article portion.
The remaining chrome words come from any nav and footer.</p>
</article>
</body>
</html>"#;
let result = extract(html, Some("https://example.com/json")).unwrap();
let json: serde_json::Value =
serde_json::from_str(&serde_json::to_string(&result).unwrap()).unwrap();
let meta = &json["metadata"];
// Existing field preserved.
assert!(
meta.get("word_count").is_some(),
"json must keep word_count field; got: {meta}"
);
// New fields present (because population logic ran and produced
// non-zero values — `skip_serializing_if = is_zero_usize` would
// drop them if both were 0).
let total = meta["word_count"].as_u64().unwrap();
let article = meta["word_count_article"].as_u64().unwrap_or(0);
let chrome = meta["word_count_chrome"].as_u64().unwrap_or(0);
assert_eq!(
article + chrome,
total,
"invariant: article + chrome == word_count in JSON output. \
got article={article}, chrome={chrome}, total={total}; meta={meta}"
);
assert!(
article > 0 || total == 0,
"expect at least some article words when total > 0; \
got article={article}, total={total}"
);
}
/// M12 test 4: --mode summary / toc / sections do NOT call into
/// `build_metadata_header`, so the breakdown line never appears in
/// those modes. This pins the modes' contract (link-list outputs
/// stay clean of metadata noise — see iter-5 / iter-7 carry-forward).
#[test]
fn test_word_count_omitted_or_simple_in_summary_mode() {
let html = r#"
<html lang="en">
<head><title>Hub-style page</title></head>
<body>
<nav>
<a href="/a">First section</a>
<a href="/b">Second section</a>
<a href="/c">Third section</a>
</nav>
<article>
<p>Short body for hub-style page; the summary mode emits a link list, not a metadata header.</p>
</article>
</body>
</html>"#;
let result = extract(html, Some("https://example.com/hub")).unwrap();
let summary = crate::to_llm_summary(&result, Some("https://example.com/hub"));
let toc = crate::to_llm_toc(&result, Some("https://example.com/hub"));
let sections = crate::to_llm_sections(&result, Some("https://example.com/hub"));
for (name, output) in [("summary", &summary), ("toc", &toc), ("sections", &sections)] {
assert!(
!output.contains("(article:"),
"{name} mode must NOT contain the article/chrome breakdown; \
got: {output}"
);
// toc/summary/sections may or may not have a "Word count:" line
// depending on their own header conventions, but it must NOT
// carry the M12 parenthetical when it exists.
}
}
}

View file

@ -136,6 +136,8 @@ mod tests {
image: None,
favicon: None,
word_count: 0,
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
},
content: Content {

View file

@ -68,6 +68,158 @@ pub(crate) fn build_metadata_header_with_opts(
out.push_str(&format!("> Language: {l}\n"));
}
if meta.word_count > 0 {
out.push_str(&format!("> Word count: {}\n", meta.word_count));
// M12 (issue #7): split the total into an article-body portion and
// a chrome remainder so LLM callers can tell at a glance whether
// there's real content under the chrome. When the breakdown is
// available (article + chrome == total, set in
// `extract_with_options_inner::compute_word_count_breakdown`), emit
// the parenthetical; otherwise fall back to the legacy single-N
// form (e.g. local-file / --stdin / direct
// `extract_with_options` calls that leave the breakdown fields at
// their `Default` zero — same shape as the http_status fallback).
//
// `--mode summary` / `--mode toc` (`include_status=false`)
// intentionally fall back to the simple `Word count: N` form: the
// link-list / outline modes don't surface article content, so the
// breakdown's "did chrome eat the body?" question is irrelevant
// there. This piggybacks on the existing `include_status` toggle
// — same modes, same suppression intent (iter-7 next-prompt
// explicitly authorized either omit-or-simple). `--mode sections`
// builds its own header and doesn't reach this code at all.
let n = meta.word_count;
let m = meta.word_count_article;
let k = meta.word_count_chrome;
if include_status && m + k == n && (m > 0 || k > 0) {
out.push_str(&format!(
"> Word count: {n} (article: {m}, chrome: {k})\n"
));
} else {
out.push_str(&format!("> Word count: {n}\n"));
}
}
}
// ---------------------------------------------------------------------------
// M12 tests for the header word-count breakdown emission. The breakdown POPULATION
// logic (jsonld articleBody → fallback heuristic) is tested in lib.rs::tests.
// These tests pin the FORMATTER behavior given pre-populated Metadata fields.
// ---------------------------------------------------------------------------
#[cfg(test)]
mod m12_tests {
use super::*;
use crate::types::{Content, ExtractionResult, Metadata};
fn make_result_with_wc(word_count: usize, article: usize, chrome: usize) -> ExtractionResult {
ExtractionResult {
metadata: Metadata {
title: Some("Test Page".into()),
description: None,
author: None,
published_date: None,
language: None,
url: Some("https://example.com/".into()),
site_name: None,
image: None,
favicon: None,
word_count,
word_count_article: article,
word_count_chrome: chrome,
http_status: None,
},
content: Content {
markdown: String::new(),
plain_text: String::new(),
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: Vec::new(),
}
}
/// Phase B test 1: when M+K==N and at least one is non-zero, the header
/// emits the parenthetical breakdown form.
#[test]
fn header_emits_breakdown_when_article_plus_chrome_equals_total() {
let result = make_result_with_wc(1000, 600, 400);
let mut out = String::new();
build_metadata_header(&mut out, &result, None);
assert!(
out.contains("> Word count: 1000 (article: 600, chrome: 400)"),
"expected breakdown form, got: {out}"
);
assert!(
!out.contains("> Word count: 1000\n"),
"must not contain legacy form when breakdown is present, got: {out}"
);
}
/// Phase B test 2: when the breakdown fields are zero (default —
/// `extract_with_options` direct path, local-file, --stdin), fall back
/// to the legacy single-N form. This protects all the test fixtures
/// that don't pre-populate the breakdown.
#[test]
fn header_falls_back_to_legacy_form_when_breakdown_unpopulated() {
let result = make_result_with_wc(1000, 0, 0);
let mut out = String::new();
build_metadata_header(&mut out, &result, None);
assert!(
out.contains("> Word count: 1000\n"),
"expected legacy single-N form, got: {out}"
);
assert!(
!out.contains("(article:"),
"must not contain parenthetical when fields are zero, got: {out}"
);
}
/// Phase B test 3: chrome=0 (all-article page, e.g. YouTube fast path
/// or document extractor) still emits the breakdown form, so the JSON
/// shape and the header shape stay consistent.
#[test]
fn header_emits_breakdown_with_chrome_zero_when_article_equals_total() {
let result = make_result_with_wc(500, 500, 0);
let mut out = String::new();
build_metadata_header(&mut out, &result, None);
assert!(
out.contains("> Word count: 500 (article: 500, chrome: 0)"),
"expected breakdown with chrome=0, got: {out}"
);
}
/// Phase B test 4: when total is zero, no Word count line is emitted at
/// all (preserves existing behavior — see `metadata_header_includes_populated_fields`
/// sentinel).
#[test]
fn header_omits_word_count_line_entirely_when_total_zero() {
let result = make_result_with_wc(0, 0, 0);
let mut out = String::new();
build_metadata_header(&mut out, &result, None);
assert!(
!out.contains("Word count"),
"expected no Word count line when total is 0, got: {out}"
);
}
/// Phase B test 5: if article+chrome != total (shouldn't happen via the
/// canonical `compute_word_count_breakdown` path — invariant), the
/// formatter falls back to the legacy single-N form rather than
/// surfacing inconsistent arithmetic. Defensive guard.
#[test]
fn header_falls_back_when_article_plus_chrome_mismatches_total() {
let result = make_result_with_wc(1000, 600, 300); // 600 + 300 != 1000
let mut out = String::new();
build_metadata_header(&mut out, &result, None);
assert!(
out.contains("> Word count: 1000\n"),
"expected legacy form when breakdown invariant violated, got: {out}"
);
assert!(
!out.contains("(article:"),
"must not surface inconsistent breakdown, got: {out}"
);
}
}

View file

@ -17,6 +17,26 @@ mod thin_body;
pub use hub_detect::{classify as classify_hub, HubClassification};
pub use sections::{collect_section_links, to_json_sections, to_llm_sections};
pub use thin_body::{classify as classify_thin_body, ThinBodyClassification};
/// Count words in the body AFTER the same processing pipeline the LLM
/// formatter applies (image / link-syntax / framework-blob stripping, dedup,
/// whitespace collapse). Words inside markdown link patterns `[text](url)`
/// are excluded by the pipeline — what's left is "real" article-body prose.
///
/// Used by M12 (`word_count_article` / `word_count_chrome` breakdown) as the
/// fallback estimator when no JSON-LD `articleBody` / `reviewBody` is
/// available. Mirrors `hub_detect::count_body_words` — same dependency on
/// `body::process_body(...).text.split_whitespace().count()` — exposed
/// publicly so `lib.rs::extract_with_options_inner` can populate the
/// `Metadata.word_count_article` field without reaching across the
/// `llm::body` `pub(crate)` boundary.
pub fn body_word_count(markdown: &str) -> usize {
body::process_body(markdown)
.text
.split_whitespace()
.filter(|w| !w.is_empty())
.count()
}
pub use output_size::{
to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
truncate_with_footer,
@ -348,6 +368,12 @@ mod tests {
image: None,
favicon: None,
word_count: 42,
// M12: default fixture leaves breakdown unset (zero); the
// header formatter falls back to the legacy `Word count: N`
// form when article+chrome != total. Tests that need the
// breakdown set these explicitly.
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
},
content: Content {
@ -602,6 +628,8 @@ mod tests {
image: None,
favicon: None,
word_count: 0,
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
},
content: Content {

View file

@ -359,6 +359,8 @@ mod tests {
image: None,
favicon: None,
word_count: 0,
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
},
content: Content {

View file

@ -316,6 +316,8 @@ mod tests {
image: None,
favicon: None,
word_count: 0,
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
},
content: Content {

View file

@ -152,6 +152,8 @@ mod tests {
image: None,
favicon: None,
word_count,
word_count_article: 0,
word_count_chrome: 0,
http_status: Some(200),
},
content: Content {

View file

@ -51,8 +51,10 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
site_name,
image,
favicon,
word_count: 0, // filled later by the extractor
http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin
word_count: 0, // filled later by the extractor
word_count_article: 0, // filled by M12 breakdown in extract_with_options_inner
word_count_chrome: 0, // filled by M12 breakdown in extract_with_options_inner
http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin
}
}

View file

@ -27,6 +27,28 @@ pub struct Metadata {
pub image: Option<String>,
pub favicon: Option<String>,
pub word_count: usize,
/// Article-body portion of `word_count`, computed in
/// `extract_with_options_inner`. Sourced from JSON-LD `articleBody`
/// (or `reviewBody`) via the M4 classifier when present; falls back
/// to the M2-style body-text heuristic (`llm::body_word_count` on the
/// extracted markdown) when JSON-LD body is absent. Always satisfies
/// `word_count_article + word_count_chrome == word_count` (M is
/// clamped to N if the JSON-LD body has more words than the extracted
/// markdown). Serialized unconditionally (parallel to `word_count`)
/// so callers can rely on both M12 fields being present alongside
/// the existing total; zero on the no-breakdown path (local-file /
/// --stdin / direct `extract_with_options` callers). `default = 0`
/// for backward-compat on incoming JSON that predates M12. M12 /
/// issue #7.
#[serde(default)]
pub word_count_article: usize,
/// Chrome portion of `word_count` (= `word_count - word_count_article`).
/// "Chrome" means everything not in the article body: navigation,
/// related-link sidebars, footers, ad slots, ticker rows, link cards,
/// etc. Serialized unconditionally (parallel to `word_count`); zero
/// on the no-breakdown path. M12 / issue #7.
#[serde(default)]
pub word_count_chrome: usize,
/// HTTP status code from the final response (after redirects). `None`
/// when extraction was not preceded by an HTTP fetch — e.g. `--file`,
/// `--stdin`, or any call into `extract_with_options` directly.
@ -43,6 +65,7 @@ pub struct Metadata {
pub http_status: Option<u16>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Content {
pub markdown: String,

View file

@ -899,6 +899,10 @@ fn pdf_to_extraction_result(
image: None,
favicon: None,
word_count,
// M12: PDF text is body content end-to-end (no nav/chrome
// wrapper extracted around it). Treat all words as article.
word_count_article: word_count,
word_count_chrome: 0,
http_status: None,
},
content: webclaw_core::Content {

View file

@ -110,6 +110,10 @@ pub fn extract_document(
image: None,
favicon: None,
word_count,
// M12: document extractors (DOCX, CSV, etc.) emit body content
// only; no chrome envelope. All words are article.
word_count_article: word_count,
word_count_chrome: 0,
http_status: None,
},
content: webclaw_core::Content {

View file

@ -216,6 +216,11 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
image: None,
favicon: None,
word_count,
// M12: LinkedIn post payload IS the article body; comments
// and reactions are appended below but treated as article
// for this extractor (no separate chrome envelope).
word_count_article: word_count,
word_count_chrome: 0,
http_status: None,
},
content: Content {

View file

@ -92,6 +92,10 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
image: None,
favicon: None,
word_count,
// M12: Reddit JSON path emits post body + comments; no chrome
// wrapper. All words are article-side content.
word_count_article: word_count,
word_count_chrome: 0,
http_status: None,
},
content: Content {

View file

@ -518,6 +518,8 @@ impl WebclawMcp {
image: None,
favicon: None,
word_count: markdown.split_whitespace().count(),
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
},
domain_data: None,

View file

@ -65,6 +65,8 @@ fn empty_metadata() -> Metadata {
image: None,
favicon: None,
word_count: 0,
word_count_article: 0,
word_count_chrome: 0,
http_status: None,
}
}