mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-09 22:35:12 +02:00
feat(core): word-count breakdown in header — article vs chrome split
Current Word count: N is a single number conflating article body and surrounding chrome (nav, ads, footer). Callers couldn't tell from the header alone whether to drill or move on. New: Word count: N (article: M, chrome: K) in -f llm/text output. For -f json: adds word_count_article and word_count_chrome fields alongside the existing word_count. M (article body) is sourced from JSON-LD articleBody when M4's parser found one (NewsArticle or Review.reviewBody); otherwise computed by llm::body_word_count (the M2-style heuristic — words outside markdown link patterns, the same body::process_body output hub_detect uses). --mode summary / toc / sections fall back to the simple Word count: N form (the modes don't extract body content; the breakdown would be meaningless). Suppression piggybacks on the existing include_status toggle in build_metadata_header_with_opts. 9 new tests in webclaw-core (4 in lib.rs::tests for the population logic; 5 in llm/metadata.rs::m12_tests for the header formatter). Workspace 701 -> 710.
This commit is contained in:
parent
ade2a5143c
commit
d5a3aa4bf9
17 changed files with 519 additions and 7 deletions
|
|
@ -3032,6 +3032,8 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: markdown.split_whitespace().count(),
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -148,6 +148,8 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -31,10 +31,10 @@ pub use jsonld::{
|
|||
ArticleRef, JsonLdSchema, LiveUpdate,
|
||||
};
|
||||
pub use llm::{
|
||||
classify_hub, classify_thin_body, collect_section_links, to_json_sections, to_json_summary,
|
||||
to_json_toc, to_llm_sections, to_llm_summary, to_llm_text, to_llm_text_with_options,
|
||||
to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, HubClassification,
|
||||
LlmTextOptions, ThinBodyClassification,
|
||||
body_word_count, classify_hub, classify_thin_body, collect_section_links, to_json_sections,
|
||||
to_json_summary, to_json_toc, to_llm_sections, to_llm_summary, to_llm_text,
|
||||
to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer,
|
||||
HubClassification, LlmTextOptions, ThinBodyClassification,
|
||||
};
|
||||
pub use types::{
|
||||
CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
|
||||
|
|
@ -114,6 +114,11 @@ fn extract_with_options_inner(
|
|||
let doc = Html::parse_document(html);
|
||||
let mut meta = metadata::extract(&doc, url);
|
||||
meta.word_count = extractor::word_count(&yt_md);
|
||||
// M12: YouTube fast path emits structured video metadata only
|
||||
// (title, channel, view count, description). No chrome / nav /
|
||||
// ads in the output — all words are "article" by definition.
|
||||
meta.word_count_article = meta.word_count;
|
||||
meta.word_count_chrome = 0;
|
||||
|
||||
let plain_text = yt_md
|
||||
.lines()
|
||||
|
|
@ -234,6 +239,16 @@ fn extract_with_options_inner(
|
|||
structured_data.extend(structured_data::extract_next_data(html));
|
||||
structured_data.extend(structured_data::extract_sveltekit(html));
|
||||
|
||||
// M12 (issue #7): split the total word_count into an article-body
|
||||
// portion and a chrome portion. Computed once, here, AFTER all the
|
||||
// word_count update paths above (data island, QuickJS, retry strategies)
|
||||
// have settled. Sourced from JSON-LD articleBody/reviewBody when
|
||||
// present, else the M2-style body word count on the extracted markdown.
|
||||
let (article_wc, chrome_wc) =
|
||||
compute_word_count_breakdown(&content.markdown, &structured_data, meta.word_count);
|
||||
meta.word_count_article = article_wc;
|
||||
meta.word_count_chrome = chrome_wc;
|
||||
|
||||
Ok(ExtractionResult {
|
||||
metadata: meta,
|
||||
content,
|
||||
|
|
@ -242,6 +257,60 @@ fn extract_with_options_inner(
|
|||
})
|
||||
}
|
||||
|
||||
/// M12 helper: split a page's total word_count into an article-body portion
|
||||
/// and a chrome remainder.
|
||||
///
|
||||
/// Precedence:
|
||||
/// 1. JSON-LD `articleBody` (NewsArticle) or `reviewBody` (Review) via
|
||||
/// [`crate::jsonld::primary_schema`]. When present, the article portion
|
||||
/// is the word count of that string.
|
||||
/// 2. Fallback: [`llm::body_word_count`] on the extracted markdown — M2's
|
||||
/// "words outside markdown link patterns" estimator (same pipeline
|
||||
/// `hub_detect::count_body_words` uses for hub classification).
|
||||
///
|
||||
/// Invariant: returns `(article, chrome)` such that `article + chrome ==
|
||||
/// total_wc`. `article` is clamped to `total_wc` if the JSON-LD body has
|
||||
/// more words than the extracted markdown (tokenization differences are
|
||||
/// expected — the breakdown is a best-effort split, not a perfect
|
||||
/// partition). When `total_wc == 0`, returns `(0, 0)` so the
|
||||
/// `skip_serializing_if = "is_zero_usize"` guard on the Metadata fields
|
||||
/// drops them from JSON output.
|
||||
fn compute_word_count_breakdown(
|
||||
markdown: &str,
|
||||
structured_data: &[serde_json::Value],
|
||||
total_wc: usize,
|
||||
) -> (usize, usize) {
|
||||
if total_wc == 0 {
|
||||
return (0, 0);
|
||||
}
|
||||
|
||||
// 1. JSON-LD articleBody / reviewBody — ground truth when present.
|
||||
let schemas = jsonld::classify_all(structured_data);
|
||||
let jsonld_body: Option<&str> = jsonld::primary_schema(&schemas).and_then(|s| match s {
|
||||
jsonld::JsonLdSchema::NewsArticle { body: Some(b), .. } => Some(b.as_str()),
|
||||
jsonld::JsonLdSchema::Review {
|
||||
review_body: Some(b),
|
||||
..
|
||||
} => Some(b.as_str()),
|
||||
_ => None,
|
||||
});
|
||||
|
||||
let article_raw = if let Some(body_str) = jsonld_body {
|
||||
extractor::word_count(body_str)
|
||||
} else {
|
||||
// 2. Fallback: M2-style body word count on extracted markdown.
|
||||
llm::body_word_count(markdown)
|
||||
};
|
||||
|
||||
// Clamp so article + chrome == total_wc (invariant for the JSON shape
|
||||
// and the header arithmetic). Tokenization mismatches (JSON-LD body
|
||||
// vs extractor::word_count) can make article_raw > total_wc; that's
|
||||
// not a bug, it's a representation gap — clamp and move on.
|
||||
let article = article_raw.min(total_wc);
|
||||
let chrome = total_wc - article;
|
||||
(article, chrome)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
@ -660,4 +729,209 @@ mod tests {
|
|||
"wasm path and threaded path must produce identical content"
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// M12 (issue #7): word-count breakdown — article vs chrome split.
|
||||
// Tests the POPULATION logic in `extract_with_options_inner` /
|
||||
// `compute_word_count_breakdown`. Formatter behavior is tested in
|
||||
// `crate::llm::metadata::m12_tests`.
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
/// M12 test 1: a page with a JSON-LD `NewsArticle.articleBody` gets
|
||||
/// the article portion sourced from the articleBody string. Chrome
|
||||
/// is the remainder. Total invariant: article + chrome == word_count.
|
||||
#[test]
|
||||
fn test_word_count_breakdown_with_jsonld_article_body() {
|
||||
// 20-word articleBody. Wrap in a <p> + nav chrome so the
|
||||
// extracted markdown has both article words AND chrome words.
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Tariffs hit consumers</title>
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@context": "https://schema.org",
|
||||
"@type": "NewsArticle",
|
||||
"headline": "Tariffs hit consumers",
|
||||
"articleBody": "Tariffs are taxes on imports paid by consumers in the importing country, not the exporting one, economists explained today again."
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<nav><a href="/">Home</a> | <a href="/markets">Markets</a> | <a href="/world">World</a></nav>
|
||||
<article>
|
||||
<h1>Tariffs hit consumers</h1>
|
||||
<p>Tariffs are taxes on imports paid by consumers in the importing country, not the exporting one, economists explained today again.</p>
|
||||
</article>
|
||||
<footer>Subscribe to our newsletter for daily updates and breaking-news alerts</footer>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let result = extract(html, Some("https://news.example.com/tariffs")).unwrap();
|
||||
let m = &result.metadata;
|
||||
|
||||
assert!(m.word_count > 0, "extraction must produce a word count");
|
||||
// articleBody is exactly 20 words. The extracted markdown may
|
||||
// include more or fewer words depending on what the scorer
|
||||
// captured; the invariant we assert is structural, not numeric.
|
||||
assert_eq!(
|
||||
m.word_count_article + m.word_count_chrome,
|
||||
m.word_count,
|
||||
"invariant: article + chrome == total. \
|
||||
got article={}, chrome={}, total={}",
|
||||
m.word_count_article,
|
||||
m.word_count_chrome,
|
||||
m.word_count
|
||||
);
|
||||
assert!(
|
||||
m.word_count_article > 0,
|
||||
"JSON-LD articleBody must populate article portion (>0); \
|
||||
got article={}",
|
||||
m.word_count_article
|
||||
);
|
||||
}
|
||||
|
||||
/// M12 test 2: when no JSON-LD body is present, the article portion
|
||||
/// falls back to the M2-style body heuristic (`llm::body_word_count`
|
||||
/// on extracted markdown). Chrome is the remainder. The article
|
||||
/// portion must still be >0 on a real body page; total invariant holds.
|
||||
#[test]
|
||||
fn test_word_count_breakdown_without_jsonld_falls_back_to_heuristic() {
|
||||
// No <script type="application/ld+json"> block — the breakdown
|
||||
// must come from the body::process_body fallback.
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head><title>Plain article</title></head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>Plain article</h1>
|
||||
<p>The economy expanded last quarter at an annualized rate of three percent
|
||||
driven primarily by consumer spending and a rebound in fixed investment,
|
||||
government statisticians reported on Thursday morning at the usual hour.</p>
|
||||
<p>Analysts had broadly expected the print, but the composition of the gain
|
||||
surprised some who had bet that residential housing would drag the headline
|
||||
number into the low twos rather than the comfortable threes.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let result = extract(html, Some("https://news.example.com/gdp")).unwrap();
|
||||
let m = &result.metadata;
|
||||
|
||||
assert!(m.word_count > 0, "extraction must produce a word count");
|
||||
assert_eq!(
|
||||
m.word_count_article + m.word_count_chrome,
|
||||
m.word_count,
|
||||
"invariant: article + chrome == total. \
|
||||
got article={}, chrome={}, total={}",
|
||||
m.word_count_article,
|
||||
m.word_count_chrome,
|
||||
m.word_count
|
||||
);
|
||||
assert!(
|
||||
m.word_count_article > 0,
|
||||
"fallback body heuristic must populate article portion (>0); \
|
||||
got article={}",
|
||||
m.word_count_article
|
||||
);
|
||||
// Sanity: structured_data should be empty (no JSON-LD in fixture).
|
||||
assert!(
|
||||
result.structured_data.is_empty()
|
||||
|| crate::jsonld::classify_all(&result.structured_data)
|
||||
.iter()
|
||||
.all(|s| !matches!(
|
||||
s,
|
||||
crate::jsonld::JsonLdSchema::NewsArticle { body: Some(_), .. }
|
||||
| crate::jsonld::JsonLdSchema::Review { review_body: Some(_), .. }
|
||||
)),
|
||||
"fixture should have no JSON-LD article/review body — \
|
||||
this test exercises the fallback path"
|
||||
);
|
||||
}
|
||||
|
||||
/// M12 test 3: JSON output shape gains `word_count_article` and
|
||||
/// `word_count_chrome` fields when populated. The existing
|
||||
/// `word_count` field is preserved. The three numbers satisfy
|
||||
/// article + chrome == total.
|
||||
#[test]
|
||||
fn test_word_count_breakdown_json_format_has_three_fields() {
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head><title>JSON shape test</title></head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>JSON shape</h1>
|
||||
<p>The body of this article has more than ten words so the
|
||||
fallback heuristic populates a positive article portion.
|
||||
The remaining chrome words come from any nav and footer.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let result = extract(html, Some("https://example.com/json")).unwrap();
|
||||
let json: serde_json::Value =
|
||||
serde_json::from_str(&serde_json::to_string(&result).unwrap()).unwrap();
|
||||
let meta = &json["metadata"];
|
||||
|
||||
// Existing field preserved.
|
||||
assert!(
|
||||
meta.get("word_count").is_some(),
|
||||
"json must keep word_count field; got: {meta}"
|
||||
);
|
||||
// New fields present (because population logic ran and produced
|
||||
// non-zero values — `skip_serializing_if = is_zero_usize` would
|
||||
// drop them if both were 0).
|
||||
let total = meta["word_count"].as_u64().unwrap();
|
||||
let article = meta["word_count_article"].as_u64().unwrap_or(0);
|
||||
let chrome = meta["word_count_chrome"].as_u64().unwrap_or(0);
|
||||
assert_eq!(
|
||||
article + chrome,
|
||||
total,
|
||||
"invariant: article + chrome == word_count in JSON output. \
|
||||
got article={article}, chrome={chrome}, total={total}; meta={meta}"
|
||||
);
|
||||
assert!(
|
||||
article > 0 || total == 0,
|
||||
"expect at least some article words when total > 0; \
|
||||
got article={article}, total={total}"
|
||||
);
|
||||
}
|
||||
|
||||
/// M12 test 4: --mode summary / toc / sections do NOT call into
|
||||
/// `build_metadata_header`, so the breakdown line never appears in
|
||||
/// those modes. This pins the modes' contract (link-list outputs
|
||||
/// stay clean of metadata noise — see iter-5 / iter-7 carry-forward).
|
||||
#[test]
|
||||
fn test_word_count_omitted_or_simple_in_summary_mode() {
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head><title>Hub-style page</title></head>
|
||||
<body>
|
||||
<nav>
|
||||
<a href="/a">First section</a>
|
||||
<a href="/b">Second section</a>
|
||||
<a href="/c">Third section</a>
|
||||
</nav>
|
||||
<article>
|
||||
<p>Short body for hub-style page; the summary mode emits a link list, not a metadata header.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let result = extract(html, Some("https://example.com/hub")).unwrap();
|
||||
let summary = crate::to_llm_summary(&result, Some("https://example.com/hub"));
|
||||
let toc = crate::to_llm_toc(&result, Some("https://example.com/hub"));
|
||||
let sections = crate::to_llm_sections(&result, Some("https://example.com/hub"));
|
||||
|
||||
for (name, output) in [("summary", &summary), ("toc", &toc), ("sections", §ions)] {
|
||||
assert!(
|
||||
!output.contains("(article:"),
|
||||
"{name} mode must NOT contain the article/chrome breakdown; \
|
||||
got: {output}"
|
||||
);
|
||||
// toc/summary/sections may or may not have a "Word count:" line
|
||||
// depending on their own header conventions, but it must NOT
|
||||
// carry the M12 parenthetical when it exists.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -136,6 +136,8 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -68,6 +68,158 @@ pub(crate) fn build_metadata_header_with_opts(
|
|||
out.push_str(&format!("> Language: {l}\n"));
|
||||
}
|
||||
if meta.word_count > 0 {
|
||||
out.push_str(&format!("> Word count: {}\n", meta.word_count));
|
||||
// M12 (issue #7): split the total into an article-body portion and
|
||||
// a chrome remainder so LLM callers can tell at a glance whether
|
||||
// there's real content under the chrome. When the breakdown is
|
||||
// available (article + chrome == total, set in
|
||||
// `extract_with_options_inner::compute_word_count_breakdown`), emit
|
||||
// the parenthetical; otherwise fall back to the legacy single-N
|
||||
// form (e.g. local-file / --stdin / direct
|
||||
// `extract_with_options` calls that leave the breakdown fields at
|
||||
// their `Default` zero — same shape as the http_status fallback).
|
||||
//
|
||||
// `--mode summary` / `--mode toc` (`include_status=false`)
|
||||
// intentionally fall back to the simple `Word count: N` form: the
|
||||
// link-list / outline modes don't surface article content, so the
|
||||
// breakdown's "did chrome eat the body?" question is irrelevant
|
||||
// there. This piggybacks on the existing `include_status` toggle
|
||||
// — same modes, same suppression intent (iter-7 next-prompt
|
||||
// explicitly authorized either omit-or-simple). `--mode sections`
|
||||
// builds its own header and doesn't reach this code at all.
|
||||
let n = meta.word_count;
|
||||
let m = meta.word_count_article;
|
||||
let k = meta.word_count_chrome;
|
||||
if include_status && m + k == n && (m > 0 || k > 0) {
|
||||
out.push_str(&format!(
|
||||
"> Word count: {n} (article: {m}, chrome: {k})\n"
|
||||
));
|
||||
} else {
|
||||
out.push_str(&format!("> Word count: {n}\n"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// M12 tests for the header word-count breakdown emission. The breakdown POPULATION
|
||||
// logic (jsonld articleBody → fallback heuristic) is tested in lib.rs::tests.
|
||||
// These tests pin the FORMATTER behavior given pre-populated Metadata fields.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod m12_tests {
|
||||
use super::*;
|
||||
use crate::types::{Content, ExtractionResult, Metadata};
|
||||
|
||||
fn make_result_with_wc(word_count: usize, article: usize, chrome: usize) -> ExtractionResult {
|
||||
ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title: Some("Test Page".into()),
|
||||
description: None,
|
||||
author: None,
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: Some("https://example.com/".into()),
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
word_count_article: article,
|
||||
word_count_chrome: chrome,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown: String::new(),
|
||||
plain_text: String::new(),
|
||||
links: Vec::new(),
|
||||
images: Vec::new(),
|
||||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Phase B test 1: when M+K==N and at least one is non-zero, the header
|
||||
/// emits the parenthetical breakdown form.
|
||||
#[test]
|
||||
fn header_emits_breakdown_when_article_plus_chrome_equals_total() {
|
||||
let result = make_result_with_wc(1000, 600, 400);
|
||||
let mut out = String::new();
|
||||
build_metadata_header(&mut out, &result, None);
|
||||
assert!(
|
||||
out.contains("> Word count: 1000 (article: 600, chrome: 400)"),
|
||||
"expected breakdown form, got: {out}"
|
||||
);
|
||||
assert!(
|
||||
!out.contains("> Word count: 1000\n"),
|
||||
"must not contain legacy form when breakdown is present, got: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Phase B test 2: when the breakdown fields are zero (default —
|
||||
/// `extract_with_options` direct path, local-file, --stdin), fall back
|
||||
/// to the legacy single-N form. This protects all the test fixtures
|
||||
/// that don't pre-populate the breakdown.
|
||||
#[test]
|
||||
fn header_falls_back_to_legacy_form_when_breakdown_unpopulated() {
|
||||
let result = make_result_with_wc(1000, 0, 0);
|
||||
let mut out = String::new();
|
||||
build_metadata_header(&mut out, &result, None);
|
||||
assert!(
|
||||
out.contains("> Word count: 1000\n"),
|
||||
"expected legacy single-N form, got: {out}"
|
||||
);
|
||||
assert!(
|
||||
!out.contains("(article:"),
|
||||
"must not contain parenthetical when fields are zero, got: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Phase B test 3: chrome=0 (all-article page, e.g. YouTube fast path
|
||||
/// or document extractor) still emits the breakdown form, so the JSON
|
||||
/// shape and the header shape stay consistent.
|
||||
#[test]
|
||||
fn header_emits_breakdown_with_chrome_zero_when_article_equals_total() {
|
||||
let result = make_result_with_wc(500, 500, 0);
|
||||
let mut out = String::new();
|
||||
build_metadata_header(&mut out, &result, None);
|
||||
assert!(
|
||||
out.contains("> Word count: 500 (article: 500, chrome: 0)"),
|
||||
"expected breakdown with chrome=0, got: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Phase B test 4: when total is zero, no Word count line is emitted at
|
||||
/// all (preserves existing behavior — see `metadata_header_includes_populated_fields`
|
||||
/// sentinel).
|
||||
#[test]
|
||||
fn header_omits_word_count_line_entirely_when_total_zero() {
|
||||
let result = make_result_with_wc(0, 0, 0);
|
||||
let mut out = String::new();
|
||||
build_metadata_header(&mut out, &result, None);
|
||||
assert!(
|
||||
!out.contains("Word count"),
|
||||
"expected no Word count line when total is 0, got: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Phase B test 5: if article+chrome != total (shouldn't happen via the
|
||||
/// canonical `compute_word_count_breakdown` path — invariant), the
|
||||
/// formatter falls back to the legacy single-N form rather than
|
||||
/// surfacing inconsistent arithmetic. Defensive guard.
|
||||
#[test]
|
||||
fn header_falls_back_when_article_plus_chrome_mismatches_total() {
|
||||
let result = make_result_with_wc(1000, 600, 300); // 600 + 300 != 1000
|
||||
let mut out = String::new();
|
||||
build_metadata_header(&mut out, &result, None);
|
||||
assert!(
|
||||
out.contains("> Word count: 1000\n"),
|
||||
"expected legacy form when breakdown invariant violated, got: {out}"
|
||||
);
|
||||
assert!(
|
||||
!out.contains("(article:"),
|
||||
"must not surface inconsistent breakdown, got: {out}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,6 +17,26 @@ mod thin_body;
|
|||
pub use hub_detect::{classify as classify_hub, HubClassification};
|
||||
pub use sections::{collect_section_links, to_json_sections, to_llm_sections};
|
||||
pub use thin_body::{classify as classify_thin_body, ThinBodyClassification};
|
||||
|
||||
/// Count words in the body AFTER the same processing pipeline the LLM
|
||||
/// formatter applies (image / link-syntax / framework-blob stripping, dedup,
|
||||
/// whitespace collapse). Words inside markdown link patterns `[text](url)`
|
||||
/// are excluded by the pipeline — what's left is "real" article-body prose.
|
||||
///
|
||||
/// Used by M12 (`word_count_article` / `word_count_chrome` breakdown) as the
|
||||
/// fallback estimator when no JSON-LD `articleBody` / `reviewBody` is
|
||||
/// available. Mirrors `hub_detect::count_body_words` — same dependency on
|
||||
/// `body::process_body(...).text.split_whitespace().count()` — exposed
|
||||
/// publicly so `lib.rs::extract_with_options_inner` can populate the
|
||||
/// `Metadata.word_count_article` field without reaching across the
|
||||
/// `llm::body` `pub(crate)` boundary.
|
||||
pub fn body_word_count(markdown: &str) -> usize {
|
||||
body::process_body(markdown)
|
||||
.text
|
||||
.split_whitespace()
|
||||
.filter(|w| !w.is_empty())
|
||||
.count()
|
||||
}
|
||||
pub use output_size::{
|
||||
to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
|
||||
truncate_with_footer,
|
||||
|
|
@ -348,6 +368,12 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 42,
|
||||
// M12: default fixture leaves breakdown unset (zero); the
|
||||
// header formatter falls back to the legacy `Word count: N`
|
||||
// form when article+chrome != total. Tests that need the
|
||||
// breakdown set these explicitly.
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
@ -602,6 +628,8 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -359,6 +359,8 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -316,6 +316,8 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -152,6 +152,8 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: Some(200),
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -51,8 +51,10 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
|
|||
site_name,
|
||||
image,
|
||||
favicon,
|
||||
word_count: 0, // filled later by the extractor
|
||||
http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin
|
||||
word_count: 0, // filled later by the extractor
|
||||
word_count_article: 0, // filled by M12 breakdown in extract_with_options_inner
|
||||
word_count_chrome: 0, // filled by M12 breakdown in extract_with_options_inner
|
||||
http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,28 @@ pub struct Metadata {
|
|||
pub image: Option<String>,
|
||||
pub favicon: Option<String>,
|
||||
pub word_count: usize,
|
||||
/// Article-body portion of `word_count`, computed in
|
||||
/// `extract_with_options_inner`. Sourced from JSON-LD `articleBody`
|
||||
/// (or `reviewBody`) via the M4 classifier when present; falls back
|
||||
/// to the M2-style body-text heuristic (`llm::body_word_count` on the
|
||||
/// extracted markdown) when JSON-LD body is absent. Always satisfies
|
||||
/// `word_count_article + word_count_chrome == word_count` (M is
|
||||
/// clamped to N if the JSON-LD body has more words than the extracted
|
||||
/// markdown). Serialized unconditionally (parallel to `word_count`)
|
||||
/// so callers can rely on both M12 fields being present alongside
|
||||
/// the existing total; zero on the no-breakdown path (local-file /
|
||||
/// --stdin / direct `extract_with_options` callers). `default = 0`
|
||||
/// for backward-compat on incoming JSON that predates M12. M12 /
|
||||
/// issue #7.
|
||||
#[serde(default)]
|
||||
pub word_count_article: usize,
|
||||
/// Chrome portion of `word_count` (= `word_count - word_count_article`).
|
||||
/// "Chrome" means everything not in the article body: navigation,
|
||||
/// related-link sidebars, footers, ad slots, ticker rows, link cards,
|
||||
/// etc. Serialized unconditionally (parallel to `word_count`); zero
|
||||
/// on the no-breakdown path. M12 / issue #7.
|
||||
#[serde(default)]
|
||||
pub word_count_chrome: usize,
|
||||
/// HTTP status code from the final response (after redirects). `None`
|
||||
/// when extraction was not preceded by an HTTP fetch — e.g. `--file`,
|
||||
/// `--stdin`, or any call into `extract_with_options` directly.
|
||||
|
|
@ -43,6 +65,7 @@ pub struct Metadata {
|
|||
pub http_status: Option<u16>,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Content {
|
||||
pub markdown: String,
|
||||
|
|
|
|||
|
|
@ -899,6 +899,10 @@ fn pdf_to_extraction_result(
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
// M12: PDF text is body content end-to-end (no nav/chrome
|
||||
// wrapper extracted around it). Treat all words as article.
|
||||
word_count_article: word_count,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
|
|
|
|||
|
|
@ -110,6 +110,10 @@ pub fn extract_document(
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
// M12: document extractors (DOCX, CSV, etc.) emit body content
|
||||
// only; no chrome envelope. All words are article.
|
||||
word_count_article: word_count,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
|
|
|
|||
|
|
@ -216,6 +216,11 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
// M12: LinkedIn post payload IS the article body; comments
|
||||
// and reactions are appended below but treated as article
|
||||
// for this extractor (no separate chrome envelope).
|
||||
word_count_article: word_count,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -92,6 +92,10 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
// M12: Reddit JSON path emits post body + comments; no chrome
|
||||
// wrapper. All words are article-side content.
|
||||
word_count_article: word_count,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -518,6 +518,8 @@ impl WebclawMcp {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: markdown.split_whitespace().count(),
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
domain_data: None,
|
||||
|
|
|
|||
|
|
@ -65,6 +65,8 @@ fn empty_metadata() -> Metadata {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
word_count_article: 0,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue