mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-07-02 04:08:08 +02:00
feat(core): word-count breakdown in header — article vs chrome split
Current Word count: N is a single number conflating article body and surrounding chrome (nav, ads, footer). Callers couldn't tell from the header alone whether to drill or move on. New: Word count: N (article: M, chrome: K) in -f llm/text output. For -f json: adds word_count_article and word_count_chrome fields alongside the existing word_count. M (article body) is sourced from JSON-LD articleBody when M4's parser found one (NewsArticle or Review.reviewBody); otherwise computed by llm::body_word_count (the M2-style heuristic — words outside markdown link patterns, the same body::process_body output hub_detect uses). --mode summary / toc / sections fall back to the simple Word count: N form (the modes don't extract body content; the breakdown would be meaningless). Suppression piggybacks on the existing include_status toggle in build_metadata_header_with_opts. 9 new tests in webclaw-core (4 in lib.rs::tests for the population logic; 5 in llm/metadata.rs::m12_tests for the header formatter). Workspace 701 -> 710.
This commit is contained in:
parent
ade2a5143c
commit
d5a3aa4bf9
17 changed files with 519 additions and 7 deletions
|
|
@ -899,6 +899,10 @@ fn pdf_to_extraction_result(
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
// M12: PDF text is body content end-to-end (no nav/chrome
|
||||
// wrapper extracted around it). Treat all words as article.
|
||||
word_count_article: word_count,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
|
|
|
|||
|
|
@ -110,6 +110,10 @@ pub fn extract_document(
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
// M12: document extractors (DOCX, CSV, etc.) emit body content
|
||||
// only; no chrome envelope. All words are article.
|
||||
word_count_article: word_count,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
|
|
|
|||
|
|
@ -216,6 +216,11 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
// M12: LinkedIn post payload IS the article body; comments
|
||||
// and reactions are appended below but treated as article
|
||||
// for this extractor (no separate chrome envelope).
|
||||
word_count_article: word_count,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
|
|
@ -92,6 +92,10 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
// M12: Reddit JSON path emits post body + comments; no chrome
|
||||
// wrapper. All words are article-side content.
|
||||
word_count_article: word_count,
|
||||
word_count_chrome: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue