From df8bdc96db1592c53fe67b0edeb09e1fe5b83803 Mon Sep 17 00:00:00 2001 From: Nenad Oric Date: Sun, 10 May 2026 14:00:54 +0200 Subject: [PATCH] Improve --format llm output quality on news index pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR fixes three independent issues that surface when running `webclaw --format llm` against modern news index pages. They were all reproducible against bbc.com/news/world and reuters.com/world/middle-east during a real briefing-generation run. ### 1. Framework hydration blobs no longer dump into the output `to_llm_text` was unconditionally appending every parsed structured-data item as a `## Structured Data` JSON fence. On Next.js sites, that means the entire `__NEXT_DATA__` `pageProps` object — ad-targeting flags, build IDs, schedule paths, feature toggles — gets serialized straight into the LLM context. On bbc.com/news/world it was about 140 KB of pure framework noise drowning the actual page content. The fix layers three filters: - Items with a Schema.org `@type` of `WebSite`, `WebPage`, or `SiteNavigationElement` are dropped as chrome. - Items without an `@type` (typical of `pageProps` or SvelteKit data) are kept only if their serialized size stays under 4 KB — small parsed records with real content survive, hydration blobs do not. - The whole section is suppressed if the total serialized size exceeds 16 KB, regardless of type. Past that threshold it is almost never useful to a downstream LLM. JSON-LD records with content-bearing `@type` values (`Article`, `NewsArticle`, `Product`, `Recipe`, `FAQPage`, `Event`, etc.) are preserved. ### 2. Element → Text node smashing `children_to_md` and `inline_text` only ran the `needs_separator` check on `Element → Element` transitions. When an element rendered text with no trailing whitespace and was followed by a sibling text node that started with a non-whitespace character, the two got concatenated with no separator. The same check now applies to the `Text` branch in both functions. ### 3. Accessibility link chrome no longer leaks into prose Sites like Reuters wrap external/new-window links with screen-reader-only spans (e.g. `, opens new tab`, `external link`). These have no consistent class hook, so the structural noise filter cannot reliably catch them and they bleed into the rendered text — sometimes dozens of times per page. A targeted regex scrub now runs in two places: in the body cleanup pipeline (`strip_a11y_link_chrome`, called early after `strip_leaked_js`) and in the link-label cleaner (`clean_link_label`) so the deduplicated `## Links` section is also clean. ### Tests All 286 existing unit tests pass. 8 new tests cover: - structured-data filter: chrome-type drop, oversized untyped drop, small untyped keep, `NewsArticle` keep - markdown separator: `Element → Text → Element` no longer smashes - a11y stripper: common phrasings, variant phrasings ("opens in a new window", "external link"), and code-fence preservation --- crates/webclaw-core/src/llm/body.rs | 3 + crates/webclaw-core/src/llm/cleanup.rs | 77 +++++++++++++++ crates/webclaw-core/src/llm/links.rs | 9 ++ crates/webclaw-core/src/llm/mod.rs | 129 ++++++++++++++++++++++++- crates/webclaw-core/src/markdown.rs | 20 ++++ 5 files changed, 234 insertions(+), 4 deletions(-) diff --git a/crates/webclaw-core/src/llm/body.rs b/crates/webclaw-core/src/llm/body.rs index 5311121..db2a011 100644 --- a/crates/webclaw-core/src/llm/body.rs +++ b/crates/webclaw-core/src/llm/body.rs @@ -29,6 +29,9 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody { // 0c. Strip leaked JavaScript (framework hydration, self.__wrap_n, etc.) let text = cleanup::strip_leaked_js(&text); + // 0c2. Strip a11y link chrome ("opens new tab", external link hints) + let text = cleanup::strip_a11y_link_chrome(&text); + // 0d. Collapse spaced-out text (CSS animation artifacts like "S t a r t") // Must run before any dedup -- spaced text confuses word-based dedup. let text = cleanup::collapse_spaced_text(&text); diff --git a/crates/webclaw-core/src/llm/cleanup.rs b/crates/webclaw-core/src/llm/cleanup.rs index c8e14ed..1f79361 100644 --- a/crates/webclaw-core/src/llm/cleanup.rs +++ b/crates/webclaw-core/src/llm/cleanup.rs @@ -146,6 +146,45 @@ pub(crate) fn strip_leaked_js(input: &str) -> String { out } +// --------------------------------------------------------------------------- +// Accessibility link chrome ("opens new tab", "external link") +// --------------------------------------------------------------------------- + +/// Strip screen-reader-only link chrome that bleeds into rendered text. +/// +/// Sites like Reuters wrap external/new-window links with hidden spans +/// like `, opens new tab`. The noise +/// filter can't reliably catch these (no consistent class hook across +/// sites), so they end up duplicated all over the body text. This is a +/// targeted text-level scrub of the most common phrasings. +pub(crate) fn strip_a11y_link_chrome(input: &str) -> String { + static A11Y_PATTERN: Lazy = Lazy::new(|| { + Regex::new( + r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?", + ) + .unwrap() + }); + + let mut out = String::with_capacity(input.len()); + let mut in_code_fence = false; + for (i, line) in input.lines().enumerate() { + if i > 0 { + out.push('\n'); + } + if line.trim().starts_with("```") { + in_code_fence = !in_code_fence; + out.push_str(line); + continue; + } + if in_code_fence { + out.push_str(line); + continue; + } + out.push_str(&A11Y_PATTERN.replace_all(line, "")); + } + out +} + // --------------------------------------------------------------------------- // Spaced-out text collapsing (CSS animation artifacts) // --------------------------------------------------------------------------- @@ -1356,4 +1395,42 @@ mod tests { let input = "```\nImage of something in code\n```"; assert_eq!(strip_alt_text_noise(input), input); } + + #[test] + fn a11y_strips_opens_new_tab() { + let input = "Download the App, opens new tab and Subscribe, opens new tab."; + let out = strip_a11y_link_chrome(input); + assert!(!out.to_lowercase().contains("opens new tab"), "leak: {out}"); + assert!(out.contains("Download the App")); + assert!(out.contains("Subscribe")); + } + + #[test] + fn a11y_strips_external_link_variants() { + let cases = [ + ("Visit our docs, opens external link", "Visit our docs"), + ("Click here, opens in a new window.", "Click here"), + ("More info external link", "More info"), + ]; + for (input, expected_prefix) in cases { + let out = strip_a11y_link_chrome(input); + assert!( + out.starts_with(expected_prefix), + "input={input:?} got={out:?}" + ); + assert!(!out.to_lowercase().contains("opens"), "leak: {out}"); + } + } + + #[test] + fn a11y_preserves_code_blocks() { + let input = "```\nopens new tab is a function\n```\nopens new tab here"; + let out = strip_a11y_link_chrome(input); + assert!( + out.contains("opens new tab is a function"), + "code stripped: {out}" + ); + // Outside the fence, the chrome is removed. + assert!(!out.ends_with("opens new tab here")); + } } diff --git a/crates/webclaw-core/src/llm/links.rs b/crates/webclaw-core/src/llm/links.rs index 0656aac..9873182 100644 --- a/crates/webclaw-core/src/llm/links.rs +++ b/crates/webclaw-core/src/llm/links.rs @@ -88,10 +88,19 @@ fn is_noise_link(text: &str, href: &str) -> bool { static MD_MARKERS_RE: Lazy = Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap()); +static A11Y_LABEL_RE: Lazy = Lazy::new(|| { + Regex::new( + r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?", + ) + .unwrap() +}); + /// Clean a link label: strip markdown, dedup repeated phrases, truncate. pub(crate) fn clean_link_label(raw: &str) -> String { // Strip markdown markers let label = MD_MARKERS_RE.replace_all(raw, "").to_string(); + // Strip a11y link chrome ("opens new tab", etc.) + let label = A11Y_LABEL_RE.replace_all(&label, "").to_string(); let label = label.split_whitespace().collect::>().join(" "); // Dedup repeated phrases in label diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs index 126558f..7314cbe 100644 --- a/crates/webclaw-core/src/llm/mod.rs +++ b/crates/webclaw-core/src/llm/mod.rs @@ -46,15 +46,67 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String { } // -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) -- - if !result.structured_data.is_empty() { - out.push_str("\n\n## Structured Data\n\n```json\n"); - out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default()); - out.push_str("\n```"); + // Only emit useful items: Schema.org records with a meaningful @type, + // and only if the total serialized size stays under a budget. Framework + // hydration blobs (Next.js pageProps full of ad-targeting flags, build + // IDs, schedule paths) explode to hundreds of KB and drown the LLM in + // noise — drop them rather than ship them. + let useful: Vec<_> = result + .structured_data + .iter() + .filter(|v| is_useful_structured_data(v)) + .cloned() + .collect(); + if !useful.is_empty() { + let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default(); + const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024; + if serialized.len() <= STRUCTURED_DATA_MAX_BYTES { + out.push_str("\n\n## Structured Data\n\n```json\n"); + out.push_str(&serialized); + out.push_str("\n```"); + } } out.trim().to_string() } +/// Decide whether a structured-data value carries content worth emitting. +/// +/// Schema.org records with a recognizable content `@type` (Article, NewsArticle, +/// Product, Recipe, FAQPage, HowTo, Event, Person, Organization, BreadcrumbList, +/// VideoObject, JobPosting, etc.) are kept. Generic `WebSite` / `WebPage` / +/// `ItemList` records and Next.js `pageProps`-style blobs without a useful +/// `@type` are dropped — they're almost always navigation chrome or framework +/// hydration state. +fn is_useful_structured_data(v: &serde_json::Value) -> bool { + let Some(obj) = v.as_object() else { + return false; + }; + // JSON-LD: @type drives the decision. + if let Some(t) = obj.get("@type") { + let type_str = match t { + serde_json::Value::String(s) => s.clone(), + serde_json::Value::Array(a) => a + .iter() + .filter_map(|x| x.as_str()) + .collect::>() + .join(","), + _ => String::new(), + }; + let lower = type_str.to_ascii_lowercase(); + // Drop low-info chrome types. + const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"]; + if DROP_TYPES.iter().any(|d| lower == *d) { + return false; + } + return !lower.is_empty(); + } + // Next.js pageProps / SvelteKit data without @type: keep only if compact. + // Anything over ~4KB is almost certainly hydration state, not content. + let serialized = serde_json::to_string(v).unwrap_or_default(); + serialized.len() <= 4 * 1024 +} + // --------------------------------------------------------------------------- // Integration tests that exercise the full pipeline through to_llm_text // --------------------------------------------------------------------------- @@ -700,4 +752,73 @@ mod tests { assert!(out.contains("Some content"), "Content before lost: {out}"); assert!(out.contains("More content"), "Content after lost: {out}"); } + + // -- Structured-data gating tests -- + + fn make_result_with_structured(values: Vec) -> ExtractionResult { + let mut r = make_result("# Body"); + r.structured_data = values; + r + } + + #[test] + fn structured_data_drops_chrome_types() { + // WebSite/WebPage records are framework chrome — should be dropped. + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "WebSite", + "name": "Example", + "url": "https://example.com" + })]); + let out = to_llm_text(&r, None); + assert!( + !out.contains("## Structured Data"), + "WebSite chrome leaked into output: {out}" + ); + } + + #[test] + fn structured_data_keeps_article_types() { + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "NewsArticle", + "headline": "Big news", + "datePublished": "2026-05-10" + })]); + let out = to_llm_text(&r, None); + assert!( + out.contains("## Structured Data"), + "NewsArticle dropped: {out}" + ); + assert!(out.contains("Big news")); + } + + #[test] + fn structured_data_drops_oversized_blob() { + // 32KB pageProps-style blob with no @type — should be dropped. + let big = "x".repeat(32 * 1024); + let r = make_result_with_structured(vec![serde_json::json!({ + "buildId": "abc", + "isFallback": false, + "noise": big + })]); + let out = to_llm_text(&r, None); + assert!( + !out.contains("## Structured Data"), + "Oversized untyped blob leaked: len={}", + out.len() + ); + } + + #[test] + fn structured_data_keeps_compact_untyped() { + // Small untyped record (e.g. a parsed pageProps with real content) — keep. + let r = make_result_with_structured(vec![serde_json::json!({ + "title": "Hi", + "body": "small enough to keep" + })]); + let out = to_llm_text(&r, None); + assert!( + out.contains("## Structured Data"), + "Compact untyped dropped: {out}" + ); + } } diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index d0a2c23..cacadb2 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -320,6 +320,9 @@ fn children_to_md( } } Node::Text(text) => { + if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) { + out.push(' '); + } out.push_str(text); } _ => {} @@ -350,6 +353,9 @@ fn inline_text( } } Node::Text(text) => { + if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) { + out.push(' '); + } out.push_str(text); } _ => {} @@ -1606,4 +1612,18 @@ mod tests { "collapse_whitespace stripped 6-space indent: {output}" ); } + + #[test] + fn text_after_inline_element_keeps_separator() { + // Reuters-style markup: agoTanker crosses... + // The "ago" text node sits between two element children. Without a + // separator check on the Text branch, "ago" + "Tanker" would smash + // together as "agoTanker". + let html = r#"
3hagoTanker crosses Strait
"#; + let (md, _, _) = convert_html(html, None); + assert!( + !md.contains("agoTanker"), + "Element->Text->Element smashed together: {md}" + ); + } }