mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-09 22:35:12 +02:00
Improve --format llm output quality on news index pages
This PR fixes three independent issues that surface when running
`webclaw --format llm` against modern news index pages. They were
all reproducible against bbc.com/news/world and reuters.com/world/middle-east
during a real briefing-generation run.
### 1. Framework hydration blobs no longer dump into the output
`to_llm_text` was unconditionally appending every parsed structured-data
item as a `## Structured Data` JSON fence. On Next.js sites, that means
the entire `__NEXT_DATA__` `pageProps` object — ad-targeting flags,
build IDs, schedule paths, feature toggles — gets serialized straight
into the LLM context. On bbc.com/news/world it was about 140 KB of
pure framework noise drowning the actual page content.
The fix layers three filters:
- Items with a Schema.org `@type` of `WebSite`, `WebPage`, or
`SiteNavigationElement` are dropped as chrome.
- Items without an `@type` (typical of `pageProps` or SvelteKit
data) are kept only if their serialized size stays under 4 KB —
small parsed records with real content survive, hydration blobs
do not.
- The whole section is suppressed if the total serialized size
exceeds 16 KB, regardless of type. Past that threshold it is
almost never useful to a downstream LLM.
JSON-LD records with content-bearing `@type` values (`Article`,
`NewsArticle`, `Product`, `Recipe`, `FAQPage`, `Event`, etc.) are
preserved.
### 2. Element → Text node smashing
`children_to_md` and `inline_text` only ran the `needs_separator`
check on `Element → Element` transitions. When an element rendered
text with no trailing whitespace and was followed by a sibling
text node that started with a non-whitespace character, the two
got concatenated with no separator. The same check now applies to
the `Text` branch in both functions.
### 3. Accessibility link chrome no longer leaks into prose
Sites like Reuters wrap external/new-window links with
screen-reader-only spans (e.g. `, opens new tab`, `external link`).
These have no consistent class hook, so the structural noise filter
cannot reliably catch them and they bleed into the rendered text —
sometimes dozens of times per page.
A targeted regex scrub now runs in two places: in the body cleanup
pipeline (`strip_a11y_link_chrome`, called early after `strip_leaked_js`)
and in the link-label cleaner (`clean_link_label`) so the deduplicated
`## Links` section is also clean.
### Tests
All 286 existing unit tests pass. 8 new tests cover:
- structured-data filter: chrome-type drop, oversized untyped drop,
small untyped keep, `NewsArticle` keep
- markdown separator: `Element → Text → Element` no longer smashes
- a11y stripper: common phrasings, variant phrasings ("opens in a
new window", "external link"), and code-fence preservation
This commit is contained in:
parent
7f75143954
commit
df8bdc96db
5 changed files with 234 additions and 4 deletions
|
|
@ -29,6 +29,9 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
|
|||
// 0c. Strip leaked JavaScript (framework hydration, self.__wrap_n, etc.)
|
||||
let text = cleanup::strip_leaked_js(&text);
|
||||
|
||||
// 0c2. Strip a11y link chrome ("opens new tab", external link hints)
|
||||
let text = cleanup::strip_a11y_link_chrome(&text);
|
||||
|
||||
// 0d. Collapse spaced-out text (CSS animation artifacts like "S t a r t")
|
||||
// Must run before any dedup -- spaced text confuses word-based dedup.
|
||||
let text = cleanup::collapse_spaced_text(&text);
|
||||
|
|
|
|||
|
|
@ -146,6 +146,45 @@ pub(crate) fn strip_leaked_js(input: &str) -> String {
|
|||
out
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Accessibility link chrome ("opens new tab", "external link")
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Strip screen-reader-only link chrome that bleeds into rendered text.
|
||||
///
|
||||
/// Sites like Reuters wrap external/new-window links with hidden spans
|
||||
/// like `<span class="visually-hidden">, opens new tab</span>`. The noise
|
||||
/// filter can't reliably catch these (no consistent class hook across
|
||||
/// sites), so they end up duplicated all over the body text. This is a
|
||||
/// targeted text-level scrub of the most common phrasings.
|
||||
pub(crate) fn strip_a11y_link_chrome(input: &str) -> String {
|
||||
static A11Y_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(
|
||||
r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let mut out = String::with_capacity(input.len());
|
||||
let mut in_code_fence = false;
|
||||
for (i, line) in input.lines().enumerate() {
|
||||
if i > 0 {
|
||||
out.push('\n');
|
||||
}
|
||||
if line.trim().starts_with("```") {
|
||||
in_code_fence = !in_code_fence;
|
||||
out.push_str(line);
|
||||
continue;
|
||||
}
|
||||
if in_code_fence {
|
||||
out.push_str(line);
|
||||
continue;
|
||||
}
|
||||
out.push_str(&A11Y_PATTERN.replace_all(line, ""));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Spaced-out text collapsing (CSS animation artifacts)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -1356,4 +1395,42 @@ mod tests {
|
|||
let input = "```\nImage of something in code\n```";
|
||||
assert_eq!(strip_alt_text_noise(input), input);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn a11y_strips_opens_new_tab() {
|
||||
let input = "Download the App, opens new tab and Subscribe, opens new tab.";
|
||||
let out = strip_a11y_link_chrome(input);
|
||||
assert!(!out.to_lowercase().contains("opens new tab"), "leak: {out}");
|
||||
assert!(out.contains("Download the App"));
|
||||
assert!(out.contains("Subscribe"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn a11y_strips_external_link_variants() {
|
||||
let cases = [
|
||||
("Visit our docs, opens external link", "Visit our docs"),
|
||||
("Click here, opens in a new window.", "Click here"),
|
||||
("More info external link", "More info"),
|
||||
];
|
||||
for (input, expected_prefix) in cases {
|
||||
let out = strip_a11y_link_chrome(input);
|
||||
assert!(
|
||||
out.starts_with(expected_prefix),
|
||||
"input={input:?} got={out:?}"
|
||||
);
|
||||
assert!(!out.to_lowercase().contains("opens"), "leak: {out}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn a11y_preserves_code_blocks() {
|
||||
let input = "```\nopens new tab is a function\n```\nopens new tab here";
|
||||
let out = strip_a11y_link_chrome(input);
|
||||
assert!(
|
||||
out.contains("opens new tab is a function"),
|
||||
"code stripped: {out}"
|
||||
);
|
||||
// Outside the fence, the chrome is removed.
|
||||
assert!(!out.ends_with("opens new tab here"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -88,10 +88,19 @@ fn is_noise_link(text: &str, href: &str) -> bool {
|
|||
static MD_MARKERS_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap());
|
||||
|
||||
static A11Y_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(
|
||||
r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
/// Clean a link label: strip markdown, dedup repeated phrases, truncate.
|
||||
pub(crate) fn clean_link_label(raw: &str) -> String {
|
||||
// Strip markdown markers
|
||||
let label = MD_MARKERS_RE.replace_all(raw, "").to_string();
|
||||
// Strip a11y link chrome ("opens new tab", etc.)
|
||||
let label = A11Y_LABEL_RE.replace_all(&label, "").to_string();
|
||||
let label = label.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
|
||||
// Dedup repeated phrases in label
|
||||
|
|
|
|||
|
|
@ -46,15 +46,67 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
|||
}
|
||||
|
||||
// -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) --
|
||||
if !result.structured_data.is_empty() {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default());
|
||||
out.push_str("\n```");
|
||||
// Only emit useful items: Schema.org records with a meaningful @type,
|
||||
// and only if the total serialized size stays under a budget. Framework
|
||||
// hydration blobs (Next.js pageProps full of ad-targeting flags, build
|
||||
// IDs, schedule paths) explode to hundreds of KB and drown the LLM in
|
||||
// noise — drop them rather than ship them.
|
||||
let useful: Vec<_> = result
|
||||
.structured_data
|
||||
.iter()
|
||||
.filter(|v| is_useful_structured_data(v))
|
||||
.cloned()
|
||||
.collect();
|
||||
if !useful.is_empty() {
|
||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||
const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
|
||||
if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(&serialized);
|
||||
out.push_str("\n```");
|
||||
}
|
||||
}
|
||||
|
||||
out.trim().to_string()
|
||||
}
|
||||
|
||||
/// Decide whether a structured-data value carries content worth emitting.
|
||||
///
|
||||
/// Schema.org records with a recognizable content `@type` (Article, NewsArticle,
|
||||
/// Product, Recipe, FAQPage, HowTo, Event, Person, Organization, BreadcrumbList,
|
||||
/// VideoObject, JobPosting, etc.) are kept. Generic `WebSite` / `WebPage` /
|
||||
/// `ItemList` records and Next.js `pageProps`-style blobs without a useful
|
||||
/// `@type` are dropped — they're almost always navigation chrome or framework
|
||||
/// hydration state.
|
||||
fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
||||
let Some(obj) = v.as_object() else {
|
||||
return false;
|
||||
};
|
||||
// JSON-LD: @type drives the decision.
|
||||
if let Some(t) = obj.get("@type") {
|
||||
let type_str = match t {
|
||||
serde_json::Value::String(s) => s.clone(),
|
||||
serde_json::Value::Array(a) => a
|
||||
.iter()
|
||||
.filter_map(|x| x.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(","),
|
||||
_ => String::new(),
|
||||
};
|
||||
let lower = type_str.to_ascii_lowercase();
|
||||
// Drop low-info chrome types.
|
||||
const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
|
||||
if DROP_TYPES.iter().any(|d| lower == *d) {
|
||||
return false;
|
||||
}
|
||||
return !lower.is_empty();
|
||||
}
|
||||
// Next.js pageProps / SvelteKit data without @type: keep only if compact.
|
||||
// Anything over ~4KB is almost certainly hydration state, not content.
|
||||
let serialized = serde_json::to_string(v).unwrap_or_default();
|
||||
serialized.len() <= 4 * 1024
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Integration tests that exercise the full pipeline through to_llm_text
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -700,4 +752,73 @@ mod tests {
|
|||
assert!(out.contains("Some content"), "Content before lost: {out}");
|
||||
assert!(out.contains("More content"), "Content after lost: {out}");
|
||||
}
|
||||
|
||||
// -- Structured-data gating tests --
|
||||
|
||||
fn make_result_with_structured(values: Vec<serde_json::Value>) -> ExtractionResult {
|
||||
let mut r = make_result("# Body");
|
||||
r.structured_data = values;
|
||||
r
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_drops_chrome_types() {
|
||||
// WebSite/WebPage records are framework chrome — should be dropped.
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "WebSite",
|
||||
"name": "Example",
|
||||
"url": "https://example.com"
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
!out.contains("## Structured Data"),
|
||||
"WebSite chrome leaked into output: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_keeps_article_types() {
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "NewsArticle",
|
||||
"headline": "Big news",
|
||||
"datePublished": "2026-05-10"
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
out.contains("## Structured Data"),
|
||||
"NewsArticle dropped: {out}"
|
||||
);
|
||||
assert!(out.contains("Big news"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_drops_oversized_blob() {
|
||||
// 32KB pageProps-style blob with no @type — should be dropped.
|
||||
let big = "x".repeat(32 * 1024);
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"buildId": "abc",
|
||||
"isFallback": false,
|
||||
"noise": big
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
!out.contains("## Structured Data"),
|
||||
"Oversized untyped blob leaked: len={}",
|
||||
out.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_keeps_compact_untyped() {
|
||||
// Small untyped record (e.g. a parsed pageProps with real content) — keep.
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"title": "Hi",
|
||||
"body": "small enough to keep"
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
out.contains("## Structured Data"),
|
||||
"Compact untyped dropped: {out}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -320,6 +320,9 @@ fn children_to_md(
|
|||
}
|
||||
}
|
||||
Node::Text(text) => {
|
||||
if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
|
||||
out.push(' ');
|
||||
}
|
||||
out.push_str(text);
|
||||
}
|
||||
_ => {}
|
||||
|
|
@ -350,6 +353,9 @@ fn inline_text(
|
|||
}
|
||||
}
|
||||
Node::Text(text) => {
|
||||
if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
|
||||
out.push(' ');
|
||||
}
|
||||
out.push_str(text);
|
||||
}
|
||||
_ => {}
|
||||
|
|
@ -1606,4 +1612,18 @@ mod tests {
|
|||
"collapse_whitespace stripped 6-space indent: {output}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn text_after_inline_element_keeps_separator() {
|
||||
// Reuters-style markup: <a><time>3h</time>ago</a><a>Tanker crosses...</a>
|
||||
// The "ago" text node sits between two element children. Without a
|
||||
// separator check on the Text branch, "ago" + "Tanker" would smash
|
||||
// together as "agoTanker".
|
||||
let html = r#"<div><span>3h</span>ago<span>Tanker crosses Strait</span></div>"#;
|
||||
let (md, _, _) = convert_html(html, None);
|
||||
assert!(
|
||||
!md.contains("agoTanker"),
|
||||
"Element->Text->Element smashed together: {md}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue