mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-09 22:35:12 +02:00
Strip more llm output noise: consent walls, bare integers, JSON-LD body duplication
- detect_empty: add ConsentWall variant for GDPR/cookie redirects (Yahoo, Google, EU news sites). Detect via final-URL host match (consent.*, /consent/, collectConsent) and warn with proxy/cookie remediation hint. - init_logging: silence html5ever / markup5ever / selectors WARNs by default (foster-parenting messages from malformed real-world HTML pollute stderr with dozens of lines per fetch; override via WEBCLAW_LOG). - cleanup: add strip_bare_number_lines for paragraphs that are just a short integer (news-index comment counts, page numbers); make is_ui_control_token case-insensitive and extend UI_CONTROLS with pagination chrome (next, prev, previous, older, newer) plus bare <=4-digit integers so '0 Next'-style glued lines are caught. - links: drop bare-integer link labels and #comment-stream / #comments / #disqus hrefs from the deduplicated Links section. - mod: scrub articleBody / body / text / description fields from JSON-LD structured-data emission when they would duplicate the rendered markdown body (always for articleBody; conditional >=500 chars for the others). All 292 core tests pass.
This commit is contained in:
parent
aa561e976a
commit
920d71f561
5 changed files with 210 additions and 5 deletions
|
|
@ -35,10 +35,38 @@ const ANTIBOT_TITLES: &[&str] = &[
|
|||
"ddos protection",
|
||||
];
|
||||
|
||||
/// Detect why a page returned empty content.
|
||||
/// URL host/path fragments that indicate a GDPR/cookie consent redirect.
|
||||
/// Yahoo, Google, and several EU news sites redirect to a `consent.*` host
|
||||
/// (or `/consent/` path) when the client is detected as being in the EU/EEA.
|
||||
/// The resulting page has near-zero usable content and a locale-specific
|
||||
/// title, so URL-shape detection is the most reliable signal.
|
||||
const CONSENT_URL_FRAGMENTS: &[&str] = &[
|
||||
"://consent.",
|
||||
"/consent?",
|
||||
"/consent/",
|
||||
"collectconsent",
|
||||
"consentcheck",
|
||||
"/cmp/",
|
||||
"guce.advertising.com",
|
||||
];
|
||||
|
||||
/// English consent-wall title prefixes. Many providers localise the page,
|
||||
/// so this is a best-effort secondary signal — primary detection is the URL.
|
||||
const CONSENT_TITLES: &[&str] = &[
|
||||
"before you continue",
|
||||
"your privacy choices",
|
||||
"we value your privacy",
|
||||
"we care about your privacy",
|
||||
"cookie consent",
|
||||
"consent required",
|
||||
];
|
||||
|
||||
/// Detect why a page returned empty (or near-empty) content.
|
||||
enum EmptyReason {
|
||||
/// Anti-bot challenge page (Cloudflare, Akamai, etc.)
|
||||
Antibot,
|
||||
/// GDPR/cookie consent redirect — content is gated behind a consent form
|
||||
ConsentWall,
|
||||
/// JS-only SPA that returns an empty shell without a browser
|
||||
JsRequired,
|
||||
/// Page has content — not empty
|
||||
|
|
@ -46,6 +74,13 @@ enum EmptyReason {
|
|||
}
|
||||
|
||||
fn detect_empty(result: &ExtractionResult) -> EmptyReason {
|
||||
// Consent walls come back with a non-empty (but tiny) markdown body and a
|
||||
// post-redirect URL pointing at a consent host. Check before the
|
||||
// word-count short-circuit so the Yahoo / Google / EU case is caught.
|
||||
if is_consent_wall(result) {
|
||||
return EmptyReason::ConsentWall;
|
||||
}
|
||||
|
||||
// Has real content — nothing to warn about
|
||||
if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
|
||||
return EmptyReason::None;
|
||||
|
|
@ -67,6 +102,27 @@ fn detect_empty(result: &ExtractionResult) -> EmptyReason {
|
|||
EmptyReason::None
|
||||
}
|
||||
|
||||
/// A consent wall is identified by either:
|
||||
/// 1. The final (post-redirect) URL pointing at a known consent host/path, OR
|
||||
/// 2. A consent-wall title prefix, AND the body being very short (<= 50 words)
|
||||
fn is_consent_wall(result: &ExtractionResult) -> bool {
|
||||
if let Some(ref url) = result.metadata.url {
|
||||
let lower = url.to_ascii_lowercase();
|
||||
if CONSENT_URL_FRAGMENTS.iter().any(|f| lower.contains(f)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if result.metadata.word_count <= 50
|
||||
&& let Some(ref title) = result.metadata.title
|
||||
{
|
||||
let lower = title.to_lowercase();
|
||||
if CONSENT_TITLES.iter().any(|t| lower.starts_with(t)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn warn_empty(url: &str, reason: &EmptyReason) {
|
||||
match reason {
|
||||
EmptyReason::Antibot => eprintln!(
|
||||
|
|
@ -74,6 +130,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
|
|||
This site requires CAPTCHA solving or browser rendering.\n\
|
||||
Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
|
||||
),
|
||||
EmptyReason::ConsentWall => eprintln!(
|
||||
"\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\
|
||||
The site redirected to a consent page and returned no usable content.\n\
|
||||
Try a non-EU proxy via --proxy, or pass a pre-accepted consent cookie\n\
|
||||
via --cookie / --cookie-file."
|
||||
),
|
||||
EmptyReason::JsRequired => eprintln!(
|
||||
"\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
|
||||
This site requires JavaScript rendering (SPA).\n\
|
||||
|
|
@ -387,10 +449,15 @@ impl From<Browser> for BrowserProfile {
|
|||
}
|
||||
|
||||
fn init_logging(verbose: bool) {
|
||||
// html5ever / markup5ever / selectors emit WARN on every page with real-world
|
||||
// HTML quirks (foster-parenting, malformed tables). They are not actionable
|
||||
// and pollute stderr with dozens of lines per fetch. Silence by default; users
|
||||
// who need them can override via WEBCLAW_LOG.
|
||||
let default = "warn,html5ever=error,markup5ever=error,selectors=error";
|
||||
let filter = if verbose {
|
||||
EnvFilter::new("webclaw=debug")
|
||||
EnvFilter::new("webclaw=debug,html5ever=error,markup5ever=error,selectors=error")
|
||||
} else {
|
||||
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
|
||||
};
|
||||
|
||||
tracing_subscriber::fmt().with_env_filter(filter).init();
|
||||
|
|
|
|||
|
|
@ -73,6 +73,18 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
|
|||
// d. Extract links, replace inline `[text](url)` with just `text`
|
||||
let (text, extracted_links) = links::extract_and_strip_links(&text);
|
||||
|
||||
// d1b. Strip bare-integer paragraphs (news-index comment counts, page
|
||||
// numbers). Must run AFTER link extraction so [0](#comment) collapses
|
||||
// to "0" first, and BEFORE dedup so identical 0 lines don't muddle
|
||||
// fingerprint dedup.
|
||||
let text = cleanup::strip_bare_number_lines(&text);
|
||||
|
||||
// d1c. Second UI-control pass: after link extraction, lines that were
|
||||
// previously `[0](url) Next` now read as `0 Next` -- a pure control
|
||||
// line that the pre-link c3 pass couldn't see through the link
|
||||
// syntax.
|
||||
let text = cleanup::strip_ui_control_text(&text);
|
||||
|
||||
// d2. Collapse repeated adjacent phrases on the same line
|
||||
// (responsive variants: "Read more Read more Read more" -> "Read more")
|
||||
let text = dedup_repeated_phrases(&text);
|
||||
|
|
|
|||
|
|
@ -395,6 +395,9 @@ pub(crate) fn is_ui_control_line(line: &str) -> bool {
|
|||
|
||||
/// Known UI control tokens from Material Icons ligatures, icon fonts, and
|
||||
/// common navigation elements that leak into text extraction.
|
||||
///
|
||||
/// Match is case-insensitive: `Next`, `next`, and `NEXT` are all treated as
|
||||
/// pagination chrome when alone on a line.
|
||||
fn is_ui_control_token(token: &str) -> bool {
|
||||
const UI_CONTROLS: &[&str] = &[
|
||||
// Material Icons ligatures
|
||||
|
|
@ -428,6 +431,12 @@ fn is_ui_control_token(token: &str) -> bool {
|
|||
"search",
|
||||
"menu",
|
||||
"share",
|
||||
// Pagination chrome left over from rendered "Next | Previous" links
|
||||
"next",
|
||||
"previous",
|
||||
"prev",
|
||||
"older",
|
||||
"newer",
|
||||
// Arrow/nav characters
|
||||
"\u{2190}",
|
||||
"\u{2192}",
|
||||
|
|
@ -444,7 +453,64 @@ fn is_ui_control_token(token: &str) -> bool {
|
|||
"\u{00BB}",
|
||||
"\u{00AB}",
|
||||
];
|
||||
UI_CONTROLS.contains(&token)
|
||||
let lowered = token.to_ascii_lowercase();
|
||||
if UI_CONTROLS.contains(&lowered.as_str()) {
|
||||
return true;
|
||||
}
|
||||
// Bare short integers (≤4 digits) on a line of otherwise pure control
|
||||
// tokens are comment counts / page numbers glued to pagination chrome
|
||||
// (e.g. "0 Next" at the bottom of news index pages). Counting these as
|
||||
// controls lets `is_ui_control_line` strip the whole line. A legitimate
|
||||
// line like "5 minutes" stays because "minutes" is not a control.
|
||||
!token.is_empty() && token.len() <= 4 && token.chars().all(|c| c.is_ascii_digit())
|
||||
}
|
||||
|
||||
/// Remove lines that are a bare short integer alone in their paragraph.
|
||||
///
|
||||
/// News index pages often render comment counts (`0`, `42`) and pagination
|
||||
/// page numbers (`1`, `2`) as standalone paragraphs after each article. These
|
||||
/// add zero signal and confuse downstream readers, but they are real numbers
|
||||
/// not control tokens, so [`strip_ui_control_text`] doesn't catch them.
|
||||
///
|
||||
/// To stay safe, we only drop a line if BOTH conditions hold:
|
||||
/// 1. The trimmed line parses cleanly as a non-negative integer ≤ 9999.
|
||||
/// 2. The line is alone in its paragraph (surrounded by blank lines, or at
|
||||
/// document edges). A `1.` or `- 1` list marker has different trim text and
|
||||
/// is not affected.
|
||||
pub(crate) fn strip_bare_number_lines(input: &str) -> String {
|
||||
let lines: Vec<&str> = input.lines().collect();
|
||||
let mut out: Vec<&str> = Vec::with_capacity(lines.len());
|
||||
let mut in_code = false;
|
||||
for (i, line) in lines.iter().enumerate() {
|
||||
let trimmed = line.trim();
|
||||
if trimmed.starts_with("```") {
|
||||
in_code = !in_code;
|
||||
out.push(line);
|
||||
continue;
|
||||
}
|
||||
if in_code {
|
||||
out.push(line);
|
||||
continue;
|
||||
}
|
||||
if is_bare_short_integer(trimmed) && is_isolated_in_paragraph(&lines, i) {
|
||||
continue;
|
||||
}
|
||||
out.push(line);
|
||||
}
|
||||
out.join("\n")
|
||||
}
|
||||
|
||||
fn is_bare_short_integer(s: &str) -> bool {
|
||||
if s.is_empty() || s.len() > 4 {
|
||||
return false;
|
||||
}
|
||||
s.chars().all(|c| c.is_ascii_digit())
|
||||
}
|
||||
|
||||
fn is_isolated_in_paragraph(lines: &[&str], i: usize) -> bool {
|
||||
let prev_blank = i == 0 || lines[i - 1].trim().is_empty();
|
||||
let next_blank = i + 1 == lines.len() || lines[i + 1].trim().is_empty();
|
||||
prev_blank && next_blank
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -69,6 +69,21 @@ fn is_noise_link(text: &str, href: &str) -> bool {
|
|||
return true;
|
||||
}
|
||||
|
||||
// Bare-integer labels (comment counts, vote counts, page numbers). Always
|
||||
// noise; the number alone tells the LLM nothing. Capped at 4 digits so a
|
||||
// legitimately useful year-style label (`1999`) isn't dropped — but in
|
||||
// practice bare 4-digit labels in a links section are page anchors, not
|
||||
// articles, so this stays safe.
|
||||
if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// In-page comment/discussion fragments that survived the bare-fragment check
|
||||
// because the href is a full URL with `#comment-stream` (or similar) tail.
|
||||
if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Internal user profile / action URLs (HN-style)
|
||||
if href.contains("/user?id=")
|
||||
|| href.contains("/hide?id=")
|
||||
|
|
|
|||
|
|
@ -51,12 +51,17 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
|||
// hydration blobs (Next.js pageProps full of ad-targeting flags, build
|
||||
// IDs, schedule paths) explode to hundreds of KB and drown the LLM in
|
||||
// noise — drop them rather than ship them.
|
||||
let useful: Vec<_> = result
|
||||
let mut useful: Vec<_> = result
|
||||
.structured_data
|
||||
.iter()
|
||||
.filter(|v| is_useful_structured_data(v))
|
||||
.cloned()
|
||||
.collect();
|
||||
// Scrub body-duplicating fields so the article body doesn't ship twice
|
||||
// (once as rendered markdown, again inside JSON-LD `articleBody`).
|
||||
for v in &mut useful {
|
||||
scrub_body_fields(v);
|
||||
}
|
||||
if !useful.is_empty() {
|
||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||
const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
|
||||
|
|
@ -113,6 +118,46 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
|||
serialized.len() <= 4 * 1024
|
||||
}
|
||||
|
||||
/// Recursively remove fields whose value is a long body string already
|
||||
/// rendered as markdown in the main output.
|
||||
///
|
||||
/// Schema.org `NewsArticle` / `Article` records often embed the full article
|
||||
/// body inside `articleBody`. Emitting that alongside the rendered markdown
|
||||
/// ships the same content twice and wastes the LLM's context budget. We
|
||||
/// always strip `articleBody`, and additionally strip `body` / `text` /
|
||||
/// `description` when their value is ≥ 500 chars (short blurbs are kept
|
||||
/// because they carry signal that may not be in the rendered body).
|
||||
fn scrub_body_fields(v: &mut serde_json::Value) {
|
||||
const BODY_KEYS: &[&str] = &["articleBody"];
|
||||
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
|
||||
const LONG_THRESHOLD: usize = 500;
|
||||
match v {
|
||||
serde_json::Value::Object(map) => {
|
||||
map.retain(|k, val| {
|
||||
if BODY_KEYS.contains(&k.as_str()) {
|
||||
return false;
|
||||
}
|
||||
if LONG_BODY_KEYS.contains(&k.as_str())
|
||||
&& let Some(s) = val.as_str()
|
||||
&& s.len() >= LONG_THRESHOLD
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
});
|
||||
for child in map.values_mut() {
|
||||
scrub_body_fields(child);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for child in arr.iter_mut() {
|
||||
scrub_body_fields(child);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Integration tests that exercise the full pipeline through to_llm_text
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue