mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-24 02:58:05 +02:00
Polish llm output quality fixes
This commit is contained in:
parent
df8bdc96db
commit
aa561e976a
7 changed files with 151 additions and 26 deletions
|
|
@ -3,6 +3,15 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.6.0] — 2026-05-10
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37.
|
||||||
|
- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites.
|
||||||
|
- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.5.9] — 2026-05-06
|
## [0.5.9] — 2026-05-06
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
|
||||||
14
Cargo.lock
generated
14
Cargo.lock
generated
|
|
@ -3219,7 +3219,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.5.9"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3240,7 +3240,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.5.9"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3258,7 +3258,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.5.9"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
|
@ -3284,7 +3284,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.5.9"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3297,7 +3297,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.5.9"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3317,7 +3317,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.5.9"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
@ -3326,7 +3326,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-server"
|
name = "webclaw-server"
|
||||||
version = "0.5.9"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"axum",
|
"axum",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.5.9"
|
version = "0.6.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -160,7 +160,7 @@ pub(crate) fn strip_leaked_js(input: &str) -> String {
|
||||||
pub(crate) fn strip_a11y_link_chrome(input: &str) -> String {
|
pub(crate) fn strip_a11y_link_chrome(input: &str) -> String {
|
||||||
static A11Y_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
static A11Y_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
||||||
Regex::new(
|
Regex::new(
|
||||||
r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?",
|
r"(?i)(?:\s*,\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?|\s+\((?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\)\.?|\s+external link\b\.?$)",
|
||||||
)
|
)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
|
@ -1424,13 +1424,19 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn a11y_preserves_code_blocks() {
|
fn a11y_preserves_code_blocks() {
|
||||||
let input = "```\nopens new tab is a function\n```\nopens new tab here";
|
let input = "```\nopens new tab is a function\n```\nDownload, opens new tab";
|
||||||
let out = strip_a11y_link_chrome(input);
|
let out = strip_a11y_link_chrome(input);
|
||||||
assert!(
|
assert!(
|
||||||
out.contains("opens new tab is a function"),
|
out.contains("opens new tab is a function"),
|
||||||
"code stripped: {out}"
|
"code stripped: {out}"
|
||||||
);
|
);
|
||||||
// Outside the fence, the chrome is removed.
|
// Outside the fence, the chrome is removed.
|
||||||
assert!(!out.ends_with("opens new tab here"));
|
assert!(!out.to_lowercase().contains("download, opens new tab"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn a11y_preserves_external_link_prose() {
|
||||||
|
let input = "Researchers found an external link between the two incidents.";
|
||||||
|
assert_eq!(strip_a11y_link_chrome(input), input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,7 @@ static MD_MARKERS_RE: Lazy<Regex> =
|
||||||
|
|
||||||
static A11Y_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
|
static A11Y_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
|
||||||
Regex::new(
|
Regex::new(
|
||||||
r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?",
|
r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)",
|
||||||
)
|
)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
|
@ -190,4 +190,20 @@ mod tests {
|
||||||
assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
|
assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
|
||||||
assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
|
assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn link_label_preserves_external_link_prose() {
|
||||||
|
assert_eq!(
|
||||||
|
clean_link_label("Research found an external link between incidents"),
|
||||||
|
"Research found an external link between incidents"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn link_label_strips_terminal_external_link_chrome() {
|
||||||
|
assert_eq!(
|
||||||
|
clean_link_label("Reuters story external link"),
|
||||||
|
"Reuters story"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -80,26 +80,32 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||||
/// hydration state.
|
/// hydration state.
|
||||||
fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
||||||
let Some(obj) = v.as_object() else {
|
let Some(obj) = v.as_object() else {
|
||||||
|
// SvelteKit can emit compact arrays of page data. Keep those if they
|
||||||
|
// are small enough to be useful, while still dropping giant hydration
|
||||||
|
// arrays under the same budget as untyped objects.
|
||||||
|
if v.is_array() {
|
||||||
|
let serialized = serde_json::to_string(v).unwrap_or_default();
|
||||||
|
return serialized.len() <= 4 * 1024;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
// JSON-LD: @type drives the decision.
|
// JSON-LD: @type drives the decision.
|
||||||
if let Some(t) = obj.get("@type") {
|
if let Some(t) = obj.get("@type") {
|
||||||
let type_str = match t {
|
let types: Vec<String> = match t {
|
||||||
serde_json::Value::String(s) => s.clone(),
|
serde_json::Value::String(s) => vec![s.to_ascii_lowercase()],
|
||||||
serde_json::Value::Array(a) => a
|
serde_json::Value::Array(a) => a
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|x| x.as_str())
|
.filter_map(|x| x.as_str())
|
||||||
.collect::<Vec<_>>()
|
.map(str::to_ascii_lowercase)
|
||||||
.join(","),
|
.collect(),
|
||||||
_ => String::new(),
|
_ => Vec::new(),
|
||||||
};
|
};
|
||||||
let lower = type_str.to_ascii_lowercase();
|
if types.is_empty() {
|
||||||
// Drop low-info chrome types.
|
|
||||||
const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
|
|
||||||
if DROP_TYPES.iter().any(|d| lower == *d) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return !lower.is_empty();
|
// Drop low-info chrome types.
|
||||||
|
const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
|
||||||
|
return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d));
|
||||||
}
|
}
|
||||||
// Next.js pageProps / SvelteKit data without @type: keep only if compact.
|
// Next.js pageProps / SvelteKit data without @type: keep only if compact.
|
||||||
// Anything over ~4KB is almost certainly hydration state, not content.
|
// Anything over ~4KB is almost certainly hydration state, not content.
|
||||||
|
|
@ -821,4 +827,17 @@ mod tests {
|
||||||
"Compact untyped dropped: {out}"
|
"Compact untyped dropped: {out}"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn structured_data_keeps_compact_untyped_array() {
|
||||||
|
// SvelteKit can emit compact arrays rather than objects.
|
||||||
|
let r = make_result_with_structured(vec![serde_json::json!([
|
||||||
|
{ "title": "Hi", "body": "small array item" }
|
||||||
|
])]);
|
||||||
|
let out = to_llm_text(&r, None);
|
||||||
|
assert!(
|
||||||
|
out.contains("small array item"),
|
||||||
|
"Compact untyped array dropped: {out}"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -367,11 +367,65 @@ fn inline_text(
|
||||||
|
|
||||||
/// Check whether a space is needed between two adjacent chunks of output.
|
/// Check whether a space is needed between two adjacent chunks of output.
|
||||||
/// Returns true when the left side doesn't end with whitespace and the right
|
/// Returns true when the left side doesn't end with whitespace and the right
|
||||||
/// side doesn't start with whitespace — i.e., two words would be mashed together.
|
/// side doesn't start with whitespace, except around punctuation that should
|
||||||
|
/// bind to the adjacent token.
|
||||||
fn needs_separator(left: &str, right: &str) -> bool {
|
fn needs_separator(left: &str, right: &str) -> bool {
|
||||||
let l = left.as_bytes().last().copied().unwrap_or(b' ');
|
let l = left.chars().next_back().unwrap_or(' ');
|
||||||
let r = right.as_bytes().first().copied().unwrap_or(b' ');
|
let r = right.chars().next().unwrap_or(' ');
|
||||||
!l.is_ascii_whitespace() && !r.is_ascii_whitespace()
|
|
||||||
|
if l.is_whitespace() || r.is_whitespace() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do not create "word ," / "word )" / "word 's" artifacts.
|
||||||
|
if is_closing_punctuation(r) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do not create "( word" / "[ 1" artifacts.
|
||||||
|
if is_opening_punctuation(l) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common inline-code suffixes: `Option`s, `x`'s. Treat them like a
|
||||||
|
// single token rather than separating the text node.
|
||||||
|
if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn starts_with_inline_code_suffix(s: &str) -> bool {
|
||||||
|
let trimmed = s.trim_start_matches(['*', '_']);
|
||||||
|
let mut chars = trimmed.chars();
|
||||||
|
let Some(first) = chars.next() else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
if matches!(first, '\'' | '’') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !matches!(first, 's' | 'S') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
match chars.next() {
|
||||||
|
None => true,
|
||||||
|
Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_closing_punctuation(c: char) -> bool {
|
||||||
|
matches!(
|
||||||
|
c,
|
||||||
|
'.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”'
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_opening_punctuation(c: char) -> bool {
|
||||||
|
matches!(c, '(' | '[' | '{' | '"' | '“')
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect raw text content (no markdown formatting).
|
/// Collect raw text content (no markdown formatting).
|
||||||
|
|
@ -1626,4 +1680,25 @@ mod tests {
|
||||||
"Element->Text->Element smashed together: {md}"
|
"Element->Text->Element smashed together: {md}"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn punctuation_after_inline_element_stays_attached() {
|
||||||
|
let html = r#"<p><span>Hello</span>, world. Use <code>package.json</code>.</p>"#;
|
||||||
|
let (md, _, _) = convert_html(html, None);
|
||||||
|
assert!(md.contains("Hello, world"), "punctuation detached: {md}");
|
||||||
|
assert!(
|
||||||
|
md.contains("`package.json`."),
|
||||||
|
"code punctuation detached: {md}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn inline_code_suffix_stays_attached() {
|
||||||
|
let html = r#"<p><a href="https://example.com"><code>NullPointerException</code></a><em>s</em> are common.</p>"#;
|
||||||
|
let (md, _, _) = convert_html(html, None);
|
||||||
|
assert!(
|
||||||
|
md.contains("[`NullPointerException`](https://example.com)*s* are common"),
|
||||||
|
"code suffix detached: {md}"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue