mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-07 22:15:12 +02:00
Improve --format llm output quality (#37)
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run
Improve LLM-format output for modern news and documentation pages. - Filter noisy hydration and low-value page chrome structured data while preserving content-bearing Schema.org records - Fix element/text spacing without detaching punctuation on docs, forums, and reference pages - Remove common accessibility link chrome from LLM text and link labels - Bump workspace version to 0.6.0 and update the changelog Thanks to Nenad Oric (@devnen) for the original PR and contribution.
This commit is contained in:
parent
7f75143954
commit
e8ca1417d6
8 changed files with 371 additions and 16 deletions
|
|
@ -46,15 +46,73 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
|||
}
|
||||
|
||||
// -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) --
|
||||
if !result.structured_data.is_empty() {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default());
|
||||
out.push_str("\n```");
|
||||
// Only emit useful items: Schema.org records with a meaningful @type,
|
||||
// and only if the total serialized size stays under a budget. Framework
|
||||
// hydration blobs (Next.js pageProps full of ad-targeting flags, build
|
||||
// IDs, schedule paths) explode to hundreds of KB and drown the LLM in
|
||||
// noise — drop them rather than ship them.
|
||||
let useful: Vec<_> = result
|
||||
.structured_data
|
||||
.iter()
|
||||
.filter(|v| is_useful_structured_data(v))
|
||||
.cloned()
|
||||
.collect();
|
||||
if !useful.is_empty() {
|
||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||
const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
|
||||
if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(&serialized);
|
||||
out.push_str("\n```");
|
||||
}
|
||||
}
|
||||
|
||||
out.trim().to_string()
|
||||
}
|
||||
|
||||
/// Decide whether a structured-data value carries content worth emitting.
|
||||
///
|
||||
/// Schema.org records with a recognizable content `@type` (Article, NewsArticle,
|
||||
/// Product, Recipe, FAQPage, HowTo, Event, Person, Organization, BreadcrumbList,
|
||||
/// VideoObject, JobPosting, etc.) are kept. Generic `WebSite` / `WebPage` /
|
||||
/// `ItemList` records and Next.js `pageProps`-style blobs without a useful
|
||||
/// `@type` are dropped — they're almost always navigation chrome or framework
|
||||
/// hydration state.
|
||||
fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
||||
let Some(obj) = v.as_object() else {
|
||||
// SvelteKit can emit compact arrays of page data. Keep those if they
|
||||
// are small enough to be useful, while still dropping giant hydration
|
||||
// arrays under the same budget as untyped objects.
|
||||
if v.is_array() {
|
||||
let serialized = serde_json::to_string(v).unwrap_or_default();
|
||||
return serialized.len() <= 4 * 1024;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
// JSON-LD: @type drives the decision.
|
||||
if let Some(t) = obj.get("@type") {
|
||||
let types: Vec<String> = match t {
|
||||
serde_json::Value::String(s) => vec![s.to_ascii_lowercase()],
|
||||
serde_json::Value::Array(a) => a
|
||||
.iter()
|
||||
.filter_map(|x| x.as_str())
|
||||
.map(str::to_ascii_lowercase)
|
||||
.collect(),
|
||||
_ => Vec::new(),
|
||||
};
|
||||
if types.is_empty() {
|
||||
return false;
|
||||
}
|
||||
// Drop low-info chrome types.
|
||||
const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
|
||||
return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d));
|
||||
}
|
||||
// Next.js pageProps / SvelteKit data without @type: keep only if compact.
|
||||
// Anything over ~4KB is almost certainly hydration state, not content.
|
||||
let serialized = serde_json::to_string(v).unwrap_or_default();
|
||||
serialized.len() <= 4 * 1024
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Integration tests that exercise the full pipeline through to_llm_text
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -700,4 +758,86 @@ mod tests {
|
|||
assert!(out.contains("Some content"), "Content before lost: {out}");
|
||||
assert!(out.contains("More content"), "Content after lost: {out}");
|
||||
}
|
||||
|
||||
// -- Structured-data gating tests --
|
||||
|
||||
fn make_result_with_structured(values: Vec<serde_json::Value>) -> ExtractionResult {
|
||||
let mut r = make_result("# Body");
|
||||
r.structured_data = values;
|
||||
r
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_drops_chrome_types() {
|
||||
// WebSite/WebPage records are framework chrome — should be dropped.
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "WebSite",
|
||||
"name": "Example",
|
||||
"url": "https://example.com"
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
!out.contains("## Structured Data"),
|
||||
"WebSite chrome leaked into output: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_keeps_article_types() {
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "NewsArticle",
|
||||
"headline": "Big news",
|
||||
"datePublished": "2026-05-10"
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
out.contains("## Structured Data"),
|
||||
"NewsArticle dropped: {out}"
|
||||
);
|
||||
assert!(out.contains("Big news"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_drops_oversized_blob() {
|
||||
// 32KB pageProps-style blob with no @type — should be dropped.
|
||||
let big = "x".repeat(32 * 1024);
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"buildId": "abc",
|
||||
"isFallback": false,
|
||||
"noise": big
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
!out.contains("## Structured Data"),
|
||||
"Oversized untyped blob leaked: len={}",
|
||||
out.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_keeps_compact_untyped() {
|
||||
// Small untyped record (e.g. a parsed pageProps with real content) — keep.
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"title": "Hi",
|
||||
"body": "small enough to keep"
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
out.contains("## Structured Data"),
|
||||
"Compact untyped dropped: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_keeps_compact_untyped_array() {
|
||||
// SvelteKit can emit compact arrays rather than objects.
|
||||
let r = make_result_with_structured(vec![serde_json::json!([
|
||||
{ "title": "Hi", "body": "small array item" }
|
||||
])]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
out.contains("small array item"),
|
||||
"Compact untyped array dropped: {out}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue