) -> ExtractionResult {
+ let mut r = make_result("# Body");
+ r.structured_data = values;
+ r
+ }
+
+ #[test]
+ fn structured_data_drops_chrome_types() {
+ // WebSite/WebPage records are framework chrome — should be dropped.
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "@type": "WebSite",
+ "name": "Example",
+ "url": "https://example.com"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ !out.contains("## Structured Data"),
+ "WebSite chrome leaked into output: {out}"
+ );
+ }
+
+ #[test]
+ fn structured_data_keeps_article_types() {
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "@type": "NewsArticle",
+ "headline": "Big news",
+ "datePublished": "2026-05-10"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ out.contains("## Structured Data"),
+ "NewsArticle dropped: {out}"
+ );
+ assert!(out.contains("Big news"));
+ }
+
+ #[test]
+ fn structured_data_drops_oversized_blob() {
+ // 32KB pageProps-style blob with no @type — should be dropped.
+ let big = "x".repeat(32 * 1024);
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "buildId": "abc",
+ "isFallback": false,
+ "noise": big
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ !out.contains("## Structured Data"),
+ "Oversized untyped blob leaked: len={}",
+ out.len()
+ );
+ }
+
+ #[test]
+ fn structured_data_keeps_compact_untyped() {
+ // Small untyped record (e.g. a parsed pageProps with real content) — keep.
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "title": "Hi",
+ "body": "small enough to keep"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ out.contains("## Structured Data"),
+ "Compact untyped dropped: {out}"
+ );
+ }
}
diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs
index d0a2c23..cacadb2 100644
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@@ -320,6 +320,9 @@ fn children_to_md(
}
}
Node::Text(text) => {
+ if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
+ out.push(' ');
+ }
out.push_str(text);
}
_ => {}
@@ -350,6 +353,9 @@ fn inline_text(
}
}
Node::Text(text) => {
+ if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
+ out.push(' ');
+ }
out.push_str(text);
}
_ => {}
@@ -1606,4 +1612,18 @@ mod tests {
"collapse_whitespace stripped 6-space indent: {output}"
);
}
+
+ #[test]
+ fn text_after_inline_element_keeps_separator() {
+ // Reuters-style markup: agoTanker crosses...
+ // The "ago" text node sits between two element children. Without a
+ // separator check on the Text branch, "ago" + "Tanker" would smash
+ // together as "agoTanker".
+ let html = r#"3hagoTanker crosses Strait
"#;
+ let (md, _, _) = convert_html(html, None);
+ assert!(
+ !md.contains("agoTanker"),
+ "Element->Text->Element smashed together: {md}"
+ );
+ }
}