) -> ExtractionResult {
+ let mut r = make_result("# Body");
+ r.structured_data = values;
+ r
+ }
+
+ #[test]
+ fn structured_data_drops_chrome_types() {
+ // WebSite/WebPage records are framework chrome — should be dropped.
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "@type": "WebSite",
+ "name": "Example",
+ "url": "https://example.com"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ !out.contains("## Structured Data"),
+ "WebSite chrome leaked into output: {out}"
+ );
+ }
+
+ #[test]
+ fn structured_data_keeps_article_types() {
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "@type": "NewsArticle",
+ "headline": "Big news",
+ "datePublished": "2026-05-10"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ out.contains("## Structured Data"),
+ "NewsArticle dropped: {out}"
+ );
+ assert!(out.contains("Big news"));
+ }
+
+ #[test]
+ fn structured_data_drops_oversized_blob() {
+ // 32KB pageProps-style blob with no @type — should be dropped.
+ let big = "x".repeat(32 * 1024);
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "buildId": "abc",
+ "isFallback": false,
+ "noise": big
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ !out.contains("## Structured Data"),
+ "Oversized untyped blob leaked: len={}",
+ out.len()
+ );
+ }
+
+ #[test]
+ fn structured_data_keeps_compact_untyped() {
+ // Small untyped record (e.g. a parsed pageProps with real content) — keep.
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "title": "Hi",
+ "body": "small enough to keep"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ out.contains("## Structured Data"),
+ "Compact untyped dropped: {out}"
+ );
+ }
+
+ #[test]
+ fn structured_data_keeps_compact_untyped_array() {
+ // SvelteKit can emit compact arrays rather than objects.
+ let r = make_result_with_structured(vec![serde_json::json!([
+ { "title": "Hi", "body": "small array item" }
+ ])]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ out.contains("small array item"),
+ "Compact untyped array dropped: {out}"
+ );
+ }
}
diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs
index d0a2c23..2699166 100644
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@@ -320,6 +320,9 @@ fn children_to_md(
}
}
Node::Text(text) => {
+ if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
+ out.push(' ');
+ }
out.push_str(text);
}
_ => {}
@@ -350,6 +353,9 @@ fn inline_text(
}
}
Node::Text(text) => {
+ if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
+ out.push(' ');
+ }
out.push_str(text);
}
_ => {}
@@ -361,11 +367,65 @@ fn inline_text(
/// Check whether a space is needed between two adjacent chunks of output.
/// Returns true when the left side doesn't end with whitespace and the right
-/// side doesn't start with whitespace — i.e., two words would be mashed together.
+/// side doesn't start with whitespace, except around punctuation that should
+/// bind to the adjacent token.
fn needs_separator(left: &str, right: &str) -> bool {
- let l = left.as_bytes().last().copied().unwrap_or(b' ');
- let r = right.as_bytes().first().copied().unwrap_or(b' ');
- !l.is_ascii_whitespace() && !r.is_ascii_whitespace()
+ let l = left.chars().next_back().unwrap_or(' ');
+ let r = right.chars().next().unwrap_or(' ');
+
+ if l.is_whitespace() || r.is_whitespace() {
+ return false;
+ }
+
+ // Do not create "word ," / "word )" / "word 's" artifacts.
+ if is_closing_punctuation(r) {
+ return false;
+ }
+
+ // Do not create "( word" / "[ 1" artifacts.
+ if is_opening_punctuation(l) {
+ return false;
+ }
+
+ // Common inline-code suffixes: `Option`s, `x`'s. Treat them like a
+ // single token rather than separating the text node.
+ if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) {
+ return false;
+ }
+
+ true
+}
+
+fn starts_with_inline_code_suffix(s: &str) -> bool {
+ let trimmed = s.trim_start_matches(['*', '_']);
+ let mut chars = trimmed.chars();
+ let Some(first) = chars.next() else {
+ return false;
+ };
+
+ if matches!(first, '\'' | '’') {
+ return true;
+ }
+
+ if !matches!(first, 's' | 'S') {
+ return false;
+ }
+
+ match chars.next() {
+ None => true,
+ Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'),
+ }
+}
+
+fn is_closing_punctuation(c: char) -> bool {
+ matches!(
+ c,
+ '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”'
+ )
+}
+
+fn is_opening_punctuation(c: char) -> bool {
+ matches!(c, '(' | '[' | '{' | '"' | '“')
}
/// Collect raw text content (no markdown formatting).
@@ -1606,4 +1666,39 @@ mod tests {
"collapse_whitespace stripped 6-space indent: {output}"
);
}
+
+ #[test]
+ fn text_after_inline_element_keeps_separator() {
+ // Reuters-style markup: agoTanker crosses...
+ // The "ago" text node sits between two element children. Without a
+ // separator check on the Text branch, "ago" + "Tanker" would smash
+ // together as "agoTanker".
+ let html = r#"3hagoTanker crosses Strait
"#;
+ let (md, _, _) = convert_html(html, None);
+ assert!(
+ !md.contains("agoTanker"),
+ "Element->Text->Element smashed together: {md}"
+ );
+ }
+
+ #[test]
+ fn punctuation_after_inline_element_stays_attached() {
+ let html = r#"Hello, world. Use package.json.
"#;
+ let (md, _, _) = convert_html(html, None);
+ assert!(md.contains("Hello, world"), "punctuation detached: {md}");
+ assert!(
+ md.contains("`package.json`."),
+ "code punctuation detached: {md}"
+ );
+ }
+
+ #[test]
+ fn inline_code_suffix_stays_attached() {
+ let html = r#"NullPointerExceptions are common.
"#;
+ let (md, _, _) = convert_html(html, None);
+ assert!(
+ md.contains("[`NullPointerException`](https://example.com)*s* are common"),
+ "code suffix detached: {md}"
+ );
+ }
}