fix: handle raw newlines in JSON-LD strings

Sites like Bluesky emit JSON-LD with literal newline characters inside string values (technically invalid JSON). Add sanitize_json_newlines() fallback that escapes control characters inside quoted strings before retrying the parse. This recovers ProfilePage, Product, and other structured data that was previously silently dropped. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 00:06:21 +02:00 · 2026-04-16 11:40:25 +02:00 · 2026-04-16 11:40:25 +02:00 · 6316b1a6e7
commit 6316b1a6e7
parent 78e198a347
5 changed files with 1266 additions and 8 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3102,7 +3102,7 @@ dependencies = [
 [[package]]
 name = "webclaw-cli"
-version = "0.3.12"
+version = "0.3.13"
 dependencies = [
 "clap",
 "dotenvy",
@ -3122,7 +3122,7 @@ dependencies = [
 [[package]]
 name = "webclaw-core"
-version = "0.3.12"
+version = "0.3.13"
 dependencies = [
 "ego-tree",
 "once_cell",
@ -3140,7 +3140,7 @@ dependencies = [
 [[package]]
 name = "webclaw-fetch"
-version = "0.3.12"
+version = "0.3.13"
 dependencies = [
 "bytes",
 "calamine",
@ -3162,7 +3162,7 @@ dependencies = [
 [[package]]
 name = "webclaw-llm"
-version = "0.3.12"
+version = "0.3.13"
 dependencies = [
 "async-trait",
 "reqwest",
@ -3175,7 +3175,7 @@ dependencies = [
 [[package]]
 name = "webclaw-mcp"
-version = "0.3.12"
+version = "0.3.13"
 dependencies = [
 "dirs",
 "dotenvy",
@ -3196,7 +3196,7 @@ dependencies = [
 [[package]]
 name = "webclaw-pdf"
-version = "0.3.12"
+version = "0.3.13"
 dependencies = [
 "pdf-extract",
 "thiserror",
--- a/crates/webclaw-core/src/structured_data.rs
+++ b/crates/webclaw-core/src/structured_data.rs
@ -52,8 +52,14 @@ pub fn extract_json_ld(html: &str) -> Vec<Value> {
            continue;
        }
-        // Parse — some sites have arrays at top level
+        // Try parsing as-is first, then retry with sanitized newlines.
-        match serde_json::from_str::<Value>(json_str) {
+        // Many sites (e.g. Bluesky) emit JSON-LD with raw newlines inside
        // string values which is technically invalid JSON.
        let parsed = serde_json::from_str::<Value>(json_str).or_else(|_| {
            let sanitized = sanitize_json_newlines(json_str);
            serde_json::from_str::<Value>(&sanitized)
        });
        match parsed {
            Ok(Value::Array(arr)) => results.extend(arr),
            Ok(val) => results.push(val),
            Err(_) => {}
@ -237,6 +243,45 @@ fn js_literal_to_json(input: &str) -> String {
    out
 }
 /// Replace raw newlines/tabs inside JSON string values with escape sequences.
 /// Walks the input tracking whether we're inside a quoted string; any literal
 /// control character found inside quotes is replaced with its `\n`/`\t`/`\r`
 /// escape. Characters outside strings are left untouched.
 fn sanitize_json_newlines(input: &str) -> String {
    let mut out = String::with_capacity(input.len());
    let mut in_string = false;
    let mut escape_next = false;
    for ch in input.chars() {
        if escape_next {
            out.push(ch);
            escape_next = false;
            continue;
        }
        if ch == '\\' && in_string {
            out.push(ch);
            escape_next = true;
            continue;
        }
        if ch == '"' {
            in_string = !in_string;
            out.push(ch);
            continue;
        }
        if in_string {
            match ch {
                '\n' => out.push_str("\\n"),
                '\r' => out.push_str("\\r"),
                '\t' => out.push_str("\\t"),
                _ => out.push(ch),
            }
        } else {
            out.push(ch);
        }
    }
    out
 }
 /// Extract content between balanced brackets, handling string escaping.
 fn extract_balanced(text: &str, open: u8, close: u8) -> Option<String> {
    if text.as_bytes().first()? != &open {
@ -374,4 +419,25 @@ mod tests {
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 1);
    }
    #[test]
    fn handles_raw_newlines_in_json_ld() {
        let html = "<script type=\"application/ld+json\">{\"@type\":\"ProfilePage\",\"mainEntity\":{\"name\":\"Jay\",\"description\":\"Founder @ Bluesky\n\nWorking on stuff\n🌱\"}}</script>";
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0]["@type"], "ProfilePage");
        let desc = results[0]["mainEntity"]["description"].as_str().unwrap();
        assert!(desc.contains("Founder"));
        assert!(desc.contains("Working on stuff"));
    }
    #[test]
    fn sanitize_preserves_valid_escapes() {
        let input = r#"{"text":"line1\nline2","raw":"has
 newline"}"#;
        let sanitized = sanitize_json_newlines(input);
        let parsed: Value = serde_json::from_str(&sanitized).unwrap();
        assert_eq!(parsed["text"], "line1\nline2");
        assert_eq!(parsed["raw"], "has\nnewline");
    }
 }
--- a/crates/webclaw-fetch/tests/bench_1k.rs
+++ b/crates/webclaw-fetch/tests/bench_1k.rs
@ -0,0 +1,168 @@
 //! 1000-site benchmark using FetchClient (wreq backend).
 //! Run: cargo test -p webclaw-fetch --test bench_1k --release -- --nocapture
 use std::sync::Arc;
 use std::time::Instant;
 use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig};
 fn load_targets() -> Vec<(String, String, Vec<String>)> {
    let candidates = [
        "targets_1000.txt",
        "../../targets_1000.txt",
        "../../../targets_1000.txt",
    ];
    let path = std::env::var("TARGETS_FILE")
        .ok()
        .or_else(|| {
            candidates
                .iter()
                .find(|p| std::path::Path::new(p).exists())
                .map(|s| s.to_string())
        })
        .expect("targets_1000.txt not found — set TARGETS_FILE env var");
    let content = std::fs::read_to_string(&path).expect("failed to read targets file");
    content
        .lines()
        .filter(|l| !l.is_empty())
        .map(|l| {
            let parts: Vec<&str> = l.splitn(3, '|').collect();
            let kw: Vec<String> = parts
                .get(2)
                .unwrap_or(&"")
                .split(',')
                .map(|s| s.trim().to_string())
                .collect();
            (parts[0].to_string(), parts[1].to_string(), kw)
        })
        .collect()
 }
 fn load_proxy() -> Option<String> {
    let candidates = ["proxies.txt", "../../proxies.txt", "../../../proxies.txt"];
    let path = std::env::var("PROXY_FILE").ok().or_else(|| {
        candidates
            .iter()
            .find(|p| std::path::Path::new(p).exists())
            .map(|s| s.to_string())
    })?;
    let content = std::fs::read_to_string(&path).ok()?;
    let line = content.lines().next()?;
    let p: Vec<&str> = line.split(':').collect();
    if p.len() == 4 {
        Some(format!("http://{}:{}@{}:{}", p[2], p[3], p[0], p[1]))
    } else {
        Some(line.to_string())
    }
 }
 fn classify(body: &str, len: usize, status: u16, kw: &[String]) -> &'static str {
    let lower = body.to_lowercase();
    let challenge = lower.contains("just a moment")
        || lower.contains("verify you are human")
        || lower.contains("cf-chl-bypass")
        || lower.contains("challenge page")
        || lower.contains("pardon our interruption")
        || lower.contains("are you a robot")
        || (lower.contains("captcha") && len < 50000);
    let hits = kw.iter().filter(|k| lower.contains(k.as_str())).count();
    if hits >= 2 && len > 5000 && !challenge {
        "OK"
    } else if challenge {
        "CHALLENGE"
    } else if status == 403 || status == 429 {
        "BLOCKED"
    } else if status >= 300 && status < 400 {
        "REDIRECT"
    } else if len < 1000 {
        "EMPTY"
    } else {
        "UNCLEAR"
    }
 }
 #[tokio::test]
 async fn bench_1k_sites() {
    let targets = load_targets();
    let proxy = load_proxy();
    let config = FetchConfig {
        browser: BrowserProfile::Chrome,
        proxy,
        timeout: std::time::Duration::from_secs(12),
        ..Default::default()
    };
    let client = Arc::new(FetchClient::new(config).expect("build client"));
    println!(
        "\n=== webclaw-fetch + wreq — {} targets ===\n",
        targets.len()
    );
    let start = Instant::now();
    let mut pass = 0usize;
    let mut errors = 0usize;
    let mut challenges = 0usize;
    let mut blocked = 0usize;
    let mut redirects = 0usize;
    let mut unclear = 0usize;
    let total = targets.len();
    // Process in batches of 20 concurrent
    for chunk in targets.chunks(20) {
        let mut handles = Vec::new();
        for (name, url, kw) in chunk {
            let c = Arc::clone(&client);
            let url = url.clone();
            let name = name.clone();
            let kw = kw.clone();
            handles.push(tokio::spawn(async move {
                match c.fetch(&url).await {
                    Ok(result) => {
                        let v = classify(&result.html, result.html.len(), result.status, &kw);
                        (name, result.status, result.html.len(), v, String::new())
                    }
                    Err(e) => (name, 0u16, 0usize, "ERROR", format!("{e}")),
                }
            }));
        }
        for h in handles {
            if let Ok((name, status, len, verdict, err)) = h.await {
                match verdict {
                    "OK" => pass += 1,
                    "CHALLENGE" => {
                        challenges += 1;
                        println!("  CHALLENGE {:<25} {:>4} {:>8}B", name, status, len);
                    }
                    "BLOCKED" => {
                        blocked += 1;
                        println!("  BLOCKED   {:<25} {:>4} {:>8}B", name, status, len);
                    }
                    "REDIRECT" => redirects += 1,
                    "ERROR" => {
                        errors += 1;
                        let short = if err.len() > 50 { &err[..50] } else { &err };
                        println!("  ERROR     {:<25} {}", name, short);
                    }
                    _ => unclear += 1,
                }
            }
        }
    }
    let elapsed = start.elapsed();
    println!("\n{}", "=".repeat(60));
    println!(
        "  PASS:      {pass}/{total} ({:.0}%)",
        (pass as f64 / total as f64) * 100.0
    );
    println!("  CHALLENGE: {challenges}");
    println!("  BLOCKED:   {blocked}");
    println!("  REDIRECT:  {redirects}");
    println!("  UNCLEAR:   {unclear}");
    println!("  ERROR:     {errors}");
    println!("  TIME:      {:.1}s", elapsed.as_secs_f64());
    println!("{}", "=".repeat(60));
 }
--- a/smithery.yaml
+++ b/smithery.yaml
@ -0,0 +1,24 @@
 # Smithery configuration — https://smithery.ai/docs/build/project-config
 # webclaw MCP server: web extraction for AI agents with bot-protection bypass
 startCommand:
  type: stdio
  configSchema:
    type: object
    properties:
      apiKey:
        type: string
        description: >
          webclaw API key from webclaw.io. Optional — the server works
          locally without one. Set this for automatic fallback to the
          webclaw cloud API when a site has bot protection or requires
          JS rendering.
        secret: true
  commandFunction: |
    (config) => ({
      command: 'webclaw-mcp',
      args: [],
      env: config.apiKey ? { WEBCLAW_API_KEY: config.apiKey } : {}
    })
  exampleConfig:
    apiKey: wc_your_api_key_here
--- a/targets_1000.txt
+++ b/targets_1000.txt