mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix: handle raw newlines in JSON-LD strings
Sites like Bluesky emit JSON-LD with literal newline characters inside string values (technically invalid JSON). Add sanitize_json_newlines() fallback that escapes control characters inside quoted strings before retrying the parse. This recovers ProfilePage, Product, and other structured data that was previously silently dropped. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
78e198a347
commit
6316b1a6e7
5 changed files with 1266 additions and 8 deletions
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3102,7 +3102,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.3.12"
|
version = "0.3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3122,7 +3122,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.3.12"
|
version = "0.3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3140,7 +3140,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.3.12"
|
version = "0.3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"calamine",
|
"calamine",
|
||||||
|
|
@ -3162,7 +3162,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.3.12"
|
version = "0.3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3175,7 +3175,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.3.12"
|
version = "0.3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3196,7 +3196,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.3.12"
|
version = "0.3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
|
||||||
|
|
@ -52,8 +52,14 @@ pub fn extract_json_ld(html: &str) -> Vec<Value> {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse — some sites have arrays at top level
|
// Try parsing as-is first, then retry with sanitized newlines.
|
||||||
match serde_json::from_str::<Value>(json_str) {
|
// Many sites (e.g. Bluesky) emit JSON-LD with raw newlines inside
|
||||||
|
// string values which is technically invalid JSON.
|
||||||
|
let parsed = serde_json::from_str::<Value>(json_str).or_else(|_| {
|
||||||
|
let sanitized = sanitize_json_newlines(json_str);
|
||||||
|
serde_json::from_str::<Value>(&sanitized)
|
||||||
|
});
|
||||||
|
match parsed {
|
||||||
Ok(Value::Array(arr)) => results.extend(arr),
|
Ok(Value::Array(arr)) => results.extend(arr),
|
||||||
Ok(val) => results.push(val),
|
Ok(val) => results.push(val),
|
||||||
Err(_) => {}
|
Err(_) => {}
|
||||||
|
|
@ -237,6 +243,45 @@ fn js_literal_to_json(input: &str) -> String {
|
||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Replace raw newlines/tabs inside JSON string values with escape sequences.
|
||||||
|
/// Walks the input tracking whether we're inside a quoted string; any literal
|
||||||
|
/// control character found inside quotes is replaced with its `\n`/`\t`/`\r`
|
||||||
|
/// escape. Characters outside strings are left untouched.
|
||||||
|
fn sanitize_json_newlines(input: &str) -> String {
|
||||||
|
let mut out = String::with_capacity(input.len());
|
||||||
|
let mut in_string = false;
|
||||||
|
let mut escape_next = false;
|
||||||
|
|
||||||
|
for ch in input.chars() {
|
||||||
|
if escape_next {
|
||||||
|
out.push(ch);
|
||||||
|
escape_next = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ch == '\\' && in_string {
|
||||||
|
out.push(ch);
|
||||||
|
escape_next = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ch == '"' {
|
||||||
|
in_string = !in_string;
|
||||||
|
out.push(ch);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if in_string {
|
||||||
|
match ch {
|
||||||
|
'\n' => out.push_str("\\n"),
|
||||||
|
'\r' => out.push_str("\\r"),
|
||||||
|
'\t' => out.push_str("\\t"),
|
||||||
|
_ => out.push(ch),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
out.push(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
/// Extract content between balanced brackets, handling string escaping.
|
/// Extract content between balanced brackets, handling string escaping.
|
||||||
fn extract_balanced(text: &str, open: u8, close: u8) -> Option<String> {
|
fn extract_balanced(text: &str, open: u8, close: u8) -> Option<String> {
|
||||||
if text.as_bytes().first()? != &open {
|
if text.as_bytes().first()? != &open {
|
||||||
|
|
@ -374,4 +419,25 @@ mod tests {
|
||||||
let results = extract_json_ld(html);
|
let results = extract_json_ld(html);
|
||||||
assert_eq!(results.len(), 1);
|
assert_eq!(results.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn handles_raw_newlines_in_json_ld() {
|
||||||
|
let html = "<script type=\"application/ld+json\">{\"@type\":\"ProfilePage\",\"mainEntity\":{\"name\":\"Jay\",\"description\":\"Founder @ Bluesky\n\nWorking on stuff\n🌱\"}}</script>";
|
||||||
|
let results = extract_json_ld(html);
|
||||||
|
assert_eq!(results.len(), 1);
|
||||||
|
assert_eq!(results[0]["@type"], "ProfilePage");
|
||||||
|
let desc = results[0]["mainEntity"]["description"].as_str().unwrap();
|
||||||
|
assert!(desc.contains("Founder"));
|
||||||
|
assert!(desc.contains("Working on stuff"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sanitize_preserves_valid_escapes() {
|
||||||
|
let input = r#"{"text":"line1\nline2","raw":"has
|
||||||
|
newline"}"#;
|
||||||
|
let sanitized = sanitize_json_newlines(input);
|
||||||
|
let parsed: Value = serde_json::from_str(&sanitized).unwrap();
|
||||||
|
assert_eq!(parsed["text"], "line1\nline2");
|
||||||
|
assert_eq!(parsed["raw"], "has\nnewline");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
168
crates/webclaw-fetch/tests/bench_1k.rs
Normal file
168
crates/webclaw-fetch/tests/bench_1k.rs
Normal file
|
|
@ -0,0 +1,168 @@
|
||||||
|
//! 1000-site benchmark using FetchClient (wreq backend).
|
||||||
|
//! Run: cargo test -p webclaw-fetch --test bench_1k --release -- --nocapture
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Instant;
|
||||||
|
use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig};
|
||||||
|
|
||||||
|
fn load_targets() -> Vec<(String, String, Vec<String>)> {
|
||||||
|
let candidates = [
|
||||||
|
"targets_1000.txt",
|
||||||
|
"../../targets_1000.txt",
|
||||||
|
"../../../targets_1000.txt",
|
||||||
|
];
|
||||||
|
let path = std::env::var("TARGETS_FILE")
|
||||||
|
.ok()
|
||||||
|
.or_else(|| {
|
||||||
|
candidates
|
||||||
|
.iter()
|
||||||
|
.find(|p| std::path::Path::new(p).exists())
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
})
|
||||||
|
.expect("targets_1000.txt not found — set TARGETS_FILE env var");
|
||||||
|
let content = std::fs::read_to_string(&path).expect("failed to read targets file");
|
||||||
|
content
|
||||||
|
.lines()
|
||||||
|
.filter(|l| !l.is_empty())
|
||||||
|
.map(|l| {
|
||||||
|
let parts: Vec<&str> = l.splitn(3, '|').collect();
|
||||||
|
let kw: Vec<String> = parts
|
||||||
|
.get(2)
|
||||||
|
.unwrap_or(&"")
|
||||||
|
.split(',')
|
||||||
|
.map(|s| s.trim().to_string())
|
||||||
|
.collect();
|
||||||
|
(parts[0].to_string(), parts[1].to_string(), kw)
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn load_proxy() -> Option<String> {
|
||||||
|
let candidates = ["proxies.txt", "../../proxies.txt", "../../../proxies.txt"];
|
||||||
|
let path = std::env::var("PROXY_FILE").ok().or_else(|| {
|
||||||
|
candidates
|
||||||
|
.iter()
|
||||||
|
.find(|p| std::path::Path::new(p).exists())
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
})?;
|
||||||
|
let content = std::fs::read_to_string(&path).ok()?;
|
||||||
|
let line = content.lines().next()?;
|
||||||
|
let p: Vec<&str> = line.split(':').collect();
|
||||||
|
if p.len() == 4 {
|
||||||
|
Some(format!("http://{}:{}@{}:{}", p[2], p[3], p[0], p[1]))
|
||||||
|
} else {
|
||||||
|
Some(line.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classify(body: &str, len: usize, status: u16, kw: &[String]) -> &'static str {
|
||||||
|
let lower = body.to_lowercase();
|
||||||
|
let challenge = lower.contains("just a moment")
|
||||||
|
|| lower.contains("verify you are human")
|
||||||
|
|| lower.contains("cf-chl-bypass")
|
||||||
|
|| lower.contains("challenge page")
|
||||||
|
|| lower.contains("pardon our interruption")
|
||||||
|
|| lower.contains("are you a robot")
|
||||||
|
|| (lower.contains("captcha") && len < 50000);
|
||||||
|
let hits = kw.iter().filter(|k| lower.contains(k.as_str())).count();
|
||||||
|
if hits >= 2 && len > 5000 && !challenge {
|
||||||
|
"OK"
|
||||||
|
} else if challenge {
|
||||||
|
"CHALLENGE"
|
||||||
|
} else if status == 403 || status == 429 {
|
||||||
|
"BLOCKED"
|
||||||
|
} else if status >= 300 && status < 400 {
|
||||||
|
"REDIRECT"
|
||||||
|
} else if len < 1000 {
|
||||||
|
"EMPTY"
|
||||||
|
} else {
|
||||||
|
"UNCLEAR"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn bench_1k_sites() {
|
||||||
|
let targets = load_targets();
|
||||||
|
let proxy = load_proxy();
|
||||||
|
|
||||||
|
let config = FetchConfig {
|
||||||
|
browser: BrowserProfile::Chrome,
|
||||||
|
proxy,
|
||||||
|
timeout: std::time::Duration::from_secs(12),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let client = Arc::new(FetchClient::new(config).expect("build client"));
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"\n=== webclaw-fetch + wreq — {} targets ===\n",
|
||||||
|
targets.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
|
let mut pass = 0usize;
|
||||||
|
let mut errors = 0usize;
|
||||||
|
let mut challenges = 0usize;
|
||||||
|
let mut blocked = 0usize;
|
||||||
|
let mut redirects = 0usize;
|
||||||
|
let mut unclear = 0usize;
|
||||||
|
let total = targets.len();
|
||||||
|
|
||||||
|
// Process in batches of 20 concurrent
|
||||||
|
for chunk in targets.chunks(20) {
|
||||||
|
let mut handles = Vec::new();
|
||||||
|
for (name, url, kw) in chunk {
|
||||||
|
let c = Arc::clone(&client);
|
||||||
|
let url = url.clone();
|
||||||
|
let name = name.clone();
|
||||||
|
let kw = kw.clone();
|
||||||
|
handles.push(tokio::spawn(async move {
|
||||||
|
match c.fetch(&url).await {
|
||||||
|
Ok(result) => {
|
||||||
|
let v = classify(&result.html, result.html.len(), result.status, &kw);
|
||||||
|
(name, result.status, result.html.len(), v, String::new())
|
||||||
|
}
|
||||||
|
Err(e) => (name, 0u16, 0usize, "ERROR", format!("{e}")),
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
for h in handles {
|
||||||
|
if let Ok((name, status, len, verdict, err)) = h.await {
|
||||||
|
match verdict {
|
||||||
|
"OK" => pass += 1,
|
||||||
|
"CHALLENGE" => {
|
||||||
|
challenges += 1;
|
||||||
|
println!(" CHALLENGE {:<25} {:>4} {:>8}B", name, status, len);
|
||||||
|
}
|
||||||
|
"BLOCKED" => {
|
||||||
|
blocked += 1;
|
||||||
|
println!(" BLOCKED {:<25} {:>4} {:>8}B", name, status, len);
|
||||||
|
}
|
||||||
|
"REDIRECT" => redirects += 1,
|
||||||
|
"ERROR" => {
|
||||||
|
errors += 1;
|
||||||
|
let short = if err.len() > 50 { &err[..50] } else { &err };
|
||||||
|
println!(" ERROR {:<25} {}", name, short);
|
||||||
|
}
|
||||||
|
_ => unclear += 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
|
||||||
|
println!("\n{}", "=".repeat(60));
|
||||||
|
println!(
|
||||||
|
" PASS: {pass}/{total} ({:.0}%)",
|
||||||
|
(pass as f64 / total as f64) * 100.0
|
||||||
|
);
|
||||||
|
println!(" CHALLENGE: {challenges}");
|
||||||
|
println!(" BLOCKED: {blocked}");
|
||||||
|
println!(" REDIRECT: {redirects}");
|
||||||
|
println!(" UNCLEAR: {unclear}");
|
||||||
|
println!(" ERROR: {errors}");
|
||||||
|
println!(" TIME: {:.1}s", elapsed.as_secs_f64());
|
||||||
|
println!("{}", "=".repeat(60));
|
||||||
|
}
|
||||||
24
smithery.yaml
Normal file
24
smithery.yaml
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
# Smithery configuration — https://smithery.ai/docs/build/project-config
|
||||||
|
# webclaw MCP server: web extraction for AI agents with bot-protection bypass
|
||||||
|
|
||||||
|
startCommand:
|
||||||
|
type: stdio
|
||||||
|
configSchema:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
apiKey:
|
||||||
|
type: string
|
||||||
|
description: >
|
||||||
|
webclaw API key from webclaw.io. Optional — the server works
|
||||||
|
locally without one. Set this for automatic fallback to the
|
||||||
|
webclaw cloud API when a site has bot protection or requires
|
||||||
|
JS rendering.
|
||||||
|
secret: true
|
||||||
|
commandFunction: |
|
||||||
|
(config) => ({
|
||||||
|
command: 'webclaw-mcp',
|
||||||
|
args: [],
|
||||||
|
env: config.apiKey ? { WEBCLAW_API_KEY: config.apiKey } : {}
|
||||||
|
})
|
||||||
|
exampleConfig:
|
||||||
|
apiKey: wc_your_api_key_here
|
||||||
1000
targets_1000.txt
Normal file
1000
targets_1000.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue