fix: handle raw newlines in JSON-LD strings
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run

Sites like Bluesky emit JSON-LD with literal newline characters inside
string values (technically invalid JSON). Add sanitize_json_newlines()
fallback that escapes control characters inside quoted strings before
retrying the parse. This recovers ProfilePage, Product, and other
structured data that was previously silently dropped.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-04-16 11:40:25 +02:00
parent 78e198a347
commit 6316b1a6e7
5 changed files with 1266 additions and 8 deletions

12
Cargo.lock generated
View file

@ -3102,7 +3102,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
version = "0.3.12"
version = "0.3.13"
dependencies = [
"clap",
"dotenvy",
@ -3122,7 +3122,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
version = "0.3.12"
version = "0.3.13"
dependencies = [
"ego-tree",
"once_cell",
@ -3140,7 +3140,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
version = "0.3.12"
version = "0.3.13"
dependencies = [
"bytes",
"calamine",
@ -3162,7 +3162,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
version = "0.3.12"
version = "0.3.13"
dependencies = [
"async-trait",
"reqwest",
@ -3175,7 +3175,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
version = "0.3.12"
version = "0.3.13"
dependencies = [
"dirs",
"dotenvy",
@ -3196,7 +3196,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
version = "0.3.12"
version = "0.3.13"
dependencies = [
"pdf-extract",
"thiserror",