mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat: structured data in markdown/LLM output + v0.3.6
__NEXT_DATA__, SvelteKit, and JSON-LD now appear as a ## Structured Data section in -f markdown and -f llm output. Works with --only-main-content and all extraction flags. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b219fc3648
commit
344eea74d9
5 changed files with 37 additions and 7 deletions
10
CHANGELOG.md
10
CHANGELOG.md
|
|
@ -3,6 +3,16 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.3.6] — 2026-04-02
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **Structured data in markdown/LLM output**: `__NEXT_DATA__`, SvelteKit, and JSON-LD data now appears as a `## Structured Data` section with a JSON code block at the end of `-f markdown` and `-f llm` output. Works with `--only-main-content` and all other flags.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- **Homebrew CI**: formula now updates all 4 platform checksums after Docker build completes, preventing SHA mismatch on Linux installs (#12).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.3.5] — 2026-04-02
|
## [0.3.5] — 2026-04-02
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
||||||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3055,7 +3055,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3075,7 +3075,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3093,7 +3093,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"calamine",
|
"calamine",
|
||||||
|
|
@ -3115,7 +3115,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3128,7 +3128,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3148,7 +3148,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -535,6 +535,13 @@ fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata
|
||||||
out.push_str(&format_frontmatter(&result.metadata));
|
out.push_str(&format_frontmatter(&result.metadata));
|
||||||
}
|
}
|
||||||
out.push_str(&result.content.markdown);
|
out.push_str(&result.content.markdown);
|
||||||
|
if !result.structured_data.is_empty() {
|
||||||
|
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||||
|
out.push_str(
|
||||||
|
&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
|
||||||
|
);
|
||||||
|
out.push_str("\n```");
|
||||||
|
}
|
||||||
out
|
out
|
||||||
}
|
}
|
||||||
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
|
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
|
||||||
|
|
@ -838,6 +845,12 @@ fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata:
|
||||||
print!("{}", format_frontmatter(&result.metadata));
|
print!("{}", format_frontmatter(&result.metadata));
|
||||||
}
|
}
|
||||||
println!("{}", result.content.markdown);
|
println!("{}", result.content.markdown);
|
||||||
|
if !result.structured_data.is_empty() {
|
||||||
|
println!(
|
||||||
|
"\n## Structured Data\n\n```json\n{}\n```",
|
||||||
|
serde_json::to_string_pretty(&result.structured_data).unwrap_or_default()
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OutputFormat::Json => {
|
OutputFormat::Json => {
|
||||||
// serde_json::to_string_pretty won't fail on our types
|
// serde_json::to_string_pretty won't fail on our types
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,13 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) --
|
||||||
|
if !result.structured_data.is_empty() {
|
||||||
|
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||||
|
out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default());
|
||||||
|
out.push_str("\n```");
|
||||||
|
}
|
||||||
|
|
||||||
out.trim().to_string()
|
out.trim().to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue