Improve --format llm output quality (#37)
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run

Improve LLM-format output for modern news and documentation pages.

- Filter noisy hydration and low-value page chrome structured data while preserving content-bearing Schema.org records
- Fix element/text spacing without detaching punctuation on docs, forums, and reference pages
- Remove common accessibility link chrome from LLM text and link labels
- Bump workspace version to 0.6.0 and update the changelog

Thanks to Nenad Oric (@devnen) for the original PR and contribution.
This commit is contained in:
devnen 2026-05-10 15:11:12 +02:00 committed by GitHub
parent 7f75143954
commit e8ca1417d6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 371 additions and 16 deletions

14
Cargo.lock generated
View file

@ -3219,7 +3219,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
version = "0.5.9"
version = "0.6.0"
dependencies = [
"clap",
"dotenvy",
@ -3240,7 +3240,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
version = "0.5.9"
version = "0.6.0"
dependencies = [
"ego-tree",
"once_cell",
@ -3258,7 +3258,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
version = "0.5.9"
version = "0.6.0"
dependencies = [
"async-trait",
"bytes",
@ -3284,7 +3284,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
version = "0.5.9"
version = "0.6.0"
dependencies = [
"async-trait",
"reqwest",
@ -3297,7 +3297,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
version = "0.5.9"
version = "0.6.0"
dependencies = [
"dirs",
"dotenvy",
@ -3317,7 +3317,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
version = "0.5.9"
version = "0.6.0"
dependencies = [
"pdf-extract",
"thiserror",
@ -3326,7 +3326,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
version = "0.5.9"
version = "0.6.0"
dependencies = [
"anyhow",
"axum",