mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-10 22:45:13 +02:00
Extraction ~22% faster on the corpus benchmark with byte-identical output: - hoist recompiled CSS selectors in the markdown noise path - single-pass shared og() meta parsing across vertical extractors - output-safe QuickJS gating (skip the JS VM when no candidate data) + reuse the already-parsed document instead of re-parsing - wreq connect_timeout + connection-pool tuning; dedup the retry loop Reliability + correctness: - char-boundary-safe truncation of LLM error bodies (shared helper) - HTTP connect/read timeouts on all LLM provider clients - isolate pdf-extract behind catch_unwind + spawn_blocking - OSS server: crawl inherits the shared fetch profile; ProviderChain built once in AppState; request TimeoutLayer API / safety / docs: - #[non_exhaustive] on public enums + result structs (+ builders) - #![forbid(unsafe_code)] on pure crates, deny on llm - //! crate docs + doctests; scrub bypass/vendor/target specifics from public crate docs and comments Tooling: [profile.release] lto/codegen-units/strip, MSRV pin, deny.toml + cargo-deny CI, macOS test matrix. CLI main.rs split into focused modules.
45 lines
1.5 KiB
TOML
45 lines
1.5 KiB
TOML
[workspace]
|
|
resolver = "2"
|
|
members = ["crates/*"]
|
|
|
|
[workspace.package]
|
|
version = "0.6.5"
|
|
edition = "2024"
|
|
rust-version = "1.85"
|
|
license = "AGPL-3.0"
|
|
repository = "https://github.com/0xMassi/webclaw"
|
|
|
|
# Hardened release profile: thin LTO + a single codegen unit enable
|
|
# cross-crate inlining on the hot extraction path and shrink the binaries,
|
|
# and stripping symbols trims the shipped artifact. We deliberately do NOT
|
|
# set `panic = "abort"`: webclaw-pdf relies on std::panic::catch_unwind to
|
|
# recover from panics inside the pdf-extract parser, and abort would turn
|
|
# those recoverable panics into hard process kills.
|
|
[profile.release]
|
|
lto = "thin"
|
|
codegen-units = 1
|
|
strip = true
|
|
|
|
# Conservative, high-value hardening lints applied workspace-wide. Crates
|
|
# opt in via `[lints] workspace = true`. Kept deliberately narrow so
|
|
# `clippy -D warnings` stays green — the goal is hardening, not a cleanup
|
|
# sweep that would break the build.
|
|
[workspace.lints.rust]
|
|
unsafe_op_in_unsafe_fn = "warn"
|
|
|
|
[workspace.lints.clippy]
|
|
mem_forget = "warn"
|
|
|
|
[workspace.dependencies]
|
|
webclaw-core = { path = "crates/webclaw-core" }
|
|
webclaw-fetch = { path = "crates/webclaw-fetch" }
|
|
webclaw-llm = { path = "crates/webclaw-llm" }
|
|
webclaw-pdf = { path = "crates/webclaw-pdf" }
|
|
tokio = { version = "1", features = ["full"] }
|
|
serde = { version = "1", features = ["derive"] }
|
|
serde_json = "1"
|
|
thiserror = "2"
|
|
tracing = "0.1"
|
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
clap = { version = "4", features = ["derive", "env"] }
|
|
dotenvy = "0.15"
|