mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-11 22:55:13 +02:00
Extraction ~22% faster on the corpus benchmark with byte-identical output: - hoist recompiled CSS selectors in the markdown noise path - single-pass shared og() meta parsing across vertical extractors - output-safe QuickJS gating (skip the JS VM when no candidate data) + reuse the already-parsed document instead of re-parsing - wreq connect_timeout + connection-pool tuning; dedup the retry loop Reliability + correctness: - char-boundary-safe truncation of LLM error bodies (shared helper) - HTTP connect/read timeouts on all LLM provider clients - isolate pdf-extract behind catch_unwind + spawn_blocking - OSS server: crawl inherits the shared fetch profile; ProviderChain built once in AppState; request TimeoutLayer API / safety / docs: - #[non_exhaustive] on public enums + result structs (+ builders) - #![forbid(unsafe_code)] on pure crates, deny on llm - //! crate docs + doctests; scrub bypass/vendor/target specifics from public crate docs and comments Tooling: [profile.release] lto/codegen-units/strip, MSRV pin, deny.toml + cargo-deny CI, macOS test matrix. CLI main.rs split into focused modules.
39 lines
1.1 KiB
TOML
39 lines
1.1 KiB
TOML
[package]
|
|
name = "webclaw-core"
|
|
description = "Pure HTML content extraction engine for LLMs"
|
|
version.workspace = true
|
|
edition.workspace = true
|
|
rust-version.workspace = true
|
|
license.workspace = true
|
|
# Reddit regression fixtures are real old.reddit.com pages read at test time;
|
|
# they're large and only needed to run the test suite from the repo, so keep
|
|
# them out of the published crate.
|
|
exclude = ["testdata/reddit/*.html"]
|
|
|
|
[lints]
|
|
workspace = true
|
|
|
|
[features]
|
|
default = ["quickjs"]
|
|
quickjs = ["rquickjs"]
|
|
|
|
[dependencies]
|
|
serde = { workspace = true }
|
|
serde_json = { workspace = true }
|
|
thiserror = { workspace = true }
|
|
tracing = { workspace = true }
|
|
scraper = "0.22"
|
|
ego-tree = "0.10"
|
|
url = { version = "2", features = ["serde"] }
|
|
regex = "1"
|
|
once_cell = "1"
|
|
similar = "2"
|
|
|
|
# rquickjs links a C library and cannot build for wasm32. Gating it per
|
|
# target keeps the `quickjs` feature usable on native while leaving the
|
|
# crate WASM-safe even with default features enabled.
|
|
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
|
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
|
|
|
[dev-dependencies]
|
|
tokio = { workspace = true }
|