perf(core): hot-path extraction speedups + senior-grade hardening

Extraction ~22% faster on the corpus benchmark with byte-identical output: - hoist recompiled CSS selectors in the markdown noise path - single-pass shared og() meta parsing across vertical extractors - output-safe QuickJS gating (skip the JS VM when no candidate data) + reuse the already-parsed document instead of re-parsing - wreq connect_timeout + connection-pool tuning; dedup the retry loop Reliability + correctness: - char-boundary-safe truncation of LLM error bodies (shared helper) - HTTP connect/read timeouts on all LLM provider clients - isolate pdf-extract behind catch_unwind + spawn_blocking - OSS server: crawl inherits the shared fetch profile; ProviderChain built once in AppState; request TimeoutLayer API / safety / docs: - #[non_exhaustive] on public enums + result structs (+ builders) - #![forbid(unsafe_code)] on pure crates, deny on llm - //! crate docs + doctests; scrub bypass/vendor/target specifics from public crate docs and comments Tooling: [profile.release] lto/codegen-units/strip, MSRV pin, deny.toml + cargo-deny CI, macOS test matrix. CLI main.rs split into focused modules.
2026-06-22 02:38:06 +02:00 · 2026-06-04 20:22:00 +02:00 · 2026-06-04 20:22:00 +02:00 · 02302e7a1d
commit 02302e7a1d
parent e499e51e70
62 changed files with 3761 additions and 3130 deletions
--- a/crates/webclaw-cli/examples/perf_corpus.rs
+++ b/crates/webclaw-cli/examples/perf_corpus.rs
@ -48,7 +48,10 @@ async fn main() {
    match mode.as_str() {
        "capture" => capture().await,
        "bench" => {
-            let iters: usize = std::env::args().nth(2).and_then(|s| s.parse().ok()).unwrap_or(60);
+            let iters: usize = std::env::args()
+                .nth(2)
+                .and_then(|s| s.parse().ok())
+                .unwrap_or(60);
            bench(iters);
        }
        "snapshot" => {
@ -64,14 +67,21 @@ async fn main() {

 async fn capture() {
    fs::create_dir_all(CORPUS).unwrap();
-    let config = FetchConfig { browser: BrowserProfile::Chrome, ..FetchConfig::default() };
+    let config = FetchConfig {
+        browser: BrowserProfile::Chrome,
+        ..FetchConfig::default()
+    };
    let client = FetchClient::new(config).expect("build client");
    let mut ok = 0;
    for (i, u) in URLS.iter().enumerate() {
        let name = format!(
            "{:02}_{}.html",
            i + 1,
-            u.replace("https://", "").chars().map(|c| if c.is_alphanumeric() { c } else { '_' }).take(40).collect::<String>()
+            u.replace("https://", "")
+                .chars()
+                .map(|c| if c.is_alphanumeric() { c } else { '_' })
+                .take(40)
+                .collect::<String>()
        );
        match client.fetch(u).await {
            Ok(f) if f.html.len() > 1000 => {
@ -99,7 +109,9 @@ fn snapshot(label: &str) {
    let mut n = 0;
    for path in &files {
        let html = fs::read_to_string(path).unwrap_or_default();
-        if html.is_empty() { continue; }
+        if html.is_empty() {
+            continue;
+        }
        let stem = path.file_stem().unwrap().to_string_lossy().to_string();
        let url = format!("https://corpus/{stem}");
        match extract(&html, Some(&url)) {
@ -117,7 +129,9 @@ fn snapshot(label: &str) {
 }

 fn percentile(sorted: &[u128], p: f64) -> u128 {
-    if sorted.is_empty() { return 0; }
+    if sorted.is_empty() {
+        return 0;
+    }
    let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize;
    sorted[idx]
 }
@ -135,7 +149,10 @@ fn bench(iters: usize) {
    }

    println!("# perf_corpus bench  docs={}  iters={}", files.len(), iters);
-    println!("{:<42} {:>10} {:>10} {:>10} {:>10}", "doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us");
+    println!(
+        "{:<42} {:>10} {:>10} {:>10} {:>10}",
+        "doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us"
+    );

    let mut grand_extract = 0u128;
    let mut grand_llm = 0u128;
@ -143,8 +160,13 @@ fn bench(iters: usize) {

    for path in &files {
        let html = fs::read_to_string(path).unwrap_or_default();
-        if html.is_empty() { continue; }
-        let url = format!("https://corpus/{}", path.file_name().unwrap().to_string_lossy());
+        if html.is_empty() {
+            continue;
+        }
+        let url = format!(
+            "https://corpus/{}",
+            path.file_name().unwrap().to_string_lossy()
+        );

        // warmup
        for _ in 0..5 {
@ -158,7 +180,10 @@ fn bench(iters: usize) {
        let mut total_times = Vec::with_capacity(iters);
        for _ in 0..iters {
            let t0 = Instant::now();
-            let ex = match extract(&html, Some(&url)) { Ok(e) => e, Err(_) => continue };
+            let ex = match extract(&html, Some(&url)) {
+                Ok(e) => e,
+                Err(_) => continue,
+            };
            let t1 = Instant::now();
            let txt = to_llm_text(&ex, Some(&url));
            let t2 = Instant::now();
@ -178,11 +203,24 @@ fn bench(iters: usize) {
        grand_llm += llm_p50;
        grand_total_p50 += tot_p50;

-        let label = format!("{} ({}KB)", path.file_stem().unwrap().to_string_lossy(), html.len() / 1024);
-        println!("{:<42} {:>10} {:>10} {:>10} {:>10}", label.chars().take(42).collect::<String>(), ex_p50, llm_p50, tot_p50, tot_p90);
+        let label = format!(
+            "{} ({}KB)",
+            path.file_stem().unwrap().to_string_lossy(),
+            html.len() / 1024
+        );
+        println!(
+            "{:<42} {:>10} {:>10} {:>10} {:>10}",
+            label.chars().take(42).collect::<String>(),
+            ex_p50,
+            llm_p50,
+            tot_p50,
+            tot_p90
+        );
    }

    println!("---");
-    println!("CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}");
+    println!(
+        "CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}"
+    );
    println!("(lower is better; total = one full extract+llm pass over the whole corpus at p50)");
 }