perf: hot-path extraction speedups (selector hoist, shared og, QuickJS gating)

Rescued from the stale perf/audit-fixes branch — the *perf-only* subset of that branch's big mixed commit, ported cleanly onto current main with byte-identical extraction output. - markdown: hoist the `img[alt]` / `a[href]` selectors out of the per-node noise path into `Lazy` statics (stop recompiling them per element). - extractors: single shared `og()` / `parse_og()` module replaces the per-field Open Graph re-scan duplicated across 7 vertical extractors (amazon, ebay, ecommerce, etsy, substack, trustpilot, youtube). Each vertical now does one pass. Raw-vs-unescaped behaviour preserved exactly. - core: gate the QuickJS VM on a cheap marker check (skip it entirely when the page has no JS-assigned data) and reuse the already-parsed document instead of re-parsing the HTML. - fetch: connection-pool tuning on the wreq client (connect_timeout, idle pool, max-idle-per-host, tcp keepalive) for connection reuse. Output-equivalence is covered by existing tests (amazon quot-entity, trustpilot title parse, ecommerce/youtube/etsy/substack og fallbacks) — all green. No new dependencies; no public API change. Deliberately EXCLUDED from this slice (separate concerns bundled in the original commit): the `#[non_exhaustive]` API-breaking changes, the LLM/PDF/ server reliability hardening (much already shipped in 0.6.8), the tooling (cargo-deny, release profile, MSRV), and the retry-loop dedup refactor (a code-cleanup with no runtime benefit — not worth churning client.rs for). Original work by the prior author on perf/audit-fixes; this re-applies only the performance subset onto main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 03:39:37 +02:00 · 2026-06-17 16:41:45 +02:00 · 2026-06-17 16:41:45 +02:00 · 3c54bea300
commit 3c54bea300
parent 51d0c538f1
13 changed files with 200 additions and 157 deletions
--- a/crates/webclaw-core/src/js_eval.rs
+++ b/crates/webclaw-core/src/js_eval.rs
@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").
 static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
 const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);

+/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find
+/// any data blob. The scan only ever surfaces `globalThis.__*` object/array
+/// properties, and the seeded `__next_f` only emits when non-empty. Every
+/// realistic way an inline script populates such a global goes through one of
+/// these substrings (`window.`/`self.__next` assignments, or the
+/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none
+/// are present, running the VM is guaranteed to return zero blobs, so skipping
+/// it is output-neutral. Conservative by design: any of these may appear in
+/// non-script HTML too, which only makes us skip *less* often, never more.
+const JS_CANDIDATE_MARKERS: [&str; 5] = [
+    "window.",
+    "__NEXT_DATA__",
+    "__NUXT__",
+    "application/json",
+    "self.__next",
+];
+
+/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS
+/// scan could surface. When false, the VM is provably a no-op and is skipped.
+pub fn has_js_candidate_data(html: &str) -> bool {
+    JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m))
+}
+
 /// A blob of data extracted from JS execution.
 pub struct JsDataBlob {
    pub name: String,
@ -24,9 +47,17 @@ pub struct JsDataBlob {
 }

 /// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
+///
+/// Convenience wrapper that parses `html` first. Hot callers that already hold a
+/// parsed [`Html`] should use [`extract_js_data_from_doc`] to avoid a second parse.
 pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
    let doc = Html::parse_document(html);
+    extract_js_data_from_doc(&doc)
+}

+/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data,
+/// reusing an already-parsed [`Html`] document instead of re-parsing the HTML.
+pub fn extract_js_data_from_doc(doc: &Html) -> Vec<JsDataBlob> {
    let scripts: Vec<String> = doc
        .select(&SCRIPT_SELECTOR)
        .filter(|el| {
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -222,8 +222,8 @@ fn extract_with_options_inner(
    // (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
    // static JSON data island extraction above with runtime-evaluated data.
    #[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
-    {
-        let blobs = js_eval::extract_js_data(html);
+    if js_eval::has_js_candidate_data(html) {
+        let blobs = js_eval::extract_js_data_from_doc(&doc);
        if !blobs.is_empty() {
            let js_text = js_eval::extract_readable_text(&blobs);
            if !js_text.is_empty() {
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@ -13,6 +13,8 @@ use crate::noise;
 use crate::types::{CodeBlock, Image, Link};

 static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
+static IMG_ALT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("img[alt]").unwrap());
+static A_HREF_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());

 /// Maximum recursion depth for DOM traversal.
 /// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
@ -853,7 +855,7 @@ fn collect_assets_from_noise(
    assets: &mut ConvertedAssets,
 ) {
    // Collect images with alt text
-    for img in element.select(&Selector::parse("img[alt]").unwrap()) {
+    for img in element.select(&IMG_ALT_SELECTOR) {
        let alt = img.value().attr("alt").unwrap_or("").to_string();
        let src = img
            .value()
@ -866,7 +868,7 @@ fn collect_assets_from_noise(
    }

    // Collect links
-    for link in element.select(&Selector::parse("a[href]").unwrap()) {
+    for link in element.select(&A_HREF_SELECTOR) {
        let href = link
            .value()
            .attr("href")