From 3c54bea300b10e3aad8e97b8607a79201e9755fa Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 17 Jun 2026 16:41:45 +0200 Subject: [PATCH] perf: hot-path extraction speedups (selector hoist, shared og, QuickJS gating) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rescued from the stale perf/audit-fixes branch — the *perf-only* subset of that branch's big mixed commit, ported cleanly onto current main with byte-identical extraction output. - markdown: hoist the `img[alt]` / `a[href]` selectors out of the per-node noise path into `Lazy` statics (stop recompiling them per element). - extractors: single shared `og()` / `parse_og()` module replaces the per-field Open Graph re-scan duplicated across 7 vertical extractors (amazon, ebay, ecommerce, etsy, substack, trustpilot, youtube). Each vertical now does one pass. Raw-vs-unescaped behaviour preserved exactly. - core: gate the QuickJS VM on a cheap marker check (skip it entirely when the page has no JS-assigned data) and reuse the already-parsed document instead of re-parsing the HTML. - fetch: connection-pool tuning on the wreq client (connect_timeout, idle pool, max-idle-per-host, tcp keepalive) for connection reuse. Output-equivalence is covered by existing tests (amazon quot-entity, trustpilot title parse, ecommerce/youtube/etsy/substack og fallbacks) — all green. No new dependencies; no public API change. Deliberately EXCLUDED from this slice (separate concerns bundled in the original commit): the `#[non_exhaustive]` API-breaking changes, the LLM/PDF/ server reliability hardening (much already shipped in 0.6.8), the tooling (cargo-deny, release profile, MSRV), and the retry-loop dedup refactor (a code-cleanup with no runtime benefit — not worth churning client.rs for). Original work by the prior author on perf/audit-fixes; this re-applies only the performance subset onto main. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/webclaw-core/src/js_eval.rs | 31 ++++++++ crates/webclaw-core/src/lib.rs | 4 +- crates/webclaw-core/src/markdown.rs | 6 +- .../src/extractors/amazon_product.rs | 36 ++------- .../src/extractors/ebay_listing.rs | 22 ++---- .../src/extractors/ecommerce_product.rs | 23 ++---- .../src/extractors/etsy_listing.rs | 27 +++---- crates/webclaw-fetch/src/extractors/mod.rs | 1 + crates/webclaw-fetch/src/extractors/og.rs | 79 +++++++++++++++++++ .../src/extractors/substack_post.rs | 25 ++---- .../src/extractors/trustpilot_reviews.rs | 75 ++++++++---------- .../src/extractors/youtube_video.rs | 22 ++---- crates/webclaw-fetch/src/tls.rs | 6 +- 13 files changed, 200 insertions(+), 157 deletions(-) create mode 100644 crates/webclaw-fetch/src/extractors/og.rs diff --git a/crates/webclaw-core/src/js_eval.rs b/crates/webclaw-core/src/js_eval.rs index e1fb2de..2f78246 100644 --- a/crates/webclaw-core/src/js_eval.rs +++ b/crates/webclaw-core/src/js_eval.rs @@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy = Lazy::new(|| Selector::parse("script"). static HTML_TAG_RE: Lazy = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap()); const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250); +/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find +/// any data blob. The scan only ever surfaces `globalThis.__*` object/array +/// properties, and the seeded `__next_f` only emits when non-empty. Every +/// realistic way an inline script populates such a global goes through one of +/// these substrings (`window.`/`self.__next` assignments, or the +/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none +/// are present, running the VM is guaranteed to return zero blobs, so skipping +/// it is output-neutral. Conservative by design: any of these may appear in +/// non-script HTML too, which only makes us skip *less* often, never more. +const JS_CANDIDATE_MARKERS: [&str; 5] = [ + "window.", + "__NEXT_DATA__", + "__NUXT__", + "application/json", + "self.__next", +]; + +/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS +/// scan could surface. When false, the VM is provably a no-op and is skipped. +pub fn has_js_candidate_data(html: &str) -> bool { + JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m)) +} + /// A blob of data extracted from JS execution. pub struct JsDataBlob { pub name: String, @@ -24,9 +47,17 @@ pub struct JsDataBlob { } /// Execute inline `