perf: hot-path extraction speedups (selector hoist, shared og, QuickJS gating)

Rescued from the stale perf/audit-fixes branch — the *perf-only* subset of that branch's big mixed commit, ported cleanly onto current main with byte-identical extraction output. - markdown: hoist the `img[alt]` / `a[href]` selectors out of the per-node noise path into `Lazy` statics (stop recompiling them per element). - extractors: single shared `og()` / `parse_og()` module replaces the per-field Open Graph re-scan duplicated across 7 vertical extractors (amazon, ebay, ecommerce, etsy, substack, trustpilot, youtube). Each vertical now does one pass. Raw-vs-unescaped behaviour preserved exactly. - core: gate the QuickJS VM on a cheap marker check (skip it entirely when the page has no JS-assigned data) and reuse the already-parsed document instead of re-parsing the HTML. - fetch: connection-pool tuning on the wreq client (connect_timeout, idle pool, max-idle-per-host, tcp keepalive) for connection reuse. Output-equivalence is covered by existing tests (amazon quot-entity, trustpilot title parse, ecommerce/youtube/etsy/substack og fallbacks) — all green. No new dependencies; no public API change. Deliberately EXCLUDED from this slice (separate concerns bundled in the original commit): the `#[non_exhaustive]` API-breaking changes, the LLM/PDF/ server reliability hardening (much already shipped in 0.6.8), the tooling (cargo-deny, release profile, MSRV), and the retry-loop dedup refactor (a code-cleanup with no runtime benefit — not worth churning client.rs for). Original work by the prior author on perf/audit-fixes; this re-applies only the performance subset onto main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 23:55:13 +02:00 · 2026-06-17 16:41:45 +02:00 · 2026-06-17 16:41:45 +02:00 · 3c54bea300
commit 3c54bea300
parent 51d0c538f1
13 changed files with 200 additions and 157 deletions
--- a/crates/webclaw-core/src/js_eval.rs
+++ b/crates/webclaw-core/src/js_eval.rs
@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").
 static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
 const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);

+/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find
+/// any data blob. The scan only ever surfaces `globalThis.__*` object/array
+/// properties, and the seeded `__next_f` only emits when non-empty. Every
+/// realistic way an inline script populates such a global goes through one of
+/// these substrings (`window.`/`self.__next` assignments, or the
+/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none
+/// are present, running the VM is guaranteed to return zero blobs, so skipping
+/// it is output-neutral. Conservative by design: any of these may appear in
+/// non-script HTML too, which only makes us skip *less* often, never more.
+const JS_CANDIDATE_MARKERS: [&str; 5] = [
+    "window.",
+    "__NEXT_DATA__",
+    "__NUXT__",
+    "application/json",
+    "self.__next",
+];
+
+/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS
+/// scan could surface. When false, the VM is provably a no-op and is skipped.
+pub fn has_js_candidate_data(html: &str) -> bool {
+    JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m))
+}
+
 /// A blob of data extracted from JS execution.
 pub struct JsDataBlob {
    pub name: String,
@ -24,9 +47,17 @@ pub struct JsDataBlob {
 }

 /// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
+///
+/// Convenience wrapper that parses `html` first. Hot callers that already hold a
+/// parsed [`Html`] should use [`extract_js_data_from_doc`] to avoid a second parse.
 pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
    let doc = Html::parse_document(html);
+    extract_js_data_from_doc(&doc)
+}

+/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data,
+/// reusing an already-parsed [`Html`] document instead of re-parsing the HTML.
+pub fn extract_js_data_from_doc(doc: &Html) -> Vec<JsDataBlob> {
    let scripts: Vec<String> = doc
        .select(&SCRIPT_SELECTOR)
        .filter(|el| {
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -222,8 +222,8 @@ fn extract_with_options_inner(
    // (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
    // static JSON data island extraction above with runtime-evaluated data.
    #[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
-    {
-        let blobs = js_eval::extract_js_data(html);
+    if js_eval::has_js_candidate_data(html) {
+        let blobs = js_eval::extract_js_data_from_doc(&doc);
        if !blobs.is_empty() {
            let js_text = js_eval::extract_readable_text(&blobs);
            if !js_text.is_empty() {
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@ -13,6 +13,8 @@ use crate::noise;
 use crate::types::{CodeBlock, Image, Link};

 static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
+static IMG_ALT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("img[alt]").unwrap());
+static A_HREF_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());

 /// Maximum recursion depth for DOM traversal.
 /// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
@ -853,7 +855,7 @@ fn collect_assets_from_noise(
    assets: &mut ConvertedAssets,
 ) {
    // Collect images with alt text
-    for img in element.select(&Selector::parse("img[alt]").unwrap()) {
+    for img in element.select(&IMG_ALT_SELECTOR) {
        let alt = img.value().attr("alt").unwrap_or("").to_string();
        let src = img
            .value()
@ -866,7 +868,7 @@ fn collect_assets_from_noise(
    }

    // Collect links
-    for link in element.select(&Selector::parse("a[href]").unwrap()) {
+    for link in element.select(&A_HREF_SELECTOR) {
        let href = link
            .value()
            .attr("href")
--- a/crates/webclaw-fetch/src/extractors/amazon_product.rs
+++ b/crates/webclaw-fetch/src/extractors/amazon_product.rs
@ -33,6 +33,7 @@ use serde_json::{Value, json};
 use url::Url;

 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@ -115,23 +116,25 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
 /// without carrying webclaw_fetch types.
 pub fn parse(html: &str, url: &str, asin: &str) -> Value {
    let jsonld = find_product_jsonld(html);
+    // Single scan for the og:* fallbacks read below.
+    let og_meta = parse_og(html);
    // Three-tier title: JSON-LD `name` > Amazon's `#productTitle` span
    // (only present on real static HTML) > cloud-synthesized og:title.
    let title = jsonld
        .as_ref()
        .and_then(|v| get_text(v, "name"))
        .or_else(|| dom_title(html))
-        .or_else(|| og(html, "title"));
+        .or_else(|| og_meta.unescaped("title"));
    let image = jsonld
        .as_ref()
        .and_then(get_first_image)
        .or_else(|| dom_image(html))
-        .or_else(|| og(html, "image"));
+        .or_else(|| og_meta.unescaped("image"));
    let brand = jsonld.as_ref().and_then(get_brand);
    let description = jsonld
        .as_ref()
        .and_then(|v| get_text(v, "description"))
-        .or_else(|| og(html, "description"));
+        .or_else(|| og_meta.unescaped("description"));
    let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating);
    let offer = jsonld.as_ref().and_then(first_offer);

@ -336,31 +339,6 @@ fn dom_image(html: &str) -> Option<String> {
        .map(|m| m.as_str().to_string())
 }

-/// OG meta tag lookup. Cloud-synthesized HTML ships these even when
-/// JSON-LD and Amazon-DOM-IDs are both absent, so they're the last
-/// line of defence for `title`, `image`, `description`.
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| html_unescape(m.as_str()));
-        }
-    }
-    None
-}
-
-/// Undo the synthesize_html attribute escaping for the few entities it
-/// emits. Keeps us off a heavier HTML-entity dep.
-fn html_unescape(s: &str) -> String {
-    s.replace("&quot;", "\"")
-        .replace("&amp;", "&")
-        .replace("&lt;", "<")
-        .replace("&gt;", ">")
-}
-
 fn cloud_to_fetch_err(e: CloudError) -> FetchError {
    FetchError::Build(e.to_string())
 }
@ -477,7 +455,7 @@ mod tests {
    fn og_unescape_handles_quot_entity() {
        let html = r#"<meta property="og:title" content="Apple &quot;M2 Pro&quot; Laptop">"#;
        assert_eq!(
-            og(html, "title").as_deref(),
+            parse_og(html).unescaped("title").as_deref(),
            Some(r#"Apple "M2 Pro" Laptop"#)
        );
    }
--- a/crates/webclaw-fetch/src/extractors/ebay_listing.rs
+++ b/crates/webclaw-fetch/src/extractors/ebay_listing.rs
@ -15,6 +15,7 @@ use serde_json::{Value, json};
 use url::Url;

 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@ -65,19 +66,21 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro

 pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
    let jsonld = find_product_jsonld(html);
+    // Single scan for the three og:* fields read as fallbacks below.
+    let og_meta = parse_og(html);
    let title = jsonld
        .as_ref()
        .and_then(|v| get_text(v, "name"))
-        .or_else(|| og(html, "title"));
+        .or_else(|| og_meta.raw("title"));
    let image = jsonld
        .as_ref()
        .and_then(get_first_image)
-        .or_else(|| og(html, "image"));
+        .or_else(|| og_meta.raw("image"));
    let brand = jsonld.as_ref().and_then(get_brand);
    let description = jsonld
        .as_ref()
        .and_then(|v| get_text(v, "description"))
-        .or_else(|| og(html, "description"));
+        .or_else(|| og_meta.raw("description"));
    let offer = jsonld.as_ref().and_then(first_offer);

    // eBay's AggregateOffer uses lowPrice/highPrice. Offer uses price.
@ -268,19 +271,6 @@ fn get_aggregate_rating(v: &Value) -> Option<Value> {
    }))
 }

-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 fn cloud_to_fetch_err(e: CloudError) -> FetchError {
    FetchError::Build(e.to_string())
 }
--- a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs
+++ b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs
@ -42,6 +42,7 @@ use regex::Regex;
 use serde_json::{Value, json};

 use super::ExtractorInfo;
+use super::og::{og, parse_og};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;

@ -142,15 +143,17 @@ fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value {
 /// Build a minimal payload from OG / product meta tags. Used when a
 /// page has no Product JSON-LD at all.
 fn build_og_payload(html: &str, url: &str) -> Value {
+    // Single scan for the three og:* fields this fallback reads.
+    let og_meta = parse_og(html);
    let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default();
-    let image = og(html, "image");
+    let image = og_meta.raw("image");
    let images: Vec<Value> = image.map(|i| vec![Value::String(i)]).unwrap_or_default();

    json!({
        "url":                url,
        "data_source":        "og_fallback",
-        "name":               og(html, "title"),
-        "description":        og(html, "description"),
+        "name":               og_meta.raw("title"),
+        "description":        og_meta.raw("description"),
        "brand":              meta_property(html, "product:brand"),
        "sku":                None::<String>,
        "mpn":                None::<String>,
@ -368,20 +371,6 @@ fn build_og_offer(html: &str) -> Option<Value> {
    }))
 }

-/// Pull the value of `<meta property="og:{prop}" content="...">`.
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 /// Pull the value of any `<meta property="..." content="...">` tag.
 /// Needed for namespaced OG variants like `product:price:amount` that
 /// the simple `og:*` matcher above doesn't cover.
--- a/crates/webclaw-fetch/src/extractors/etsy_listing.rs
+++ b/crates/webclaw-fetch/src/extractors/etsy_listing.rs
@ -26,6 +26,7 @@ use regex::Regex;
 use serde_json::{Value, json};

 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@ -74,19 +75,26 @@ pub fn parse(html: &str, url: &str, listing_id: &str) -> Value {
    let jsonld = find_product_jsonld(html);
    let slug_title = humanise_slug(parse_slug(url).as_deref());

+    // Single scan for the three og:* fields used as fallbacks below.
+    let og_meta = parse_og(html);
+
    let title = jsonld
        .as_ref()
        .and_then(|v| get_text(v, "name"))
-        .or_else(|| og(html, "title").filter(|t| !is_generic_title(t)))
+        .or_else(|| og_meta.raw("title").filter(|t| !is_generic_title(t)))
        .or(slug_title);
    let description = jsonld
        .as_ref()
        .and_then(|v| get_text(v, "description"))
-        .or_else(|| og(html, "description").filter(|d| !is_generic_description(d)));
+        .or_else(|| {
+            og_meta
+                .raw("description")
+                .filter(|d| !is_generic_description(d))
+        });
    let image = jsonld
        .as_ref()
        .and_then(get_first_image)
-        .or_else(|| og(html, "image"));
+        .or_else(|| og_meta.raw("image"));
    let brand = jsonld.as_ref().and_then(get_brand);

    // Etsy listings often ship either a single Offer or an
@ -359,19 +367,6 @@ fn strip_schema_prefix(s: String) -> String {
        .replace("https://schema.org/", "")
 }

-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 /// Etsy links the owning shop with a canonical anchor like
 /// `<a href="/shop/ShopName" ...>`. Grab the first one after the
 /// breadcrumb boundary.
--- a/crates/webclaw-fetch/src/extractors/mod.rs
+++ b/crates/webclaw-fetch/src/extractors/mod.rs
@ -33,6 +33,7 @@ pub mod instagram_post;
 pub mod instagram_profile;
 pub mod linkedin_post;
 pub mod npm;
+pub(crate) mod og;
 pub mod pypi;
 pub mod reddit;
 pub mod shopify_collection;
--- a/crates/webclaw-fetch/src/extractors/og.rs
+++ b/crates/webclaw-fetch/src/extractors/og.rs
@ -0,0 +1,79 @@
+//! Shared Open Graph (`og:*`) meta-tag parsing for the HTML vertical
+//! extractors.
+//!
+//! Several site extractors read a handful of `og:*` properties (title,
+//! description, image, ...) from the page `<head>`. Each used to carry a
+//! verbatim copy of the same regex + scan helper. This module centralises
+//! that logic and adds [`parse_og`], which collects every `og:*` pair in a
+//! single `captures_iter` pass so an extractor that needs multiple fields
+//! scans the document once instead of once per field.
+//!
+//! Values are stored raw. Callers that need HTML entity decoding apply
+//! [`html_unescape`] themselves — some extractors intentionally keep the
+//! raw value, so decoding is opt-in per call site to preserve output.
+
+use std::collections::HashMap;
+use std::sync::OnceLock;
+
+use regex::Regex;
+
+/// Matches `<meta property="og:<name>" content="<value>">`, case-insensitive.
+/// Capture 1 is the property suffix (after `og:`), capture 2 is the content.
+fn og_regex() -> &'static Regex {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
+    })
+}
+
+/// Return the raw content of the first `og:<prop>` meta tag, if present.
+///
+/// Single-pass per call. For extractors reading several properties, prefer
+/// [`parse_og`] to scan the document only once.
+pub(crate) fn og(html: &str, prop: &str) -> Option<String> {
+    for c in og_regex().captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == prop) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
+/// Parse every `og:*` meta tag in one pass into a `suffix -> content` map.
+///
+/// First occurrence wins, matching the short-circuit-on-first-match
+/// behaviour of [`og`] when called per property. Values are raw (not
+/// entity-decoded); use [`OgMeta::unescaped`] / [`OgMeta::raw`] to read.
+pub(crate) fn parse_og(html: &str) -> OgMeta {
+    let mut map: HashMap<String, String> = HashMap::new();
+    for c in og_regex().captures_iter(html) {
+        if let (Some(name), Some(content)) = (c.get(1), c.get(2)) {
+            map.entry(name.as_str().to_string())
+                .or_insert_with(|| content.as_str().to_string());
+        }
+    }
+    OgMeta(map)
+}
+
+/// Parsed `og:*` properties from a single document scan.
+pub(crate) struct OgMeta(HashMap<String, String>);
+
+impl OgMeta {
+    /// Raw content of `og:<prop>`, exactly as it appeared in the HTML.
+    pub(crate) fn raw(&self, prop: &str) -> Option<String> {
+        self.0.get(prop).cloned()
+    }
+
+    /// Content of `og:<prop>` with the common HTML entities decoded.
+    pub(crate) fn unescaped(&self, prop: &str) -> Option<String> {
+        self.0.get(prop).map(|v| html_unescape(v))
+    }
+}
+
+/// Decode the small set of HTML entities that show up in `og:*` content.
+pub(crate) fn html_unescape(s: &str) -> String {
+    s.replace("&quot;", "\"")
+        .replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+}
--- a/crates/webclaw-fetch/src/extractors/substack_post.rs
+++ b/crates/webclaw-fetch/src/extractors/substack_post.rs
@ -28,6 +28,7 @@ use serde::Deserialize;
 use serde_json::{Value, json};

 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@ -181,24 +182,27 @@ async fn html_fallback(
 pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
    let article = find_article_jsonld(html);

+    // Single scan for the four og:* fields read as fallbacks below.
+    let og_meta = parse_og(html);
+
    let title = article
        .as_ref()
        .and_then(|v| get_text(v, "headline"))
-        .or_else(|| og(html, "title"));
+        .or_else(|| og_meta.raw("title"));
    let description = article
        .as_ref()
        .and_then(|v| get_text(v, "description"))
-        .or_else(|| og(html, "description"));
+        .or_else(|| og_meta.raw("description"));
    let cover_image = article
        .as_ref()
        .and_then(get_first_image)
-        .or_else(|| og(html, "image"));
+        .or_else(|| og_meta.raw("image"));
    let post_date = article
        .as_ref()
        .and_then(|v| get_text(v, "datePublished"))
        .or_else(|| meta_property(html, "article:published_time"));
    let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
-    let publication_name = og(html, "site_name");
+    let publication_name = og_meta.raw("site_name");
    let authors = article.as_ref().map(extract_authors).unwrap_or_default();

    json!({
@ -302,19 +306,6 @@ fn handle_from_author_url(u: &str) -> Option<String> {
 // HTML tag helpers
 // ---------------------------------------------------------------------------

-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 /// Pull `<meta property="article:published_time" content="...">` and
 /// similar structured meta tags.
 fn meta_property(html: &str, prop: &str) -> Option<String> {
--- a/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs
+++ b/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs
@ -32,6 +32,7 @@ use regex::Regex;
 use serde_json::{Value, json};

 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@ -87,11 +88,17 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
    // The aiSummary block: not typed (no `@type`), detect by key.
    let ai_block = find_ai_summary_block(&blocks);

+    // Single scan of the page's og:* meta tags; title + description feed
+    // the regex fallbacks below.
+    let og_meta = parse_og(html);
+    let og_title = og_meta.unescaped("title");
+    let og_description = og_meta.unescaped("description");
+
    // Business name: Dataset > metadata.title regex > URL domain.
    let business_name = dataset
        .as_ref()
        .and_then(|d| get_string(d, "name"))
-        .or_else(|| parse_name_from_og_title(html))
+        .or_else(|| parse_name_from_og_title(og_title.as_deref()))
        .or_else(|| Some(domain.clone()));

    // Rating distribution from the csvw:Table columns. Each column has
@ -105,8 +112,8 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {

    // Page-title / page-description fallbacks. OG title format:
    // "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
-    let (rating_label, rating_from_og) = parse_rating_from_og_title(html);
-    let total_from_desc = parse_review_count_from_og_description(html);
+    let (rating_label, rating_from_og) = parse_rating_from_og_title(og_title.as_deref());
+    let total_from_desc = parse_review_count_from_og_description(og_description.as_deref());

    // Recent reviews carried by the aiSummary block.
    let recent_reviews: Vec<Value> = ai_block
@ -336,20 +343,21 @@ fn compute_rating_stats(distribution: &Value) -> (Option<String>, Option<i64>) {

 /// Regex out the business name from the standard Trustpilot OG title
 /// shape: `"{name} is rated \"{label}\" with {rating} / 5 on Trustpilot"`.
-fn parse_name_from_og_title(html: &str) -> Option<String> {
-    let title = og(html, "title")?;
+/// `title` is the (entity-decoded) `og:title` content.
+fn parse_name_from_og_title(title: Option<&str>) -> Option<String> {
+    let title = title?;
    // "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
    static RE: OnceLock<Regex> = OnceLock::new();
    let re = RE.get_or_init(|| Regex::new(r"^(.+?)\s+is rated\b").unwrap());
-    re.captures(&title)
+    re.captures(title)
        .and_then(|c| c.get(1))
        .map(|m| m.as_str().to_string())
 }

 /// Pull the rating label (e.g. "Bad", "Excellent") and numeric value
-/// from the OG title.
-fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
-    let Some(title) = og(html, "title") else {
+/// from the (entity-decoded) `og:title` content.
+fn parse_rating_from_og_title(title: Option<&str>) -> (Option<String>, Option<String>) {
+    let Some(title) = title else {
        return (None, None);
    };
    static RE: OnceLock<Regex> = OnceLock::new();
@ -357,7 +365,7 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
    let re = RE.get_or_init(|| {
        Regex::new(r#"is rated\s*[\\"]+([^"\\]+)[\\"]+\s*with\s*([\d.]+)\s*/\s*5"#).unwrap()
    });
-    let Some(caps) = re.captures(&title) else {
+    let Some(caps) = re.captures(title) else {
        return (None, None);
    };
    (
@ -366,13 +374,13 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
    )
 }

-/// Parse "hear what 226 customers have already said" from the OG
-/// description tag.
-fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
-    let desc = og(html, "description")?;
+/// Parse "hear what 226 customers have already said" from the
+/// (entity-decoded) `og:description` content.
+fn parse_review_count_from_og_description(desc: Option<&str>) -> Option<i64> {
+    let desc = desc?;
    static RE: OnceLock<Regex> = OnceLock::new();
    let re = RE.get_or_init(|| Regex::new(r"(\d[\d,]*)\s+customers").unwrap());
-    re.captures(&desc)?
+    re.captures(desc)?
        .get(1)?
        .as_str()
        .replace(',', "")
@ -380,29 +388,6 @@ fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
        .ok()
 }

-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            let raw = c.get(2).map(|m| m.as_str())?;
-            return Some(html_unescape(raw));
-        }
-    }
-    None
-}
-
-/// Minimal HTML entity unescaping for the three entities the
-/// synthesize_html escaper might produce. Keeps us off a heavier dep.
-fn html_unescape(s: &str) -> String {
-    s.replace("&quot;", "\"")
-        .replace("&amp;", "&")
-        .replace("&lt;", "<")
-        .replace("&gt;", ">")
-}
-
 fn get_string(v: &Value, key: &str) -> Option<String> {
    v.get(key).and_then(|x| x.as_str().map(String::from))
 }
@ -488,8 +473,12 @@ mod tests {
    #[test]
    fn parse_og_title_extracts_name_and_rating() {
        let html = r#"<meta property="og:title" content="Anthropic is rated &quot;Bad&quot; with 1.5 / 5 on Trustpilot">"#;
-        assert_eq!(parse_name_from_og_title(html), Some("Anthropic".into()));
-        let (label, rating) = parse_rating_from_og_title(html);
+        let title = parse_og(html).unescaped("title");
+        assert_eq!(
+            parse_name_from_og_title(title.as_deref()),
+            Some("Anthropic".into())
+        );
+        let (label, rating) = parse_rating_from_og_title(title.as_deref());
        assert_eq!(label.as_deref(), Some("Bad"));
        assert_eq!(rating.as_deref(), Some("1.5"));
    }
@ -497,7 +486,11 @@ mod tests {
    #[test]
    fn parse_review_count_from_og_description_picks_number() {
        let html = r#"<meta property="og:description" content="Do you agree? Voice your opinion today and hear what 226 customers have already said.">"#;
-        assert_eq!(parse_review_count_from_og_description(html), Some(226));
+        let desc = parse_og(html).unescaped("description");
+        assert_eq!(
+            parse_review_count_from_og_description(desc.as_deref()),
+            Some(226)
+        );
    }

    #[test]
--- a/crates/webclaw-fetch/src/extractors/youtube_video.rs
+++ b/crates/webclaw-fetch/src/extractors/youtube_video.rs
@ -25,6 +25,7 @@ use regex::Regex;
 use serde_json::{Value, json};

 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;

@ -143,9 +144,11 @@ fn build_player_payload(
 // ---------------------------------------------------------------------------

 fn build_og_fallback(html: &str, url: &str, canonical: &str, video_id: &str) -> Value {
-    let title = og(html, "title");
-    let description = og(html, "description");
-    let thumbnail = og(html, "image");
+    // Single scan for the three og:* fields read below.
+    let og_meta = parse_og(html);
+    let title = og_meta.raw("title");
+    let description = og_meta.raw("description");
+    let thumbnail = og_meta.raw("image");
    // YouTube sets `<meta name="channel_name" ...>` on some pages but
    // OG-only pages reliably carry `og:video:tag` and the channel in
    // `<link itemprop="name">`. We keep this lean: just what's stable.
@ -248,19 +251,6 @@ fn extract_player_response(html: &str) -> Option<Value> {
 // Meta-tag helpers (for OG fallback)
 // ---------------------------------------------------------------------------

-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 fn meta_name(html: &str, name: &str) -> Option<String> {
    static RE: OnceLock<Regex> = OnceLock::new();
    let re = RE.get_or_init(|| {
--- a/crates/webclaw-fetch/src/tls.rs
+++ b/crates/webclaw-fetch/src/tls.rs
@ -538,7 +538,11 @@ pub fn build_client(
            max_redirects as usize,
        ))
        .cookie_store(true)
-        .timeout(timeout);
+        .timeout(timeout)
+        .connect_timeout(Duration::from_secs(5))
+        .pool_idle_timeout(Duration::from_secs(90))
+        .pool_max_idle_per_host(8)
+        .tcp_keepalive(Duration::from_secs(60));

    if let Some(proxy_url) = proxy {
        let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {