perf: hot-path extraction speedups (selector hoist, shared og, QuickJS gating)

Rescued from the stale perf/audit-fixes branch — the *perf-only* subset of
that branch's big mixed commit, ported cleanly onto current main with
byte-identical extraction output.

- markdown: hoist the `img[alt]` / `a[href]` selectors out of the per-node
  noise path into `Lazy` statics (stop recompiling them per element).
- extractors: single shared `og()` / `parse_og()` module replaces the
  per-field Open Graph re-scan duplicated across 7 vertical extractors
  (amazon, ebay, ecommerce, etsy, substack, trustpilot, youtube). Each
  vertical now does one pass. Raw-vs-unescaped behaviour preserved exactly.
- core: gate the QuickJS VM on a cheap marker check (skip it entirely when
  the page has no JS-assigned data) and reuse the already-parsed document
  instead of re-parsing the HTML.
- fetch: connection-pool tuning on the wreq client (connect_timeout, idle
  pool, max-idle-per-host, tcp keepalive) for connection reuse.

Output-equivalence is covered by existing tests (amazon quot-entity,
trustpilot title parse, ecommerce/youtube/etsy/substack og fallbacks) — all
green. No new dependencies; no public API change.

Deliberately EXCLUDED from this slice (separate concerns bundled in the
original commit): the `#[non_exhaustive]` API-breaking changes, the LLM/PDF/
server reliability hardening (much already shipped in 0.6.8), the tooling
(cargo-deny, release profile, MSRV), and the retry-loop dedup refactor (a
code-cleanup with no runtime benefit — not worth churning client.rs for).

Original work by the prior author on perf/audit-fixes; this re-applies only
the performance subset onto main.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-06-17 16:41:45 +02:00
parent 51d0c538f1
commit 3c54bea300
13 changed files with 200 additions and 157 deletions

View file

@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find
/// any data blob. The scan only ever surfaces `globalThis.__*` object/array
/// properties, and the seeded `__next_f` only emits when non-empty. Every
/// realistic way an inline script populates such a global goes through one of
/// these substrings (`window.`/`self.__next` assignments, or the
/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none
/// are present, running the VM is guaranteed to return zero blobs, so skipping
/// it is output-neutral. Conservative by design: any of these may appear in
/// non-script HTML too, which only makes us skip *less* often, never more.
const JS_CANDIDATE_MARKERS: [&str; 5] = [
"window.",
"__NEXT_DATA__",
"__NUXT__",
"application/json",
"self.__next",
];
/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS
/// scan could surface. When false, the VM is provably a no-op and is skipped.
pub fn has_js_candidate_data(html: &str) -> bool {
JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m))
}
/// A blob of data extracted from JS execution.
pub struct JsDataBlob {
pub name: String,
@ -24,9 +47,17 @@ pub struct JsDataBlob {
}
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
///
/// Convenience wrapper that parses `html` first. Hot callers that already hold a
/// parsed [`Html`] should use [`extract_js_data_from_doc`] to avoid a second parse.
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
let doc = Html::parse_document(html);
extract_js_data_from_doc(&doc)
}
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data,
/// reusing an already-parsed [`Html`] document instead of re-parsing the HTML.
pub fn extract_js_data_from_doc(doc: &Html) -> Vec<JsDataBlob> {
let scripts: Vec<String> = doc
.select(&SCRIPT_SELECTOR)
.filter(|el| {

View file

@ -222,8 +222,8 @@ fn extract_with_options_inner(
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
// static JSON data island extraction above with runtime-evaluated data.
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
{
let blobs = js_eval::extract_js_data(html);
if js_eval::has_js_candidate_data(html) {
let blobs = js_eval::extract_js_data_from_doc(&doc);
if !blobs.is_empty() {
let js_text = js_eval::extract_readable_text(&blobs);
if !js_text.is_empty() {

View file

@ -13,6 +13,8 @@ use crate::noise;
use crate::types::{CodeBlock, Image, Link};
static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
static IMG_ALT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("img[alt]").unwrap());
static A_HREF_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());
/// Maximum recursion depth for DOM traversal.
/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
@ -853,7 +855,7 @@ fn collect_assets_from_noise(
assets: &mut ConvertedAssets,
) {
// Collect images with alt text
for img in element.select(&Selector::parse("img[alt]").unwrap()) {
for img in element.select(&IMG_ALT_SELECTOR) {
let alt = img.value().attr("alt").unwrap_or("").to_string();
let src = img
.value()
@ -866,7 +868,7 @@ fn collect_assets_from_noise(
}
// Collect links
for link in element.select(&Selector::parse("a[href]").unwrap()) {
for link in element.select(&A_HREF_SELECTOR) {
let href = link
.value()
.attr("href")

View file

@ -33,6 +33,7 @@ use serde_json::{Value, json};
use url::Url;
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -115,23 +116,25 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
/// without carrying webclaw_fetch types.
pub fn parse(html: &str, url: &str, asin: &str) -> Value {
let jsonld = find_product_jsonld(html);
// Single scan for the og:* fallbacks read below.
let og_meta = parse_og(html);
// Three-tier title: JSON-LD `name` > Amazon's `#productTitle` span
// (only present on real static HTML) > cloud-synthesized og:title.
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| dom_title(html))
.or_else(|| og(html, "title"));
.or_else(|| og_meta.unescaped("title"));
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| dom_image(html))
.or_else(|| og(html, "image"));
.or_else(|| og_meta.unescaped("image"));
let brand = jsonld.as_ref().and_then(get_brand);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og(html, "description"));
.or_else(|| og_meta.unescaped("description"));
let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating);
let offer = jsonld.as_ref().and_then(first_offer);
@ -336,31 +339,6 @@ fn dom_image(html: &str) -> Option<String> {
.map(|m| m.as_str().to_string())
}
/// OG meta tag lookup. Cloud-synthesized HTML ships these even when
/// JSON-LD and Amazon-DOM-IDs are both absent, so they're the last
/// line of defence for `title`, `image`, `description`.
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| html_unescape(m.as_str()));
}
}
None
}
/// Undo the synthesize_html attribute escaping for the few entities it
/// emits. Keeps us off a heavier HTML-entity dep.
fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
FetchError::Build(e.to_string())
}
@ -477,7 +455,7 @@ mod tests {
fn og_unescape_handles_quot_entity() {
let html = r#"<meta property="og:title" content="Apple &quot;M2 Pro&quot; Laptop">"#;
assert_eq!(
og(html, "title").as_deref(),
parse_og(html).unescaped("title").as_deref(),
Some(r#"Apple "M2 Pro" Laptop"#)
);
}

View file

@ -15,6 +15,7 @@ use serde_json::{Value, json};
use url::Url;
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -65,19 +66,21 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
let jsonld = find_product_jsonld(html);
// Single scan for the three og:* fields read as fallbacks below.
let og_meta = parse_og(html);
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| og(html, "title"));
.or_else(|| og_meta.raw("title"));
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| og(html, "image"));
.or_else(|| og_meta.raw("image"));
let brand = jsonld.as_ref().and_then(get_brand);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og(html, "description"));
.or_else(|| og_meta.raw("description"));
let offer = jsonld.as_ref().and_then(first_offer);
// eBay's AggregateOffer uses lowPrice/highPrice. Offer uses price.
@ -268,19 +271,6 @@ fn get_aggregate_rating(v: &Value) -> Option<Value> {
}))
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
FetchError::Build(e.to_string())
}

View file

@ -42,6 +42,7 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::{og, parse_og};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -142,15 +143,17 @@ fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value {
/// Build a minimal payload from OG / product meta tags. Used when a
/// page has no Product JSON-LD at all.
fn build_og_payload(html: &str, url: &str) -> Value {
// Single scan for the three og:* fields this fallback reads.
let og_meta = parse_og(html);
let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default();
let image = og(html, "image");
let image = og_meta.raw("image");
let images: Vec<Value> = image.map(|i| vec![Value::String(i)]).unwrap_or_default();
json!({
"url": url,
"data_source": "og_fallback",
"name": og(html, "title"),
"description": og(html, "description"),
"name": og_meta.raw("title"),
"description": og_meta.raw("description"),
"brand": meta_property(html, "product:brand"),
"sku": None::<String>,
"mpn": None::<String>,
@ -368,20 +371,6 @@ fn build_og_offer(html: &str) -> Option<Value> {
}))
}
/// Pull the value of `<meta property="og:{prop}" content="...">`.
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Pull the value of any `<meta property="..." content="...">` tag.
/// Needed for namespaced OG variants like `product:price:amount` that
/// the simple `og:*` matcher above doesn't cover.

View file

@ -26,6 +26,7 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -74,19 +75,26 @@ pub fn parse(html: &str, url: &str, listing_id: &str) -> Value {
let jsonld = find_product_jsonld(html);
let slug_title = humanise_slug(parse_slug(url).as_deref());
// Single scan for the three og:* fields used as fallbacks below.
let og_meta = parse_og(html);
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| og(html, "title").filter(|t| !is_generic_title(t)))
.or_else(|| og_meta.raw("title").filter(|t| !is_generic_title(t)))
.or(slug_title);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og(html, "description").filter(|d| !is_generic_description(d)));
.or_else(|| {
og_meta
.raw("description")
.filter(|d| !is_generic_description(d))
});
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| og(html, "image"));
.or_else(|| og_meta.raw("image"));
let brand = jsonld.as_ref().and_then(get_brand);
// Etsy listings often ship either a single Offer or an
@ -359,19 +367,6 @@ fn strip_schema_prefix(s: String) -> String {
.replace("https://schema.org/", "")
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Etsy links the owning shop with a canonical anchor like
/// `<a href="/shop/ShopName" ...>`. Grab the first one after the
/// breadcrumb boundary.

View file

@ -33,6 +33,7 @@ pub mod instagram_post;
pub mod instagram_profile;
pub mod linkedin_post;
pub mod npm;
pub(crate) mod og;
pub mod pypi;
pub mod reddit;
pub mod shopify_collection;

View file

@ -0,0 +1,79 @@
//! Shared Open Graph (`og:*`) meta-tag parsing for the HTML vertical
//! extractors.
//!
//! Several site extractors read a handful of `og:*` properties (title,
//! description, image, ...) from the page `<head>`. Each used to carry a
//! verbatim copy of the same regex + scan helper. This module centralises
//! that logic and adds [`parse_og`], which collects every `og:*` pair in a
//! single `captures_iter` pass so an extractor that needs multiple fields
//! scans the document once instead of once per field.
//!
//! Values are stored raw. Callers that need HTML entity decoding apply
//! [`html_unescape`] themselves — some extractors intentionally keep the
//! raw value, so decoding is opt-in per call site to preserve output.
use std::collections::HashMap;
use std::sync::OnceLock;
use regex::Regex;
/// Matches `<meta property="og:<name>" content="<value>">`, case-insensitive.
/// Capture 1 is the property suffix (after `og:`), capture 2 is the content.
fn og_regex() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
})
}
/// Return the raw content of the first `og:<prop>` meta tag, if present.
///
/// Single-pass per call. For extractors reading several properties, prefer
/// [`parse_og`] to scan the document only once.
pub(crate) fn og(html: &str, prop: &str) -> Option<String> {
for c in og_regex().captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Parse every `og:*` meta tag in one pass into a `suffix -> content` map.
///
/// First occurrence wins, matching the short-circuit-on-first-match
/// behaviour of [`og`] when called per property. Values are raw (not
/// entity-decoded); use [`OgMeta::unescaped`] / [`OgMeta::raw`] to read.
pub(crate) fn parse_og(html: &str) -> OgMeta {
let mut map: HashMap<String, String> = HashMap::new();
for c in og_regex().captures_iter(html) {
if let (Some(name), Some(content)) = (c.get(1), c.get(2)) {
map.entry(name.as_str().to_string())
.or_insert_with(|| content.as_str().to_string());
}
}
OgMeta(map)
}
/// Parsed `og:*` properties from a single document scan.
pub(crate) struct OgMeta(HashMap<String, String>);
impl OgMeta {
/// Raw content of `og:<prop>`, exactly as it appeared in the HTML.
pub(crate) fn raw(&self, prop: &str) -> Option<String> {
self.0.get(prop).cloned()
}
/// Content of `og:<prop>` with the common HTML entities decoded.
pub(crate) fn unescaped(&self, prop: &str) -> Option<String> {
self.0.get(prop).map(|v| html_unescape(v))
}
}
/// Decode the small set of HTML entities that show up in `og:*` content.
pub(crate) fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}

View file

@ -28,6 +28,7 @@ use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -181,24 +182,27 @@ async fn html_fallback(
pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
let article = find_article_jsonld(html);
// Single scan for the four og:* fields read as fallbacks below.
let og_meta = parse_og(html);
let title = article
.as_ref()
.and_then(|v| get_text(v, "headline"))
.or_else(|| og(html, "title"));
.or_else(|| og_meta.raw("title"));
let description = article
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og(html, "description"));
.or_else(|| og_meta.raw("description"));
let cover_image = article
.as_ref()
.and_then(get_first_image)
.or_else(|| og(html, "image"));
.or_else(|| og_meta.raw("image"));
let post_date = article
.as_ref()
.and_then(|v| get_text(v, "datePublished"))
.or_else(|| meta_property(html, "article:published_time"));
let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
let publication_name = og(html, "site_name");
let publication_name = og_meta.raw("site_name");
let authors = article.as_ref().map(extract_authors).unwrap_or_default();
json!({
@ -302,19 +306,6 @@ fn handle_from_author_url(u: &str) -> Option<String> {
// HTML tag helpers
// ---------------------------------------------------------------------------
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Pull `<meta property="article:published_time" content="...">` and
/// similar structured meta tags.
fn meta_property(html: &str, prop: &str) -> Option<String> {

View file

@ -32,6 +32,7 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -87,11 +88,17 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
// The aiSummary block: not typed (no `@type`), detect by key.
let ai_block = find_ai_summary_block(&blocks);
// Single scan of the page's og:* meta tags; title + description feed
// the regex fallbacks below.
let og_meta = parse_og(html);
let og_title = og_meta.unescaped("title");
let og_description = og_meta.unescaped("description");
// Business name: Dataset > metadata.title regex > URL domain.
let business_name = dataset
.as_ref()
.and_then(|d| get_string(d, "name"))
.or_else(|| parse_name_from_og_title(html))
.or_else(|| parse_name_from_og_title(og_title.as_deref()))
.or_else(|| Some(domain.clone()));
// Rating distribution from the csvw:Table columns. Each column has
@ -105,8 +112,8 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
// Page-title / page-description fallbacks. OG title format:
// "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
let (rating_label, rating_from_og) = parse_rating_from_og_title(html);
let total_from_desc = parse_review_count_from_og_description(html);
let (rating_label, rating_from_og) = parse_rating_from_og_title(og_title.as_deref());
let total_from_desc = parse_review_count_from_og_description(og_description.as_deref());
// Recent reviews carried by the aiSummary block.
let recent_reviews: Vec<Value> = ai_block
@ -336,20 +343,21 @@ fn compute_rating_stats(distribution: &Value) -> (Option<String>, Option<i64>) {
/// Regex out the business name from the standard Trustpilot OG title
/// shape: `"{name} is rated \"{label}\" with {rating} / 5 on Trustpilot"`.
fn parse_name_from_og_title(html: &str) -> Option<String> {
let title = og(html, "title")?;
/// `title` is the (entity-decoded) `og:title` content.
fn parse_name_from_og_title(title: Option<&str>) -> Option<String> {
let title = title?;
// "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"^(.+?)\s+is rated\b").unwrap());
re.captures(&title)
re.captures(title)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
/// Pull the rating label (e.g. "Bad", "Excellent") and numeric value
/// from the OG title.
fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
let Some(title) = og(html, "title") else {
/// from the (entity-decoded) `og:title` content.
fn parse_rating_from_og_title(title: Option<&str>) -> (Option<String>, Option<String>) {
let Some(title) = title else {
return (None, None);
};
static RE: OnceLock<Regex> = OnceLock::new();
@ -357,7 +365,7 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
let re = RE.get_or_init(|| {
Regex::new(r#"is rated\s*[\\"]+([^"\\]+)[\\"]+\s*with\s*([\d.]+)\s*/\s*5"#).unwrap()
});
let Some(caps) = re.captures(&title) else {
let Some(caps) = re.captures(title) else {
return (None, None);
};
(
@ -366,13 +374,13 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
)
}
/// Parse "hear what 226 customers have already said" from the OG
/// description tag.
fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
let desc = og(html, "description")?;
/// Parse "hear what 226 customers have already said" from the
/// (entity-decoded) `og:description` content.
fn parse_review_count_from_og_description(desc: Option<&str>) -> Option<i64> {
let desc = desc?;
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"(\d[\d,]*)\s+customers").unwrap());
re.captures(&desc)?
re.captures(desc)?
.get(1)?
.as_str()
.replace(',', "")
@ -380,29 +388,6 @@ fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
.ok()
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
let raw = c.get(2).map(|m| m.as_str())?;
return Some(html_unescape(raw));
}
}
None
}
/// Minimal HTML entity unescaping for the three entities the
/// synthesize_html escaper might produce. Keeps us off a heavier dep.
fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}
fn get_string(v: &Value, key: &str) -> Option<String> {
v.get(key).and_then(|x| x.as_str().map(String::from))
}
@ -488,8 +473,12 @@ mod tests {
#[test]
fn parse_og_title_extracts_name_and_rating() {
let html = r#"<meta property="og:title" content="Anthropic is rated &quot;Bad&quot; with 1.5 / 5 on Trustpilot">"#;
assert_eq!(parse_name_from_og_title(html), Some("Anthropic".into()));
let (label, rating) = parse_rating_from_og_title(html);
let title = parse_og(html).unescaped("title");
assert_eq!(
parse_name_from_og_title(title.as_deref()),
Some("Anthropic".into())
);
let (label, rating) = parse_rating_from_og_title(title.as_deref());
assert_eq!(label.as_deref(), Some("Bad"));
assert_eq!(rating.as_deref(), Some("1.5"));
}
@ -497,7 +486,11 @@ mod tests {
#[test]
fn parse_review_count_from_og_description_picks_number() {
let html = r#"<meta property="og:description" content="Do you agree? Voice your opinion today and hear what 226 customers have already said.">"#;
assert_eq!(parse_review_count_from_og_description(html), Some(226));
let desc = parse_og(html).unescaped("description");
assert_eq!(
parse_review_count_from_og_description(desc.as_deref()),
Some(226)
);
}
#[test]

View file

@ -25,6 +25,7 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -143,9 +144,11 @@ fn build_player_payload(
// ---------------------------------------------------------------------------
fn build_og_fallback(html: &str, url: &str, canonical: &str, video_id: &str) -> Value {
let title = og(html, "title");
let description = og(html, "description");
let thumbnail = og(html, "image");
// Single scan for the three og:* fields read below.
let og_meta = parse_og(html);
let title = og_meta.raw("title");
let description = og_meta.raw("description");
let thumbnail = og_meta.raw("image");
// YouTube sets `<meta name="channel_name" ...>` on some pages but
// OG-only pages reliably carry `og:video:tag` and the channel in
// `<link itemprop="name">`. We keep this lean: just what's stable.
@ -248,19 +251,6 @@ fn extract_player_response(html: &str) -> Option<Value> {
// Meta-tag helpers (for OG fallback)
// ---------------------------------------------------------------------------
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
fn meta_name(html: &str, name: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {

View file

@ -538,7 +538,11 @@ pub fn build_client(
max_redirects as usize,
))
.cookie_store(true)
.timeout(timeout);
.timeout(timeout)
.connect_timeout(Duration::from_secs(5))
.pool_idle_timeout(Duration::from_secs(90))
.pool_max_idle_per_host(8)
.tcp_keepalive(Duration::from_secs(60));
if let Some(proxy_url) = proxy {
let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {