mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-17 23:55:13 +02:00
perf: hot-path extraction speedups (selector hoist, shared og, QuickJS gating)
Rescued from the stale perf/audit-fixes branch — the *perf-only* subset of that branch's big mixed commit, ported cleanly onto current main with byte-identical extraction output. - markdown: hoist the `img[alt]` / `a[href]` selectors out of the per-node noise path into `Lazy` statics (stop recompiling them per element). - extractors: single shared `og()` / `parse_og()` module replaces the per-field Open Graph re-scan duplicated across 7 vertical extractors (amazon, ebay, ecommerce, etsy, substack, trustpilot, youtube). Each vertical now does one pass. Raw-vs-unescaped behaviour preserved exactly. - core: gate the QuickJS VM on a cheap marker check (skip it entirely when the page has no JS-assigned data) and reuse the already-parsed document instead of re-parsing the HTML. - fetch: connection-pool tuning on the wreq client (connect_timeout, idle pool, max-idle-per-host, tcp keepalive) for connection reuse. Output-equivalence is covered by existing tests (amazon quot-entity, trustpilot title parse, ecommerce/youtube/etsy/substack og fallbacks) — all green. No new dependencies; no public API change. Deliberately EXCLUDED from this slice (separate concerns bundled in the original commit): the `#[non_exhaustive]` API-breaking changes, the LLM/PDF/ server reliability hardening (much already shipped in 0.6.8), the tooling (cargo-deny, release profile, MSRV), and the retry-loop dedup refactor (a code-cleanup with no runtime benefit — not worth churning client.rs for). Original work by the prior author on perf/audit-fixes; this re-applies only the performance subset onto main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
51d0c538f1
commit
3c54bea300
13 changed files with 200 additions and 157 deletions
|
|
@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").
|
|||
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
|
||||
const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
|
||||
|
||||
/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find
|
||||
/// any data blob. The scan only ever surfaces `globalThis.__*` object/array
|
||||
/// properties, and the seeded `__next_f` only emits when non-empty. Every
|
||||
/// realistic way an inline script populates such a global goes through one of
|
||||
/// these substrings (`window.`/`self.__next` assignments, or the
|
||||
/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none
|
||||
/// are present, running the VM is guaranteed to return zero blobs, so skipping
|
||||
/// it is output-neutral. Conservative by design: any of these may appear in
|
||||
/// non-script HTML too, which only makes us skip *less* often, never more.
|
||||
const JS_CANDIDATE_MARKERS: [&str; 5] = [
|
||||
"window.",
|
||||
"__NEXT_DATA__",
|
||||
"__NUXT__",
|
||||
"application/json",
|
||||
"self.__next",
|
||||
];
|
||||
|
||||
/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS
|
||||
/// scan could surface. When false, the VM is provably a no-op and is skipped.
|
||||
pub fn has_js_candidate_data(html: &str) -> bool {
|
||||
JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m))
|
||||
}
|
||||
|
||||
/// A blob of data extracted from JS execution.
|
||||
pub struct JsDataBlob {
|
||||
pub name: String,
|
||||
|
|
@ -24,9 +47,17 @@ pub struct JsDataBlob {
|
|||
}
|
||||
|
||||
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
|
||||
///
|
||||
/// Convenience wrapper that parses `html` first. Hot callers that already hold a
|
||||
/// parsed [`Html`] should use [`extract_js_data_from_doc`] to avoid a second parse.
|
||||
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
|
||||
let doc = Html::parse_document(html);
|
||||
extract_js_data_from_doc(&doc)
|
||||
}
|
||||
|
||||
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data,
|
||||
/// reusing an already-parsed [`Html`] document instead of re-parsing the HTML.
|
||||
pub fn extract_js_data_from_doc(doc: &Html) -> Vec<JsDataBlob> {
|
||||
let scripts: Vec<String> = doc
|
||||
.select(&SCRIPT_SELECTOR)
|
||||
.filter(|el| {
|
||||
|
|
|
|||
|
|
@ -222,8 +222,8 @@ fn extract_with_options_inner(
|
|||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||
// static JSON data island extraction above with runtime-evaluated data.
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
{
|
||||
let blobs = js_eval::extract_js_data(html);
|
||||
if js_eval::has_js_candidate_data(html) {
|
||||
let blobs = js_eval::extract_js_data_from_doc(&doc);
|
||||
if !blobs.is_empty() {
|
||||
let js_text = js_eval::extract_readable_text(&blobs);
|
||||
if !js_text.is_empty() {
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ use crate::noise;
|
|||
use crate::types::{CodeBlock, Image, Link};
|
||||
|
||||
static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
|
||||
static IMG_ALT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("img[alt]").unwrap());
|
||||
static A_HREF_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());
|
||||
|
||||
/// Maximum recursion depth for DOM traversal.
|
||||
/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
|
||||
|
|
@ -853,7 +855,7 @@ fn collect_assets_from_noise(
|
|||
assets: &mut ConvertedAssets,
|
||||
) {
|
||||
// Collect images with alt text
|
||||
for img in element.select(&Selector::parse("img[alt]").unwrap()) {
|
||||
for img in element.select(&IMG_ALT_SELECTOR) {
|
||||
let alt = img.value().attr("alt").unwrap_or("").to_string();
|
||||
let src = img
|
||||
.value()
|
||||
|
|
@ -866,7 +868,7 @@ fn collect_assets_from_noise(
|
|||
}
|
||||
|
||||
// Collect links
|
||||
for link in element.select(&Selector::parse("a[href]").unwrap()) {
|
||||
for link in element.select(&A_HREF_SELECTOR) {
|
||||
let href = link
|
||||
.value()
|
||||
.attr("href")
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ use serde_json::{Value, json};
|
|||
use url::Url;
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use super::og::parse_og;
|
||||
use crate::cloud::{self, CloudError};
|
||||
use crate::error::FetchError;
|
||||
use crate::fetcher::Fetcher;
|
||||
|
|
@ -115,23 +116,25 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
|
|||
/// without carrying webclaw_fetch types.
|
||||
pub fn parse(html: &str, url: &str, asin: &str) -> Value {
|
||||
let jsonld = find_product_jsonld(html);
|
||||
// Single scan for the og:* fallbacks read below.
|
||||
let og_meta = parse_og(html);
|
||||
// Three-tier title: JSON-LD `name` > Amazon's `#productTitle` span
|
||||
// (only present on real static HTML) > cloud-synthesized og:title.
|
||||
let title = jsonld
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "name"))
|
||||
.or_else(|| dom_title(html))
|
||||
.or_else(|| og(html, "title"));
|
||||
.or_else(|| og_meta.unescaped("title"));
|
||||
let image = jsonld
|
||||
.as_ref()
|
||||
.and_then(get_first_image)
|
||||
.or_else(|| dom_image(html))
|
||||
.or_else(|| og(html, "image"));
|
||||
.or_else(|| og_meta.unescaped("image"));
|
||||
let brand = jsonld.as_ref().and_then(get_brand);
|
||||
let description = jsonld
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "description"))
|
||||
.or_else(|| og(html, "description"));
|
||||
.or_else(|| og_meta.unescaped("description"));
|
||||
let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating);
|
||||
let offer = jsonld.as_ref().and_then(first_offer);
|
||||
|
||||
|
|
@ -336,31 +339,6 @@ fn dom_image(html: &str) -> Option<String> {
|
|||
.map(|m| m.as_str().to_string())
|
||||
}
|
||||
|
||||
/// OG meta tag lookup. Cloud-synthesized HTML ships these even when
|
||||
/// JSON-LD and Amazon-DOM-IDs are both absent, so they're the last
|
||||
/// line of defence for `title`, `image`, `description`.
|
||||
fn og(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| html_unescape(m.as_str()));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Undo the synthesize_html attribute escaping for the few entities it
|
||||
/// emits. Keeps us off a heavier HTML-entity dep.
|
||||
fn html_unescape(s: &str) -> String {
|
||||
s.replace(""", "\"")
|
||||
.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
}
|
||||
|
||||
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
|
||||
FetchError::Build(e.to_string())
|
||||
}
|
||||
|
|
@ -477,7 +455,7 @@ mod tests {
|
|||
fn og_unescape_handles_quot_entity() {
|
||||
let html = r#"<meta property="og:title" content="Apple "M2 Pro" Laptop">"#;
|
||||
assert_eq!(
|
||||
og(html, "title").as_deref(),
|
||||
parse_og(html).unescaped("title").as_deref(),
|
||||
Some(r#"Apple "M2 Pro" Laptop"#)
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ use serde_json::{Value, json};
|
|||
use url::Url;
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use super::og::parse_og;
|
||||
use crate::cloud::{self, CloudError};
|
||||
use crate::error::FetchError;
|
||||
use crate::fetcher::Fetcher;
|
||||
|
|
@ -65,19 +66,21 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
|
|||
|
||||
pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
|
||||
let jsonld = find_product_jsonld(html);
|
||||
// Single scan for the three og:* fields read as fallbacks below.
|
||||
let og_meta = parse_og(html);
|
||||
let title = jsonld
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "name"))
|
||||
.or_else(|| og(html, "title"));
|
||||
.or_else(|| og_meta.raw("title"));
|
||||
let image = jsonld
|
||||
.as_ref()
|
||||
.and_then(get_first_image)
|
||||
.or_else(|| og(html, "image"));
|
||||
.or_else(|| og_meta.raw("image"));
|
||||
let brand = jsonld.as_ref().and_then(get_brand);
|
||||
let description = jsonld
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "description"))
|
||||
.or_else(|| og(html, "description"));
|
||||
.or_else(|| og_meta.raw("description"));
|
||||
let offer = jsonld.as_ref().and_then(first_offer);
|
||||
|
||||
// eBay's AggregateOffer uses lowPrice/highPrice. Offer uses price.
|
||||
|
|
@ -268,19 +271,6 @@ fn get_aggregate_rating(v: &Value) -> Option<Value> {
|
|||
}))
|
||||
}
|
||||
|
||||
fn og(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| m.as_str().to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
|
||||
FetchError::Build(e.to_string())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ use regex::Regex;
|
|||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use super::og::{og, parse_og};
|
||||
use crate::error::FetchError;
|
||||
use crate::fetcher::Fetcher;
|
||||
|
||||
|
|
@ -142,15 +143,17 @@ fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value {
|
|||
/// Build a minimal payload from OG / product meta tags. Used when a
|
||||
/// page has no Product JSON-LD at all.
|
||||
fn build_og_payload(html: &str, url: &str) -> Value {
|
||||
// Single scan for the three og:* fields this fallback reads.
|
||||
let og_meta = parse_og(html);
|
||||
let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default();
|
||||
let image = og(html, "image");
|
||||
let image = og_meta.raw("image");
|
||||
let images: Vec<Value> = image.map(|i| vec![Value::String(i)]).unwrap_or_default();
|
||||
|
||||
json!({
|
||||
"url": url,
|
||||
"data_source": "og_fallback",
|
||||
"name": og(html, "title"),
|
||||
"description": og(html, "description"),
|
||||
"name": og_meta.raw("title"),
|
||||
"description": og_meta.raw("description"),
|
||||
"brand": meta_property(html, "product:brand"),
|
||||
"sku": None::<String>,
|
||||
"mpn": None::<String>,
|
||||
|
|
@ -368,20 +371,6 @@ fn build_og_offer(html: &str) -> Option<Value> {
|
|||
}))
|
||||
}
|
||||
|
||||
/// Pull the value of `<meta property="og:{prop}" content="...">`.
|
||||
fn og(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| m.as_str().to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Pull the value of any `<meta property="..." content="...">` tag.
|
||||
/// Needed for namespaced OG variants like `product:price:amount` that
|
||||
/// the simple `og:*` matcher above doesn't cover.
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ use regex::Regex;
|
|||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use super::og::parse_og;
|
||||
use crate::cloud::{self, CloudError};
|
||||
use crate::error::FetchError;
|
||||
use crate::fetcher::Fetcher;
|
||||
|
|
@ -74,19 +75,26 @@ pub fn parse(html: &str, url: &str, listing_id: &str) -> Value {
|
|||
let jsonld = find_product_jsonld(html);
|
||||
let slug_title = humanise_slug(parse_slug(url).as_deref());
|
||||
|
||||
// Single scan for the three og:* fields used as fallbacks below.
|
||||
let og_meta = parse_og(html);
|
||||
|
||||
let title = jsonld
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "name"))
|
||||
.or_else(|| og(html, "title").filter(|t| !is_generic_title(t)))
|
||||
.or_else(|| og_meta.raw("title").filter(|t| !is_generic_title(t)))
|
||||
.or(slug_title);
|
||||
let description = jsonld
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "description"))
|
||||
.or_else(|| og(html, "description").filter(|d| !is_generic_description(d)));
|
||||
.or_else(|| {
|
||||
og_meta
|
||||
.raw("description")
|
||||
.filter(|d| !is_generic_description(d))
|
||||
});
|
||||
let image = jsonld
|
||||
.as_ref()
|
||||
.and_then(get_first_image)
|
||||
.or_else(|| og(html, "image"));
|
||||
.or_else(|| og_meta.raw("image"));
|
||||
let brand = jsonld.as_ref().and_then(get_brand);
|
||||
|
||||
// Etsy listings often ship either a single Offer or an
|
||||
|
|
@ -359,19 +367,6 @@ fn strip_schema_prefix(s: String) -> String {
|
|||
.replace("https://schema.org/", "")
|
||||
}
|
||||
|
||||
fn og(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| m.as_str().to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Etsy links the owning shop with a canonical anchor like
|
||||
/// `<a href="/shop/ShopName" ...>`. Grab the first one after the
|
||||
/// breadcrumb boundary.
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ pub mod instagram_post;
|
|||
pub mod instagram_profile;
|
||||
pub mod linkedin_post;
|
||||
pub mod npm;
|
||||
pub(crate) mod og;
|
||||
pub mod pypi;
|
||||
pub mod reddit;
|
||||
pub mod shopify_collection;
|
||||
|
|
|
|||
79
crates/webclaw-fetch/src/extractors/og.rs
Normal file
79
crates/webclaw-fetch/src/extractors/og.rs
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
//! Shared Open Graph (`og:*`) meta-tag parsing for the HTML vertical
|
||||
//! extractors.
|
||||
//!
|
||||
//! Several site extractors read a handful of `og:*` properties (title,
|
||||
//! description, image, ...) from the page `<head>`. Each used to carry a
|
||||
//! verbatim copy of the same regex + scan helper. This module centralises
|
||||
//! that logic and adds [`parse_og`], which collects every `og:*` pair in a
|
||||
//! single `captures_iter` pass so an extractor that needs multiple fields
|
||||
//! scans the document once instead of once per field.
|
||||
//!
|
||||
//! Values are stored raw. Callers that need HTML entity decoding apply
|
||||
//! [`html_unescape`] themselves — some extractors intentionally keep the
|
||||
//! raw value, so decoding is opt-in per call site to preserve output.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
/// Matches `<meta property="og:<name>" content="<value>">`, case-insensitive.
|
||||
/// Capture 1 is the property suffix (after `og:`), capture 2 is the content.
|
||||
fn og_regex() -> &'static Regex {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the raw content of the first `og:<prop>` meta tag, if present.
|
||||
///
|
||||
/// Single-pass per call. For extractors reading several properties, prefer
|
||||
/// [`parse_og`] to scan the document only once.
|
||||
pub(crate) fn og(html: &str, prop: &str) -> Option<String> {
|
||||
for c in og_regex().captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| m.as_str().to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Parse every `og:*` meta tag in one pass into a `suffix -> content` map.
|
||||
///
|
||||
/// First occurrence wins, matching the short-circuit-on-first-match
|
||||
/// behaviour of [`og`] when called per property. Values are raw (not
|
||||
/// entity-decoded); use [`OgMeta::unescaped`] / [`OgMeta::raw`] to read.
|
||||
pub(crate) fn parse_og(html: &str) -> OgMeta {
|
||||
let mut map: HashMap<String, String> = HashMap::new();
|
||||
for c in og_regex().captures_iter(html) {
|
||||
if let (Some(name), Some(content)) = (c.get(1), c.get(2)) {
|
||||
map.entry(name.as_str().to_string())
|
||||
.or_insert_with(|| content.as_str().to_string());
|
||||
}
|
||||
}
|
||||
OgMeta(map)
|
||||
}
|
||||
|
||||
/// Parsed `og:*` properties from a single document scan.
|
||||
pub(crate) struct OgMeta(HashMap<String, String>);
|
||||
|
||||
impl OgMeta {
|
||||
/// Raw content of `og:<prop>`, exactly as it appeared in the HTML.
|
||||
pub(crate) fn raw(&self, prop: &str) -> Option<String> {
|
||||
self.0.get(prop).cloned()
|
||||
}
|
||||
|
||||
/// Content of `og:<prop>` with the common HTML entities decoded.
|
||||
pub(crate) fn unescaped(&self, prop: &str) -> Option<String> {
|
||||
self.0.get(prop).map(|v| html_unescape(v))
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode the small set of HTML entities that show up in `og:*` content.
|
||||
pub(crate) fn html_unescape(s: &str) -> String {
|
||||
s.replace(""", "\"")
|
||||
.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
}
|
||||
|
|
@ -28,6 +28,7 @@ use serde::Deserialize;
|
|||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use super::og::parse_og;
|
||||
use crate::cloud::{self, CloudError};
|
||||
use crate::error::FetchError;
|
||||
use crate::fetcher::Fetcher;
|
||||
|
|
@ -181,24 +182,27 @@ async fn html_fallback(
|
|||
pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
|
||||
let article = find_article_jsonld(html);
|
||||
|
||||
// Single scan for the four og:* fields read as fallbacks below.
|
||||
let og_meta = parse_og(html);
|
||||
|
||||
let title = article
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "headline"))
|
||||
.or_else(|| og(html, "title"));
|
||||
.or_else(|| og_meta.raw("title"));
|
||||
let description = article
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "description"))
|
||||
.or_else(|| og(html, "description"));
|
||||
.or_else(|| og_meta.raw("description"));
|
||||
let cover_image = article
|
||||
.as_ref()
|
||||
.and_then(get_first_image)
|
||||
.or_else(|| og(html, "image"));
|
||||
.or_else(|| og_meta.raw("image"));
|
||||
let post_date = article
|
||||
.as_ref()
|
||||
.and_then(|v| get_text(v, "datePublished"))
|
||||
.or_else(|| meta_property(html, "article:published_time"));
|
||||
let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
|
||||
let publication_name = og(html, "site_name");
|
||||
let publication_name = og_meta.raw("site_name");
|
||||
let authors = article.as_ref().map(extract_authors).unwrap_or_default();
|
||||
|
||||
json!({
|
||||
|
|
@ -302,19 +306,6 @@ fn handle_from_author_url(u: &str) -> Option<String> {
|
|||
// HTML tag helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn og(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| m.as_str().to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Pull `<meta property="article:published_time" content="...">` and
|
||||
/// similar structured meta tags.
|
||||
fn meta_property(html: &str, prop: &str) -> Option<String> {
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ use regex::Regex;
|
|||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use super::og::parse_og;
|
||||
use crate::cloud::{self, CloudError};
|
||||
use crate::error::FetchError;
|
||||
use crate::fetcher::Fetcher;
|
||||
|
|
@ -87,11 +88,17 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
|
|||
// The aiSummary block: not typed (no `@type`), detect by key.
|
||||
let ai_block = find_ai_summary_block(&blocks);
|
||||
|
||||
// Single scan of the page's og:* meta tags; title + description feed
|
||||
// the regex fallbacks below.
|
||||
let og_meta = parse_og(html);
|
||||
let og_title = og_meta.unescaped("title");
|
||||
let og_description = og_meta.unescaped("description");
|
||||
|
||||
// Business name: Dataset > metadata.title regex > URL domain.
|
||||
let business_name = dataset
|
||||
.as_ref()
|
||||
.and_then(|d| get_string(d, "name"))
|
||||
.or_else(|| parse_name_from_og_title(html))
|
||||
.or_else(|| parse_name_from_og_title(og_title.as_deref()))
|
||||
.or_else(|| Some(domain.clone()));
|
||||
|
||||
// Rating distribution from the csvw:Table columns. Each column has
|
||||
|
|
@ -105,8 +112,8 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
|
|||
|
||||
// Page-title / page-description fallbacks. OG title format:
|
||||
// "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
|
||||
let (rating_label, rating_from_og) = parse_rating_from_og_title(html);
|
||||
let total_from_desc = parse_review_count_from_og_description(html);
|
||||
let (rating_label, rating_from_og) = parse_rating_from_og_title(og_title.as_deref());
|
||||
let total_from_desc = parse_review_count_from_og_description(og_description.as_deref());
|
||||
|
||||
// Recent reviews carried by the aiSummary block.
|
||||
let recent_reviews: Vec<Value> = ai_block
|
||||
|
|
@ -336,20 +343,21 @@ fn compute_rating_stats(distribution: &Value) -> (Option<String>, Option<i64>) {
|
|||
|
||||
/// Regex out the business name from the standard Trustpilot OG title
|
||||
/// shape: `"{name} is rated \"{label}\" with {rating} / 5 on Trustpilot"`.
|
||||
fn parse_name_from_og_title(html: &str) -> Option<String> {
|
||||
let title = og(html, "title")?;
|
||||
/// `title` is the (entity-decoded) `og:title` content.
|
||||
fn parse_name_from_og_title(title: Option<&str>) -> Option<String> {
|
||||
let title = title?;
|
||||
// "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| Regex::new(r"^(.+?)\s+is rated\b").unwrap());
|
||||
re.captures(&title)
|
||||
re.captures(title)
|
||||
.and_then(|c| c.get(1))
|
||||
.map(|m| m.as_str().to_string())
|
||||
}
|
||||
|
||||
/// Pull the rating label (e.g. "Bad", "Excellent") and numeric value
|
||||
/// from the OG title.
|
||||
fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
|
||||
let Some(title) = og(html, "title") else {
|
||||
/// from the (entity-decoded) `og:title` content.
|
||||
fn parse_rating_from_og_title(title: Option<&str>) -> (Option<String>, Option<String>) {
|
||||
let Some(title) = title else {
|
||||
return (None, None);
|
||||
};
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
|
|
@ -357,7 +365,7 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
|
|||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"is rated\s*[\\"]+([^"\\]+)[\\"]+\s*with\s*([\d.]+)\s*/\s*5"#).unwrap()
|
||||
});
|
||||
let Some(caps) = re.captures(&title) else {
|
||||
let Some(caps) = re.captures(title) else {
|
||||
return (None, None);
|
||||
};
|
||||
(
|
||||
|
|
@ -366,13 +374,13 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
|
|||
)
|
||||
}
|
||||
|
||||
/// Parse "hear what 226 customers have already said" from the OG
|
||||
/// description tag.
|
||||
fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
|
||||
let desc = og(html, "description")?;
|
||||
/// Parse "hear what 226 customers have already said" from the
|
||||
/// (entity-decoded) `og:description` content.
|
||||
fn parse_review_count_from_og_description(desc: Option<&str>) -> Option<i64> {
|
||||
let desc = desc?;
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| Regex::new(r"(\d[\d,]*)\s+customers").unwrap());
|
||||
re.captures(&desc)?
|
||||
re.captures(desc)?
|
||||
.get(1)?
|
||||
.as_str()
|
||||
.replace(',', "")
|
||||
|
|
@ -380,29 +388,6 @@ fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
|
|||
.ok()
|
||||
}
|
||||
|
||||
fn og(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
let raw = c.get(2).map(|m| m.as_str())?;
|
||||
return Some(html_unescape(raw));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Minimal HTML entity unescaping for the three entities the
|
||||
/// synthesize_html escaper might produce. Keeps us off a heavier dep.
|
||||
fn html_unescape(s: &str) -> String {
|
||||
s.replace(""", "\"")
|
||||
.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
}
|
||||
|
||||
fn get_string(v: &Value, key: &str) -> Option<String> {
|
||||
v.get(key).and_then(|x| x.as_str().map(String::from))
|
||||
}
|
||||
|
|
@ -488,8 +473,12 @@ mod tests {
|
|||
#[test]
|
||||
fn parse_og_title_extracts_name_and_rating() {
|
||||
let html = r#"<meta property="og:title" content="Anthropic is rated "Bad" with 1.5 / 5 on Trustpilot">"#;
|
||||
assert_eq!(parse_name_from_og_title(html), Some("Anthropic".into()));
|
||||
let (label, rating) = parse_rating_from_og_title(html);
|
||||
let title = parse_og(html).unescaped("title");
|
||||
assert_eq!(
|
||||
parse_name_from_og_title(title.as_deref()),
|
||||
Some("Anthropic".into())
|
||||
);
|
||||
let (label, rating) = parse_rating_from_og_title(title.as_deref());
|
||||
assert_eq!(label.as_deref(), Some("Bad"));
|
||||
assert_eq!(rating.as_deref(), Some("1.5"));
|
||||
}
|
||||
|
|
@ -497,7 +486,11 @@ mod tests {
|
|||
#[test]
|
||||
fn parse_review_count_from_og_description_picks_number() {
|
||||
let html = r#"<meta property="og:description" content="Do you agree? Voice your opinion today and hear what 226 customers have already said.">"#;
|
||||
assert_eq!(parse_review_count_from_og_description(html), Some(226));
|
||||
let desc = parse_og(html).unescaped("description");
|
||||
assert_eq!(
|
||||
parse_review_count_from_og_description(desc.as_deref()),
|
||||
Some(226)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ use regex::Regex;
|
|||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use super::og::parse_og;
|
||||
use crate::error::FetchError;
|
||||
use crate::fetcher::Fetcher;
|
||||
|
||||
|
|
@ -143,9 +144,11 @@ fn build_player_payload(
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn build_og_fallback(html: &str, url: &str, canonical: &str, video_id: &str) -> Value {
|
||||
let title = og(html, "title");
|
||||
let description = og(html, "description");
|
||||
let thumbnail = og(html, "image");
|
||||
// Single scan for the three og:* fields read below.
|
||||
let og_meta = parse_og(html);
|
||||
let title = og_meta.raw("title");
|
||||
let description = og_meta.raw("description");
|
||||
let thumbnail = og_meta.raw("image");
|
||||
// YouTube sets `<meta name="channel_name" ...>` on some pages but
|
||||
// OG-only pages reliably carry `og:video:tag` and the channel in
|
||||
// `<link itemprop="name">`. We keep this lean: just what's stable.
|
||||
|
|
@ -248,19 +251,6 @@ fn extract_player_response(html: &str) -> Option<Value> {
|
|||
// Meta-tag helpers (for OG fallback)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn og(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| m.as_str().to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn meta_name(html: &str, name: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
|
|
|
|||
|
|
@ -538,7 +538,11 @@ pub fn build_client(
|
|||
max_redirects as usize,
|
||||
))
|
||||
.cookie_store(true)
|
||||
.timeout(timeout);
|
||||
.timeout(timeout)
|
||||
.connect_timeout(Duration::from_secs(5))
|
||||
.pool_idle_timeout(Duration::from_secs(90))
|
||||
.pool_max_idle_per_host(8)
|
||||
.tcp_keepalive(Duration::from_secs(60));
|
||||
|
||||
if let Some(proxy_url) = proxy {
|
||||
let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue