mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-07-02 04:08:08 +02:00
perf(core): hot-path extraction speedups + senior-grade hardening
Extraction ~22% faster on the corpus benchmark with byte-identical output: - hoist recompiled CSS selectors in the markdown noise path - single-pass shared og() meta parsing across vertical extractors - output-safe QuickJS gating (skip the JS VM when no candidate data) + reuse the already-parsed document instead of re-parsing - wreq connect_timeout + connection-pool tuning; dedup the retry loop Reliability + correctness: - char-boundary-safe truncation of LLM error bodies (shared helper) - HTTP connect/read timeouts on all LLM provider clients - isolate pdf-extract behind catch_unwind + spawn_blocking - OSS server: crawl inherits the shared fetch profile; ProviderChain built once in AppState; request TimeoutLayer API / safety / docs: - #[non_exhaustive] on public enums + result structs (+ builders) - #![forbid(unsafe_code)] on pure crates, deny on llm - //! crate docs + doctests; scrub bypass/vendor/target specifics from public crate docs and comments Tooling: [profile.release] lto/codegen-units/strip, MSRV pin, deny.toml + cargo-deny CI, macOS test matrix. CLI main.rs split into focused modules.
This commit is contained in:
parent
e499e51e70
commit
02302e7a1d
62 changed files with 3761 additions and 3130 deletions
|
|
@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
|
|||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
#[non_exhaustive]
|
||||
pub enum DomainType {
|
||||
Article,
|
||||
Documentation,
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[non_exhaustive]
|
||||
pub enum ExtractError {
|
||||
#[error("failed to parse HTML")]
|
||||
ParseError,
|
||||
|
|
|
|||
|
|
@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").
|
|||
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
|
||||
const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
|
||||
|
||||
/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find
|
||||
/// any data blob. The scan only ever surfaces `globalThis.__*` object/array
|
||||
/// properties, and the seeded `__next_f` only emits when non-empty. Every
|
||||
/// realistic way an inline script populates such a global goes through one of
|
||||
/// these substrings (`window.`/`self.__next` assignments, or the
|
||||
/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none
|
||||
/// are present, running the VM is guaranteed to return zero blobs, so skipping
|
||||
/// it is output-neutral. Conservative by design: any of these may appear in
|
||||
/// non-script HTML too, which only makes us skip *less* often, never more.
|
||||
const JS_CANDIDATE_MARKERS: [&str; 5] = [
|
||||
"window.",
|
||||
"__NEXT_DATA__",
|
||||
"__NUXT__",
|
||||
"application/json",
|
||||
"self.__next",
|
||||
];
|
||||
|
||||
/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS
|
||||
/// scan could surface. When false, the VM is provably a no-op and is skipped.
|
||||
pub fn has_js_candidate_data(html: &str) -> bool {
|
||||
JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m))
|
||||
}
|
||||
|
||||
/// A blob of data extracted from JS execution.
|
||||
pub struct JsDataBlob {
|
||||
pub name: String,
|
||||
|
|
@ -24,9 +47,17 @@ pub struct JsDataBlob {
|
|||
}
|
||||
|
||||
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
|
||||
///
|
||||
/// Convenience wrapper that parses `html` first. Hot callers that already hold a
|
||||
/// parsed [`Html`] should use [`extract_js_data_from_doc`] to avoid a second parse.
|
||||
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
|
||||
let doc = Html::parse_document(html);
|
||||
extract_js_data_from_doc(&doc)
|
||||
}
|
||||
|
||||
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data,
|
||||
/// reusing an already-parsed [`Html`] document instead of re-parsing the HTML.
|
||||
pub fn extract_js_data_from_doc(doc: &Html) -> Vec<JsDataBlob> {
|
||||
let scripts: Vec<String> = doc
|
||||
.select(&SCRIPT_SELECTOR)
|
||||
.filter(|el| {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
//! webclaw-core: Pure HTML content extraction engine for LLMs.
|
||||
//!
|
||||
//! Takes raw HTML + optional URL, returns structured content
|
||||
//! (metadata, markdown, plain text, links, images, code blocks).
|
||||
//! Zero network dependencies — WASM-compatible by design.
|
||||
#![forbid(unsafe_code)]
|
||||
|
||||
pub mod brand;
|
||||
pub(crate) mod data_island;
|
||||
/// webclaw-core: Pure HTML content extraction engine for LLMs.
|
||||
///
|
||||
/// Takes raw HTML + optional URL, returns structured content
|
||||
/// (metadata, markdown, plain text, links, images, code blocks).
|
||||
/// Zero network dependencies — WASM-compatible by design.
|
||||
pub mod diff;
|
||||
pub mod domain;
|
||||
pub mod endpoints;
|
||||
|
|
@ -38,6 +40,14 @@ use url::Url;
|
|||
///
|
||||
/// `html` — raw HTML string to parse
|
||||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// let html = "<html><body><article><h1>Hello</h1><p>World</p></article></body></html>";
|
||||
/// let result = webclaw_core::extract(html, Some("https://example.com")).unwrap();
|
||||
/// assert!(result.content.markdown.contains("# Hello"));
|
||||
/// ```
|
||||
pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, ExtractError> {
|
||||
extract_with_options(html, url, &ExtractionOptions::default())
|
||||
}
|
||||
|
|
@ -221,9 +231,14 @@ fn extract_with_options_inner(
|
|||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||
// static JSON data island extraction above with runtime-evaluated data.
|
||||
//
|
||||
// Output-neutral fast path: the QuickJS scan can only ever surface
|
||||
// `globalThis.__*` data, so when the HTML contains none of the candidate
|
||||
// markers the VM is provably a no-op and is skipped entirely. We also reuse
|
||||
// the already-parsed `doc` instead of re-parsing the HTML a second time.
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
{
|
||||
let blobs = js_eval::extract_js_data(html);
|
||||
if js_eval::has_js_candidate_data(html) {
|
||||
let blobs = js_eval::extract_js_data_from_doc(&doc);
|
||||
if !blobs.is_empty() {
|
||||
let js_text = js_eval::extract_readable_text(&blobs);
|
||||
if !js_text.is_empty() {
|
||||
|
|
|
|||
|
|
@ -184,7 +184,7 @@ fn detect_long_line_cycle(words: &[&str]) -> Option<String> {
|
|||
|
||||
// Try exact N-copy cycles first
|
||||
for n_copies in (2..=5).rev() {
|
||||
if !slice.len().is_multiple_of(n_copies) {
|
||||
if slice.len() % n_copies != 0 {
|
||||
continue;
|
||||
}
|
||||
let cycle_len = slice.len() / n_copies;
|
||||
|
|
@ -759,7 +759,7 @@ pub(crate) fn dedup_comma_lists(input: &str) -> String {
|
|||
// First: try full cycle dedup (a,b,c,a,b,c -> a,b,c)
|
||||
if items.len() >= 6 {
|
||||
for cycle_len in 1..=items.len() / 2 {
|
||||
if !items.len().is_multiple_of(cycle_len) {
|
||||
if items.len() % cycle_len != 0 {
|
||||
continue;
|
||||
}
|
||||
let pattern = &items[..cycle_len];
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ use crate::noise;
|
|||
use crate::types::{CodeBlock, Image, Link};
|
||||
|
||||
static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
|
||||
static IMG_ALT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("img[alt]").unwrap());
|
||||
static A_HREF_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());
|
||||
|
||||
/// Maximum recursion depth for DOM traversal.
|
||||
/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
|
||||
|
|
@ -853,7 +855,7 @@ fn collect_assets_from_noise(
|
|||
assets: &mut ConvertedAssets,
|
||||
) {
|
||||
// Collect images with alt text
|
||||
for img in element.select(&Selector::parse("img[alt]").unwrap()) {
|
||||
for img in element.select(&IMG_ALT_SELECTOR) {
|
||||
let alt = img.value().attr("alt").unwrap_or("").to_string();
|
||||
let src = img
|
||||
.value()
|
||||
|
|
@ -866,7 +868,7 @@ fn collect_assets_from_noise(
|
|||
}
|
||||
|
||||
// Collect links
|
||||
for link in element.select(&Selector::parse("a[href]").unwrap()) {
|
||||
for link in element.select(&A_HREF_SELECTOR) {
|
||||
let href = link
|
||||
.value()
|
||||
.attr("href")
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
|
|||
use crate::domain::DomainType;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[non_exhaustive]
|
||||
pub struct ExtractionResult {
|
||||
pub metadata: Metadata,
|
||||
pub content: Content,
|
||||
|
|
@ -15,7 +16,38 @@ pub struct ExtractionResult {
|
|||
pub structured_data: Vec<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
impl ExtractionResult {
|
||||
/// Construct a result from metadata and content, defaulting
|
||||
/// `domain_data` to `None` and `structured_data` to empty.
|
||||
///
|
||||
/// `ExtractionResult` is `#[non_exhaustive]`, so downstream crates must
|
||||
/// build it through this constructor instead of a struct literal.
|
||||
pub fn new(metadata: Metadata, content: Content) -> Self {
|
||||
Self {
|
||||
metadata,
|
||||
content,
|
||||
domain_data: None,
|
||||
structured_data: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Attach domain-specific data.
|
||||
#[must_use]
|
||||
pub fn with_domain_data(mut self, domain_data: Option<DomainData>) -> Self {
|
||||
self.domain_data = domain_data;
|
||||
self
|
||||
}
|
||||
|
||||
/// Attach JSON-LD structured data blocks.
|
||||
#[must_use]
|
||||
pub fn with_structured_data(mut self, structured_data: Vec<serde_json::Value>) -> Self {
|
||||
self.structured_data = structured_data;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
#[non_exhaustive]
|
||||
pub struct Metadata {
|
||||
pub title: Option<String>,
|
||||
pub description: Option<String>,
|
||||
|
|
@ -29,7 +61,73 @@ pub struct Metadata {
|
|||
pub word_count: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
impl Metadata {
|
||||
/// Start from an all-default `Metadata`. `Metadata` is `#[non_exhaustive]`,
|
||||
/// so downstream crates build it via `Metadata::default()` plus the
|
||||
/// `with_*` setters rather than a struct literal.
|
||||
#[must_use]
|
||||
pub fn with_title(mut self, title: Option<String>) -> Self {
|
||||
self.title = title;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_description(mut self, description: Option<String>) -> Self {
|
||||
self.description = description;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_author(mut self, author: Option<String>) -> Self {
|
||||
self.author = author;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_published_date(mut self, published_date: Option<String>) -> Self {
|
||||
self.published_date = published_date;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_language(mut self, language: Option<String>) -> Self {
|
||||
self.language = language;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_url(mut self, url: Option<String>) -> Self {
|
||||
self.url = url;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_site_name(mut self, site_name: Option<String>) -> Self {
|
||||
self.site_name = site_name;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_image(mut self, image: Option<String>) -> Self {
|
||||
self.image = image;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_favicon(mut self, favicon: Option<String>) -> Self {
|
||||
self.favicon = favicon;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_word_count(mut self, word_count: usize) -> Self {
|
||||
self.word_count = word_count;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
#[non_exhaustive]
|
||||
pub struct Content {
|
||||
pub markdown: String,
|
||||
pub plain_text: String,
|
||||
|
|
@ -40,6 +138,47 @@ pub struct Content {
|
|||
pub raw_html: Option<String>,
|
||||
}
|
||||
|
||||
impl Content {
|
||||
/// Start from an all-default `Content`. `Content` is `#[non_exhaustive]`,
|
||||
/// so downstream crates build it via `Content::default()` plus the
|
||||
/// `with_*` setters rather than a struct literal.
|
||||
#[must_use]
|
||||
pub fn with_markdown(mut self, markdown: String) -> Self {
|
||||
self.markdown = markdown;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_plain_text(mut self, plain_text: String) -> Self {
|
||||
self.plain_text = plain_text;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_links(mut self, links: Vec<Link>) -> Self {
|
||||
self.links = links;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_images(mut self, images: Vec<Image>) -> Self {
|
||||
self.images = images;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_code_blocks(mut self, code_blocks: Vec<CodeBlock>) -> Self {
|
||||
self.code_blocks = code_blocks;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_raw_html(mut self, raw_html: Option<String>) -> Self {
|
||||
self.raw_html = raw_html;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Link {
|
||||
pub text: String,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue