webclaw/crates/webclaw-core/src/lib.rs

pub mod brand;
pub(crate) mod data_island;
/// webclaw-core: Pure HTML content extraction engine for LLMs.
///
/// Takes raw HTML + optional URL, returns structured content
/// (metadata, markdown, plain text, links, images, code blocks).
/// Zero network dependencies — WASM-compatible by design.
pub mod diff;
pub mod domain;
pub mod error;
pub mod extractor;
#[cfg(feature = "quickjs")]
pub mod js_eval;
pub mod llm;
pub mod markdown;
pub mod metadata;
#[allow(dead_code)]
pub(crate) mod noise;
pub mod structured_data;
pub mod types;
pub mod youtube;

pub use brand::BrandIdentity;
pub use diff::{ChangeStatus, ContentDiff, MetadataChange};
pub use domain::DomainType;
pub use error::ExtractError;
pub use llm::to_llm_text;
pub use types::{
    CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
};

use scraper::Html;
use url::Url;

/// Extract structured content from raw HTML.
///
/// `html` — raw HTML string to parse
/// `url`  — optional source URL, used for resolving relative links and domain detection
pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, ExtractError> {
    extract_with_options(html, url, &ExtractionOptions::default())
}

/// Extract structured content from raw HTML with configurable options.
///
/// `html`    — raw HTML string to parse
/// `url`     — optional source URL, used for resolving relative links and domain detection
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
pub fn extract_with_options(
    html: &str,
    url: Option<&str>,
    options: &ExtractionOptions,
) -> Result<ExtractionResult, ExtractError> {
    if html.is_empty() {
        return Err(ExtractError::NoContent);
    }

    // YouTube fast path: if the URL is a YouTube video page, try extracting
    // structured metadata from ytInitialPlayerResponse before DOM scoring.
    // This gives LLMs a clean, structured view of video metadata.
    if let Some(u) = url
        && youtube::is_youtube_url(u)
        && let Some(yt_md) = youtube::try_extract(html)
    {
        let doc = Html::parse_document(html);
        let mut meta = metadata::extract(&doc, url);
        meta.word_count = extractor::word_count(&yt_md);

        let plain_text = yt_md
            .lines()
            .filter(|l| !l.starts_with('#') && !l.starts_with("**"))
            .collect::<Vec<_>>()
            .join("\n")
            .trim()
            .to_string();

        let domain_data = Some(DomainData {
            domain_type: DomainType::Social,
        });

        let structured_data = structured_data::extract_json_ld(html);

        return Ok(ExtractionResult {
            metadata: meta,
            content: Content {
                markdown: yt_md,
                plain_text,
                links: Vec::new(),
                images: Vec::new(),
                code_blocks: Vec::new(),
                raw_html: None,
            },
            domain_data,
            structured_data,
        });
    }

    let doc = Html::parse_document(html);

    let base_url = url
        .map(|u| Url::parse(u).map_err(|_| ExtractError::InvalidUrl(u.to_string())))
        .transpose()?;

    // Metadata from <head>
    let mut meta = metadata::extract(&doc, url);

    // Main content extraction (Readability-style scoring + markdown conversion)
    let mut content = extractor::extract_content(&doc, base_url.as_ref(), options);
    // Use the higher of plain_text and markdown word counts.
    // Some pages (headings + links) have content in markdown but empty plain_text.
    let pt_wc = extractor::word_count(&content.plain_text);
    let md_wc = extractor::word_count(&content.markdown);
    meta.word_count = pt_wc.max(md_wc);

    // Retry fallback: if extraction captured too little of the page's visible content,
    // retry with wider strategies. The scorer sometimes picks a tiny node (e.g., an
    // <article> with 52 words when the body has 1300 words of real content).
    //
    // Strategy 1: retry without only_main_content restriction
    if options.only_main_content && meta.word_count < 30 {
        let relaxed = ExtractionOptions {
            only_main_content: false,
            ..options.clone()
        };
        let retry = extractor::extract_content(&doc, base_url.as_ref(), &relaxed);
        let retry_wc =
            extractor::word_count(&retry.plain_text).max(extractor::word_count(&retry.markdown));
        if retry_wc > meta.word_count {
            content = retry;
            meta.word_count = retry_wc;
        }
    }

    // Strategy 2: if scored extraction is sparse (<200 words) AND the page has
    // significantly more visible text, retry with include_selectors: ["body"].
    // This bypasses the readability scorer entirely — catches blogs, pricing
    // pages, and modern sites where no single element scores well.
    if meta.word_count < 200 && options.include_selectors.is_empty() {
        let body_opts = ExtractionOptions {
            include_selectors: vec!["body".to_string()],
            exclude_selectors: options.exclude_selectors.clone(),
            only_main_content: false,
            include_raw_html: false,
        };
        let body_content = extractor::extract_content(&doc, base_url.as_ref(), &body_opts);
        let body_wc = extractor::word_count(&body_content.plain_text)
            .max(extractor::word_count(&body_content.markdown));
        // Use body extraction if it captures significantly more content (>2x)
        if body_wc > meta.word_count * 2 && body_wc > 50 {
            content = body_content;
            meta.word_count = body_wc;
        }
    }

    // Fallback: if DOM extraction was sparse, try JSON data islands
    // (React SPAs, Next.js, Contentful CMS embed page data in <script> tags)
    if let Some(island_md) = data_island::try_extract(&doc, meta.word_count, &content.markdown) {
        content.markdown.push_str("\n\n");
        content.markdown.push_str(&island_md);
        meta.word_count = extractor::word_count(&content.markdown);
    }

    // QuickJS: execute inline <script> tags to capture JS-assigned data blobs
    // (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
    // static JSON data island extraction above with runtime-evaluated data.
    #[cfg(feature = "quickjs")]
    {
        let blobs = js_eval::extract_js_data(html);
        if !blobs.is_empty() {
            let js_text = js_eval::extract_readable_text(&blobs);
            if !js_text.is_empty() {
                content.markdown.push_str("\n\n");
                content.markdown.push_str(&js_text);
                meta.word_count = extractor::word_count(&content.markdown);
            }
        }
    }

    // Domain detection from URL patterns and DOM heuristics
    let domain_type = domain::detect(url, html);
    let domain_data = Some(DomainData { domain_type });

    // JSON-LD structured data (Schema.org Product, Article, etc.)
    let structured_data = structured_data::extract_json_ld(html);

    Ok(ExtractionResult {
        metadata: meta,
        content,
        domain_data,
        structured_data,
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn full_extraction_pipeline() {
        let html = r#"
        <html lang="en">
        <head>
            <title>Rust is Great</title>
            <meta name="description" content="An article about Rust">
            <meta name="author" content="Bob">
        </head>
        <body>
            <nav><a href="/">Home</a> | <a href="/about">About</a></nav>
            <article>
                <h1>Why Rust is Great</h1>
                <p>Rust gives you <strong>memory safety</strong> without a garbage collector.
                This is achieved through its <em>ownership system</em>.</p>
                <p>Here is an example:</p>
                <pre><code class="language-rust">fn main() {
    println!("Hello, world!");
}</code></pre>
                <p>Learn more at <a href="https://rust-lang.org">rust-lang.org</a>.</p>
            </article>
            <footer>Copyright 2025</footer>
        </body>
        </html>"#;

        let result = extract(html, Some("https://blog.example.com/rust")).unwrap();

        // Metadata
        assert_eq!(result.metadata.title.as_deref(), Some("Rust is Great"));
        assert_eq!(
            result.metadata.description.as_deref(),
            Some("An article about Rust")
        );
        assert_eq!(result.metadata.author.as_deref(), Some("Bob"));
        assert_eq!(result.metadata.language.as_deref(), Some("en"));
        assert!(result.metadata.word_count > 0);

        // Content
        assert!(result.content.markdown.contains("# Why Rust is Great"));
        assert!(result.content.markdown.contains("**memory safety**"));
        assert!(result.content.markdown.contains("```rust"));
        assert!(
            result
                .content
                .links
                .iter()
                .any(|l| l.href == "https://rust-lang.org")
        );
        assert!(!result.content.code_blocks.is_empty());

        // raw_html not populated by default
        assert!(result.content.raw_html.is_none());

        // Domain — blog.example.com has <article> tag
        let dd = result.domain_data.unwrap();
        assert_eq!(dd.domain_type, DomainType::Article);
    }

    #[test]
    fn invalid_url_returns_error() {
        let result = extract("<html></html>", Some("not a url"));
        assert!(matches!(result, Err(ExtractError::InvalidUrl(_))));
    }

    #[test]
    fn empty_html_returns_error() {
        let result = extract("", None);
        assert!(matches!(result, Err(ExtractError::NoContent)));
    }

    #[test]
    fn no_url_is_fine() {
        let result = extract("<html><body><p>Hello</p></body></html>", None);
        assert!(result.is_ok());
    }

    #[test]
    fn serializes_to_json() {
        let result = extract("<html><body><p>Test</p></body></html>", None).unwrap();
        let json = serde_json::to_string_pretty(&result).unwrap();
        assert!(json.contains("metadata"));
        assert!(json.contains("content"));
        // raw_html should be absent (skip_serializing_if)
        assert!(!json.contains("raw_html"));
    }

    #[test]
    fn youtube_extraction_produces_structured_markdown() {
        let html = r#"
        <html><head><title>Rust in 100 Seconds - YouTube</title></head>
        <body>
        <script>
        var ytInitialPlayerResponse = {"videoDetails":{"title":"Rust in 100 Seconds","author":"Fireship","viewCount":"5432100","shortDescription":"Learn Rust in 100 seconds. A mass of web developers are mass adopting Rust.","lengthSeconds":"120"},"microformat":{"playerMicroformatRenderer":{"uploadDate":"2023-01-15"}}};
        </script>
        </body></html>
        "#;

        let result = extract(html, Some("https://www.youtube.com/watch?v=5C_HPTJg5ek")).unwrap();

        assert!(result.content.markdown.contains("# Rust in 100 Seconds"));
        assert!(result.content.markdown.contains("**Channel:** Fireship"));
        assert!(result.content.markdown.contains("2:00"));
        assert!(
            result
                .content
                .markdown
                .contains("Learn Rust in 100 seconds")
        );

        // Should be detected as Social domain
        let dd = result.domain_data.unwrap();
        assert_eq!(dd.domain_type, DomainType::Social);
    }

    #[test]
    fn youtube_url_without_player_response_falls_through() {
        // If ytInitialPlayerResponse is missing, fall through to normal extraction
        let html = r#"<html><body><article><h1>Some YouTube Page</h1><p>Content here for testing.</p></article></body></html>"#;
        let result = extract(html, Some("https://www.youtube.com/watch?v=abc123")).unwrap();

        // Should still extract something via normal pipeline
        assert!(result.content.markdown.contains("Some YouTube Page"));
    }

    // --- ExtractionOptions tests ---

    #[test]
    fn test_exclude_selectors() {
        let html = r#"<html><body>
            <nav>Navigation stuff</nav>
            <article><h1>Title</h1><p>Real content here.</p></article>
            <footer>Footer stuff</footer>
        </body></html>"#;

        let options = ExtractionOptions {
            exclude_selectors: vec!["nav".into(), "footer".into()],
            ..Default::default()
        };
        let result = extract_with_options(html, None, &options).unwrap();

        assert!(result.content.markdown.contains("Real content"));
        assert!(
            !result.content.markdown.contains("Navigation stuff"),
            "nav should be excluded"
        );
        assert!(
            !result.content.markdown.contains("Footer stuff"),
            "footer should be excluded"
        );
    }

    #[test]
    fn test_include_selectors() {
        let html = r#"<html><body>
            <nav>Navigation stuff</nav>
            <article><h1>Title</h1><p>Real content here.</p></article>
            <div class="sidebar">Sidebar junk</div>
            <footer>Footer stuff</footer>
        </body></html>"#;

        let options = ExtractionOptions {
            include_selectors: vec!["article".into()],
            ..Default::default()
        };
        let result = extract_with_options(html, None, &options).unwrap();

        assert!(result.content.markdown.contains("Title"));
        assert!(result.content.markdown.contains("Real content"));
        assert!(
            !result.content.markdown.contains("Navigation stuff"),
            "nav should not be included"
        );
        assert!(
            !result.content.markdown.contains("Sidebar junk"),
            "sidebar should not be included"
        );
        assert!(
            !result.content.markdown.contains("Footer stuff"),
            "footer should not be included"
        );
    }

    #[test]
    fn test_include_and_exclude() {
        let html = r#"<html><body>
            <article>
                <h1>Title</h1>
                <p>Real content here.</p>
                <div class="sidebar">Sidebar inside article</div>
            </article>
            <footer>Footer stuff</footer>
        </body></html>"#;

        let options = ExtractionOptions {
            include_selectors: vec!["article".into()],
            exclude_selectors: vec![".sidebar".into()],
            ..Default::default()
        };
        let result = extract_with_options(html, None, &options).unwrap();

        assert!(result.content.markdown.contains("Title"));
        assert!(result.content.markdown.contains("Real content"));
        assert!(
            !result.content.markdown.contains("Sidebar inside article"),
            "sidebar inside article should be excluded"
        );
        assert!(
            !result.content.markdown.contains("Footer stuff"),
            "footer should not be included"
        );
    }

    #[test]
    fn test_only_main_content() {
        let html = r#"<html><body>
            <nav>Navigation</nav>
            <div class="hero"><h1>Big Hero</h1></div>
            <article><h2>Article Title</h2><p>Article content that is long enough to be real.</p></article>
            <div class="sidebar">Sidebar</div>
            <footer>Footer</footer>
        </body></html>"#;

        let options = ExtractionOptions {
            only_main_content: true,
            ..Default::default()
        };
        let result = extract_with_options(html, None, &options).unwrap();

        assert!(
            result.content.markdown.contains("Article Title"),
            "article content should be present"
        );
        assert!(
            result.content.markdown.contains("Article content"),
            "article body should be present"
        );
        // only_main_content picks the article/main element directly, so hero and sidebar
        // should not be in the output
        assert!(
            !result.content.markdown.contains("Sidebar"),
            "sidebar should not be in only_main_content output"
        );
    }

    #[test]
    fn test_include_raw_html() {
        let html = r#"<html><body>
            <article><h1>Title</h1><p>Content here.</p></article>
        </body></html>"#;

        let options = ExtractionOptions {
            include_raw_html: true,
            ..Default::default()
        };
        let result = extract_with_options(html, None, &options).unwrap();

        assert!(
            result.content.raw_html.is_some(),
            "raw_html should be populated"
        );
        let raw = result.content.raw_html.unwrap();
        assert!(
            raw.contains("<article>"),
            "raw_html should contain article tag"
        );
        assert!(raw.contains("<h1>Title</h1>"), "raw_html should contain h1");
    }

    #[test]
    fn test_invalid_selectors() {
        let html = r#"<html><body>
            <article><h1>Title</h1><p>Content here.</p></article>
        </body></html>"#;

        // Invalid selectors should be gracefully skipped
        let options = ExtractionOptions {
            include_selectors: vec!["[invalid[[[".into(), "article".into()],
            exclude_selectors: vec![">>>bad".into()],
            ..Default::default()
        };
        let result = extract_with_options(html, None, &options).unwrap();

        assert!(
            result.content.markdown.contains("Title"),
            "valid selectors should still work"
        );
        assert!(
            result.content.markdown.contains("Content here"),
            "extraction should proceed despite invalid selectors"
        );
    }

    #[test]
    fn test_backward_compat() {
        let html = r#"<html><body>
            <article><h1>Title</h1><p>Content here.</p></article>
        </body></html>"#;

        let result_old = extract(html, None).unwrap();
        let result_new = extract_with_options(html, None, &ExtractionOptions::default()).unwrap();

        assert_eq!(result_old.content.markdown, result_new.content.markdown);
        assert_eq!(result_old.content.plain_text, result_new.content.plain_text);
        assert_eq!(
            result_old.content.links.len(),
            result_new.content.links.len()
        );
    }

    #[test]
    fn test_empty_options() {
        let html = r#"<html><body>
            <article><h1>Title</h1><p>Content here.</p></article>
        </body></html>"#;

        let result_extract = extract(html, None).unwrap();
        let result_options =
            extract_with_options(html, None, &ExtractionOptions::default()).unwrap();

        assert_eq!(
            result_extract.content.markdown, result_options.content.markdown,
            "default ExtractionOptions should produce identical results to extract()"
        );
    }

    #[test]
    fn test_raw_html_not_in_json_when_none() {
        let result = extract("<html><body><p>Test</p></body></html>", None).unwrap();
        let json = serde_json::to_string(&result).unwrap();
        assert!(
            !json.contains("raw_html"),
            "raw_html should be absent from JSON when None"
        );
    }
}
Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`pub mod brand;`
			`pub(crate) mod data_island;`
			`/// webclaw-core: Pure HTML content extraction engine for LLMs.`
			`///`
			`/// Takes raw HTML + optional URL, returns structured content`
			`/// (metadata, markdown, plain text, links, images, code blocks).`
			`/// Zero network dependencies — WASM-compatible by design.`
			`pub mod diff;`
			`pub mod domain;`
			`pub mod error;`
			`pub mod extractor;`
feat: v0.1.4 — QuickJS integration for inline JavaScript data extraction Embeds QuickJS (rquickjs) to execute inline <script> tags and extract data hidden in JavaScript variable assignments. Captures window.__* objects like __preloadedData (NYTimes), __PRELOADED_STATE__ (Wired), and self.__next_f (Next.js RSC flight data). Results: - NYTimes: 1,552 → 4,162 words (+168%) - Wired: 1,459 → 9,937 words (+580%) - Zero measurable performance overhead (<15ms per page) - Feature-gated: disable with --no-default-features for WASM Smart text filtering rejects CSS, base64, file paths, code strings. Only readable prose is appended under "## Additional Content". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-26 10:28:16 +01:00			`#[cfg(feature = "quickjs")]`
			`pub mod js_eval;`
Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`pub mod llm;`
			`pub mod markdown;`
			`pub mod metadata;`
			`#[allow(dead_code)]`
			`pub(crate) mod noise;`
			`pub mod structured_data;`
			`pub mod types;`
			`pub mod youtube;`

			`pub use brand::BrandIdentity;`
			`pub use diff::{ChangeStatus, ContentDiff, MetadataChange};`
			`pub use domain::DomainType;`
			`pub use error::ExtractError;`
			`pub use llm::to_llm_text;`
			`pub use types::{`
			`CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,`
			`};`

			`use scraper::Html;`
			`use url::Url;`

			`/// Extract structured content from raw HTML.`
			`///`
			/// `html` — raw HTML string to parse
			/// `url` — optional source URL, used for resolving relative links and domain detection
			`pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, ExtractError> {`
			`extract_with_options(html, url, &ExtractionOptions::default())`
			`}`

			`/// Extract structured content from raw HTML with configurable options.`
			`///`
			/// `html` — raw HTML string to parse
			/// `url` — optional source URL, used for resolving relative links and domain detection
			/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
			`pub fn extract_with_options(`
			`html: &str,`
			`url: Option<&str>,`
			`options: &ExtractionOptions,`
			`) -> Result<ExtractionResult, ExtractError> {`
			`if html.is_empty() {`
			`return Err(ExtractError::NoContent);`
			`}`

			`// YouTube fast path: if the URL is a YouTube video page, try extracting`
			`// structured metadata from ytInitialPlayerResponse before DOM scoring.`
			`// This gives LLMs a clean, structured view of video metadata.`
			`if let Some(u) = url`
			`&& youtube::is_youtube_url(u)`
			`&& let Some(yt_md) = youtube::try_extract(html)`
			`{`
			`let doc = Html::parse_document(html);`
			`let mut meta = metadata::extract(&doc, url);`
			`meta.word_count = extractor::word_count(&yt_md);`

			`let plain_text = yt_md`
			`.lines()`
			`.filter(\|l\| !l.starts_with('#') && !l.starts_with("**"))`
			`.collect::<Vec<_>>()`
			`.join("\n")`
			`.trim()`
			`.to_string();`

			`let domain_data = Some(DomainData {`
			`domain_type: DomainType::Social,`
			`});`

			`let structured_data = structured_data::extract_json_ld(html);`

			`return Ok(ExtractionResult {`
			`metadata: meta,`
			`content: Content {`
			`markdown: yt_md,`
			`plain_text,`
			`links: Vec::new(),`
			`images: Vec::new(),`
			`code_blocks: Vec::new(),`
			`raw_html: None,`
			`},`
			`domain_data,`
			`structured_data,`
			`});`
			`}`

			`let doc = Html::parse_document(html);`

			`let base_url = url`
			`.map(\|u\| Url::parse(u).map_err(\|_\| ExtractError::InvalidUrl(u.to_string())))`
			`.transpose()?;`

			`// Metadata from <head>`
			`let mut meta = metadata::extract(&doc, url);`

			`// Main content extraction (Readability-style scoring + markdown conversion)`
			`let mut content = extractor::extract_content(&doc, base_url.as_ref(), options);`
			`// Use the higher of plain_text and markdown word counts.`
			`// Some pages (headings + links) have content in markdown but empty plain_text.`
			`let pt_wc = extractor::word_count(&content.plain_text);`
			`let md_wc = extractor::word_count(&content.markdown);`
			`meta.word_count = pt_wc.max(md_wc);`

			`// Retry fallback: if extraction captured too little of the page's visible content,`
			`// retry with wider strategies. The scorer sometimes picks a tiny node (e.g., an`
			`// <article> with 52 words when the body has 1300 words of real content).`
			`//`
			`// Strategy 1: retry without only_main_content restriction`
			`if options.only_main_content && meta.word_count < 30 {`
			`let relaxed = ExtractionOptions {`
			`only_main_content: false,`
			`..options.clone()`
			`};`
			`let retry = extractor::extract_content(&doc, base_url.as_ref(), &relaxed);`
			`let retry_wc =`
			`extractor::word_count(&retry.plain_text).max(extractor::word_count(&retry.markdown));`
			`if retry_wc > meta.word_count {`
			`content = retry;`
			`meta.word_count = retry_wc;`
			`}`
			`}`

			`// Strategy 2: if scored extraction is sparse (<200 words) AND the page has`
			`// significantly more visible text, retry with include_selectors: ["body"].`
			`// This bypasses the readability scorer entirely — catches blogs, pricing`
			`// pages, and modern sites where no single element scores well.`
			`if meta.word_count < 200 && options.include_selectors.is_empty() {`
			`let body_opts = ExtractionOptions {`
			`include_selectors: vec!["body".to_string()],`
			`exclude_selectors: options.exclude_selectors.clone(),`
			`only_main_content: false,`
			`include_raw_html: false,`
			`};`
			`let body_content = extractor::extract_content(&doc, base_url.as_ref(), &body_opts);`
			`let body_wc = extractor::word_count(&body_content.plain_text)`
			`.max(extractor::word_count(&body_content.markdown));`
			`// Use body extraction if it captures significantly more content (>2x)`
			`if body_wc > meta.word_count * 2 && body_wc > 50 {`
			`content = body_content;`
			`meta.word_count = body_wc;`
			`}`
			`}`

			`// Fallback: if DOM extraction was sparse, try JSON data islands`
			`// (React SPAs, Next.js, Contentful CMS embed page data in <script> tags)`
			`if let Some(island_md) = data_island::try_extract(&doc, meta.word_count, &content.markdown) {`
			`content.markdown.push_str("\n\n");`
			`content.markdown.push_str(&island_md);`
			`meta.word_count = extractor::word_count(&content.markdown);`
			`}`

feat: v0.1.4 — QuickJS integration for inline JavaScript data extraction Embeds QuickJS (rquickjs) to execute inline <script> tags and extract data hidden in JavaScript variable assignments. Captures window.__* objects like __preloadedData (NYTimes), __PRELOADED_STATE__ (Wired), and self.__next_f (Next.js RSC flight data). Results: - NYTimes: 1,552 → 4,162 words (+168%) - Wired: 1,459 → 9,937 words (+580%) - Zero measurable performance overhead (<15ms per page) - Feature-gated: disable with --no-default-features for WASM Smart text filtering rejects CSS, base64, file paths, code strings. Only readable prose is appended under "## Additional Content". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-26 10:28:16 +01:00			`// QuickJS: execute inline <script> tags to capture JS-assigned data blobs`
			`// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the`
			`// static JSON data island extraction above with runtime-evaluated data.`
			`#[cfg(feature = "quickjs")]`
			`{`
			`let blobs = js_eval::extract_js_data(html);`
			`if !blobs.is_empty() {`
			`let js_text = js_eval::extract_readable_text(&blobs);`
			`if !js_text.is_empty() {`
			`content.markdown.push_str("\n\n");`
			`content.markdown.push_str(&js_text);`
			`meta.word_count = extractor::word_count(&content.markdown);`
			`}`
			`}`
			`}`

Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`// Domain detection from URL patterns and DOM heuristics`
			`let domain_type = domain::detect(url, html);`
			`let domain_data = Some(DomainData { domain_type });`

			`// JSON-LD structured data (Schema.org Product, Article, etc.)`
			`let structured_data = structured_data::extract_json_ld(html);`

			`Ok(ExtractionResult {`
			`metadata: meta,`
			`content,`
			`domain_data,`
			`structured_data,`
			`})`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn full_extraction_pipeline() {`
			`let html = r#"`
			`<html lang="en">`
			`<head>`
			`<title>Rust is Great</title>`
			`<meta name="description" content="An article about Rust">`
			`<meta name="author" content="Bob">`
			`</head>`
			`<body>`
			`<nav><a href="/">Home</a> \| <a href="/about">About</a></nav>`
			`<article>`
			`<h1>Why Rust is Great</h1>`
			`<p>Rust gives you <strong>memory safety</strong> without a garbage collector.`
			`This is achieved through its <em>ownership system</em>.</p>`
			`<p>Here is an example:</p>`
			`<pre><code class="language-rust">fn main() {`
			`println!("Hello, world!");`
			`}</code></pre>`
			`<p>Learn more at <a href="https://rust-lang.org">rust-lang.org</a>.</p>`
			`</article>`
			`<footer>Copyright 2025</footer>`
			`</body>`
			`</html>"#;`

			`let result = extract(html, Some("https://blog.example.com/rust")).unwrap();`

			`// Metadata`
			`assert_eq!(result.metadata.title.as_deref(), Some("Rust is Great"));`
			`assert_eq!(`
			`result.metadata.description.as_deref(),`
			`Some("An article about Rust")`
			`);`
			`assert_eq!(result.metadata.author.as_deref(), Some("Bob"));`
			`assert_eq!(result.metadata.language.as_deref(), Some("en"));`
			`assert!(result.metadata.word_count > 0);`

			`// Content`
			`assert!(result.content.markdown.contains("# Why Rust is Great"));`
			`assert!(result.content.markdown.contains("memory safety"));`
			assert!(result.content.markdown.contains("```rust"));
			`assert!(`
			`result`
			`.content`
			`.links`
			`.iter()`
			`.any(\|l\| l.href == "https://rust-lang.org")`
			`);`
			`assert!(!result.content.code_blocks.is_empty());`

			`// raw_html not populated by default`
			`assert!(result.content.raw_html.is_none());`

			`// Domain — blog.example.com has <article> tag`
			`let dd = result.domain_data.unwrap();`
			`assert_eq!(dd.domain_type, DomainType::Article);`
			`}`

			`#[test]`
			`fn invalid_url_returns_error() {`
			`let result = extract("<html></html>", Some("not a url"));`
			`assert!(matches!(result, Err(ExtractError::InvalidUrl(_))));`
			`}`

			`#[test]`
			`fn empty_html_returns_error() {`
			`let result = extract("", None);`
			`assert!(matches!(result, Err(ExtractError::NoContent)));`
			`}`

			`#[test]`
			`fn no_url_is_fine() {`
			`let result = extract("<html><body><p>Hello</p></body></html>", None);`
			`assert!(result.is_ok());`
			`}`

			`#[test]`
			`fn serializes_to_json() {`
			`let result = extract("<html><body><p>Test</p></body></html>", None).unwrap();`
			`let json = serde_json::to_string_pretty(&result).unwrap();`
			`assert!(json.contains("metadata"));`
			`assert!(json.contains("content"));`
			`// raw_html should be absent (skip_serializing_if)`
			`assert!(!json.contains("raw_html"));`
			`}`

			`#[test]`
			`fn youtube_extraction_produces_structured_markdown() {`
			`let html = r#"`
			`<html><head><title>Rust in 100 Seconds - YouTube</title></head>`
			`<body>`
			`<script>`
			`var ytInitialPlayerResponse = {"videoDetails":{"title":"Rust in 100 Seconds","author":"Fireship","viewCount":"5432100","shortDescription":"Learn Rust in 100 seconds. A mass of web developers are mass adopting Rust.","lengthSeconds":"120"},"microformat":{"playerMicroformatRenderer":{"uploadDate":"2023-01-15"}}};`
			`</script>`
			`</body></html>`
			`"#;`

			`let result = extract(html, Some("https://www.youtube.com/watch?v=5C_HPTJg5ek")).unwrap();`

			`assert!(result.content.markdown.contains("# Rust in 100 Seconds"));`
			`assert!(result.content.markdown.contains("Channel: Fireship"));`
			`assert!(result.content.markdown.contains("2:00"));`
			`assert!(`
			`result`
			`.content`
			`.markdown`
			`.contains("Learn Rust in 100 seconds")`
			`);`

			`// Should be detected as Social domain`
			`let dd = result.domain_data.unwrap();`
			`assert_eq!(dd.domain_type, DomainType::Social);`
			`}`

			`#[test]`
			`fn youtube_url_without_player_response_falls_through() {`
			`// If ytInitialPlayerResponse is missing, fall through to normal extraction`
			`let html = r#"<html><body><article><h1>Some YouTube Page</h1><p>Content here for testing.</p></article></body></html>"#;`
			`let result = extract(html, Some("https://www.youtube.com/watch?v=abc123")).unwrap();`

			`// Should still extract something via normal pipeline`
			`assert!(result.content.markdown.contains("Some YouTube Page"));`
			`}`

			`// --- ExtractionOptions tests ---`

			`#[test]`
			`fn test_exclude_selectors() {`
			`let html = r#"<html><body>`
			`<nav>Navigation stuff</nav>`
			`<article><h1>Title</h1><p>Real content here.</p></article>`
			`<footer>Footer stuff</footer>`
			`</body></html>"#;`

			`let options = ExtractionOptions {`
			`exclude_selectors: vec!["nav".into(), "footer".into()],`
			`..Default::default()`
			`};`
			`let result = extract_with_options(html, None, &options).unwrap();`

			`assert!(result.content.markdown.contains("Real content"));`
			`assert!(`
			`!result.content.markdown.contains("Navigation stuff"),`
			`"nav should be excluded"`
			`);`
			`assert!(`
			`!result.content.markdown.contains("Footer stuff"),`
			`"footer should be excluded"`
			`);`
			`}`

			`#[test]`
			`fn test_include_selectors() {`
			`let html = r#"<html><body>`
			`<nav>Navigation stuff</nav>`
			`<article><h1>Title</h1><p>Real content here.</p></article>`
			`<div class="sidebar">Sidebar junk</div>`
			`<footer>Footer stuff</footer>`
			`</body></html>"#;`

			`let options = ExtractionOptions {`
			`include_selectors: vec!["article".into()],`
			`..Default::default()`
			`};`
			`let result = extract_with_options(html, None, &options).unwrap();`

			`assert!(result.content.markdown.contains("Title"));`
			`assert!(result.content.markdown.contains("Real content"));`
			`assert!(`
			`!result.content.markdown.contains("Navigation stuff"),`
			`"nav should not be included"`
			`);`
			`assert!(`
			`!result.content.markdown.contains("Sidebar junk"),`
			`"sidebar should not be included"`
			`);`
			`assert!(`
			`!result.content.markdown.contains("Footer stuff"),`
			`"footer should not be included"`
			`);`
			`}`

			`#[test]`
			`fn test_include_and_exclude() {`
			`let html = r#"<html><body>`
			`<article>`
			`<h1>Title</h1>`
			`<p>Real content here.</p>`
			`<div class="sidebar">Sidebar inside article</div>`
			`</article>`
			`<footer>Footer stuff</footer>`
			`</body></html>"#;`

			`let options = ExtractionOptions {`
			`include_selectors: vec!["article".into()],`
			`exclude_selectors: vec![".sidebar".into()],`
			`..Default::default()`
			`};`
			`let result = extract_with_options(html, None, &options).unwrap();`

			`assert!(result.content.markdown.contains("Title"));`
			`assert!(result.content.markdown.contains("Real content"));`
			`assert!(`
			`!result.content.markdown.contains("Sidebar inside article"),`
			`"sidebar inside article should be excluded"`
			`);`
			`assert!(`
			`!result.content.markdown.contains("Footer stuff"),`
			`"footer should not be included"`
			`);`
			`}`

			`#[test]`
			`fn test_only_main_content() {`
			`let html = r#"<html><body>`
			`<nav>Navigation</nav>`
			`<div class="hero"><h1>Big Hero</h1></div>`
			`<article><h2>Article Title</h2><p>Article content that is long enough to be real.</p></article>`
			`<div class="sidebar">Sidebar</div>`
			`<footer>Footer</footer>`
			`</body></html>"#;`

			`let options = ExtractionOptions {`
			`only_main_content: true,`
			`..Default::default()`
			`};`
			`let result = extract_with_options(html, None, &options).unwrap();`

			`assert!(`
			`result.content.markdown.contains("Article Title"),`
			`"article content should be present"`
			`);`
			`assert!(`
			`result.content.markdown.contains("Article content"),`
			`"article body should be present"`
			`);`
			`// only_main_content picks the article/main element directly, so hero and sidebar`
			`// should not be in the output`
			`assert!(`
			`!result.content.markdown.contains("Sidebar"),`
			`"sidebar should not be in only_main_content output"`
			`);`
			`}`

			`#[test]`
			`fn test_include_raw_html() {`
			`let html = r#"<html><body>`
			`<article><h1>Title</h1><p>Content here.</p></article>`
			`</body></html>"#;`

			`let options = ExtractionOptions {`
			`include_raw_html: true,`
			`..Default::default()`
			`};`
			`let result = extract_with_options(html, None, &options).unwrap();`

			`assert!(`
			`result.content.raw_html.is_some(),`
			`"raw_html should be populated"`
			`);`
			`let raw = result.content.raw_html.unwrap();`
			`assert!(`
			`raw.contains("<article>"),`
			`"raw_html should contain article tag"`
			`);`
			`assert!(raw.contains("<h1>Title</h1>"), "raw_html should contain h1");`
			`}`

			`#[test]`
			`fn test_invalid_selectors() {`
			`let html = r#"<html><body>`
			`<article><h1>Title</h1><p>Content here.</p></article>`
			`</body></html>"#;`

			`// Invalid selectors should be gracefully skipped`
			`let options = ExtractionOptions {`
			`include_selectors: vec!["[invalid[[[".into(), "article".into()],`
			`exclude_selectors: vec![">>>bad".into()],`
			`..Default::default()`
			`};`
			`let result = extract_with_options(html, None, &options).unwrap();`

			`assert!(`
			`result.content.markdown.contains("Title"),`
			`"valid selectors should still work"`
			`);`
			`assert!(`
			`result.content.markdown.contains("Content here"),`
			`"extraction should proceed despite invalid selectors"`
			`);`
			`}`

			`#[test]`
			`fn test_backward_compat() {`
			`let html = r#"<html><body>`
			`<article><h1>Title</h1><p>Content here.</p></article>`
			`</body></html>"#;`

			`let result_old = extract(html, None).unwrap();`
			`let result_new = extract_with_options(html, None, &ExtractionOptions::default()).unwrap();`

			`assert_eq!(result_old.content.markdown, result_new.content.markdown);`
			`assert_eq!(result_old.content.plain_text, result_new.content.plain_text);`
			`assert_eq!(`
			`result_old.content.links.len(),`
			`result_new.content.links.len()`
			`);`
			`}`

			`#[test]`
			`fn test_empty_options() {`
			`let html = r#"<html><body>`
			`<article><h1>Title</h1><p>Content here.</p></article>`
			`</body></html>"#;`

			`let result_extract = extract(html, None).unwrap();`
			`let result_options =`
			`extract_with_options(html, None, &ExtractionOptions::default()).unwrap();`

			`assert_eq!(`
			`result_extract.content.markdown, result_options.content.markdown,`
			`"default ExtractionOptions should produce identical results to extract()"`
			`);`
			`}`

			`#[test]`
			`fn test_raw_html_not_in_json_when_none() {`
			`let result = extract("<html><body><p>Test</p></body></html>", None).unwrap();`
			`let json = serde_json::to_string(&result).unwrap();`
			`assert!(`
			`!json.contains("raw_html"),`
			`"raw_html should be absent from JSON when None"`
			`);`
			`}`
			`}`