mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-21 02:28:27 +02:00
CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
756 lines
22 KiB
Rust
756 lines
22 KiB
Rust
/// Shared noise detection for web content extraction.
|
|
///
|
|
/// Identifies elements that don't contribute to main content:
|
|
/// navigation, sidebars, footers, ads, cookie banners, modals, etc.
|
|
/// Used by both the extractor (candidate filtering) and the markdown
|
|
/// converter (output-time stripping).
|
|
use scraper::ElementRef;
|
|
|
|
const NOISE_TAGS: &[&str] = &[
|
|
"script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "form",
|
|
"video", "audio",
|
|
"canvas",
|
|
// NOTE: <picture> removed — it's a responsive image container, not noise.
|
|
// <picture> wraps <source> and <img> for responsive images.
|
|
];
|
|
|
|
const NOISE_ROLES: &[&str] = &["navigation", "banner", "complementary", "contentinfo"];
|
|
|
|
const NOISE_CLASS_PATTERNS: &[&str] = &[
|
|
"sidebar",
|
|
"side",
|
|
"nav",
|
|
"navbar",
|
|
"navigation",
|
|
"menu",
|
|
"footer",
|
|
"header",
|
|
"top",
|
|
"bottom",
|
|
"advertisement",
|
|
"advert",
|
|
"social",
|
|
"social-media",
|
|
"social-links",
|
|
"share",
|
|
"comment",
|
|
"cookie",
|
|
"popup",
|
|
"modal",
|
|
"overlay",
|
|
"banner",
|
|
"breadcrumb",
|
|
"breadcrumbs",
|
|
"widget",
|
|
"lang-selector",
|
|
"language",
|
|
"newsletter",
|
|
"subscribe",
|
|
"related-posts",
|
|
"recommended",
|
|
"pagination",
|
|
"pager",
|
|
"signup",
|
|
"login-form",
|
|
"search-form",
|
|
"notification",
|
|
"alert",
|
|
"toast",
|
|
"skip-link",
|
|
"sr-only",
|
|
"visually-hidden",
|
|
];
|
|
|
|
const NOISE_ID_PATTERNS: &[&str] = &[
|
|
"sidebar",
|
|
"nav",
|
|
"menu",
|
|
"footer",
|
|
"header",
|
|
"cookie",
|
|
"popup",
|
|
"modal",
|
|
"breadcrumbs",
|
|
"widget",
|
|
"language-selector",
|
|
"ad",
|
|
"social",
|
|
"share",
|
|
"newsletter",
|
|
"subscribe",
|
|
"comments",
|
|
"related",
|
|
"recommended",
|
|
];
|
|
|
|
/// Exact class tokens that indicate noise.
|
|
/// Unlike substring matching, these only match when the EXACT class token
|
|
/// is present — ".modal" matches `class="modal"` but NOT `class="free-modal-container"`.
|
|
const NOISE_CLASSES: &[&str] = &[
|
|
"header",
|
|
"top",
|
|
"navbar",
|
|
"footer",
|
|
"bottom",
|
|
"sidebar",
|
|
"modal",
|
|
"popup",
|
|
"overlay",
|
|
"ad",
|
|
"ads",
|
|
"advert",
|
|
"lang-selector",
|
|
"language",
|
|
"social",
|
|
"social-media",
|
|
"social-links",
|
|
"menu",
|
|
"navigation",
|
|
"breadcrumbs",
|
|
"breadcrumb",
|
|
"share",
|
|
"widget",
|
|
"cookie",
|
|
"newsletter",
|
|
"subscribe",
|
|
"skip-link",
|
|
"sr-only",
|
|
"visually-hidden",
|
|
"notification",
|
|
"alert",
|
|
"toast",
|
|
"pagination",
|
|
"pager",
|
|
"signup",
|
|
"login-form",
|
|
"search-form",
|
|
"related-posts",
|
|
"recommended",
|
|
];
|
|
|
|
/// Exact IDs that indicate noise.
|
|
const NOISE_IDS: &[&str] = &[
|
|
"header",
|
|
"footer",
|
|
"nav",
|
|
"sidebar",
|
|
"menu",
|
|
"modal",
|
|
"popup",
|
|
"cookie",
|
|
"breadcrumbs",
|
|
"widget",
|
|
"ad",
|
|
"social",
|
|
"share",
|
|
"newsletter",
|
|
"subscribe",
|
|
"comments",
|
|
"related",
|
|
"recommended",
|
|
];
|
|
|
|
/// ID prefixes for cookie consent platforms that should be stripped entirely.
|
|
/// These generate massive DOM overlays that dominate content extraction.
|
|
const COOKIE_CONSENT_ID_PREFIXES: &[&str] = &[
|
|
"onetrust", // OneTrust (Foot Locker, many EU sites)
|
|
"optanon", // OneTrust legacy
|
|
"ot-sdk", // OneTrust SDK
|
|
"cookiebot", // Cookiebot
|
|
"CybotCookiebot", // Cookiebot
|
|
"cc-", // Cookie Consent (Osano)
|
|
"cookie-law", // Cookie Law Info
|
|
"gdpr", // Generic GDPR banners
|
|
"consent-", // Generic consent banners
|
|
"cmp-", // Consent Management Platforms
|
|
"sp_message", // SourcePoint
|
|
"qc-cmp", // Quantcast CMP
|
|
"trustarc", // TrustArc
|
|
"evidon", // Evidon/Crownpeak
|
|
];
|
|
|
|
/// Check if an element is noise by tag, role, class, or id.
|
|
///
|
|
/// Uses EXACT class token matching instead
|
|
/// of substring matching. This prevents false positives like:
|
|
/// - "free-modal-container" ≠ noise (Vice.com's content wrapper)
|
|
/// - "a-bw_aui_cxc_alert_measurement" ≠ noise (Amazon's body class)
|
|
/// - "desktop" ≠ noise (not matching "top")
|
|
pub fn is_noise(el: ElementRef<'_>) -> bool {
|
|
let tag = el.value().name();
|
|
|
|
// Never treat <body> or <html> as noise.
|
|
if tag == "body" || tag == "html" {
|
|
return false;
|
|
}
|
|
|
|
// Tag-based noise (script, style, nav, etc.)
|
|
if NOISE_TAGS.contains(&tag) {
|
|
return true;
|
|
}
|
|
|
|
// ARIA role-based noise
|
|
if let Some(role) = el.value().attr("role")
|
|
&& NOISE_ROLES.contains(&role)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// Exact class token matching — split class attribute into tokens,
|
|
// check each against the noise list. "free-modal-container" splits into
|
|
// ["free-modal-container"] which does NOT match "modal".
|
|
if let Some(class) = el.value().attr("class") {
|
|
for token in class.split_whitespace() {
|
|
let lower = token.to_lowercase();
|
|
if NOISE_CLASSES.contains(&lower.as_str()) {
|
|
return true;
|
|
}
|
|
// Structural elements use compound names (FooterLinks, Header-nav, etc.)
|
|
// These are always noise regardless of compound form.
|
|
if lower.starts_with("footer")
|
|
|| lower.starts_with("header-")
|
|
|| lower.starts_with("nav-")
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
// Also check for ad-specific patterns (standalone "ad" class)
|
|
if is_ad_class(class) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Exact ID matching
|
|
if let Some(id) = el.value().attr("id") {
|
|
let id_lower = id.to_lowercase();
|
|
if NOISE_IDS.contains(&id_lower.as_str()) && !is_structural_id(&id_lower) {
|
|
return true;
|
|
}
|
|
// Cookie consent platform IDs (prefix match — these generate huge overlays)
|
|
for prefix in COOKIE_CONSENT_ID_PREFIXES {
|
|
if id_lower.starts_with(prefix) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Class-based cookie consent detection (prefix match for platform classes)
|
|
if let Some(class) = el.value().attr("class") {
|
|
let class_lower = class.to_lowercase();
|
|
for prefix in COOKIE_CONSENT_ID_PREFIXES {
|
|
if class_lower.contains(prefix) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
/// Check if an element is inside a noise container.
|
|
pub fn is_noise_descendant(el: ElementRef<'_>) -> bool {
|
|
let mut node = el.parent();
|
|
while let Some(parent) = node {
|
|
if let Some(parent_el) = ElementRef::wrap(parent)
|
|
&& is_noise(parent_el)
|
|
{
|
|
return true;
|
|
}
|
|
node = parent.parent();
|
|
}
|
|
false
|
|
}
|
|
|
|
fn has_noise_class(class: &str) -> bool {
|
|
// Match noise patterns against individual class tokens, with safeguards
|
|
// against Tailwind CSS utility classes that contain noise keywords as
|
|
// substrings (e.g., "pt-header-h" is padding, not a header class).
|
|
class.split_whitespace().any(is_noise_token) || is_ad_class(class)
|
|
}
|
|
|
|
/// Check if a single class token is a noise indicator.
|
|
/// Requires the noise pattern to be the *semantic core* of the token,
|
|
/// not embedded inside a Tailwind utility prefix or CSS variable.
|
|
fn is_noise_token(token: &str) -> bool {
|
|
let t = token.to_lowercase();
|
|
|
|
// Skip Tailwind arbitrary values and CSS variable references entirely
|
|
if t.contains("[--") || t.contains("var(") {
|
|
return false;
|
|
}
|
|
|
|
// Strip common Tailwind responsive/state prefixes (e.g., "lg:", "hover:", "md:")
|
|
let core = t.rsplit_once(':').map_or(t.as_str(), |(_, c)| c);
|
|
|
|
// The noise pattern should match the semantic name, not be buried inside
|
|
// a utility like "pt-header-h" (padding) or "mt-nav-offset" (margin).
|
|
// Tailwind utilities start with known prefixes; if the token starts with one,
|
|
// it's a utility class, not a semantic class.
|
|
const UTILITY_PREFIXES: &[&str] = &[
|
|
"p-",
|
|
"pt-",
|
|
"pb-",
|
|
"pl-",
|
|
"pr-",
|
|
"px-",
|
|
"py-",
|
|
"m-",
|
|
"mt-",
|
|
"mb-",
|
|
"ml-",
|
|
"mr-",
|
|
"mx-",
|
|
"my-",
|
|
"w-",
|
|
"h-",
|
|
"min-",
|
|
"max-",
|
|
"top-",
|
|
"left-",
|
|
"right-",
|
|
"bottom-",
|
|
"z-",
|
|
"gap-",
|
|
"text-",
|
|
"bg-",
|
|
"border-",
|
|
"rounded-",
|
|
"flex-",
|
|
"grid-",
|
|
"col-",
|
|
"row-",
|
|
"opacity-",
|
|
"transition-",
|
|
"duration-",
|
|
"delay-",
|
|
"ease-",
|
|
"translate-",
|
|
"scale-",
|
|
"rotate-",
|
|
"origin-",
|
|
"overflow-",
|
|
"inset-",
|
|
"space-",
|
|
"divide-",
|
|
"ring-",
|
|
"shadow-",
|
|
"outline-",
|
|
"font-",
|
|
"leading-",
|
|
"tracking-",
|
|
"decoration-",
|
|
];
|
|
if UTILITY_PREFIXES.iter().any(|pfx| core.starts_with(pfx)) {
|
|
return false;
|
|
}
|
|
|
|
// "banner" and "overlay" only match as prefix — they false-positive as
|
|
// suffixes in BEM/Webflow component names (e.g., "package_banner" is a
|
|
// product card, not an ad banner; "planet-overlay" is a visual effect).
|
|
const PREFIX_ONLY: &[&str] = &["banner", "overlay"];
|
|
|
|
// Short patterns (≤6 chars like "nav", "top", "header", "widget") require
|
|
// word-boundary matching to avoid false positives on compound CSS class
|
|
// names (e.g., "desktop" ≠ "top", "celwidget" ≠ "widget",
|
|
// "_categoriesheader_active" ≠ semantic "header").
|
|
// A word boundary is `-`, `_`, or start/end of string.
|
|
// Longer patterns (7+ chars like "sidebar", "breadcrumb") are specific
|
|
// enough that substring matching is safe.
|
|
NOISE_CLASS_PATTERNS.iter().any(|p| {
|
|
if PREFIX_ONLY.contains(p) {
|
|
core == *p || core.starts_with(&format!("{p}-")) || core.starts_with(&format!("{p}_"))
|
|
} else if p.len() <= 6 {
|
|
is_word_boundary_match(core, p)
|
|
} else {
|
|
core.contains(p)
|
|
}
|
|
})
|
|
}
|
|
|
|
/// Check if `pattern` appears in `text` at a word boundary.
|
|
/// Word boundaries are `-`, `_`, or start/end of string.
|
|
/// e.g., "nav" matches "main-nav", "nav-bar", "nav" but NOT "canvas", "navbar".
|
|
fn is_word_boundary_match(text: &str, pattern: &str) -> bool {
|
|
let mut start = 0;
|
|
while let Some(pos) = text[start..].find(pattern) {
|
|
let abs = start + pos;
|
|
let before_ok = abs == 0 || matches!(text.as_bytes()[abs - 1], b'-' | b'_');
|
|
let end = abs + pattern.len();
|
|
let after_ok = end == text.len() || matches!(text.as_bytes()[end], b'-' | b'_');
|
|
if before_ok && after_ok {
|
|
return true;
|
|
}
|
|
start = abs + 1;
|
|
}
|
|
false
|
|
}
|
|
|
|
/// IDs like "modal-portal", "nav-root", "header-container" are structural
|
|
/// wrappers (React portals, app roots), not actual noise elements.
|
|
fn is_structural_id(id: &str) -> bool {
|
|
const STRUCTURAL_SUFFIXES: &[&str] =
|
|
&["portal", "root", "container", "wrapper", "mount", "app"];
|
|
STRUCTURAL_SUFFIXES.iter().any(|s| id.contains(s))
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// CSS class text detection (visible content that looks like class names)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// CSS utility prefixes that indicate a word is a class name, not prose.
|
|
/// Covers Tailwind, Bootstrap-ish, and common utility-first patterns.
|
|
const CSS_CLASS_PREFIXES: &[&str] = &[
|
|
"text-",
|
|
"bg-",
|
|
"px-",
|
|
"py-",
|
|
"pt-",
|
|
"pb-",
|
|
"pl-",
|
|
"pr-",
|
|
"p-",
|
|
"mx-",
|
|
"my-",
|
|
"mt-",
|
|
"mb-",
|
|
"ml-",
|
|
"mr-",
|
|
"m-",
|
|
"w-",
|
|
"h-",
|
|
"min-",
|
|
"max-",
|
|
"flex-",
|
|
"grid-",
|
|
"col-",
|
|
"row-",
|
|
"gap-",
|
|
"space-",
|
|
"rounded-",
|
|
"shadow-",
|
|
"border-",
|
|
"ring-",
|
|
"outline-",
|
|
"font-",
|
|
"tracking-",
|
|
"leading-",
|
|
"decoration-",
|
|
"opacity-",
|
|
"transition-",
|
|
"duration-",
|
|
"delay-",
|
|
"ease-",
|
|
"translate-",
|
|
"scale-",
|
|
"rotate-",
|
|
"origin-",
|
|
"overflow-",
|
|
"inset-",
|
|
"divide-",
|
|
"z-",
|
|
"top-",
|
|
"left-",
|
|
"right-",
|
|
"bottom-",
|
|
"sr-",
|
|
"not-",
|
|
"group-",
|
|
"peer-",
|
|
"placeholder-",
|
|
"focus-",
|
|
"hover-",
|
|
"active-",
|
|
"disabled-",
|
|
"dark-",
|
|
"sm-",
|
|
"md-",
|
|
"lg-",
|
|
"xl-",
|
|
"2xl-",
|
|
];
|
|
|
|
/// Exact single-word CSS utility class names (no prefix needed).
|
|
const CSS_CLASS_EXACT: &[&str] = &[
|
|
"flex",
|
|
"grid",
|
|
"block",
|
|
"inline",
|
|
"hidden",
|
|
"static",
|
|
"fixed",
|
|
"absolute",
|
|
"relative",
|
|
"sticky",
|
|
"isolate",
|
|
"container",
|
|
"prose",
|
|
"antialiased",
|
|
"truncate",
|
|
"uppercase",
|
|
"lowercase",
|
|
"capitalize",
|
|
"italic",
|
|
"underline",
|
|
"overline",
|
|
"invisible",
|
|
"visible",
|
|
"sr-only",
|
|
"not-sr-only",
|
|
];
|
|
|
|
/// Tailwind responsive/state prefixes that can appear before a utility class
|
|
/// (e.g., "sm:text-lg", "hover:bg-blue-500", "dark:text-white").
|
|
fn strip_tw_variant_prefix(word: &str) -> &str {
|
|
// Handle chained variants: "dark:sm:text-lg" → "text-lg"
|
|
word.rsplit_once(':').map_or(word, |(_, core)| core)
|
|
}
|
|
|
|
/// Check if a single whitespace-delimited word looks like a CSS utility class.
|
|
fn is_css_class_word(word: &str) -> bool {
|
|
let core = strip_tw_variant_prefix(word);
|
|
let lower = core.to_lowercase();
|
|
|
|
// Arbitrary value syntax: "[--foo:bar]", "w-[200px]"
|
|
if lower.contains('[') && lower.contains(']') {
|
|
return true;
|
|
}
|
|
|
|
// Exact matches
|
|
if CSS_CLASS_EXACT.iter().any(|&e| lower == e) {
|
|
return true;
|
|
}
|
|
|
|
// Prefix matches
|
|
if CSS_CLASS_PREFIXES.iter().any(|pfx| lower.starts_with(pfx)) {
|
|
return true;
|
|
}
|
|
|
|
// Negative utilities: "-mt-4", "-translate-x-1/2"
|
|
if lower.starts_with('-') && lower.len() > 1 {
|
|
let rest = &lower[1..];
|
|
if CSS_CLASS_PREFIXES.iter().any(|pfx| rest.starts_with(pfx)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
/// Public wrapper for single-word CSS class detection (used by LLM pipeline
|
|
/// for stripping trailing CSS classes from mixed-content lines).
|
|
pub fn is_css_class_word_pub(word: &str) -> bool {
|
|
is_css_class_word(word)
|
|
}
|
|
|
|
/// Check if a text block is predominantly CSS class names.
|
|
///
|
|
/// Returns true if >50% of the whitespace-delimited words look like CSS
|
|
/// utility classes. Requires at least 3 words to avoid false positives on
|
|
/// short fragments.
|
|
pub fn is_css_class_text(text: &str) -> bool {
|
|
let words: Vec<&str> = text.split_whitespace().collect();
|
|
if words.len() < 3 {
|
|
return false;
|
|
}
|
|
|
|
let css_count = words.iter().filter(|w| is_css_class_word(w)).count();
|
|
// >50% of words are CSS classes
|
|
css_count * 2 > words.len()
|
|
}
|
|
|
|
/// Detect "ad" as a standalone class token, not a substring of "read" or "loading".
|
|
fn is_ad_class(class: &str) -> bool {
|
|
class.split_whitespace().any(|token| {
|
|
token == "ad"
|
|
|| token.starts_with("ad-")
|
|
|| token.starts_with("ad_")
|
|
|| token.ends_with("-ad")
|
|
|| token.ends_with("_ad")
|
|
})
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn ad_class_standalone_detected() {
|
|
assert!(is_ad_class("ad"));
|
|
assert!(is_ad_class("some ad-banner"));
|
|
assert!(is_ad_class("top-ad widget"));
|
|
assert!(is_ad_class("ad_unit"));
|
|
assert!(is_ad_class("sidebar_ad"));
|
|
}
|
|
|
|
#[test]
|
|
fn ad_class_no_false_positive() {
|
|
assert!(!is_ad_class("reading-time"));
|
|
assert!(!is_ad_class("loading-indicator"));
|
|
assert!(!is_ad_class("download-button"));
|
|
assert!(!is_ad_class("breadcrumb"));
|
|
}
|
|
|
|
#[test]
|
|
fn noise_class_patterns() {
|
|
assert!(has_noise_class("main-sidebar"));
|
|
assert!(has_noise_class("cookie-banner")); // "cookie" substring match
|
|
assert!(has_noise_class("modal-overlay")); // "modal" substring match
|
|
assert!(has_noise_class("banner-top")); // "banner" as prefix
|
|
assert!(has_noise_class("overlay-popup")); // "overlay" as prefix
|
|
assert!(!has_noise_class("article-content"));
|
|
assert!(!has_noise_class("post-body"));
|
|
}
|
|
|
|
#[test]
|
|
fn short_patterns_require_word_boundary() {
|
|
// "nav" (3 chars) — must be a standalone word segment
|
|
assert!(has_noise_class("main-nav"));
|
|
assert!(has_noise_class("nav-bar"));
|
|
assert!(has_noise_class("nav"));
|
|
assert!(!has_noise_class("canvas")); // "nav" is substring, not word
|
|
assert!(has_noise_class("icp-nav-flag")); // "nav" IS between word boundaries
|
|
// "top" (3 chars) — note: "top-bar" starts with Tailwind prefix "top-" → filtered out
|
|
assert!(has_noise_class("page-top")); // "top" at word boundary
|
|
assert!(!has_noise_class("desktop")); // "top" is substring inside word
|
|
assert!(!has_noise_class("stop-motion")); // "top" inside word
|
|
// "side" (4 chars) — "left-side" starts with Tailwind prefix "left-" → filtered
|
|
assert!(has_noise_class("page-side"));
|
|
assert!(!has_noise_class("inside-content"));
|
|
assert!(!has_noise_class("consider"));
|
|
}
|
|
|
|
#[test]
|
|
fn amazon_classes_not_noise() {
|
|
// Amazon CSS module class names that were false-positiving
|
|
assert!(!has_noise_class("desktop")); // contains "top"
|
|
assert!(!has_noise_class("celwidget")); // contains "widget"
|
|
// a-alert-container: "alert" IS a proper word segment → still matches (correct for UI alerts)
|
|
assert!(has_noise_class("a-alert-container"));
|
|
assert!(!has_noise_class(
|
|
"_haul-cx-images-carousel_style_desktop-card__fid8k"
|
|
));
|
|
assert!(!has_noise_class(
|
|
"_haul-cx-infinite-scroll-body_categoriesheader_active__2j-4u"
|
|
));
|
|
// But actual noise classes still work
|
|
assert!(has_noise_class("site-header"));
|
|
assert!(has_noise_class("main-nav"));
|
|
assert!(has_noise_class("footer-links"));
|
|
assert!(has_noise_class("cookie-consent"));
|
|
}
|
|
|
|
#[test]
|
|
fn word_boundary_match_works() {
|
|
assert!(is_word_boundary_match("main-nav", "nav"));
|
|
assert!(is_word_boundary_match("nav-bar", "nav"));
|
|
assert!(is_word_boundary_match("nav", "nav"));
|
|
assert!(is_word_boundary_match("top-nav_bar", "nav"));
|
|
assert!(!is_word_boundary_match("canvas", "nav"));
|
|
assert!(!is_word_boundary_match("navbar", "nav"));
|
|
assert!(!is_word_boundary_match("navigate", "nav"));
|
|
assert!(is_word_boundary_match("top-bar", "top"));
|
|
assert!(!is_word_boundary_match("desktop", "top"));
|
|
assert!(!is_word_boundary_match("stopper", "top"));
|
|
}
|
|
|
|
#[test]
|
|
fn bem_component_names_not_noise() {
|
|
// BEM/Webflow component names where noise keyword is a suffix
|
|
assert!(!has_noise_class("package_banner"));
|
|
assert!(!has_noise_class("mars-cta_planet-overlay"));
|
|
assert!(!has_noise_class("hero_banner_wrap"));
|
|
// But actual noise classes still work
|
|
assert!(has_noise_class("banner-dismiss"));
|
|
assert!(has_noise_class("overlay-backdrop"));
|
|
}
|
|
|
|
#[test]
|
|
fn structural_ids_not_noise() {
|
|
assert!(is_structural_id("modal-portal"));
|
|
assert!(is_structural_id("nav-root"));
|
|
assert!(is_structural_id("header-container"));
|
|
assert!(is_structural_id("sidebar-wrapper"));
|
|
assert!(is_structural_id("menu-mount"));
|
|
assert!(is_structural_id("app"));
|
|
// Actual noise IDs should NOT be structural
|
|
assert!(!is_structural_id("main-sidebar"));
|
|
assert!(!is_structural_id("cookie-consent"));
|
|
assert!(!is_structural_id("popup-overlay"));
|
|
}
|
|
|
|
#[test]
|
|
fn tailwind_animation_utilities_not_noise() {
|
|
// Tailwind transition/animation utilities with noise keywords as values
|
|
assert!(!has_noise_class("ease-curve-sidebar"));
|
|
assert!(!has_noise_class("duration-sidebar"));
|
|
assert!(!has_noise_class("delay-modal-open"));
|
|
// But actual sidebar/modal classes still work
|
|
assert!(has_noise_class("sidebar-panel"));
|
|
assert!(has_noise_class("modal-dialog"));
|
|
}
|
|
|
|
#[test]
|
|
fn tailwind_css_vars_not_noise() {
|
|
// Tailwind arbitrary values and CSS variables should NOT trigger noise
|
|
assert!(!has_noise_class("[--content-top-offset:var(--header-h)]"));
|
|
assert!(!has_noise_class(
|
|
"pt-[var(--content-top-offset)] [--content-top-offset:var(--header-h)]"
|
|
));
|
|
assert!(!has_noise_class("[--nav-width:200px]"));
|
|
// But actual noise classes still work
|
|
assert!(has_noise_class("[--offset:10px] header-bar"));
|
|
assert!(has_noise_class("sidebar [--x:1]"));
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// CSS class text detection (decorative text that looks like class names)
|
|
// -----------------------------------------------------------------------
|
|
|
|
#[test]
|
|
fn css_class_text_detected() {
|
|
// Pure Tailwind utility class blocks — the real-world problem
|
|
assert!(is_css_class_text(
|
|
"text-4xl font-bold tracking-tight text-gray-900"
|
|
));
|
|
assert!(is_css_class_text(
|
|
"text-4xl text-5xl text-6xl text-8xl text-gray-950 text-white tracking-tighter text-balance"
|
|
));
|
|
assert!(is_css_class_text(
|
|
"flex grid rounded-lg shadow-md bg-white px-4 py-2"
|
|
));
|
|
assert!(is_css_class_text(
|
|
"sm:text-lg dark:bg-gray-800 hover:bg-blue-500"
|
|
));
|
|
// Negative utilities
|
|
assert!(is_css_class_text("-mt-4 -translate-x-1/2 flex"));
|
|
}
|
|
|
|
#[test]
|
|
fn css_class_text_normal_prose_kept() {
|
|
// Normal English text — must NOT be detected as CSS
|
|
assert!(!is_css_class_text(
|
|
"the text-based approach works well for this use case"
|
|
));
|
|
assert!(!is_css_class_text(
|
|
"Build beautiful websites with modern tools"
|
|
));
|
|
assert!(!is_css_class_text(
|
|
"Tailwind CSS is a utility-first CSS framework"
|
|
));
|
|
// Too short to be confident
|
|
assert!(!is_css_class_text("flex grid"));
|
|
assert!(!is_css_class_text("text-lg"));
|
|
}
|
|
|
|
#[test]
|
|
fn css_class_text_mixed_content() {
|
|
// Majority CSS → detected
|
|
assert!(is_css_class_text(
|
|
"text-4xl font-bold tracking-tight text-gray-900 hero"
|
|
));
|
|
// Majority prose → not detected
|
|
assert!(!is_css_class_text(
|
|
"The quick brown fox jumps over the lazy text-lg dog"
|
|
));
|
|
}
|
|
}
|