diff --git a/crates/webclaw-core/src/endpoints.rs b/crates/webclaw-core/src/endpoints.rs index 854d79e..21c5280 100644 --- a/crates/webclaw-core/src/endpoints.rs +++ b/crates/webclaw-core/src/endpoints.rs @@ -94,6 +94,38 @@ fn is_first_party(candidate_host: &str, base_reg: &str) -> bool { ch == base_reg || ch.ends_with(&format!(".{base_reg}")) } +/// Registrable domains that are spec/schema/example noise, never real API +/// surface (minified JSON-Schema/`schema.org` refs show up constantly). +const NOISE_HOSTS: &[&str] = &[ + "schema.org", + "json-schema.org", + "w3.org", + "example.com", + "example.org", + "example.net", + "localhost", +]; + +/// A host worth reporting: multi-label with an alphabetic TLD (>=2 chars). +/// Rejects minifier garbage like `http://f` / `http://n` and UUID-ish +/// single labels that the URL regex otherwise picks up. +fn is_valid_host(host: &str) -> bool { + let h = host.trim_end_matches('.'); + let labels: Vec<&str> = h.split('.').collect(); + if labels.len() < 2 || labels.iter().any(|l| l.is_empty()) { + return false; + } + let tld = labels[labels.len() - 1]; + tld.len() >= 2 && tld.chars().all(|c| c.is_ascii_alphabetic()) +} + +/// Bare/low-signal relative paths that are just the prefix, not an endpoint +/// (e.g. `/api`, `/api/`, `/`). `/graphql`, `/gql`, `/api/x` are kept. +fn is_noise_path(p: &str) -> bool { + let t = p.trim_end_matches('/'); + t.len() < 4 || matches!(t, "/api" | "/rest") +} + /// Resolved absolute `