feat(core): filter endpoint-extractor noise (invalid hosts, schema domains, bare paths)

2026-06-13 23:15:13 +02:00 · 2026-05-19 19:04:28 +02:00 · 2026-05-19 19:04:28 +02:00 · db176261e2
commit db176261e2
parent 87d4c14f98
1 changed files with 95 additions and 6 deletions
--- a/crates/webclaw-core/src/endpoints.rs
+++ b/crates/webclaw-core/src/endpoints.rs
@ -94,6 +94,38 @@ fn is_first_party(candidate_host: &str, base_reg: &str) -> bool {
    ch == base_reg || ch.ends_with(&format!(".{base_reg}"))
 }

+/// Registrable domains that are spec/schema/example noise, never real API
+/// surface (minified JSON-Schema/`schema.org` refs show up constantly).
+const NOISE_HOSTS: &[&str] = &[
+    "schema.org",
+    "json-schema.org",
+    "w3.org",
+    "example.com",
+    "example.org",
+    "example.net",
+    "localhost",
+];
+
+/// A host worth reporting: multi-label with an alphabetic TLD (>=2 chars).
+/// Rejects minifier garbage like `http://f` / `http://n` and UUID-ish
+/// single labels that the URL regex otherwise picks up.
+fn is_valid_host(host: &str) -> bool {
+    let h = host.trim_end_matches('.');
+    let labels: Vec<&str> = h.split('.').collect();
+    if labels.len() < 2 || labels.iter().any(|l| l.is_empty()) {
+        return false;
+    }
+    let tld = labels[labels.len() - 1];
+    tld.len() >= 2 && tld.chars().all(|c| c.is_ascii_alphabetic())
+}
+
+/// Bare/low-signal relative paths that are just the prefix, not an endpoint
+/// (e.g. `/api`, `/api/`, `/`). `/graphql`, `/gql`, `/api/x` are kept.
+fn is_noise_path(p: &str) -> bool {
+    let t = p.trim_end_matches('/');
+    t.len() < 4 || matches!(t, "/api" | "/rest")
+}
+
 /// Resolved absolute `<script src>` URLs (http/https only), deduped, capped.
 /// Inline scripts have no `src` and are scanned via [`extract_endpoints`].
 pub fn script_srcs(html: &str, base_url: &str) -> Vec<String> {
@ -152,15 +184,31 @@ pub fn extract_endpoints(
        }
        let first_party = match Url::parse(&value) {
            Ok(u) => {
-                if let Some(h) = u.host_str() {
-                    hosts.insert(h.to_ascii_lowercase());
-                    is_first_party(h, &base_reg)
-                } else {
-                    false
+                let Some(h) = u.host_str() else {
+                    return true;
+                };
+                if !is_valid_host(h) {
+                    return true; // minifier garbage host
                }
+                if NOISE_HOSTS.contains(&registrable_domain(h).as_str()) {
+                    return true; // schema.org / json-schema.org / example.*
+                }
+                // Absolute URL with no real path is an origin/site link,
+                // not an API endpoint (drops the page's own URL too).
+                let path = u.path();
+                if path.is_empty() || path == "/" {
+                    return true;
+                }
+                hosts.insert(h.to_ascii_lowercase());
+                is_first_party(h, &base_reg)
            }
            // Relative path: same origin as the page by definition.
-            Err(_) => true,
+            Err(_) => {
+                if is_noise_path(&value) {
+                    return true; // bare /api, /, ultra-short
+                }
+                true
+            }
        };
        if seen.insert((value.clone(), source.to_string())) {
            endpoints.push(DiscoveredEndpoint {
@ -423,4 +471,45 @@ mod tests {
        assert_eq!(r.bundles_scanned, 0);
        assert!(!r.truncated);
    }
+
+    #[test]
+    fn v1_1_noise_is_filtered() {
+        let bundles = vec![(
+            "b.js".to_string(),
+            r#"
+              "/api/search/events";
+              "/api"; "/api/";
+              "http://f"; "http://n/x";
+              "https://schema.org/Thing";
+              "http://json-schema.org/draft-07/schema";
+              "https://www.ticketmaster.co.uk/";
+              "https://pubapi.ticketmaster.co.uk/discovery/v2/events";
+              "wss://live.ticketmaster.co.uk/socket";
+            "#
+            .to_string(),
+        )];
+        let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
+        let vals: std::collections::BTreeSet<&str> =
+            r.endpoints.iter().map(|e| e.value.as_str()).collect();
+        assert!(vals.contains("/api/search/events"));
+        assert!(vals.contains("https://pubapi.ticketmaster.co.uk/discovery/v2/events"));
+        assert!(vals.contains("wss://live.ticketmaster.co.uk/socket"));
+        for junk in [
+            "/api",
+            "/api/",
+            "http://f",
+            "http://n/x",
+            "https://schema.org/Thing",
+            "http://json-schema.org/draft-07/schema",
+            "https://www.ticketmaster.co.uk/",
+        ] {
+            assert!(!vals.contains(junk), "noise leaked: {junk}");
+        }
+        assert!(
+            !r.hosts
+                .iter()
+                .any(|h| h == "f" || h == "n" || h == "schema.org")
+        );
+        assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
+    }
 }