webclaw/crates/webclaw-core/src/endpoints.rs
Valerio 499345046c fix: harden LLM providers, UTF-8 handling, and webhook/batch reliability
- webclaw-llm: add explicit request + connect timeouts to the reqwest
  client in every provider (anthropic, openai, ollama) with a shorter
  timeout on the ollama health check, so a stalled provider fails fast.
- webclaw-llm: fix a panic when truncating a provider error body that
  contains multibyte characters near the 500-char cut (char-safe take).
- webclaw-core: snap the endpoint-scan budget cut to a UTF-8 char
  boundary so oversized scripts with non-ASCII content no longer panic.
- webclaw-core: rewrite js_literal_to_json to copy raw bytes instead of
  `byte as char`, preserving multibyte UTF-8 in SvelteKit string values
  rather than producing Latin-1 mojibake.
- webclaw-cli: have fire_webhook return its JoinHandle and await it at
  the crawl/batch/batch-llm call sites, removing the fixed 500ms sleeps.
- webclaw-mcp: drop the up-front DNS pre-validation loop in batch that
  aborted the whole request on one bad URL; the fetch layer already
  applies the same SSRF guard per URL and reports per-URL errors.
- webclaw-fetch: include the port in the warmup homepage URL so hosts
  on a non-default port are warmed correctly.

Adds regression tests for the UTF-8 endpoint-scan and SvelteKit cases.
2026-06-09 21:10:15 +02:00

533 lines
18 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! API/endpoint surface discovery from HTML + JS bundle text.
//!
//! Pure and zero-network: callers fetch the page and its `<script src>`
//! bundles, then hand the raw text here. We surface API paths, absolute
//! API URLs, GraphQL and WebSocket endpoints that live in inline scripts
//! and bundles — the surface a sitemap/`map` can never see.
//!
//! Heuristic by design: regex over string literals, not JS dataflow.
//! High-signal patterns only; bounded for DoS safety.
use once_cell::sync::Lazy;
use regex::Regex;
use scraper::{Html, Selector};
use std::collections::BTreeSet;
use url::Url;
/// Hard caps so a hostile/huge bundle set can't blow up CPU or memory.
const MAX_SCAN_BYTES: usize = 8 * 1024 * 1024;
const MAX_ENDPOINTS: usize = 2000;
/// Cap on `<script src>` URLs returned for the caller to fetch.
const MAX_SCRIPT_SRCS: usize = 40;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum EndpointKind {
RelativePath,
AbsoluteUrl,
GraphQl,
WebSocket,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct DiscoveredEndpoint {
pub value: String,
pub kind: EndpointKind,
pub first_party: bool,
/// `"inline"` or the bundle URL the match came from.
pub source: String,
}
#[derive(Debug, Default, serde::Serialize)]
pub struct EndpointReport {
pub endpoints: Vec<DiscoveredEndpoint>,
/// Distinct hosts seen across absolute URLs (first- and third-party).
pub hosts: Vec<String>,
pub bundles_scanned: usize,
/// True if a cap was hit and results may be incomplete.
pub truncated: bool,
}
// Quoted relative path that looks API-ish. Bounded quantifiers; the `regex`
// crate is linear-time (RE2) so this cannot catastrophically backtrack.
static RE_REL_PATH: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#"["'`](/[A-Za-z0-9_\-./]{0,200}?(?:api|graphql|gql|/v[0-9]|/rest|/gateway|/internal|/discovery)[A-Za-z0-9_\-./]{0,200})["'`]"#,
)
.expect("RE_REL_PATH")
});
static RE_ABS_URL: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"https?://[A-Za-z0-9.\-]{1,253}(?:/[A-Za-z0-9_\-./%]{0,400})?"#)
.expect("RE_ABS_URL")
});
static RE_WS: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"wss?://[A-Za-z0-9.\-]{1,253}(?:/[A-Za-z0-9_\-./%]{0,256})?"#).expect("RE_WS")
});
static SCRIPT_SEL: Lazy<Selector> = Lazy::new(|| Selector::parse("script").expect("script sel"));
/// Common multi-label public suffixes so `ticketmaster.co.uk` resolves to
/// `ticketmaster.co.uk` (not `co.uk`). Not a full PSL — pragmatic v1.
const SUFFIX2: &[&str] = &[
"co.uk", "org.uk", "gov.uk", "ac.uk", "me.uk", "com.au", "net.au", "org.au", "co.jp", "co.nz",
"co.za", "com.br", "com.mx", "com.sg", "co.in", "co.kr", "com.tr", "com.cn",
];
fn registrable_domain(host: &str) -> String {
let host = host.trim_end_matches('.').to_ascii_lowercase();
let labels: Vec<&str> = host.split('.').collect();
if labels.len() < 2 {
return host;
}
let last2 = labels[labels.len() - 2..].join(".");
if SUFFIX2.contains(&last2.as_str()) && labels.len() >= 3 {
labels[labels.len() - 3..].join(".")
} else {
last2
}
}
fn is_first_party(candidate_host: &str, base_reg: &str) -> bool {
let ch = candidate_host.to_ascii_lowercase();
ch == base_reg || ch.ends_with(&format!(".{base_reg}"))
}
/// Registrable domains that are spec/schema/example noise, never real API
/// surface (minified JSON-Schema/`schema.org` refs show up constantly).
const NOISE_HOSTS: &[&str] = &[
"schema.org",
"json-schema.org",
"w3.org",
"example.com",
"example.org",
"example.net",
"localhost",
];
/// A host worth reporting: multi-label with an alphabetic TLD (>=2 chars).
/// Rejects minifier garbage like `http://f` / `http://n` and UUID-ish
/// single labels that the URL regex otherwise picks up.
fn is_valid_host(host: &str) -> bool {
let h = host.trim_end_matches('.');
let labels: Vec<&str> = h.split('.').collect();
if labels.len() < 2 || labels.iter().any(|l| l.is_empty()) {
return false;
}
let tld = labels[labels.len() - 1];
tld.len() >= 2 && tld.chars().all(|c| c.is_ascii_alphabetic())
}
/// Bare/low-signal relative paths that are just the prefix, not an endpoint
/// (e.g. `/api`, `/api/`, `/`). `/graphql`, `/gql`, `/api/x` are kept.
fn is_noise_path(p: &str) -> bool {
let t = p.trim_end_matches('/');
t.len() < 4 || matches!(t, "/api" | "/rest")
}
/// Resolved absolute `<script src>` URLs (http/https only), deduped, capped.
/// Inline scripts have no `src` and are scanned via [`extract_endpoints`].
pub fn script_srcs(html: &str, base_url: &str) -> Vec<String> {
let base = Url::parse(base_url).ok();
let doc = Html::parse_document(html);
let mut seen = BTreeSet::new();
let mut out = Vec::new();
for el in doc.select(&SCRIPT_SEL) {
if out.len() >= MAX_SCRIPT_SRCS {
break;
}
let Some(src) = el.value().attr("src") else {
continue;
};
let resolved = match Url::parse(src) {
Ok(u) => Some(u),
Err(_) => base.as_ref().and_then(|b| b.join(src).ok()),
};
let Some(u) = resolved else {
continue;
};
if (u.scheme() == "http" || u.scheme() == "https") && seen.insert(u.to_string()) {
out.push(u.to_string());
}
}
out
}
/// Extract endpoints from inline HTML scripts plus pre-fetched JS bundles.
/// `bundles` is `(bundle_url, bundle_text)`.
pub fn extract_endpoints(
html: &str,
base_url: &str,
bundles: &[(String, String)],
) -> EndpointReport {
let base_reg = Url::parse(base_url)
.ok()
.and_then(|u| u.host_str().map(registrable_domain))
.unwrap_or_default();
let mut endpoints: Vec<DiscoveredEndpoint> = Vec::new();
let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
let mut hosts: BTreeSet<String> = BTreeSet::new();
let mut budget = MAX_SCAN_BYTES;
let mut truncated = false;
let push = |value: String,
kind: EndpointKind,
source: &str,
endpoints: &mut Vec<DiscoveredEndpoint>,
seen: &mut BTreeSet<(String, String)>,
hosts: &mut BTreeSet<String>|
-> bool {
if endpoints.len() >= MAX_ENDPOINTS {
return false;
}
let first_party = match Url::parse(&value) {
Ok(u) => {
let Some(h) = u.host_str() else {
return true;
};
if !is_valid_host(h) {
return true; // minifier garbage host
}
if NOISE_HOSTS.contains(&registrable_domain(h).as_str()) {
return true; // schema.org / json-schema.org / example.*
}
// Absolute URL with no real path is an origin/site link,
// not an API endpoint (drops the page's own URL too).
let path = u.path();
if path.is_empty() || path == "/" {
return true;
}
hosts.insert(h.to_ascii_lowercase());
is_first_party(h, &base_reg)
}
// Relative path: same origin as the page by definition.
Err(_) => {
if is_noise_path(&value) {
return true; // bare /api, /, ultra-short
}
true
}
};
if seen.insert((value.clone(), source.to_string())) {
endpoints.push(DiscoveredEndpoint {
value,
kind,
first_party,
source: source.to_string(),
});
}
true
};
let scan = |text: &str,
source: &str,
endpoints: &mut Vec<DiscoveredEndpoint>,
seen: &mut BTreeSet<(String, String)>,
hosts: &mut BTreeSet<String>,
budget: &mut usize,
truncated: &mut bool| {
if *budget == 0 {
return;
}
let slice = if text.len() > *budget {
*truncated = true;
// Snap the cut to a UTF-8 char boundary so non-ASCII content
// (multibyte codepoints straddling the budget) can't panic.
let mut cut = (*budget).min(text.len());
while cut > 0 && !text.is_char_boundary(cut) {
cut -= 1;
}
&text[..cut]
} else {
text
};
*budget -= slice.len();
for c in RE_REL_PATH.captures_iter(slice) {
if let Some(m) = c.get(1) {
let v = m.as_str().to_string();
let kind = if v.contains("graphql") || v.contains("/gql") {
EndpointKind::GraphQl
} else {
EndpointKind::RelativePath
};
if !push(v, kind, source, endpoints, seen, hosts) {
*truncated = true;
return;
}
}
}
for m in RE_WS.find_iter(slice) {
if !push(
m.as_str().to_string(),
EndpointKind::WebSocket,
source,
endpoints,
seen,
hosts,
) {
*truncated = true;
return;
}
}
for m in RE_ABS_URL.find_iter(slice) {
let v = m.as_str().to_string();
// Skip obvious static assets — we want API surface, not CDN files.
let lower = v.to_ascii_lowercase();
if lower.ends_with(".js")
|| lower.ends_with(".css")
|| lower.ends_with(".png")
|| lower.ends_with(".jpg")
|| lower.ends_with(".svg")
|| lower.ends_with(".woff2")
{
// still record the host for visibility
if let Some(h) = Url::parse(&v)
.ok()
.and_then(|u| u.host_str().map(str::to_string))
{
hosts.insert(h.to_ascii_lowercase());
}
continue;
}
let kind = if lower.contains("graphql") || lower.contains("/gql") {
EndpointKind::GraphQl
} else {
EndpointKind::AbsoluteUrl
};
if !push(v, kind, source, endpoints, seen, hosts) {
*truncated = true;
return;
}
}
};
// Inline scripts.
let doc = Html::parse_document(html);
let mut inline = String::new();
for el in doc.select(&SCRIPT_SEL) {
if el.value().attr("src").is_none() {
inline.push_str(&el.text().collect::<String>());
inline.push('\n');
}
}
scan(
&inline,
"inline",
&mut endpoints,
&mut seen,
&mut hosts,
&mut budget,
&mut truncated,
);
// Bundles.
let mut bundles_scanned = 0usize;
for (src, text) in bundles {
if budget == 0 {
truncated = true;
break;
}
bundles_scanned += 1;
scan(
text,
src,
&mut endpoints,
&mut seen,
&mut hosts,
&mut budget,
&mut truncated,
);
}
endpoints.sort_by(|a, b| (a.kind, &a.value, &a.source).cmp(&(b.kind, &b.value, &b.source)));
EndpointReport {
endpoints,
hosts: hosts.into_iter().collect(),
bundles_scanned,
truncated,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn registrable_domain_handles_cc_tlds() {
assert_eq!(
registrable_domain("www.ticketmaster.co.uk"),
"ticketmaster.co.uk"
);
assert_eq!(
registrable_domain("api.ticketmaster.com"),
"ticketmaster.com"
);
assert_eq!(
registrable_domain("pubapi.ticketmaster.co.uk"),
"ticketmaster.co.uk"
);
assert_eq!(registrable_domain("localhost"), "localhost");
}
#[test]
fn script_srcs_resolves_and_filters() {
let html = r#"<html><head>
<script src="/_next/static/chunks/main-abc.js"></script>
<script src="https://cdn.example.net/lib.js"></script>
<script>var inline = 1;</script>
<script src="data:text/javascript,1"></script>
</head></html>"#;
let srcs = script_srcs(html, "https://www.ticketmaster.co.uk/");
assert!(srcs.contains(
&"https://www.ticketmaster.co.uk/_next/static/chunks/main-abc.js".to_string()
));
assert!(srcs.contains(&"https://cdn.example.net/lib.js".to_string()));
assert_eq!(srcs.len(), 2, "inline + data: ignored");
}
#[test]
fn extracts_inline_and_bundle_endpoints_with_classification() {
let html = r#"<html><body>
<script>
var cfg = { search: "/api/search/events", suggest: "/api/search/search-suggest" };
fetch("/api/venue/info");
</script>
<script src="/app.js"></script>
</body></html>"#;
let bundles = vec![(
"https://www.ticketmaster.co.uk/app.js".to_string(),
r#"
const GQL = "https://pubapi.ticketmaster.co.uk/graphql";
axios.post("https://services.ticketmaster.co.uk/discovery/v2/events");
new WebSocket("wss://live.ticketmaster.co.uk/socket");
const ga = "https://www.googletagservices.com/tag/js/gpt.js";
const img = "https://cdn.tmol.co/hero.png";
"#
.to_string(),
)];
let r = extract_endpoints(html, "https://www.ticketmaster.co.uk/", &bundles);
let vals: Vec<&str> = r.endpoints.iter().map(|e| e.value.as_str()).collect();
assert!(vals.contains(&"/api/search/events"));
assert!(vals.contains(&"/api/search/search-suggest"));
assert!(vals.contains(&"/api/venue/info"));
assert!(vals.contains(&"https://pubapi.ticketmaster.co.uk/graphql"));
assert!(vals.contains(&"https://services.ticketmaster.co.uk/discovery/v2/events"));
assert!(vals.contains(&"wss://live.ticketmaster.co.uk/socket"));
// static .js asset is not an endpoint, but its host is recorded
assert!(!vals.contains(&"https://www.googletagservices.com/tag/js/gpt.js"));
assert!(r.hosts.iter().any(|h| h == "www.googletagservices.com"));
let gql = r
.endpoints
.iter()
.find(|e| e.value.contains("graphql"))
.unwrap();
assert_eq!(gql.kind, EndpointKind::GraphQl);
assert!(
gql.first_party,
"pubapi.ticketmaster.co.uk is first-party to .co.uk"
);
let third = r
.endpoints
.iter()
.find(|e| e.value.starts_with("/api/venue"));
assert!(third.unwrap().first_party, "relative path is same-origin");
assert_eq!(r.bundles_scanned, 1);
}
#[test]
fn third_party_absolute_is_flagged_not_first_party() {
let bundles = vec![(
"b".to_string(),
r#"x="https://api.stripe.com/v1/charges""#.to_string(),
)];
let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
let e = r
.endpoints
.iter()
.find(|e| e.value.contains("stripe"))
.unwrap();
assert!(!e.first_party);
}
#[test]
fn caps_bound_pathological_input() {
// A huge blob of fake endpoints must not exceed MAX_ENDPOINTS and
// must return promptly (regex crate is linear-time).
let mut big = String::new();
for i in 0..50_000 {
big.push_str(&format!("\"/api/v1/item/{i}\" "));
}
let bundles = vec![("big".to_string(), big)];
let r = extract_endpoints("<html></html>", "https://x.com/", &bundles);
assert!(r.endpoints.len() <= MAX_ENDPOINTS);
assert!(r.truncated);
}
#[test]
fn empty_inputs_are_safe() {
let r = extract_endpoints("", "not a url", &[]);
assert!(r.endpoints.is_empty());
assert_eq!(r.bundles_scanned, 0);
assert!(!r.truncated);
}
#[test]
fn v1_1_noise_is_filtered() {
let bundles = vec![(
"b.js".to_string(),
r#"
"/api/search/events";
"/api"; "/api/";
"http://f"; "http://n/x";
"https://schema.org/Thing";
"http://json-schema.org/draft-07/schema";
"https://www.ticketmaster.co.uk/";
"https://pubapi.ticketmaster.co.uk/discovery/v2/events";
"wss://live.ticketmaster.co.uk/socket";
"#
.to_string(),
)];
let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
let vals: std::collections::BTreeSet<&str> =
r.endpoints.iter().map(|e| e.value.as_str()).collect();
assert!(vals.contains("/api/search/events"));
assert!(vals.contains("https://pubapi.ticketmaster.co.uk/discovery/v2/events"));
assert!(vals.contains("wss://live.ticketmaster.co.uk/socket"));
for junk in [
"/api",
"/api/",
"http://f",
"http://n/x",
"https://schema.org/Thing",
"http://json-schema.org/draft-07/schema",
"https://www.ticketmaster.co.uk/",
] {
assert!(!vals.contains(junk), "noise leaked: {junk}");
}
assert!(
!r.hosts
.iter()
.any(|h| h == "f" || h == "n" || h == "schema.org")
);
assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
}
#[test]
fn scan_truncation_at_non_ascii_boundary_does_not_panic() {
// A bundle just over the scan budget, padded with a multibyte char
// ('é' is 2 bytes) so the cut lands mid-codepoint. The old
// `&text[..budget]` slice panicked here; the boundary snap must not.
let pad = "é".repeat(MAX_SCAN_BYTES); // ~2× budget in bytes
let bundle = format!("{pad} fetch(\"/api/x\")");
let bundles = vec![("big.js".to_string(), bundle)];
let r = extract_endpoints("<html></html>", "https://example.com/", &bundles);
assert!(r.truncated, "oversized bundle should mark truncated");
}
}