fix: harden resource limits, path safety, and WASM build (#46)

Security audit follow-up across the workspace:

- webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a
  cfg(not(wasm32)) target dependency and the extraction entry point uses
  a direct call on wasm instead of spawning a thread, so it builds and
  runs on wasm32 with or without default features.
- webclaw-core: bound the structured-data scrubber recursion (depth cap)
  so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the
  stack.
- webclaw-fetch: stream the response body with a running ceiling so a
  small highly compressed payload cannot inflate to gigabytes in memory;
  redact user:pass@ from proxy URLs before they reach error strings.
- webclaw-cli: contain output filenames inside the chosen directory
  (reject .. / absolute, drop traversal path segments), run --webhook
  URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s,
  and make research slug truncation char-safe.
- webclaw-mcp: char-safe slug truncation (no multibyte slice panic).
- setup.sh / deploy/hetzner.sh: replace eval on read input with
  printf -v, and mask auth key / API token in console output.
- CI: enforce the wasm32 build invariant for webclaw-core.

Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
This commit is contained in:
Valerio 2026-05-19 17:03:52 +02:00 committed by GitHub
parent aab51bea91
commit be8bcfebd9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 454 additions and 47 deletions

View file

@ -31,6 +31,21 @@ jobs:
- run: cargo fmt --check --all
- run: cargo clippy --all -- -D warnings
wasm:
name: WASM
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
with:
targets: wasm32-unknown-unknown
- uses: Swatinem/rust-cache@v2
# webclaw-core must stay WASM-safe (zero network deps, no threads).
# Check both with and without default features so the quickjs gate
# can't regress.
- run: cargo check --target wasm32-unknown-unknown -p webclaw-core
- run: cargo check --target wasm32-unknown-unknown -p webclaw-core --no-default-features
docs:
name: Docs
runs-on: ubuntu-latest

View file

@ -3,6 +3,13 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.6.3] — 2026-05-19
### Fixed
- Hardened resource and path-safety limits across the CLI, MCP server, and self-hosted API: oversized or highly compressed responses are capped while streaming, deeply nested page data can no longer exhaust memory, output filenames stay inside the chosen directory, webhook URLs are validated like every other fetch, and multibyte search queries no longer crash slug generation.
---
## [0.6.2] — 2026-05-18
### Fixed

14
Cargo.lock generated
View file

@ -3219,7 +3219,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
version = "0.6.2"
version = "0.6.3"
dependencies = [
"clap",
"dotenvy",
@ -3240,7 +3240,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
version = "0.6.2"
version = "0.6.3"
dependencies = [
"ego-tree",
"once_cell",
@ -3258,7 +3258,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
version = "0.6.2"
version = "0.6.3"
dependencies = [
"async-trait",
"bytes",
@ -3284,7 +3284,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
version = "0.6.2"
version = "0.6.3"
dependencies = [
"async-trait",
"reqwest",
@ -3297,7 +3297,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
version = "0.6.2"
version = "0.6.3"
dependencies = [
"dirs",
"dotenvy",
@ -3317,7 +3317,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
version = "0.6.2"
version = "0.6.3"
dependencies = [
"pdf-extract",
"thiserror",
@ -3326,7 +3326,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
version = "0.6.2"
version = "0.6.3"
dependencies = [
"anyhow",
"axum",

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
version = "0.6.2"
version = "0.6.3"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"

View file

@ -613,7 +613,15 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
Err(_) => (String::new(), String::new(), None),
};
let mut stem = path.trim_matches('/').to_string();
// Drop empty / "." / ".." path segments so a URL path like
// `/../../etc/passwd` can't climb out of the output directory.
let cleaned_path: String = path
.split('/')
.filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
.collect::<Vec<_>>()
.join("/");
let mut stem = cleaned_path;
if stem.is_empty() {
// Use hostname for root URLs to avoid collisions in batch mode
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
@ -640,13 +648,59 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
format!("{sanitized}.{ext}")
}
/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
/// output directory: absolute paths, drive prefixes, root, or any `..`
/// component. Returns the validated relative path on success.
fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
let candidate = Path::new(filename);
use std::path::Component;
for comp in candidate.components() {
match comp {
Component::Normal(_) | Component::CurDir => {}
Component::ParentDir => {
return Err(format!("refusing path with '..' component: {filename}"));
}
Component::RootDir | Component::Prefix(_) => {
return Err(format!("refusing absolute output path: {filename}"));
}
}
}
if candidate.as_os_str().is_empty() {
return Err("empty output filename".to_string());
}
Ok(candidate.to_path_buf())
}
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
///
/// `filename` may originate from an attacker-controlled `--urls-file`
/// (`url,filename` CSV). It is validated for traversal, and the canonical
/// destination directory is asserted to stay under the canonical output
/// directory before any write.
fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
let dest = dir.join(filename);
let rel = safe_relative_filename(filename)?;
let dest = dir.join(&rel);
std::fs::create_dir_all(dir)
.map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
let base = dir
.canonicalize()
.map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
let canon_parent = parent
.canonicalize()
.map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
if !canon_parent.starts_with(&base) {
return Err(format!(
"refusing to write outside output dir: {}",
dest.display()
));
}
}
std::fs::write(&dest, content)
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
let word_count = content.split_whitespace().count();
@ -1679,6 +1733,13 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
serde_json::to_string(payload).unwrap_or_default()
};
tokio::spawn(async move {
// SSRF guard: a webhook URL is user-supplied and otherwise bypasses
// the fetch-layer protections, so resolve + reject private/internal
// destinations before sending the payload.
if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
eprintln!("[webhook] refusing unsafe URL: {e}");
return;
}
match reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
@ -1750,7 +1811,9 @@ async fn run_watch_single(
);
loop {
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
// fetch loop with zero delay and hammer the target.
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
if cancelled.load(Ordering::Relaxed) {
eprintln!("[watch] Stopped");
@ -1842,7 +1905,9 @@ async fn run_watch_multi(
let mut check_number = 0u64;
loop {
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
// fetch loop with zero delay and hammer the target.
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
if cancelled.load(Ordering::Relaxed) {
eprintln!("[watch] Stopped");
@ -2321,7 +2386,9 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
.collect::<Vec<_>>()
.join("-")
.to_lowercase();
let slug = if slug.len() > 50 { &slug[..50] } else { &slug };
// char-safe truncation: byte slicing panics if char 50
// lands mid-codepoint (multibyte queries).
let slug: String = slug.chars().take(50).collect();
let filename = format!("research-{slug}.json");
let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
@ -2773,4 +2840,66 @@ mod tests {
assert_eq!(content, "hello");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn url_to_filename_strips_traversal_segments() {
// `..` / `.` / empty path segments must not survive into the path.
let out = url_to_filename(
"https://example.com/../../etc/passwd",
&OutputFormat::Markdown,
);
assert!(!out.contains(".."), "traversal leaked: {out}");
assert_eq!(out, "etc/passwd.md");
let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
assert_eq!(out2, "a/b/c.json");
}
#[test]
fn safe_relative_filename_rejects_escapes() {
assert!(safe_relative_filename("../escape.md").is_err());
assert!(safe_relative_filename("a/../../b.md").is_err());
assert!(safe_relative_filename("/etc/passwd").is_err());
assert!(safe_relative_filename("").is_err());
// Normal nested relative names stay allowed.
assert!(safe_relative_filename("nested/deep/file.md").is_ok());
assert!(safe_relative_filename("./ok.md").is_ok());
}
#[test]
fn write_to_file_refuses_traversal_filename() {
let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
let _ = std::fs::remove_dir_all(&dir);
// CSV-supplied `url,filename` traversal attempt.
let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
assert!(err.contains("refusing"), "unexpected error: {err}");
assert!(
!std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
"traversal write escaped the output dir"
);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn research_slug_truncation_is_char_safe() {
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.
let query = "日本語".repeat(40); // 120 chars, 3 bytes each
let slug: String = query
.chars()
.map(|c| {
if c.is_alphanumeric() || c == ' ' {
c
} else {
' '
}
})
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join("-")
.to_lowercase();
let slug: String = slug.chars().take(50).collect();
assert!(slug.chars().count() <= 50);
// Round-trips through formatting without panicking.
let _ = format!("research-{slug}.json");
}
}

View file

@ -20,6 +20,11 @@ url = { version = "2", features = ["serde"] }
regex = "1"
once_cell = "1"
similar = "2"
# rquickjs links a C library and cannot build for wasm32. Gating it per
# target keeps the `quickjs` feature usable on native while leaving the
# crate WASM-safe even with default features enabled.
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
[dev-dependencies]

View file

@ -9,7 +9,7 @@ pub mod diff;
pub mod domain;
pub mod error;
pub mod extractor;
#[cfg(feature = "quickjs")]
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
pub mod js_eval;
pub mod llm;
pub mod markdown;
@ -46,9 +46,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
/// `url` — optional source URL, used for resolving relative links and domain detection
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
///
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
/// main-thread stack on Windows.
/// On native targets, spawns extraction on a thread with an 8 MB stack to
/// handle deeply nested HTML (e.g., Express.co.uk live blogs) without
/// overflowing the default 1-2 MB main-thread stack on Windows.
///
/// On `wasm32`, threads are unavailable (`std::thread::spawn` panics at
/// runtime), so extraction runs inline on the caller's stack.
#[cfg(not(target_arch = "wasm32"))]
pub fn extract_with_options(
html: &str,
url: Option<&str>,
@ -70,6 +74,16 @@ pub fn extract_with_options(
.unwrap_or(Err(ExtractError::NoContent))
}
/// WASM has no threads; run extraction directly on the caller's stack.
#[cfg(target_arch = "wasm32")]
pub fn extract_with_options(
html: &str,
url: Option<&str>,
options: &ExtractionOptions,
) -> Result<ExtractionResult, ExtractError> {
extract_with_options_inner(html, url, options)
}
fn extract_with_options_inner(
html: &str,
url: Option<&str>,
@ -187,7 +201,7 @@ fn extract_with_options_inner(
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
// static JSON data island extraction above with runtime-evaluated data.
#[cfg(feature = "quickjs")]
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
{
let blobs = js_eval::extract_js_data(html);
if !blobs.is_empty() {
@ -603,4 +617,36 @@ mod tests {
"Should extract content from deep nesting"
);
}
#[test]
fn wasm_direct_call_path_extracts_content() {
// On wasm32 `extract_with_options` runs `extract_with_options_inner`
// inline (no thread spawn). Exercise that exact entry point here so
// the WASM path stays covered on native CI, and assert it produces
// the same content as the public threaded entry point.
let html = r#"
<html lang="en">
<head><title>WASM Path</title></head>
<body><article><h1>Heading</h1><p>WASM-safe extraction body content.</p></article></body>
</html>"#;
let opts = ExtractionOptions::default();
let inner = extract_with_options_inner(html, Some("https://example.com"), &opts)
.expect("inner extraction (wasm path) should succeed");
assert!(
inner
.content
.markdown
.contains("WASM-safe extraction body content"),
"wasm direct-call path should extract body, got: {}",
inner.content.markdown
);
let threaded = extract_with_options(html, Some("https://example.com"), &opts)
.expect("threaded extraction should succeed");
assert_eq!(
inner.content.markdown, threaded.content.markdown,
"wasm path and threaded path must produce identical content"
);
}
}

View file

@ -58,7 +58,7 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
.cloned()
.collect();
for value in &mut useful {
scrub_body_fields(value);
scrub_body_fields(value, 0);
}
if !useful.is_empty() {
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
@ -117,10 +117,21 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
}
/// Recursively remove long fields that duplicate the rendered markdown body.
fn scrub_body_fields(v: &mut serde_json::Value) {
///
/// `depth` guards against stack exhaustion from attacker-controlled
/// JSON-LD / `__NEXT_DATA__` blobs with pathological nesting: past
/// [`MAX_SCRUB_DEPTH`] levels we stop descending and leave the subtree
/// as-is (it is still size-capped by the `STRUCTURED_DATA_MAX_BYTES`
/// budget in `to_llm_text`).
fn scrub_body_fields(v: &mut serde_json::Value, depth: usize) {
const BODY_KEYS: &[&str] = &["articleBody"];
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
const LONG_THRESHOLD: usize = 500;
const MAX_SCRUB_DEPTH: usize = 64;
if depth >= MAX_SCRUB_DEPTH {
return;
}
match v {
serde_json::Value::Object(map) => {
@ -136,12 +147,12 @@ fn scrub_body_fields(v: &mut serde_json::Value) {
true
});
for value in map.values_mut() {
scrub_body_fields(value);
scrub_body_fields(value, depth + 1);
}
}
serde_json::Value::Array(values) => {
for value in values {
scrub_body_fields(value);
scrub_body_fields(value, depth + 1);
}
}
_ => {}
@ -908,4 +919,53 @@ mod tests {
"Compact untyped array dropped: {out}"
);
}
/// Walk `value` down its single `"n"` child link and return the depth
/// at which an `articleBody` key is still present (i.e. was NOT
/// scrubbed). Used to observe exactly where the recursion stopped.
fn first_unscrubbed_article_body_depth(mut value: &serde_json::Value) -> Option<usize> {
let mut depth = 0;
loop {
let obj = value.as_object()?;
if obj.contains_key("articleBody") {
return Some(depth);
}
value = obj.get("n")?;
depth += 1;
}
}
#[test]
fn scrub_body_fields_bounds_recursion_on_deep_nesting() {
// Attacker-controlled JSON-LD / __NEXT_DATA__ with pathological
// nesting must not recurse without bound. Build a chain a little
// past the 64-level cap where every level carries a scrub-able
// `articleBody`. Levels within the cap get scrubbed; the first
// level past the cap keeps its `articleBody` because recursion
// stopped — that is the bound we assert. (Kept shallow on purpose:
// serde_json drops Values recursively, so a 10k-deep value would
// overflow the stack just being dropped.)
const DEPTH: usize = 80;
let mut node = serde_json::json!({ "articleBody": "x".repeat(600) });
for _ in 0..DEPTH {
node = serde_json::json!({
"articleBody": "x".repeat(600),
"n": node,
});
}
scrub_body_fields(&mut node, 0);
let stopped_at = first_unscrubbed_article_body_depth(&node)
.expect("recursion must stop and leave a deep articleBody intact");
// Top levels were scrubbed; the survivor sits right at the cap.
assert_eq!(
stopped_at, 64,
"recursion should stop at the depth cap, stopped at {stopped_at}"
);
assert!(
node.as_object().unwrap().get("articleBody").is_none(),
"shallow articleBody must still be scrubbed"
);
}
}

View file

@ -95,12 +95,30 @@ struct Response {
/// per page in collapse_whitespace + strip_markdown).
const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024;
/// Running decompression-bomb guard: reject as soon as the bytes already
/// buffered plus the next decompressed chunk would cross [`MAX_BODY_BYTES`].
/// Saturating arithmetic so a huge chunk length can't wrap the sum.
fn check_body_ceiling(buffered: usize, next_chunk: usize) -> Result<(), FetchError> {
let total = (buffered as u64).saturating_add(next_chunk as u64);
if total > MAX_BODY_BYTES {
return Err(FetchError::BodyDecode(format!(
"response body exceeds cap {MAX_BODY_BYTES} bytes (decompressed)"
)));
}
Ok(())
}
impl Response {
/// Buffer a wreq response into an owned Response. Rejects bodies that
/// advertise a Content-Length beyond [`MAX_BODY_BYTES`] before we pay
/// the allocation, and truncates after the fact as a belt-and-braces
/// check against a lying server.
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
/// Buffer a wreq response into an owned Response.
///
/// Rejects bodies that advertise a Content-Length beyond
/// [`MAX_BODY_BYTES`] before we pay any allocation, then streams the
/// body chunk-by-chunk while enforcing a running ceiling. `chunk()`
/// yields *post-decompression* bytes (gzip/brotli/zstd/deflate are
/// negotiated), so a tiny compressed payload that inflates to
/// gigabytes is aborted as soon as the accumulated size crosses the
/// cap — it never gets fully buffered in memory.
async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
if let Some(len) = resp.content_length()
&& len > MAX_BODY_BYTES
{
@ -111,21 +129,22 @@ impl Response {
let status = resp.status().as_u16();
let url = resp.uri().to_string();
let headers = resp.headers().clone();
let body = resp
.bytes()
let mut buf = bytes::BytesMut::new();
while let Some(chunk) = resp
.chunk()
.await
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
if body.len() as u64 > MAX_BODY_BYTES {
return Err(FetchError::BodyDecode(format!(
"response body {} bytes exceeds cap {MAX_BODY_BYTES}",
body.len()
)));
.map_err(|e| FetchError::BodyDecode(e.to_string()))?
{
check_body_ceiling(buf.len(), chunk.len())?;
buf.extend_from_slice(&chunk);
}
Ok(Self {
status,
url,
headers,
body,
body: buf.freeze(),
})
}
@ -896,6 +915,28 @@ mod tests {
assert!(err.result.is_err());
}
#[test]
fn body_ceiling_allows_under_cap() {
assert!(check_body_ceiling(0, 1024).is_ok());
assert!(check_body_ceiling(MAX_BODY_BYTES as usize - 1, 1).is_ok());
}
#[test]
fn body_ceiling_rejects_at_and_over_cap() {
// Exactly at the cap is allowed; one byte over is rejected.
assert!(check_body_ceiling(MAX_BODY_BYTES as usize, 1).is_err());
// A small buffer plus a huge inflated chunk (decompression bomb)
// is caught on the very first oversized chunk.
let err = check_body_ceiling(16, 64 * 1024 * 1024).unwrap_err();
assert!(matches!(err, FetchError::BodyDecode(_)));
}
#[test]
fn body_ceiling_saturates_on_overflow() {
// usize::MAX chunk must not wrap the running sum to a small value.
assert!(check_body_ceiling(usize::MAX, usize::MAX).is_err());
}
#[test]
fn test_batch_extract_result_struct() {
let err = BatchExtractResult {

View file

@ -533,8 +533,9 @@ pub fn build_client(
.timeout(timeout);
if let Some(proxy_url) = proxy {
let proxy =
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {
FetchError::Build(format!("invalid proxy {}", redact_proxy_url(proxy_url)))
})?;
builder = builder.proxy(proxy);
} else {
builder = builder.dns_resolver(PublicDnsResolver);
@ -545,6 +546,24 @@ pub fn build_client(
.map_err(|e| FetchError::Build(e.to_string()))
}
/// Render a proxy URL safe to log: drop any `user:pass@` userinfo so
/// rotating-proxy credentials never reach error strings or tracing.
/// Falls back to a constant placeholder when the input does not parse.
fn redact_proxy_url(raw: &str) -> String {
match url::Url::parse(raw) {
Ok(mut u) => {
// Best-effort: opaque URLs (e.g. no host) reject these setters;
// in that case fall through to the placeholder rather than risk
// returning the raw string with credentials.
if u.set_username("").is_err() || u.set_password(None).is_err() {
return "<proxy redacted>".to_string();
}
u.to_string()
}
Err(_) => "<proxy redacted>".to_string(),
}
}
fn ssrf_safe_redirect_policy(
follow_redirects: bool,
max_redirects: usize,
@ -567,3 +586,41 @@ fn ssrf_safe_redirect_policy(
})
})
}
#[cfg(test)]
mod tests {
use super::redact_proxy_url;
#[test]
fn redacts_userinfo_from_proxy_url() {
let red = redact_proxy_url("http://user123:s3cr3tPass@proxy.example.com:8080");
assert!(!red.contains("user123"), "username leaked: {red}");
assert!(!red.contains("s3cr3tPass"), "password leaked: {red}");
assert!(red.contains("proxy.example.com"), "host lost: {red}");
assert!(red.contains("8080"), "port lost: {red}");
}
#[test]
fn redacts_long_token_residential_proxy() {
// Residential-style: long structured credential with embedded
// tokens in the username and special chars in the password.
let red =
redact_proxy_url("http://acct-zone-resi-country-xx:p@ss-word@gw.proxy.example:12321");
assert!(!red.contains("acct-zone-resi"), "username leaked: {red}");
assert!(!red.contains("p@ss-word"), "password leaked: {red}");
assert!(red.contains("gw.proxy.example"));
}
#[test]
fn unparseable_proxy_does_not_echo_input() {
let red = redact_proxy_url("user:pass@not a url");
assert_eq!(red, "<proxy redacted>");
}
#[test]
fn proxy_without_credentials_is_preserved() {
let red = redact_proxy_url("http://proxy.example.com:3128");
assert!(red.contains("proxy.example.com"));
assert!(red.contains("3128"));
}
}

View file

@ -800,7 +800,9 @@ fn slugify(query: &str) -> String {
.collect::<Vec<_>>()
.join("-")
.to_lowercase();
if s.len() > 60 { s[..60].to_string() } else { s }
// char-safe truncation: byte slicing panics if char 60 lands
// mid-codepoint (multibyte queries, e.g. CJK / accented input).
s.chars().take(60).collect()
}
/// Check for a cached research result. Returns the compact response if found.
@ -856,3 +858,32 @@ fn save_research(dir: &std::path::Path, slug: &str, data: &serde_json::Value) ->
json_path.to_string_lossy().to_string(),
)
}
#[cfg(test)]
mod tests {
use super::slugify;
#[test]
fn slugify_multibyte_query_does_not_panic() {
// Byte-slicing s[..60] would panic mid-codepoint on multibyte
// alphanumerics; char-safe truncation must not.
let q = "日本語のクエリ".repeat(20); // long, 3-byte chars
let s = slugify(&q);
assert!(
s.chars().count() <= 60,
"slug too long: {}",
s.chars().count()
);
}
#[test]
fn slugify_ascii_unchanged_under_limit() {
assert_eq!(slugify("Hello World Query"), "hello-world-query");
}
#[test]
fn slugify_caps_long_ascii_at_60_chars() {
let s = slugify(&"word ".repeat(40));
assert!(s.len() <= 60);
}
}

View file

@ -44,6 +44,19 @@ warn() { printf "${YELLOW}[!]${RESET} %s\n" "$*"; }
error() { printf "${RED}[x]${RESET} %s\n" "$*" >&2; }
fatal() { error "$*"; exit 1; }
# Mask a secret for display: keep the last 4 chars, redact the rest.
# Empty input renders as "(not set)".
mask_secret() {
local s="$1"
if [[ -z "$s" ]]; then
printf '(not set)'
elif (( ${#s} <= 4 )); then
printf '****'
else
printf '****%s' "${s: -4}"
fi
}
prompt() {
local var_name="$1" prompt_text="$2" default="${3:-}"
if [[ -n "$default" ]]; then
@ -52,7 +65,7 @@ prompt() {
printf "${CYAN} %s${RESET}: " "$prompt_text"
fi
read -r input
eval "$var_name=\"${input:-$default}\""
printf -v "$var_name" '%s' "${input:-$default}"
}
prompt_secret() {
@ -64,7 +77,7 @@ prompt_secret() {
fi
read -rs input
echo
eval "$var_name=\"${input:-$default}\""
printf -v "$var_name" '%s' "${input:-$default}"
}
generate_key() {
@ -374,7 +387,7 @@ create_server() {
printf " Domain: ${BOLD}%s${RESET}\n" "${DOMAIN:-none}"
printf " OpenAI key: ${BOLD}%s${RESET}\n" "$([ -n "$OPENAI_KEY" ] && echo 'set' || echo 'not set')"
printf " Anthropic key:${BOLD}%s${RESET}\n" "$([ -n "$ANTHROPIC_KEY" ] && echo 'set' || echo 'not set')"
printf " Auth key: ${BOLD}%s${RESET}\n" "$AUTH_KEY"
printf " Auth key: ${BOLD}%s${RESET}\n" "$(mask_secret "$AUTH_KEY")"
printf " Ollama model: ${BOLD}%s${RESET}\n" "$OLLAMA_MODEL"
echo
@ -454,7 +467,9 @@ create_server() {
echo
printf " ${BOLD}Server IP:${RESET} %s\n" "$server_ip"
printf " ${BOLD}SSH:${RESET} ssh root@%s\n" "$server_ip"
printf " ${BOLD}Auth key:${RESET} %s\n" "$AUTH_KEY"
printf " ${BOLD}Auth key:${RESET} %s\n" "$(mask_secret "$AUTH_KEY")"
printf " ${DIM}(full key stored in /opt/webclaw/.env on the server:\n"
printf " ssh root@%s 'grep WEBCLAW_AUTH_KEY /opt/webclaw/.env')${RESET}\n" "$server_ip"
echo
printf " ${BOLD}Monitor build progress:${RESET}\n"
printf " ssh root@%s 'cd /opt/webclaw && docker compose logs -f'\n" "$server_ip"
@ -465,7 +480,7 @@ create_server() {
printf " ${BOLD}Scrape:${RESET}\n"
printf " curl -X POST http://%s:3000/v1/scrape \\\\\n" "$server_ip"
printf " -H 'Content-Type: application/json' \\\\\n"
printf " -H 'Authorization: Bearer %s' \\\\\n" "$AUTH_KEY"
printf " -H 'Authorization: Bearer <YOUR_AUTH_KEY>' \\\\\n"
printf " -d '{\"url\": \"https://example.com\"}'\n"
echo
@ -482,7 +497,8 @@ create_server() {
echo
printf " ${BOLD}Tear down:${RESET}\n"
printf " HETZNER_TOKEN=%s ./deploy/hetzner.sh --destroy\n" "$HETZNER_TOKEN"
printf " HETZNER_TOKEN=\$HETZNER_TOKEN ./deploy/hetzner.sh --destroy\n"
printf " ${DIM}(re-export the same token you used to deploy)${RESET}\n"
echo
}

View file

@ -36,7 +36,7 @@ prompt() {
printf "${CYAN} %s${RESET}: " "$prompt_text"
fi
read -r input
eval "$var_name=\"${input:-$default}\""
printf -v "$var_name" '%s' "${input:-$default}"
}
prompt_secret() {
@ -48,7 +48,7 @@ prompt_secret() {
fi
read -rs input
echo
eval "$var_name=\"${input:-$default}\""
printf -v "$var_name" '%s' "${input:-$default}"
}
prompt_yn() {