mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: harden resource limits, path safety, and WASM build (#46)
Security audit follow-up across the workspace: - webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a cfg(not(wasm32)) target dependency and the extraction entry point uses a direct call on wasm instead of spawning a thread, so it builds and runs on wasm32 with or without default features. - webclaw-core: bound the structured-data scrubber recursion (depth cap) so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the stack. - webclaw-fetch: stream the response body with a running ceiling so a small highly compressed payload cannot inflate to gigabytes in memory; redact user:pass@ from proxy URLs before they reach error strings. - webclaw-cli: contain output filenames inside the chosen directory (reject .. / absolute, drop traversal path segments), run --webhook URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s, and make research slug truncation char-safe. - webclaw-mcp: char-safe slug truncation (no multibyte slice panic). - setup.sh / deploy/hetzner.sh: replace eval on read input with printf -v, and mask auth key / API token in console output. - CI: enforce the wasm32 build invariant for webclaw-core. Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
This commit is contained in:
parent
aab51bea91
commit
be8bcfebd9
13 changed files with 454 additions and 47 deletions
|
|
@ -613,7 +613,15 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
|||
Err(_) => (String::new(), String::new(), None),
|
||||
};
|
||||
|
||||
let mut stem = path.trim_matches('/').to_string();
|
||||
// Drop empty / "." / ".." path segments so a URL path like
|
||||
// `/../../etc/passwd` can't climb out of the output directory.
|
||||
let cleaned_path: String = path
|
||||
.split('/')
|
||||
.filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
|
||||
.collect::<Vec<_>>()
|
||||
.join("/");
|
||||
|
||||
let mut stem = cleaned_path;
|
||||
if stem.is_empty() {
|
||||
// Use hostname for root URLs to avoid collisions in batch mode
|
||||
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
|
||||
|
|
@ -640,13 +648,59 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
|||
format!("{sanitized}.{ext}")
|
||||
}
|
||||
|
||||
/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
|
||||
/// output directory: absolute paths, drive prefixes, root, or any `..`
|
||||
/// component. Returns the validated relative path on success.
|
||||
fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
|
||||
let candidate = Path::new(filename);
|
||||
use std::path::Component;
|
||||
for comp in candidate.components() {
|
||||
match comp {
|
||||
Component::Normal(_) | Component::CurDir => {}
|
||||
Component::ParentDir => {
|
||||
return Err(format!("refusing path with '..' component: {filename}"));
|
||||
}
|
||||
Component::RootDir | Component::Prefix(_) => {
|
||||
return Err(format!("refusing absolute output path: {filename}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
if candidate.as_os_str().is_empty() {
|
||||
return Err("empty output filename".to_string());
|
||||
}
|
||||
Ok(candidate.to_path_buf())
|
||||
}
|
||||
|
||||
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
|
||||
///
|
||||
/// `filename` may originate from an attacker-controlled `--urls-file`
|
||||
/// (`url,filename` CSV). It is validated for traversal, and the canonical
|
||||
/// destination directory is asserted to stay under the canonical output
|
||||
/// directory before any write.
|
||||
fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
|
||||
let dest = dir.join(filename);
|
||||
let rel = safe_relative_filename(filename)?;
|
||||
let dest = dir.join(&rel);
|
||||
|
||||
std::fs::create_dir_all(dir)
|
||||
.map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
|
||||
let base = dir
|
||||
.canonicalize()
|
||||
.map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
|
||||
|
||||
if let Some(parent) = dest.parent() {
|
||||
std::fs::create_dir_all(parent)
|
||||
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
|
||||
let canon_parent = parent
|
||||
.canonicalize()
|
||||
.map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
|
||||
if !canon_parent.starts_with(&base) {
|
||||
return Err(format!(
|
||||
"refusing to write outside output dir: {}",
|
||||
dest.display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
std::fs::write(&dest, content)
|
||||
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
|
||||
let word_count = content.split_whitespace().count();
|
||||
|
|
@ -1679,6 +1733,13 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
|||
serde_json::to_string(payload).unwrap_or_default()
|
||||
};
|
||||
tokio::spawn(async move {
|
||||
// SSRF guard: a webhook URL is user-supplied and otherwise bypasses
|
||||
// the fetch-layer protections, so resolve + reject private/internal
|
||||
// destinations before sending the payload.
|
||||
if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
|
||||
eprintln!("[webhook] refusing unsafe URL: {e}");
|
||||
return;
|
||||
}
|
||||
match reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(10))
|
||||
.build()
|
||||
|
|
@ -1750,7 +1811,9 @@ async fn run_watch_single(
|
|||
);
|
||||
|
||||
loop {
|
||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
||||
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
|
||||
// fetch loop with zero delay and hammer the target.
|
||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
|
||||
|
||||
if cancelled.load(Ordering::Relaxed) {
|
||||
eprintln!("[watch] Stopped");
|
||||
|
|
@ -1842,7 +1905,9 @@ async fn run_watch_multi(
|
|||
let mut check_number = 0u64;
|
||||
|
||||
loop {
|
||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
||||
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
|
||||
// fetch loop with zero delay and hammer the target.
|
||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
|
||||
|
||||
if cancelled.load(Ordering::Relaxed) {
|
||||
eprintln!("[watch] Stopped");
|
||||
|
|
@ -2321,7 +2386,9 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
.collect::<Vec<_>>()
|
||||
.join("-")
|
||||
.to_lowercase();
|
||||
let slug = if slug.len() > 50 { &slug[..50] } else { &slug };
|
||||
// char-safe truncation: byte slicing panics if char 50
|
||||
// lands mid-codepoint (multibyte queries).
|
||||
let slug: String = slug.chars().take(50).collect();
|
||||
let filename = format!("research-{slug}.json");
|
||||
|
||||
let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
|
||||
|
|
@ -2773,4 +2840,66 @@ mod tests {
|
|||
assert_eq!(content, "hello");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn url_to_filename_strips_traversal_segments() {
|
||||
// `..` / `.` / empty path segments must not survive into the path.
|
||||
let out = url_to_filename(
|
||||
"https://example.com/../../etc/passwd",
|
||||
&OutputFormat::Markdown,
|
||||
);
|
||||
assert!(!out.contains(".."), "traversal leaked: {out}");
|
||||
assert_eq!(out, "etc/passwd.md");
|
||||
let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
|
||||
assert_eq!(out2, "a/b/c.json");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn safe_relative_filename_rejects_escapes() {
|
||||
assert!(safe_relative_filename("../escape.md").is_err());
|
||||
assert!(safe_relative_filename("a/../../b.md").is_err());
|
||||
assert!(safe_relative_filename("/etc/passwd").is_err());
|
||||
assert!(safe_relative_filename("").is_err());
|
||||
// Normal nested relative names stay allowed.
|
||||
assert!(safe_relative_filename("nested/deep/file.md").is_ok());
|
||||
assert!(safe_relative_filename("./ok.md").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn write_to_file_refuses_traversal_filename() {
|
||||
let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
// CSV-supplied `url,filename` traversal attempt.
|
||||
let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
|
||||
assert!(err.contains("refusing"), "unexpected error: {err}");
|
||||
assert!(
|
||||
!std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
|
||||
"traversal write escaped the output dir"
|
||||
);
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn research_slug_truncation_is_char_safe() {
|
||||
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.
|
||||
let query = "日本語".repeat(40); // 120 chars, 3 bytes each
|
||||
let slug: String = query
|
||||
.chars()
|
||||
.map(|c| {
|
||||
if c.is_alphanumeric() || c == ' ' {
|
||||
c
|
||||
} else {
|
||||
' '
|
||||
}
|
||||
})
|
||||
.collect::<String>()
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join("-")
|
||||
.to_lowercase();
|
||||
let slug: String = slug.chars().take(50).collect();
|
||||
assert!(slug.chars().count() <= 50);
|
||||
// Round-trips through formatting without panicking.
|
||||
let _ = format!("research-{slug}.json");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,6 +20,11 @@ url = { version = "2", features = ["serde"] }
|
|||
regex = "1"
|
||||
once_cell = "1"
|
||||
similar = "2"
|
||||
|
||||
# rquickjs links a C library and cannot build for wasm32. Gating it per
|
||||
# target keeps the `quickjs` feature usable on native while leaving the
|
||||
# crate WASM-safe even with default features enabled.
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ pub mod diff;
|
|||
pub mod domain;
|
||||
pub mod error;
|
||||
pub mod extractor;
|
||||
#[cfg(feature = "quickjs")]
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
pub mod js_eval;
|
||||
pub mod llm;
|
||||
pub mod markdown;
|
||||
|
|
@ -46,9 +46,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
|
|||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
||||
///
|
||||
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
|
||||
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
|
||||
/// main-thread stack on Windows.
|
||||
/// On native targets, spawns extraction on a thread with an 8 MB stack to
|
||||
/// handle deeply nested HTML (e.g., Express.co.uk live blogs) without
|
||||
/// overflowing the default 1-2 MB main-thread stack on Windows.
|
||||
///
|
||||
/// On `wasm32`, threads are unavailable (`std::thread::spawn` panics at
|
||||
/// runtime), so extraction runs inline on the caller's stack.
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
|
|
@ -70,6 +74,16 @@ pub fn extract_with_options(
|
|||
.unwrap_or(Err(ExtractError::NoContent))
|
||||
}
|
||||
|
||||
/// WASM has no threads; run extraction directly on the caller's stack.
|
||||
#[cfg(target_arch = "wasm32")]
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
extract_with_options_inner(html, url, options)
|
||||
}
|
||||
|
||||
fn extract_with_options_inner(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
|
|
@ -187,7 +201,7 @@ fn extract_with_options_inner(
|
|||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||
// static JSON data island extraction above with runtime-evaluated data.
|
||||
#[cfg(feature = "quickjs")]
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
{
|
||||
let blobs = js_eval::extract_js_data(html);
|
||||
if !blobs.is_empty() {
|
||||
|
|
@ -603,4 +617,36 @@ mod tests {
|
|||
"Should extract content from deep nesting"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wasm_direct_call_path_extracts_content() {
|
||||
// On wasm32 `extract_with_options` runs `extract_with_options_inner`
|
||||
// inline (no thread spawn). Exercise that exact entry point here so
|
||||
// the WASM path stays covered on native CI, and assert it produces
|
||||
// the same content as the public threaded entry point.
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head><title>WASM Path</title></head>
|
||||
<body><article><h1>Heading</h1><p>WASM-safe extraction body content.</p></article></body>
|
||||
</html>"#;
|
||||
let opts = ExtractionOptions::default();
|
||||
|
||||
let inner = extract_with_options_inner(html, Some("https://example.com"), &opts)
|
||||
.expect("inner extraction (wasm path) should succeed");
|
||||
assert!(
|
||||
inner
|
||||
.content
|
||||
.markdown
|
||||
.contains("WASM-safe extraction body content"),
|
||||
"wasm direct-call path should extract body, got: {}",
|
||||
inner.content.markdown
|
||||
);
|
||||
|
||||
let threaded = extract_with_options(html, Some("https://example.com"), &opts)
|
||||
.expect("threaded extraction should succeed");
|
||||
assert_eq!(
|
||||
inner.content.markdown, threaded.content.markdown,
|
||||
"wasm path and threaded path must produce identical content"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
|||
.cloned()
|
||||
.collect();
|
||||
for value in &mut useful {
|
||||
scrub_body_fields(value);
|
||||
scrub_body_fields(value, 0);
|
||||
}
|
||||
if !useful.is_empty() {
|
||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||
|
|
@ -117,10 +117,21 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
|||
}
|
||||
|
||||
/// Recursively remove long fields that duplicate the rendered markdown body.
|
||||
fn scrub_body_fields(v: &mut serde_json::Value) {
|
||||
///
|
||||
/// `depth` guards against stack exhaustion from attacker-controlled
|
||||
/// JSON-LD / `__NEXT_DATA__` blobs with pathological nesting: past
|
||||
/// [`MAX_SCRUB_DEPTH`] levels we stop descending and leave the subtree
|
||||
/// as-is (it is still size-capped by the `STRUCTURED_DATA_MAX_BYTES`
|
||||
/// budget in `to_llm_text`).
|
||||
fn scrub_body_fields(v: &mut serde_json::Value, depth: usize) {
|
||||
const BODY_KEYS: &[&str] = &["articleBody"];
|
||||
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
|
||||
const LONG_THRESHOLD: usize = 500;
|
||||
const MAX_SCRUB_DEPTH: usize = 64;
|
||||
|
||||
if depth >= MAX_SCRUB_DEPTH {
|
||||
return;
|
||||
}
|
||||
|
||||
match v {
|
||||
serde_json::Value::Object(map) => {
|
||||
|
|
@ -136,12 +147,12 @@ fn scrub_body_fields(v: &mut serde_json::Value) {
|
|||
true
|
||||
});
|
||||
for value in map.values_mut() {
|
||||
scrub_body_fields(value);
|
||||
scrub_body_fields(value, depth + 1);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(values) => {
|
||||
for value in values {
|
||||
scrub_body_fields(value);
|
||||
scrub_body_fields(value, depth + 1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
|
|
@ -908,4 +919,53 @@ mod tests {
|
|||
"Compact untyped array dropped: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Walk `value` down its single `"n"` child link and return the depth
|
||||
/// at which an `articleBody` key is still present (i.e. was NOT
|
||||
/// scrubbed). Used to observe exactly where the recursion stopped.
|
||||
fn first_unscrubbed_article_body_depth(mut value: &serde_json::Value) -> Option<usize> {
|
||||
let mut depth = 0;
|
||||
loop {
|
||||
let obj = value.as_object()?;
|
||||
if obj.contains_key("articleBody") {
|
||||
return Some(depth);
|
||||
}
|
||||
value = obj.get("n")?;
|
||||
depth += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scrub_body_fields_bounds_recursion_on_deep_nesting() {
|
||||
// Attacker-controlled JSON-LD / __NEXT_DATA__ with pathological
|
||||
// nesting must not recurse without bound. Build a chain a little
|
||||
// past the 64-level cap where every level carries a scrub-able
|
||||
// `articleBody`. Levels within the cap get scrubbed; the first
|
||||
// level past the cap keeps its `articleBody` because recursion
|
||||
// stopped — that is the bound we assert. (Kept shallow on purpose:
|
||||
// serde_json drops Values recursively, so a 10k-deep value would
|
||||
// overflow the stack just being dropped.)
|
||||
const DEPTH: usize = 80;
|
||||
let mut node = serde_json::json!({ "articleBody": "x".repeat(600) });
|
||||
for _ in 0..DEPTH {
|
||||
node = serde_json::json!({
|
||||
"articleBody": "x".repeat(600),
|
||||
"n": node,
|
||||
});
|
||||
}
|
||||
|
||||
scrub_body_fields(&mut node, 0);
|
||||
|
||||
let stopped_at = first_unscrubbed_article_body_depth(&node)
|
||||
.expect("recursion must stop and leave a deep articleBody intact");
|
||||
// Top levels were scrubbed; the survivor sits right at the cap.
|
||||
assert_eq!(
|
||||
stopped_at, 64,
|
||||
"recursion should stop at the depth cap, stopped at {stopped_at}"
|
||||
);
|
||||
assert!(
|
||||
node.as_object().unwrap().get("articleBody").is_none(),
|
||||
"shallow articleBody must still be scrubbed"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -95,12 +95,30 @@ struct Response {
|
|||
/// per page in collapse_whitespace + strip_markdown).
|
||||
const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024;
|
||||
|
||||
/// Running decompression-bomb guard: reject as soon as the bytes already
|
||||
/// buffered plus the next decompressed chunk would cross [`MAX_BODY_BYTES`].
|
||||
/// Saturating arithmetic so a huge chunk length can't wrap the sum.
|
||||
fn check_body_ceiling(buffered: usize, next_chunk: usize) -> Result<(), FetchError> {
|
||||
let total = (buffered as u64).saturating_add(next_chunk as u64);
|
||||
if total > MAX_BODY_BYTES {
|
||||
return Err(FetchError::BodyDecode(format!(
|
||||
"response body exceeds cap {MAX_BODY_BYTES} bytes (decompressed)"
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Response {
|
||||
/// Buffer a wreq response into an owned Response. Rejects bodies that
|
||||
/// advertise a Content-Length beyond [`MAX_BODY_BYTES`] before we pay
|
||||
/// the allocation, and truncates after the fact as a belt-and-braces
|
||||
/// check against a lying server.
|
||||
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
|
||||
/// Buffer a wreq response into an owned Response.
|
||||
///
|
||||
/// Rejects bodies that advertise a Content-Length beyond
|
||||
/// [`MAX_BODY_BYTES`] before we pay any allocation, then streams the
|
||||
/// body chunk-by-chunk while enforcing a running ceiling. `chunk()`
|
||||
/// yields *post-decompression* bytes (gzip/brotli/zstd/deflate are
|
||||
/// negotiated), so a tiny compressed payload that inflates to
|
||||
/// gigabytes is aborted as soon as the accumulated size crosses the
|
||||
/// cap — it never gets fully buffered in memory.
|
||||
async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
|
||||
if let Some(len) = resp.content_length()
|
||||
&& len > MAX_BODY_BYTES
|
||||
{
|
||||
|
|
@ -111,21 +129,22 @@ impl Response {
|
|||
let status = resp.status().as_u16();
|
||||
let url = resp.uri().to_string();
|
||||
let headers = resp.headers().clone();
|
||||
let body = resp
|
||||
.bytes()
|
||||
|
||||
let mut buf = bytes::BytesMut::new();
|
||||
while let Some(chunk) = resp
|
||||
.chunk()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
if body.len() as u64 > MAX_BODY_BYTES {
|
||||
return Err(FetchError::BodyDecode(format!(
|
||||
"response body {} bytes exceeds cap {MAX_BODY_BYTES}",
|
||||
body.len()
|
||||
)));
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?
|
||||
{
|
||||
check_body_ceiling(buf.len(), chunk.len())?;
|
||||
buf.extend_from_slice(&chunk);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
status,
|
||||
url,
|
||||
headers,
|
||||
body,
|
||||
body: buf.freeze(),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -896,6 +915,28 @@ mod tests {
|
|||
assert!(err.result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn body_ceiling_allows_under_cap() {
|
||||
assert!(check_body_ceiling(0, 1024).is_ok());
|
||||
assert!(check_body_ceiling(MAX_BODY_BYTES as usize - 1, 1).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn body_ceiling_rejects_at_and_over_cap() {
|
||||
// Exactly at the cap is allowed; one byte over is rejected.
|
||||
assert!(check_body_ceiling(MAX_BODY_BYTES as usize, 1).is_err());
|
||||
// A small buffer plus a huge inflated chunk (decompression bomb)
|
||||
// is caught on the very first oversized chunk.
|
||||
let err = check_body_ceiling(16, 64 * 1024 * 1024).unwrap_err();
|
||||
assert!(matches!(err, FetchError::BodyDecode(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn body_ceiling_saturates_on_overflow() {
|
||||
// usize::MAX chunk must not wrap the running sum to a small value.
|
||||
assert!(check_body_ceiling(usize::MAX, usize::MAX).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_extract_result_struct() {
|
||||
let err = BatchExtractResult {
|
||||
|
|
|
|||
|
|
@ -533,8 +533,9 @@ pub fn build_client(
|
|||
.timeout(timeout);
|
||||
|
||||
if let Some(proxy_url) = proxy {
|
||||
let proxy =
|
||||
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
|
||||
let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {
|
||||
FetchError::Build(format!("invalid proxy {}", redact_proxy_url(proxy_url)))
|
||||
})?;
|
||||
builder = builder.proxy(proxy);
|
||||
} else {
|
||||
builder = builder.dns_resolver(PublicDnsResolver);
|
||||
|
|
@ -545,6 +546,24 @@ pub fn build_client(
|
|||
.map_err(|e| FetchError::Build(e.to_string()))
|
||||
}
|
||||
|
||||
/// Render a proxy URL safe to log: drop any `user:pass@` userinfo so
|
||||
/// rotating-proxy credentials never reach error strings or tracing.
|
||||
/// Falls back to a constant placeholder when the input does not parse.
|
||||
fn redact_proxy_url(raw: &str) -> String {
|
||||
match url::Url::parse(raw) {
|
||||
Ok(mut u) => {
|
||||
// Best-effort: opaque URLs (e.g. no host) reject these setters;
|
||||
// in that case fall through to the placeholder rather than risk
|
||||
// returning the raw string with credentials.
|
||||
if u.set_username("").is_err() || u.set_password(None).is_err() {
|
||||
return "<proxy redacted>".to_string();
|
||||
}
|
||||
u.to_string()
|
||||
}
|
||||
Err(_) => "<proxy redacted>".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn ssrf_safe_redirect_policy(
|
||||
follow_redirects: bool,
|
||||
max_redirects: usize,
|
||||
|
|
@ -567,3 +586,41 @@ fn ssrf_safe_redirect_policy(
|
|||
})
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::redact_proxy_url;
|
||||
|
||||
#[test]
|
||||
fn redacts_userinfo_from_proxy_url() {
|
||||
let red = redact_proxy_url("http://user123:s3cr3tPass@proxy.example.com:8080");
|
||||
assert!(!red.contains("user123"), "username leaked: {red}");
|
||||
assert!(!red.contains("s3cr3tPass"), "password leaked: {red}");
|
||||
assert!(red.contains("proxy.example.com"), "host lost: {red}");
|
||||
assert!(red.contains("8080"), "port lost: {red}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn redacts_long_token_residential_proxy() {
|
||||
// Residential-style: long structured credential with embedded
|
||||
// tokens in the username and special chars in the password.
|
||||
let red =
|
||||
redact_proxy_url("http://acct-zone-resi-country-xx:p@ss-word@gw.proxy.example:12321");
|
||||
assert!(!red.contains("acct-zone-resi"), "username leaked: {red}");
|
||||
assert!(!red.contains("p@ss-word"), "password leaked: {red}");
|
||||
assert!(red.contains("gw.proxy.example"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unparseable_proxy_does_not_echo_input() {
|
||||
let red = redact_proxy_url("user:pass@not a url");
|
||||
assert_eq!(red, "<proxy redacted>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn proxy_without_credentials_is_preserved() {
|
||||
let red = redact_proxy_url("http://proxy.example.com:3128");
|
||||
assert!(red.contains("proxy.example.com"));
|
||||
assert!(red.contains("3128"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -800,7 +800,9 @@ fn slugify(query: &str) -> String {
|
|||
.collect::<Vec<_>>()
|
||||
.join("-")
|
||||
.to_lowercase();
|
||||
if s.len() > 60 { s[..60].to_string() } else { s }
|
||||
// char-safe truncation: byte slicing panics if char 60 lands
|
||||
// mid-codepoint (multibyte queries, e.g. CJK / accented input).
|
||||
s.chars().take(60).collect()
|
||||
}
|
||||
|
||||
/// Check for a cached research result. Returns the compact response if found.
|
||||
|
|
@ -856,3 +858,32 @@ fn save_research(dir: &std::path::Path, slug: &str, data: &serde_json::Value) ->
|
|||
json_path.to_string_lossy().to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::slugify;
|
||||
|
||||
#[test]
|
||||
fn slugify_multibyte_query_does_not_panic() {
|
||||
// Byte-slicing s[..60] would panic mid-codepoint on multibyte
|
||||
// alphanumerics; char-safe truncation must not.
|
||||
let q = "日本語のクエリ".repeat(20); // long, 3-byte chars
|
||||
let s = slugify(&q);
|
||||
assert!(
|
||||
s.chars().count() <= 60,
|
||||
"slug too long: {}",
|
||||
s.chars().count()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn slugify_ascii_unchanged_under_limit() {
|
||||
assert_eq!(slugify("Hello World Query"), "hello-world-query");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn slugify_caps_long_ascii_at_60_chars() {
|
||||
let s = slugify(&"word ".repeat(40));
|
||||
assert!(s.len() <= 60);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue