mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: harden resource limits, path safety, and WASM build (#46)
Security audit follow-up across the workspace: - webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a cfg(not(wasm32)) target dependency and the extraction entry point uses a direct call on wasm instead of spawning a thread, so it builds and runs on wasm32 with or without default features. - webclaw-core: bound the structured-data scrubber recursion (depth cap) so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the stack. - webclaw-fetch: stream the response body with a running ceiling so a small highly compressed payload cannot inflate to gigabytes in memory; redact user:pass@ from proxy URLs before they reach error strings. - webclaw-cli: contain output filenames inside the chosen directory (reject .. / absolute, drop traversal path segments), run --webhook URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s, and make research slug truncation char-safe. - webclaw-mcp: char-safe slug truncation (no multibyte slice panic). - setup.sh / deploy/hetzner.sh: replace eval on read input with printf -v, and mask auth key / API token in console output. - CI: enforce the wasm32 build invariant for webclaw-core. Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
This commit is contained in:
parent
aab51bea91
commit
be8bcfebd9
13 changed files with 454 additions and 47 deletions
15
.github/workflows/ci.yml
vendored
15
.github/workflows/ci.yml
vendored
|
|
@ -31,6 +31,21 @@ jobs:
|
||||||
- run: cargo fmt --check --all
|
- run: cargo fmt --check --all
|
||||||
- run: cargo clippy --all -- -D warnings
|
- run: cargo clippy --all -- -D warnings
|
||||||
|
|
||||||
|
wasm:
|
||||||
|
name: WASM
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: dtolnay/rust-toolchain@stable
|
||||||
|
with:
|
||||||
|
targets: wasm32-unknown-unknown
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
# webclaw-core must stay WASM-safe (zero network deps, no threads).
|
||||||
|
# Check both with and without default features so the quickjs gate
|
||||||
|
# can't regress.
|
||||||
|
- run: cargo check --target wasm32-unknown-unknown -p webclaw-core
|
||||||
|
- run: cargo check --target wasm32-unknown-unknown -p webclaw-core --no-default-features
|
||||||
|
|
||||||
docs:
|
docs:
|
||||||
name: Docs
|
name: Docs
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,13 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.6.3] — 2026-05-19
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Hardened resource and path-safety limits across the CLI, MCP server, and self-hosted API: oversized or highly compressed responses are capped while streaming, deeply nested page data can no longer exhaust memory, output filenames stay inside the chosen directory, webhook URLs are validated like every other fetch, and multibyte search queries no longer crash slug generation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.6.2] — 2026-05-18
|
## [0.6.2] — 2026-05-18
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
|
||||||
14
Cargo.lock
generated
14
Cargo.lock
generated
|
|
@ -3219,7 +3219,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.6.2"
|
version = "0.6.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3240,7 +3240,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.6.2"
|
version = "0.6.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3258,7 +3258,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.6.2"
|
version = "0.6.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
|
@ -3284,7 +3284,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.6.2"
|
version = "0.6.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3297,7 +3297,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.6.2"
|
version = "0.6.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3317,7 +3317,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.6.2"
|
version = "0.6.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
@ -3326,7 +3326,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-server"
|
name = "webclaw-server"
|
||||||
version = "0.6.2"
|
version = "0.6.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"axum",
|
"axum",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.6.2"
|
version = "0.6.3"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -613,7 +613,15 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
||||||
Err(_) => (String::new(), String::new(), None),
|
Err(_) => (String::new(), String::new(), None),
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut stem = path.trim_matches('/').to_string();
|
// Drop empty / "." / ".." path segments so a URL path like
|
||||||
|
// `/../../etc/passwd` can't climb out of the output directory.
|
||||||
|
let cleaned_path: String = path
|
||||||
|
.split('/')
|
||||||
|
.filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("/");
|
||||||
|
|
||||||
|
let mut stem = cleaned_path;
|
||||||
if stem.is_empty() {
|
if stem.is_empty() {
|
||||||
// Use hostname for root URLs to avoid collisions in batch mode
|
// Use hostname for root URLs to avoid collisions in batch mode
|
||||||
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
|
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
|
||||||
|
|
@ -640,13 +648,59 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
||||||
format!("{sanitized}.{ext}")
|
format!("{sanitized}.{ext}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
|
||||||
|
/// output directory: absolute paths, drive prefixes, root, or any `..`
|
||||||
|
/// component. Returns the validated relative path on success.
|
||||||
|
fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
|
||||||
|
let candidate = Path::new(filename);
|
||||||
|
use std::path::Component;
|
||||||
|
for comp in candidate.components() {
|
||||||
|
match comp {
|
||||||
|
Component::Normal(_) | Component::CurDir => {}
|
||||||
|
Component::ParentDir => {
|
||||||
|
return Err(format!("refusing path with '..' component: {filename}"));
|
||||||
|
}
|
||||||
|
Component::RootDir | Component::Prefix(_) => {
|
||||||
|
return Err(format!("refusing absolute output path: {filename}"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if candidate.as_os_str().is_empty() {
|
||||||
|
return Err("empty output filename".to_string());
|
||||||
|
}
|
||||||
|
Ok(candidate.to_path_buf())
|
||||||
|
}
|
||||||
|
|
||||||
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
|
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
|
||||||
|
///
|
||||||
|
/// `filename` may originate from an attacker-controlled `--urls-file`
|
||||||
|
/// (`url,filename` CSV). It is validated for traversal, and the canonical
|
||||||
|
/// destination directory is asserted to stay under the canonical output
|
||||||
|
/// directory before any write.
|
||||||
fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
|
fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
|
||||||
let dest = dir.join(filename);
|
let rel = safe_relative_filename(filename)?;
|
||||||
|
let dest = dir.join(&rel);
|
||||||
|
|
||||||
|
std::fs::create_dir_all(dir)
|
||||||
|
.map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
|
||||||
|
let base = dir
|
||||||
|
.canonicalize()
|
||||||
|
.map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
|
||||||
|
|
||||||
if let Some(parent) = dest.parent() {
|
if let Some(parent) = dest.parent() {
|
||||||
std::fs::create_dir_all(parent)
|
std::fs::create_dir_all(parent)
|
||||||
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
|
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
|
||||||
|
let canon_parent = parent
|
||||||
|
.canonicalize()
|
||||||
|
.map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
|
||||||
|
if !canon_parent.starts_with(&base) {
|
||||||
|
return Err(format!(
|
||||||
|
"refusing to write outside output dir: {}",
|
||||||
|
dest.display()
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::fs::write(&dest, content)
|
std::fs::write(&dest, content)
|
||||||
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
|
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
|
||||||
let word_count = content.split_whitespace().count();
|
let word_count = content.split_whitespace().count();
|
||||||
|
|
@ -1679,6 +1733,13 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
||||||
serde_json::to_string(payload).unwrap_or_default()
|
serde_json::to_string(payload).unwrap_or_default()
|
||||||
};
|
};
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
|
// SSRF guard: a webhook URL is user-supplied and otherwise bypasses
|
||||||
|
// the fetch-layer protections, so resolve + reject private/internal
|
||||||
|
// destinations before sending the payload.
|
||||||
|
if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
|
||||||
|
eprintln!("[webhook] refusing unsafe URL: {e}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
match reqwest::Client::builder()
|
match reqwest::Client::builder()
|
||||||
.timeout(std::time::Duration::from_secs(10))
|
.timeout(std::time::Duration::from_secs(10))
|
||||||
.build()
|
.build()
|
||||||
|
|
@ -1750,7 +1811,9 @@ async fn run_watch_single(
|
||||||
);
|
);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
|
||||||
|
// fetch loop with zero delay and hammer the target.
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
|
||||||
|
|
||||||
if cancelled.load(Ordering::Relaxed) {
|
if cancelled.load(Ordering::Relaxed) {
|
||||||
eprintln!("[watch] Stopped");
|
eprintln!("[watch] Stopped");
|
||||||
|
|
@ -1842,7 +1905,9 @@ async fn run_watch_multi(
|
||||||
let mut check_number = 0u64;
|
let mut check_number = 0u64;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
|
||||||
|
// fetch loop with zero delay and hammer the target.
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
|
||||||
|
|
||||||
if cancelled.load(Ordering::Relaxed) {
|
if cancelled.load(Ordering::Relaxed) {
|
||||||
eprintln!("[watch] Stopped");
|
eprintln!("[watch] Stopped");
|
||||||
|
|
@ -2321,7 +2386,9 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.join("-")
|
.join("-")
|
||||||
.to_lowercase();
|
.to_lowercase();
|
||||||
let slug = if slug.len() > 50 { &slug[..50] } else { &slug };
|
// char-safe truncation: byte slicing panics if char 50
|
||||||
|
// lands mid-codepoint (multibyte queries).
|
||||||
|
let slug: String = slug.chars().take(50).collect();
|
||||||
let filename = format!("research-{slug}.json");
|
let filename = format!("research-{slug}.json");
|
||||||
|
|
||||||
let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
|
let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
|
||||||
|
|
@ -2773,4 +2840,66 @@ mod tests {
|
||||||
assert_eq!(content, "hello");
|
assert_eq!(content, "hello");
|
||||||
let _ = std::fs::remove_dir_all(&dir);
|
let _ = std::fs::remove_dir_all(&dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn url_to_filename_strips_traversal_segments() {
|
||||||
|
// `..` / `.` / empty path segments must not survive into the path.
|
||||||
|
let out = url_to_filename(
|
||||||
|
"https://example.com/../../etc/passwd",
|
||||||
|
&OutputFormat::Markdown,
|
||||||
|
);
|
||||||
|
assert!(!out.contains(".."), "traversal leaked: {out}");
|
||||||
|
assert_eq!(out, "etc/passwd.md");
|
||||||
|
let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
|
||||||
|
assert_eq!(out2, "a/b/c.json");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_relative_filename_rejects_escapes() {
|
||||||
|
assert!(safe_relative_filename("../escape.md").is_err());
|
||||||
|
assert!(safe_relative_filename("a/../../b.md").is_err());
|
||||||
|
assert!(safe_relative_filename("/etc/passwd").is_err());
|
||||||
|
assert!(safe_relative_filename("").is_err());
|
||||||
|
// Normal nested relative names stay allowed.
|
||||||
|
assert!(safe_relative_filename("nested/deep/file.md").is_ok());
|
||||||
|
assert!(safe_relative_filename("./ok.md").is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn write_to_file_refuses_traversal_filename() {
|
||||||
|
let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
|
||||||
|
let _ = std::fs::remove_dir_all(&dir);
|
||||||
|
// CSV-supplied `url,filename` traversal attempt.
|
||||||
|
let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
|
||||||
|
assert!(err.contains("refusing"), "unexpected error: {err}");
|
||||||
|
assert!(
|
||||||
|
!std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
|
||||||
|
"traversal write escaped the output dir"
|
||||||
|
);
|
||||||
|
let _ = std::fs::remove_dir_all(&dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn research_slug_truncation_is_char_safe() {
|
||||||
|
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.
|
||||||
|
let query = "日本語".repeat(40); // 120 chars, 3 bytes each
|
||||||
|
let slug: String = query
|
||||||
|
.chars()
|
||||||
|
.map(|c| {
|
||||||
|
if c.is_alphanumeric() || c == ' ' {
|
||||||
|
c
|
||||||
|
} else {
|
||||||
|
' '
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<String>()
|
||||||
|
.split_whitespace()
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("-")
|
||||||
|
.to_lowercase();
|
||||||
|
let slug: String = slug.chars().take(50).collect();
|
||||||
|
assert!(slug.chars().count() <= 50);
|
||||||
|
// Round-trips through formatting without panicking.
|
||||||
|
let _ = format!("research-{slug}.json");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,11 @@ url = { version = "2", features = ["serde"] }
|
||||||
regex = "1"
|
regex = "1"
|
||||||
once_cell = "1"
|
once_cell = "1"
|
||||||
similar = "2"
|
similar = "2"
|
||||||
|
|
||||||
|
# rquickjs links a C library and cannot build for wasm32. Gating it per
|
||||||
|
# target keeps the `quickjs` feature usable on native while leaving the
|
||||||
|
# crate WASM-safe even with default features enabled.
|
||||||
|
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||||
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ pub mod diff;
|
||||||
pub mod domain;
|
pub mod domain;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod extractor;
|
pub mod extractor;
|
||||||
#[cfg(feature = "quickjs")]
|
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||||
pub mod js_eval;
|
pub mod js_eval;
|
||||||
pub mod llm;
|
pub mod llm;
|
||||||
pub mod markdown;
|
pub mod markdown;
|
||||||
|
|
@ -46,9 +46,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
|
||||||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||||
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
||||||
///
|
///
|
||||||
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
|
/// On native targets, spawns extraction on a thread with an 8 MB stack to
|
||||||
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
|
/// handle deeply nested HTML (e.g., Express.co.uk live blogs) without
|
||||||
/// main-thread stack on Windows.
|
/// overflowing the default 1-2 MB main-thread stack on Windows.
|
||||||
|
///
|
||||||
|
/// On `wasm32`, threads are unavailable (`std::thread::spawn` panics at
|
||||||
|
/// runtime), so extraction runs inline on the caller's stack.
|
||||||
|
#[cfg(not(target_arch = "wasm32"))]
|
||||||
pub fn extract_with_options(
|
pub fn extract_with_options(
|
||||||
html: &str,
|
html: &str,
|
||||||
url: Option<&str>,
|
url: Option<&str>,
|
||||||
|
|
@ -70,6 +74,16 @@ pub fn extract_with_options(
|
||||||
.unwrap_or(Err(ExtractError::NoContent))
|
.unwrap_or(Err(ExtractError::NoContent))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// WASM has no threads; run extraction directly on the caller's stack.
|
||||||
|
#[cfg(target_arch = "wasm32")]
|
||||||
|
pub fn extract_with_options(
|
||||||
|
html: &str,
|
||||||
|
url: Option<&str>,
|
||||||
|
options: &ExtractionOptions,
|
||||||
|
) -> Result<ExtractionResult, ExtractError> {
|
||||||
|
extract_with_options_inner(html, url, options)
|
||||||
|
}
|
||||||
|
|
||||||
fn extract_with_options_inner(
|
fn extract_with_options_inner(
|
||||||
html: &str,
|
html: &str,
|
||||||
url: Option<&str>,
|
url: Option<&str>,
|
||||||
|
|
@ -187,7 +201,7 @@ fn extract_with_options_inner(
|
||||||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||||
// static JSON data island extraction above with runtime-evaluated data.
|
// static JSON data island extraction above with runtime-evaluated data.
|
||||||
#[cfg(feature = "quickjs")]
|
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||||
{
|
{
|
||||||
let blobs = js_eval::extract_js_data(html);
|
let blobs = js_eval::extract_js_data(html);
|
||||||
if !blobs.is_empty() {
|
if !blobs.is_empty() {
|
||||||
|
|
@ -603,4 +617,36 @@ mod tests {
|
||||||
"Should extract content from deep nesting"
|
"Should extract content from deep nesting"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn wasm_direct_call_path_extracts_content() {
|
||||||
|
// On wasm32 `extract_with_options` runs `extract_with_options_inner`
|
||||||
|
// inline (no thread spawn). Exercise that exact entry point here so
|
||||||
|
// the WASM path stays covered on native CI, and assert it produces
|
||||||
|
// the same content as the public threaded entry point.
|
||||||
|
let html = r#"
|
||||||
|
<html lang="en">
|
||||||
|
<head><title>WASM Path</title></head>
|
||||||
|
<body><article><h1>Heading</h1><p>WASM-safe extraction body content.</p></article></body>
|
||||||
|
</html>"#;
|
||||||
|
let opts = ExtractionOptions::default();
|
||||||
|
|
||||||
|
let inner = extract_with_options_inner(html, Some("https://example.com"), &opts)
|
||||||
|
.expect("inner extraction (wasm path) should succeed");
|
||||||
|
assert!(
|
||||||
|
inner
|
||||||
|
.content
|
||||||
|
.markdown
|
||||||
|
.contains("WASM-safe extraction body content"),
|
||||||
|
"wasm direct-call path should extract body, got: {}",
|
||||||
|
inner.content.markdown
|
||||||
|
);
|
||||||
|
|
||||||
|
let threaded = extract_with_options(html, Some("https://example.com"), &opts)
|
||||||
|
.expect("threaded extraction should succeed");
|
||||||
|
assert_eq!(
|
||||||
|
inner.content.markdown, threaded.content.markdown,
|
||||||
|
"wasm path and threaded path must produce identical content"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||||
.cloned()
|
.cloned()
|
||||||
.collect();
|
.collect();
|
||||||
for value in &mut useful {
|
for value in &mut useful {
|
||||||
scrub_body_fields(value);
|
scrub_body_fields(value, 0);
|
||||||
}
|
}
|
||||||
if !useful.is_empty() {
|
if !useful.is_empty() {
|
||||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||||
|
|
@ -117,10 +117,21 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Recursively remove long fields that duplicate the rendered markdown body.
|
/// Recursively remove long fields that duplicate the rendered markdown body.
|
||||||
fn scrub_body_fields(v: &mut serde_json::Value) {
|
///
|
||||||
|
/// `depth` guards against stack exhaustion from attacker-controlled
|
||||||
|
/// JSON-LD / `__NEXT_DATA__` blobs with pathological nesting: past
|
||||||
|
/// [`MAX_SCRUB_DEPTH`] levels we stop descending and leave the subtree
|
||||||
|
/// as-is (it is still size-capped by the `STRUCTURED_DATA_MAX_BYTES`
|
||||||
|
/// budget in `to_llm_text`).
|
||||||
|
fn scrub_body_fields(v: &mut serde_json::Value, depth: usize) {
|
||||||
const BODY_KEYS: &[&str] = &["articleBody"];
|
const BODY_KEYS: &[&str] = &["articleBody"];
|
||||||
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
|
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
|
||||||
const LONG_THRESHOLD: usize = 500;
|
const LONG_THRESHOLD: usize = 500;
|
||||||
|
const MAX_SCRUB_DEPTH: usize = 64;
|
||||||
|
|
||||||
|
if depth >= MAX_SCRUB_DEPTH {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
match v {
|
match v {
|
||||||
serde_json::Value::Object(map) => {
|
serde_json::Value::Object(map) => {
|
||||||
|
|
@ -136,12 +147,12 @@ fn scrub_body_fields(v: &mut serde_json::Value) {
|
||||||
true
|
true
|
||||||
});
|
});
|
||||||
for value in map.values_mut() {
|
for value in map.values_mut() {
|
||||||
scrub_body_fields(value);
|
scrub_body_fields(value, depth + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
serde_json::Value::Array(values) => {
|
serde_json::Value::Array(values) => {
|
||||||
for value in values {
|
for value in values {
|
||||||
scrub_body_fields(value);
|
scrub_body_fields(value, depth + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
|
|
@ -908,4 +919,53 @@ mod tests {
|
||||||
"Compact untyped array dropped: {out}"
|
"Compact untyped array dropped: {out}"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Walk `value` down its single `"n"` child link and return the depth
|
||||||
|
/// at which an `articleBody` key is still present (i.e. was NOT
|
||||||
|
/// scrubbed). Used to observe exactly where the recursion stopped.
|
||||||
|
fn first_unscrubbed_article_body_depth(mut value: &serde_json::Value) -> Option<usize> {
|
||||||
|
let mut depth = 0;
|
||||||
|
loop {
|
||||||
|
let obj = value.as_object()?;
|
||||||
|
if obj.contains_key("articleBody") {
|
||||||
|
return Some(depth);
|
||||||
|
}
|
||||||
|
value = obj.get("n")?;
|
||||||
|
depth += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scrub_body_fields_bounds_recursion_on_deep_nesting() {
|
||||||
|
// Attacker-controlled JSON-LD / __NEXT_DATA__ with pathological
|
||||||
|
// nesting must not recurse without bound. Build a chain a little
|
||||||
|
// past the 64-level cap where every level carries a scrub-able
|
||||||
|
// `articleBody`. Levels within the cap get scrubbed; the first
|
||||||
|
// level past the cap keeps its `articleBody` because recursion
|
||||||
|
// stopped — that is the bound we assert. (Kept shallow on purpose:
|
||||||
|
// serde_json drops Values recursively, so a 10k-deep value would
|
||||||
|
// overflow the stack just being dropped.)
|
||||||
|
const DEPTH: usize = 80;
|
||||||
|
let mut node = serde_json::json!({ "articleBody": "x".repeat(600) });
|
||||||
|
for _ in 0..DEPTH {
|
||||||
|
node = serde_json::json!({
|
||||||
|
"articleBody": "x".repeat(600),
|
||||||
|
"n": node,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
scrub_body_fields(&mut node, 0);
|
||||||
|
|
||||||
|
let stopped_at = first_unscrubbed_article_body_depth(&node)
|
||||||
|
.expect("recursion must stop and leave a deep articleBody intact");
|
||||||
|
// Top levels were scrubbed; the survivor sits right at the cap.
|
||||||
|
assert_eq!(
|
||||||
|
stopped_at, 64,
|
||||||
|
"recursion should stop at the depth cap, stopped at {stopped_at}"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
node.as_object().unwrap().get("articleBody").is_none(),
|
||||||
|
"shallow articleBody must still be scrubbed"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -95,12 +95,30 @@ struct Response {
|
||||||
/// per page in collapse_whitespace + strip_markdown).
|
/// per page in collapse_whitespace + strip_markdown).
|
||||||
const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024;
|
const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024;
|
||||||
|
|
||||||
|
/// Running decompression-bomb guard: reject as soon as the bytes already
|
||||||
|
/// buffered plus the next decompressed chunk would cross [`MAX_BODY_BYTES`].
|
||||||
|
/// Saturating arithmetic so a huge chunk length can't wrap the sum.
|
||||||
|
fn check_body_ceiling(buffered: usize, next_chunk: usize) -> Result<(), FetchError> {
|
||||||
|
let total = (buffered as u64).saturating_add(next_chunk as u64);
|
||||||
|
if total > MAX_BODY_BYTES {
|
||||||
|
return Err(FetchError::BodyDecode(format!(
|
||||||
|
"response body exceeds cap {MAX_BODY_BYTES} bytes (decompressed)"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
impl Response {
|
impl Response {
|
||||||
/// Buffer a wreq response into an owned Response. Rejects bodies that
|
/// Buffer a wreq response into an owned Response.
|
||||||
/// advertise a Content-Length beyond [`MAX_BODY_BYTES`] before we pay
|
///
|
||||||
/// the allocation, and truncates after the fact as a belt-and-braces
|
/// Rejects bodies that advertise a Content-Length beyond
|
||||||
/// check against a lying server.
|
/// [`MAX_BODY_BYTES`] before we pay any allocation, then streams the
|
||||||
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
|
/// body chunk-by-chunk while enforcing a running ceiling. `chunk()`
|
||||||
|
/// yields *post-decompression* bytes (gzip/brotli/zstd/deflate are
|
||||||
|
/// negotiated), so a tiny compressed payload that inflates to
|
||||||
|
/// gigabytes is aborted as soon as the accumulated size crosses the
|
||||||
|
/// cap — it never gets fully buffered in memory.
|
||||||
|
async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
|
||||||
if let Some(len) = resp.content_length()
|
if let Some(len) = resp.content_length()
|
||||||
&& len > MAX_BODY_BYTES
|
&& len > MAX_BODY_BYTES
|
||||||
{
|
{
|
||||||
|
|
@ -111,21 +129,22 @@ impl Response {
|
||||||
let status = resp.status().as_u16();
|
let status = resp.status().as_u16();
|
||||||
let url = resp.uri().to_string();
|
let url = resp.uri().to_string();
|
||||||
let headers = resp.headers().clone();
|
let headers = resp.headers().clone();
|
||||||
let body = resp
|
|
||||||
.bytes()
|
let mut buf = bytes::BytesMut::new();
|
||||||
|
while let Some(chunk) = resp
|
||||||
|
.chunk()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
.map_err(|e| FetchError::BodyDecode(e.to_string()))?
|
||||||
if body.len() as u64 > MAX_BODY_BYTES {
|
{
|
||||||
return Err(FetchError::BodyDecode(format!(
|
check_body_ceiling(buf.len(), chunk.len())?;
|
||||||
"response body {} bytes exceeds cap {MAX_BODY_BYTES}",
|
buf.extend_from_slice(&chunk);
|
||||||
body.len()
|
|
||||||
)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
status,
|
status,
|
||||||
url,
|
url,
|
||||||
headers,
|
headers,
|
||||||
body,
|
body: buf.freeze(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -896,6 +915,28 @@ mod tests {
|
||||||
assert!(err.result.is_err());
|
assert!(err.result.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn body_ceiling_allows_under_cap() {
|
||||||
|
assert!(check_body_ceiling(0, 1024).is_ok());
|
||||||
|
assert!(check_body_ceiling(MAX_BODY_BYTES as usize - 1, 1).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn body_ceiling_rejects_at_and_over_cap() {
|
||||||
|
// Exactly at the cap is allowed; one byte over is rejected.
|
||||||
|
assert!(check_body_ceiling(MAX_BODY_BYTES as usize, 1).is_err());
|
||||||
|
// A small buffer plus a huge inflated chunk (decompression bomb)
|
||||||
|
// is caught on the very first oversized chunk.
|
||||||
|
let err = check_body_ceiling(16, 64 * 1024 * 1024).unwrap_err();
|
||||||
|
assert!(matches!(err, FetchError::BodyDecode(_)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn body_ceiling_saturates_on_overflow() {
|
||||||
|
// usize::MAX chunk must not wrap the running sum to a small value.
|
||||||
|
assert!(check_body_ceiling(usize::MAX, usize::MAX).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_batch_extract_result_struct() {
|
fn test_batch_extract_result_struct() {
|
||||||
let err = BatchExtractResult {
|
let err = BatchExtractResult {
|
||||||
|
|
|
||||||
|
|
@ -533,8 +533,9 @@ pub fn build_client(
|
||||||
.timeout(timeout);
|
.timeout(timeout);
|
||||||
|
|
||||||
if let Some(proxy_url) = proxy {
|
if let Some(proxy_url) = proxy {
|
||||||
let proxy =
|
let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {
|
||||||
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
|
FetchError::Build(format!("invalid proxy {}", redact_proxy_url(proxy_url)))
|
||||||
|
})?;
|
||||||
builder = builder.proxy(proxy);
|
builder = builder.proxy(proxy);
|
||||||
} else {
|
} else {
|
||||||
builder = builder.dns_resolver(PublicDnsResolver);
|
builder = builder.dns_resolver(PublicDnsResolver);
|
||||||
|
|
@ -545,6 +546,24 @@ pub fn build_client(
|
||||||
.map_err(|e| FetchError::Build(e.to_string()))
|
.map_err(|e| FetchError::Build(e.to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Render a proxy URL safe to log: drop any `user:pass@` userinfo so
|
||||||
|
/// rotating-proxy credentials never reach error strings or tracing.
|
||||||
|
/// Falls back to a constant placeholder when the input does not parse.
|
||||||
|
fn redact_proxy_url(raw: &str) -> String {
|
||||||
|
match url::Url::parse(raw) {
|
||||||
|
Ok(mut u) => {
|
||||||
|
// Best-effort: opaque URLs (e.g. no host) reject these setters;
|
||||||
|
// in that case fall through to the placeholder rather than risk
|
||||||
|
// returning the raw string with credentials.
|
||||||
|
if u.set_username("").is_err() || u.set_password(None).is_err() {
|
||||||
|
return "<proxy redacted>".to_string();
|
||||||
|
}
|
||||||
|
u.to_string()
|
||||||
|
}
|
||||||
|
Err(_) => "<proxy redacted>".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn ssrf_safe_redirect_policy(
|
fn ssrf_safe_redirect_policy(
|
||||||
follow_redirects: bool,
|
follow_redirects: bool,
|
||||||
max_redirects: usize,
|
max_redirects: usize,
|
||||||
|
|
@ -567,3 +586,41 @@ fn ssrf_safe_redirect_policy(
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::redact_proxy_url;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn redacts_userinfo_from_proxy_url() {
|
||||||
|
let red = redact_proxy_url("http://user123:s3cr3tPass@proxy.example.com:8080");
|
||||||
|
assert!(!red.contains("user123"), "username leaked: {red}");
|
||||||
|
assert!(!red.contains("s3cr3tPass"), "password leaked: {red}");
|
||||||
|
assert!(red.contains("proxy.example.com"), "host lost: {red}");
|
||||||
|
assert!(red.contains("8080"), "port lost: {red}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn redacts_long_token_residential_proxy() {
|
||||||
|
// Residential-style: long structured credential with embedded
|
||||||
|
// tokens in the username and special chars in the password.
|
||||||
|
let red =
|
||||||
|
redact_proxy_url("http://acct-zone-resi-country-xx:p@ss-word@gw.proxy.example:12321");
|
||||||
|
assert!(!red.contains("acct-zone-resi"), "username leaked: {red}");
|
||||||
|
assert!(!red.contains("p@ss-word"), "password leaked: {red}");
|
||||||
|
assert!(red.contains("gw.proxy.example"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unparseable_proxy_does_not_echo_input() {
|
||||||
|
let red = redact_proxy_url("user:pass@not a url");
|
||||||
|
assert_eq!(red, "<proxy redacted>");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn proxy_without_credentials_is_preserved() {
|
||||||
|
let red = redact_proxy_url("http://proxy.example.com:3128");
|
||||||
|
assert!(red.contains("proxy.example.com"));
|
||||||
|
assert!(red.contains("3128"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -800,7 +800,9 @@ fn slugify(query: &str) -> String {
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.join("-")
|
.join("-")
|
||||||
.to_lowercase();
|
.to_lowercase();
|
||||||
if s.len() > 60 { s[..60].to_string() } else { s }
|
// char-safe truncation: byte slicing panics if char 60 lands
|
||||||
|
// mid-codepoint (multibyte queries, e.g. CJK / accented input).
|
||||||
|
s.chars().take(60).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check for a cached research result. Returns the compact response if found.
|
/// Check for a cached research result. Returns the compact response if found.
|
||||||
|
|
@ -856,3 +858,32 @@ fn save_research(dir: &std::path::Path, slug: &str, data: &serde_json::Value) ->
|
||||||
json_path.to_string_lossy().to_string(),
|
json_path.to_string_lossy().to_string(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::slugify;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn slugify_multibyte_query_does_not_panic() {
|
||||||
|
// Byte-slicing s[..60] would panic mid-codepoint on multibyte
|
||||||
|
// alphanumerics; char-safe truncation must not.
|
||||||
|
let q = "日本語のクエリ".repeat(20); // long, 3-byte chars
|
||||||
|
let s = slugify(&q);
|
||||||
|
assert!(
|
||||||
|
s.chars().count() <= 60,
|
||||||
|
"slug too long: {}",
|
||||||
|
s.chars().count()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn slugify_ascii_unchanged_under_limit() {
|
||||||
|
assert_eq!(slugify("Hello World Query"), "hello-world-query");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn slugify_caps_long_ascii_at_60_chars() {
|
||||||
|
let s = slugify(&"word ".repeat(40));
|
||||||
|
assert!(s.len() <= 60);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,19 @@ warn() { printf "${YELLOW}[!]${RESET} %s\n" "$*"; }
|
||||||
error() { printf "${RED}[x]${RESET} %s\n" "$*" >&2; }
|
error() { printf "${RED}[x]${RESET} %s\n" "$*" >&2; }
|
||||||
fatal() { error "$*"; exit 1; }
|
fatal() { error "$*"; exit 1; }
|
||||||
|
|
||||||
|
# Mask a secret for display: keep the last 4 chars, redact the rest.
|
||||||
|
# Empty input renders as "(not set)".
|
||||||
|
mask_secret() {
|
||||||
|
local s="$1"
|
||||||
|
if [[ -z "$s" ]]; then
|
||||||
|
printf '(not set)'
|
||||||
|
elif (( ${#s} <= 4 )); then
|
||||||
|
printf '****'
|
||||||
|
else
|
||||||
|
printf '****%s' "${s: -4}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
prompt() {
|
prompt() {
|
||||||
local var_name="$1" prompt_text="$2" default="${3:-}"
|
local var_name="$1" prompt_text="$2" default="${3:-}"
|
||||||
if [[ -n "$default" ]]; then
|
if [[ -n "$default" ]]; then
|
||||||
|
|
@ -52,7 +65,7 @@ prompt() {
|
||||||
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
||||||
fi
|
fi
|
||||||
read -r input
|
read -r input
|
||||||
eval "$var_name=\"${input:-$default}\""
|
printf -v "$var_name" '%s' "${input:-$default}"
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_secret() {
|
prompt_secret() {
|
||||||
|
|
@ -64,7 +77,7 @@ prompt_secret() {
|
||||||
fi
|
fi
|
||||||
read -rs input
|
read -rs input
|
||||||
echo
|
echo
|
||||||
eval "$var_name=\"${input:-$default}\""
|
printf -v "$var_name" '%s' "${input:-$default}"
|
||||||
}
|
}
|
||||||
|
|
||||||
generate_key() {
|
generate_key() {
|
||||||
|
|
@ -374,7 +387,7 @@ create_server() {
|
||||||
printf " Domain: ${BOLD}%s${RESET}\n" "${DOMAIN:-none}"
|
printf " Domain: ${BOLD}%s${RESET}\n" "${DOMAIN:-none}"
|
||||||
printf " OpenAI key: ${BOLD}%s${RESET}\n" "$([ -n "$OPENAI_KEY" ] && echo 'set' || echo 'not set')"
|
printf " OpenAI key: ${BOLD}%s${RESET}\n" "$([ -n "$OPENAI_KEY" ] && echo 'set' || echo 'not set')"
|
||||||
printf " Anthropic key:${BOLD}%s${RESET}\n" "$([ -n "$ANTHROPIC_KEY" ] && echo 'set' || echo 'not set')"
|
printf " Anthropic key:${BOLD}%s${RESET}\n" "$([ -n "$ANTHROPIC_KEY" ] && echo 'set' || echo 'not set')"
|
||||||
printf " Auth key: ${BOLD}%s${RESET}\n" "$AUTH_KEY"
|
printf " Auth key: ${BOLD}%s${RESET}\n" "$(mask_secret "$AUTH_KEY")"
|
||||||
printf " Ollama model: ${BOLD}%s${RESET}\n" "$OLLAMA_MODEL"
|
printf " Ollama model: ${BOLD}%s${RESET}\n" "$OLLAMA_MODEL"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
|
|
@ -454,7 +467,9 @@ create_server() {
|
||||||
echo
|
echo
|
||||||
printf " ${BOLD}Server IP:${RESET} %s\n" "$server_ip"
|
printf " ${BOLD}Server IP:${RESET} %s\n" "$server_ip"
|
||||||
printf " ${BOLD}SSH:${RESET} ssh root@%s\n" "$server_ip"
|
printf " ${BOLD}SSH:${RESET} ssh root@%s\n" "$server_ip"
|
||||||
printf " ${BOLD}Auth key:${RESET} %s\n" "$AUTH_KEY"
|
printf " ${BOLD}Auth key:${RESET} %s\n" "$(mask_secret "$AUTH_KEY")"
|
||||||
|
printf " ${DIM}(full key stored in /opt/webclaw/.env on the server:\n"
|
||||||
|
printf " ssh root@%s 'grep WEBCLAW_AUTH_KEY /opt/webclaw/.env')${RESET}\n" "$server_ip"
|
||||||
echo
|
echo
|
||||||
printf " ${BOLD}Monitor build progress:${RESET}\n"
|
printf " ${BOLD}Monitor build progress:${RESET}\n"
|
||||||
printf " ssh root@%s 'cd /opt/webclaw && docker compose logs -f'\n" "$server_ip"
|
printf " ssh root@%s 'cd /opt/webclaw && docker compose logs -f'\n" "$server_ip"
|
||||||
|
|
@ -465,7 +480,7 @@ create_server() {
|
||||||
printf " ${BOLD}Scrape:${RESET}\n"
|
printf " ${BOLD}Scrape:${RESET}\n"
|
||||||
printf " curl -X POST http://%s:3000/v1/scrape \\\\\n" "$server_ip"
|
printf " curl -X POST http://%s:3000/v1/scrape \\\\\n" "$server_ip"
|
||||||
printf " -H 'Content-Type: application/json' \\\\\n"
|
printf " -H 'Content-Type: application/json' \\\\\n"
|
||||||
printf " -H 'Authorization: Bearer %s' \\\\\n" "$AUTH_KEY"
|
printf " -H 'Authorization: Bearer <YOUR_AUTH_KEY>' \\\\\n"
|
||||||
printf " -d '{\"url\": \"https://example.com\"}'\n"
|
printf " -d '{\"url\": \"https://example.com\"}'\n"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
|
|
@ -482,7 +497,8 @@ create_server() {
|
||||||
echo
|
echo
|
||||||
|
|
||||||
printf " ${BOLD}Tear down:${RESET}\n"
|
printf " ${BOLD}Tear down:${RESET}\n"
|
||||||
printf " HETZNER_TOKEN=%s ./deploy/hetzner.sh --destroy\n" "$HETZNER_TOKEN"
|
printf " HETZNER_TOKEN=\$HETZNER_TOKEN ./deploy/hetzner.sh --destroy\n"
|
||||||
|
printf " ${DIM}(re-export the same token you used to deploy)${RESET}\n"
|
||||||
echo
|
echo
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
4
setup.sh
4
setup.sh
|
|
@ -36,7 +36,7 @@ prompt() {
|
||||||
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
||||||
fi
|
fi
|
||||||
read -r input
|
read -r input
|
||||||
eval "$var_name=\"${input:-$default}\""
|
printf -v "$var_name" '%s' "${input:-$default}"
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_secret() {
|
prompt_secret() {
|
||||||
|
|
@ -48,7 +48,7 @@ prompt_secret() {
|
||||||
fi
|
fi
|
||||||
read -rs input
|
read -rs input
|
||||||
echo
|
echo
|
||||||
eval "$var_name=\"${input:-$default}\""
|
printf -v "$var_name" '%s' "${input:-$default}"
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_yn() {
|
prompt_yn() {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue