From cb31c70465fd7a8c2b706db9f174d42ddbe2f1e5 Mon Sep 17 00:00:00 2001 From: karolinnger <104932164+karolinnger@users.noreply.github.com> Date: Sat, 16 May 2026 22:22:18 -0400 Subject: [PATCH] add network capture endpoint replay --- Cargo.lock | 214 +++++++- README.md | 34 ++ SKILL.md | 28 ++ crates/webclaw-capture/Cargo.toml | 21 + crates/webclaw-capture/src/cdp.rs | 404 +++++++++++++++ crates/webclaw-capture/src/classify.rs | 253 ++++++++++ crates/webclaw-capture/src/infer.rs | 386 +++++++++++++++ crates/webclaw-capture/src/lib.rs | 8 + crates/webclaw-capture/src/openapi.rs | 463 ++++++++++++++++++ crates/webclaw-capture/src/redact.rs | 236 +++++++++ crates/webclaw-capture/src/replay.rs | 383 +++++++++++++++ crates/webclaw-capture/src/store.rs | 221 +++++++++ crates/webclaw-capture/src/types.rs | 174 +++++++ crates/webclaw-capture/tests/classify.rs | 216 ++++++++ .../tests/fixtures/sample.har.json | 139 ++++++ crates/webclaw-capture/tests/infer.rs | 261 ++++++++++ .../tests/integration_capture.rs | 245 +++++++++ crates/webclaw-capture/tests/openapi.rs | 358 ++++++++++++++ crates/webclaw-capture/tests/redact.rs | 209 ++++++++ crates/webclaw-capture/tests/replay.rs | 414 ++++++++++++++++ crates/webclaw-capture/tests/store.rs | 312 ++++++++++++ crates/webclaw-cli/Cargo.toml | 1 + crates/webclaw-cli/src/main.rs | 225 +++++++++ crates/webclaw-fetch/src/sitemap.rs | 2 +- crates/webclaw-fetch/src/url_security.rs | 2 +- crates/webclaw-fetch/tests/bench_1k.rs | 2 +- crates/webclaw-mcp/Cargo.toml | 1 + crates/webclaw-mcp/src/main.rs | 43 ++ crates/webclaw-mcp/src/server.rs | 397 ++++++++++++++- crates/webclaw-mcp/src/tools.rs | 57 +++ crates/webclaw-server/Cargo.toml | 1 + crates/webclaw-server/src/main.rs | 10 + crates/webclaw-server/src/routes/capture.rs | 283 +++++++++++ crates/webclaw-server/src/routes/mod.rs | 1 + 34 files changed, 5996 insertions(+), 8 deletions(-) create mode 100644 crates/webclaw-capture/Cargo.toml create mode 100644 crates/webclaw-capture/src/cdp.rs create mode 100644 crates/webclaw-capture/src/classify.rs create mode 100644 crates/webclaw-capture/src/infer.rs create mode 100644 crates/webclaw-capture/src/lib.rs create mode 100644 crates/webclaw-capture/src/openapi.rs create mode 100644 crates/webclaw-capture/src/redact.rs create mode 100644 crates/webclaw-capture/src/replay.rs create mode 100644 crates/webclaw-capture/src/store.rs create mode 100644 crates/webclaw-capture/src/types.rs create mode 100644 crates/webclaw-capture/tests/classify.rs create mode 100644 crates/webclaw-capture/tests/fixtures/sample.har.json create mode 100644 crates/webclaw-capture/tests/infer.rs create mode 100644 crates/webclaw-capture/tests/integration_capture.rs create mode 100644 crates/webclaw-capture/tests/openapi.rs create mode 100644 crates/webclaw-capture/tests/redact.rs create mode 100644 crates/webclaw-capture/tests/replay.rs create mode 100644 crates/webclaw-capture/tests/store.rs create mode 100644 crates/webclaw-server/src/routes/capture.rs diff --git a/Cargo.lock b/Cargo.lock index 5b96a0b..20bb2db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -161,6 +161,23 @@ dependencies = [ "syn", ] +[[package]] +name = "async-tungstenite" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc405d38be14342132609f06f02acaf825ddccfe76c4824a69281e0458ebd4" +dependencies = [ + "atomic-waker", + "futures-core", + "futures-io", + "futures-task", + "futures-util", + "log", + "pin-project-lite", + "tokio", + "tungstenite", +] + [[package]] name = "atoi_simd" version = "0.17.0" @@ -348,6 +365,9 @@ name = "bytes" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +dependencies = [ + "serde", +] [[package]] name = "bzip2" @@ -418,6 +438,71 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chromiumoxide" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26ed067eb6c1f660bdb87c05efb964421d2ca262bae0296cdfe38cf0cd949a3e" +dependencies = [ + "async-tungstenite", + "base64", + "bytes", + "chromiumoxide_cdp", + "chromiumoxide_types", + "dunce", + "fnv", + "futures", + "futures-timer", + "pin-project-lite", + "reqwest 0.13.3", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "url", + "which", + "windows-registry", +] + +[[package]] +name = "chromiumoxide_cdp" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68a6a03a7ebac4ea85308f285d6959a3e6b2ce32a0c9465dc7a7b1db0144eec7" +dependencies = [ + "chromiumoxide_pdl", + "chromiumoxide_types", + "serde", + "serde_json", +] + +[[package]] +name = "chromiumoxide_pdl" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c602dea92337bc4d824668d78c5b79c3b4ddb29b40dd7218282bbe8fd3fc2091" +dependencies = [ + "chromiumoxide_types", + "either", + "heck", + "once_cell", + "proc-macro2", + "quote", + "regex", + "serde_json", +] + +[[package]] +name = "chromiumoxide_types" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678d5146e74f16fc4a41978b275af572cd913de1f10270d2b93b6c276bc57d80" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "chrono" version = "0.4.44" @@ -665,6 +750,12 @@ dependencies = [ "syn", ] +[[package]] +name = "data-encoding" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8" + [[package]] name = "debug_unsafe" version = "0.1.4" @@ -772,6 +863,12 @@ dependencies = [ "dtoa", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.20" @@ -998,6 +1095,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" +[[package]] +name = "futures-timer" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" + [[package]] name = "futures-util" version = "0.3.32" @@ -1116,6 +1219,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "hmac" version = "0.12.1" @@ -2207,6 +2316,35 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "reqwest" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62e0021ea2c22aed41653bc7e1419abb2c97e038ff2c33d0e1309e49a97deec0" +dependencies = [ + "base64", + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "ring" version = "0.17.14" @@ -2529,6 +2667,17 @@ dependencies = [ "digest", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -2958,6 +3107,23 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tungstenite" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8628dcc84e5a09eb3d8423d6cb682965dea9133204e8fb3efee74c2a0c259442" +dependencies = [ + "bytes", + "data-encoding", + "http", + "httparse", + "log", + "rand 0.9.2", + "sha1", + "thiserror", + "utf-8", +] + [[package]] name = "type1-encoding-parser" version = "0.1.1" @@ -3217,6 +3383,25 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webclaw-capture" +version = "0.6.1" +dependencies = [ + "chromiumoxide", + "chrono", + "dirs", + "futures-util", + "hex", + "reqwest 0.12.28", + "serde", + "serde_json", + "sha2", + "thiserror", + "tokio", + "tracing", + "url", +] + [[package]] name = "webclaw-cli" version = "0.6.1" @@ -3225,13 +3410,14 @@ dependencies = [ "dotenvy", "rand 0.8.5", "regex", - "reqwest", + "reqwest 0.12.28", "serde_json", "shlex", "tokio", "tracing", "tracing-subscriber", "url", + "webclaw-capture", "webclaw-core", "webclaw-fetch", "webclaw-llm", @@ -3267,7 +3453,7 @@ dependencies = [ "quick-xml 0.37.5", "rand 0.8.5", "regex", - "reqwest", + "reqwest 0.12.28", "serde", "serde_json", "tempfile", @@ -3287,7 +3473,7 @@ name = "webclaw-llm" version = "0.6.1" dependencies = [ "async-trait", - "reqwest", + "reqwest 0.12.28", "serde", "serde_json", "thiserror", @@ -3309,6 +3495,7 @@ dependencies = [ "tracing", "tracing-subscriber", "url", + "webclaw-capture", "webclaw-core", "webclaw-fetch", "webclaw-llm", @@ -3339,6 +3526,7 @@ dependencies = [ "tower-http", "tracing", "tracing-subscriber", + "webclaw-capture", "webclaw-core", "webclaw-fetch", "webclaw-llm", @@ -3369,6 +3557,15 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" +[[package]] +name = "which" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81995fafaaaf6ae47a7d0cc83c67caf92aeb7e5331650ae6ff856f7c0c60c459" +dependencies = [ + "libc", +] + [[package]] name = "winapi" version = "0.3.9" @@ -3432,6 +3629,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" diff --git a/README.md b/README.md index fde6e49..f50685f 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,19 @@ webclaw https://example.com/pricing --format json > pricing-old.json webclaw https://example.com/pricing --diff-with pricing-old.json ``` +### Capture and replay learned APIs + +Capture browser network traffic from a public or authorized page, store the learned endpoints locally, and reuse them from the CLI. Captures are written under `%USERPROFILE%\.webclaw\api-captures` by default, or under `WEBCLAW_CAPTURE_DIR` when that environment variable is set. + +```powershell +webclaw capture-network https://example.com --intent "discover product listing API" --wait-ms 3000 +webclaw endpoints example.com/2026-05-16T12-00-00Z +webclaw replay-endpoint "GET https://example.com/api/products" --dry-run +webclaw export-openapi example.com/2026-05-16T12-00-00Z +``` + +Use `webclaw show-endpoint ""` to inspect one learned endpoint before replay. `GET`, `HEAD`, and `OPTIONS` endpoints can be replayed directly; `POST`, `PUT`, `PATCH`, and `DELETE` stay in dry-run preview unless you pass `--confirm-unsafe`. + --- ## MCP Server @@ -186,6 +199,21 @@ Crawl this documentation site and prepare clean context for a RAG index. Extract the brand colors, fonts, and logos from this company website. ``` +### Network capture tools + +MCP clients can use the learned API workflow directly through these tools: + +| Tool | Parameters | What it does | +| --- | --- | --- | +| `capture_network` | `url`, optional `intent`, `wait_ms`, `headed` | Opens a public or authorized HTTP(S) page in Chromium, captures browser network traffic, redacts secrets, infers API endpoints, and saves the capture locally. | +| `discover_endpoints` | `capture_id` | Returns the learned endpoint definitions for a saved capture. | +| `show_endpoint` | `endpoint_id` | Returns one learned endpoint so an agent can inspect method, path, examples, schemas, and safety metadata before reuse. | +| `replay_endpoint` | `endpoint_id`, optional `params_json`, `dry_run`, `confirm_unsafe`, `headers`, `body_json` | Previews or replays a learned endpoint. `GET`, `HEAD`, and `OPTIONS` can execute when `dry_run` is false; `POST`, `PUT`, `PATCH`, and `DELETE` stay as dry-run previews unless `confirm_unsafe` is true. Redacted headers are never sent. | +| `export_openapi` | `capture_id` | Writes `openapi.json` beside a saved capture's `endpoints.json`. | +| `list_captures` | `{}` | Lists saved captures from the configured capture root. | + +Captured artifacts are stored under `%USERPROFILE%\.webclaw\api-captures` by default, or `WEBCLAW_CAPTURE_DIR` when set. Only capture pages and sessions you are authorized to inspect; webclaw does not use these tools to bypass CAPTCHAs, paywalls, login walls, rate limits, or access controls. + --- ## Tools @@ -202,6 +230,12 @@ Extract the brand colors, fonts, and logos from this company website. | `brand` | Extract colors, fonts, logos, and metadata | Yes | | `search` | Search the web and scrape results | Hosted API | | `research` | Multi-source research workflow | Hosted API | +| `capture_network` | Capture browser network traffic and save learned API endpoints | Yes | +| `discover_endpoints` | Return learned endpoints for a saved capture | Yes | +| `show_endpoint` | Inspect one learned endpoint by id | Yes | +| `replay_endpoint` | Preview or safely replay a learned endpoint | Yes | +| `export_openapi` | Export learned endpoints as OpenAPI 3.1 JSON | Yes | +| `list_captures` | List saved network captures | Yes | --- diff --git a/SKILL.md b/SKILL.md index 39fc144..cc5c963 100644 --- a/SKILL.md +++ b/SKILL.md @@ -31,6 +31,34 @@ All requests go to `https://api.webclaw.io/v1/`. Authentication: `Authorization: Bearer $WEBCLAW_API_KEY` +## CLI API capture + +Use the local CLI to capture browser network traffic from a public or authorized page, store learned endpoints locally, replay them safely, or export them as OpenAPI. Captures are written under `%USERPROFILE%\.webclaw\api-captures` by default, or under `WEBCLAW_CAPTURE_DIR` when set. + +```powershell +webclaw capture-network https://example.com --intent "discover product listing API" --wait-ms 3000 +webclaw endpoints example.com/2026-05-16T12-00-00Z +webclaw replay-endpoint "GET https://example.com/api/products" --dry-run +webclaw export-openapi example.com/2026-05-16T12-00-00Z +``` + +Use `webclaw show-endpoint ""` to inspect one learned endpoint before replay. `GET`, `HEAD`, and `OPTIONS` endpoints can be replayed directly; `POST`, `PUT`, `PATCH`, and `DELETE` stay in dry-run preview unless you pass `--confirm-unsafe`. + +## MCP API capture tools + +Use the MCP server tools when an agent needs to discover and reuse API calls made by a public or authorized page: + +| Tool | Parameters | Use | +|------|------------|-----| +| `capture_network` | `url`, optional `intent`, `wait_ms`, `headed` | Open an HTTP(S) page in Chromium, capture network traffic, redact secrets, infer endpoints, and save the capture locally. | +| `discover_endpoints` | `capture_id` | Return all learned endpoint definitions for a saved capture. | +| `show_endpoint` | `endpoint_id` | Inspect one learned endpoint before replay or OpenAPI export. | +| `replay_endpoint` | `endpoint_id`, optional `params_json`, `dry_run`, `confirm_unsafe`, `headers`, `body_json` | Preview or replay a learned endpoint. Read-only methods can execute when `dry_run` is false; `POST`, `PUT`, `PATCH`, and `DELETE` stay dry-run unless `confirm_unsafe` is true. Redacted headers are never sent. | +| `export_openapi` | `capture_id` | Write `openapi.json` beside the saved capture's `endpoints.json`. | +| `list_captures` | `{}` | List saved captures from the configured capture root. | + +Safety defaults: capture only pages and sessions the user is authorized to inspect, redact secrets by default, and do not use the capture tools to bypass CAPTCHAs, paywalls, login walls, rate limits, or access controls. Captures are stored under `%USERPROFILE%\.webclaw\api-captures` by default, or under `WEBCLAW_CAPTURE_DIR` when set. + ## Endpoints ### 1. Scrape — extract content from a single URL diff --git a/crates/webclaw-capture/Cargo.toml b/crates/webclaw-capture/Cargo.toml new file mode 100644 index 0000000..0a70cc9 --- /dev/null +++ b/crates/webclaw-capture/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "webclaw-capture" +description = "Browser network capture, endpoint inference, and safe replay for Webclaw" +version.workspace = true +edition.workspace = true +license.workspace = true + +[dependencies] +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +chromiumoxide = "0.9.1" +futures-util = "0.3" +reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } +url = "2" +dirs = "6" +chrono = { version = "0.4", features = ["serde"] } +sha2 = "0.10" +hex = "0.4" diff --git a/crates/webclaw-capture/src/cdp.rs b/crates/webclaw-capture/src/cdp.rs new file mode 100644 index 0000000..ed0c6dd --- /dev/null +++ b/crates/webclaw-capture/src/cdp.rs @@ -0,0 +1,404 @@ +use std::collections::HashMap; +use std::time::Duration; + +use chromiumoxide::cdp::browser_protocol::network::{ + EnableParams, EventLoadingFinished, EventRequestWillBeSent, EventResponseReceived, + GetResponseBodyParams, Headers, RequestId, ResourceType, TimeSinceEpoch, +}; +use chromiumoxide::{Browser, BrowserConfig, Page}; +use chrono::{DateTime, Utc}; +use futures_util::StreamExt; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value, json}; +use tokio::sync::oneshot; +use url::Url; + +use crate::infer::infer_endpoints; +use crate::store::{capture_id_for, save_capture}; +use crate::types::{CaptureArtifact, CaptureError, CapturedExchange, HeaderMap, SavedCapture}; + +const BODY_SAMPLE_LIMIT: usize = 64 * 1024; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct CaptureOptions { + pub url: String, + pub intent: Option, + pub wait_ms: u64, + pub headed: bool, +} + +pub async fn capture_network(options: CaptureOptions) -> Result { + let source_url = + Url::parse(&options.url).map_err(|error| CaptureError::InvalidUrl(error.to_string()))?; + let started_at = Utc::now(); + let capture_id = capture_id_for(&source_url, started_at); + + let (mut browser, mut handler) = launch_browser(options.headed).await?; + let handler_task = tokio::spawn(async move { + while let Some(event) = handler.next().await { + if let Err(error) = event { + tracing::debug!(error = %error, "chromiumoxide browser handler stopped"); + break; + } + } + }); + + let capture_result = async { + let page = browser + .new_page("about:blank") + .await + .map_err(|error| CaptureError::Capture(format!("could not create page: {error}")))?; + + enable_network_capture(&page).await?; + let request_events = page + .event_listener::() + .await + .map_err(|error| { + CaptureError::Capture(format!("could not listen for network requests: {error}")) + })?; + let response_events = page + .event_listener::() + .await + .map_err(|error| { + CaptureError::Capture(format!("could not listen for network responses: {error}")) + })?; + let finished_events = page + .event_listener::() + .await + .map_err(|error| { + CaptureError::Capture(format!("could not listen for completed requests: {error}")) + })?; + + let (stop_tx, stop_rx) = oneshot::channel(); + let collector_page = page.clone(); + let collector_task = tokio::spawn(async move { + collect_exchanges( + collector_page, + request_events, + response_events, + finished_events, + stop_rx, + started_at, + ) + .await + }); + + page.goto(options.url.clone()).await.map_err(|error| { + CaptureError::Capture(format!("could not navigate to {}: {error}", options.url)) + })?; + + tokio::time::sleep(Duration::from_millis(options.wait_ms)).await; + let _ = stop_tx.send(()); + + let exchanges = collector_task + .await + .map_err(|error| CaptureError::Capture(format!("capture collector failed: {error}")))? + .map_err(|error| CaptureError::Capture(format!("capture collector failed: {error}")))?; + let completed_at = Utc::now(); + let endpoints = infer_endpoints(&exchanges); + let exchange_count = exchanges.len(); + let endpoint_count = endpoints.len(); + + let mut metadata = Map::new(); + metadata.insert("wait_ms".to_owned(), json!(options.wait_ms)); + metadata.insert("headed".to_owned(), json!(options.headed)); + metadata.insert("exchange_count".to_owned(), json!(exchange_count)); + metadata.insert("endpoint_count".to_owned(), json!(endpoint_count)); + + let artifact = CaptureArtifact { + id: capture_id, + source_url: options.url, + intent: options.intent, + started_at, + completed_at: Some(completed_at), + exchanges, + endpoints, + metadata, + }; + + save_capture(&artifact) + } + .await; + + if let Err(error) = browser.close().await { + tracing::debug!(error = %error, "failed to close browser after capture"); + } + if let Err(error) = handler_task.await { + tracing::debug!(error = %error, "failed to join browser handler after capture"); + } + + capture_result +} + +async fn launch_browser(headed: bool) -> Result<(Browser, chromiumoxide::Handler), CaptureError> { + let mut config = BrowserConfig::builder() + .request_timeout(Duration::from_secs(15)) + .no_sandbox() + .disable_cache() + .disable_https_first(); + + if headed { + config = config.with_head(); + } + + let config = config.build().map_err(|error| { + CaptureError::Capture(format!("could not build browser config: {error}")) + })?; + + Browser::launch(config) + .await + .map_err(|error| CaptureError::Capture(format!("could not launch Chromium: {error}"))) +} + +async fn enable_network_capture(page: &Page) -> Result<(), CaptureError> { + let params = EnableParams::builder() + .max_total_buffer_size(16 * 1024 * 1024) + .max_resource_buffer_size(2 * 1024 * 1024) + .max_post_data_size(BODY_SAMPLE_LIMIT as i64) + .build(); + + page.execute(params).await.map_err(|error| { + CaptureError::Capture(format!("could not enable CDP network capture: {error}")) + })?; + + Ok(()) +} + +async fn collect_exchanges( + page: Page, + mut request_events: chromiumoxide::listeners::EventStream, + mut response_events: chromiumoxide::listeners::EventStream, + mut finished_events: chromiumoxide::listeners::EventStream, + mut stop_rx: oneshot::Receiver<()>, + fallback_started_at: DateTime, +) -> Result, CaptureError> { + let mut pending = HashMap::::new(); + let mut exchanges = Vec::::new(); + + loop { + tokio::select! { + _ = &mut stop_rx => break, + event = request_events.next() => { + if let Some(event) = event { + record_request(&mut pending, &event, fallback_started_at); + } + } + event = response_events.next() => { + if let Some(event) = event { + record_response(&mut pending, &event); + } + } + event = finished_events.next() => { + if let Some(event) = event + && let Some(exchange) = finish_request(&page, &mut pending, &event).await? + { + exchanges.push(exchange); + } + } + } + } + + for (_request_id, pending_exchange) in pending { + if let Some(exchange) = pending_exchange.into_exchange() { + exchanges.push(exchange); + } + } + + exchanges.sort_by(|left, right| { + left.started_at + .cmp(&right.started_at) + .then_with(|| left.url.cmp(&right.url)) + }); + + Ok(exchanges) +} + +fn record_request( + pending: &mut HashMap, + event: &EventRequestWillBeSent, + fallback_started_at: DateTime, +) { + let request_id = event.request_id.clone(); + let mut current = pending.remove(&request_id).unwrap_or_default(); + + if let Some(redirect_response) = &event.redirect_response { + if !current.url.is_empty() { + current.redirect_chain.push(current.url.clone()); + } + current.redirect_chain.push(redirect_response.url.clone()); + } + + current.method = event.request.method.clone(); + current.url = event.request.url.clone(); + current.request_headers = headers_to_map(&event.request.headers); + current.request_body_sample = request_body_sample(event); + current.resource_type = event.r#type.as_ref().map(resource_type_name); + current.started_at = wall_time_to_utc(&event.wall_time, fallback_started_at); + current.started_monotonic = Some(*event.timestamp.inner()); + + pending.insert(request_id, current); +} + +fn record_response( + pending: &mut HashMap, + event: &EventResponseReceived, +) { + let current = pending.entry(event.request_id.clone()).or_default(); + + if current.url.is_empty() { + current.url = event.response.url.clone(); + } + current.status = u16::try_from(event.response.status).unwrap_or_default(); + current.response_headers = headers_to_map(&event.response.headers); + current.response_mime_type = Some(event.response.mime_type.clone()); + current.resource_type = Some(resource_type_name(&event.r#type)); +} + +async fn finish_request( + page: &Page, + pending: &mut HashMap, + event: &EventLoadingFinished, +) -> Result, CaptureError> { + let Some(mut current) = pending.remove(&event.request_id) else { + return Ok(None); + }; + + if let Some(started) = current.started_monotonic { + let elapsed = ((*event.timestamp.inner() - started) * 1_000.0).max(0.0); + current.duration_ms = elapsed.round() as u64; + } + + current.response_body_sample = response_body_sample(page, event.request_id.clone()).await; + + Ok(current.into_exchange()) +} + +async fn response_body_sample(page: &Page, request_id: RequestId) -> Option { + let response = page + .execute(GetResponseBodyParams::new(request_id)) + .await + .ok()?; + Some(truncate_sample(response.result.body)) +} + +fn headers_to_map(headers: &Headers) -> HeaderMap { + match headers.inner() { + Value::Object(headers) => headers.clone(), + _ => HeaderMap::new(), + } +} + +fn request_body_sample(event: &EventRequestWillBeSent) -> Option { + let entries = event.request.post_data_entries.as_ref()?; + let mut body = String::new(); + + for entry in entries { + if let Some(bytes) = &entry.bytes { + body.push_str(bytes.as_ref()); + } + } + + if body.is_empty() { + None + } else { + Some(truncate_sample(body)) + } +} + +fn resource_type_name(resource_type: &ResourceType) -> String { + resource_type.as_ref().to_owned() +} + +fn wall_time_to_utc(wall_time: &TimeSinceEpoch, fallback: DateTime) -> DateTime { + let seconds = *wall_time.inner(); + if !seconds.is_finite() || seconds < 0.0 { + return fallback; + } + + let whole_seconds = seconds.trunc() as i64; + let nanos = ((seconds.fract() * 1_000_000_000.0).round() as u32).min(999_999_999); + + DateTime::::from_timestamp(whole_seconds, nanos).unwrap_or(fallback) +} + +fn truncate_sample(sample: String) -> String { + if sample.len() <= BODY_SAMPLE_LIMIT { + return sample; + } + + let end = sample + .char_indices() + .take_while(|(index, _)| *index <= BODY_SAMPLE_LIMIT) + .map(|(index, character)| index + character.len_utf8()) + .last() + .unwrap_or(0) + .min(sample.len()); + + sample[..end].to_owned() +} + +#[derive(Debug, Clone)] +struct PendingExchange { + method: String, + url: String, + request_headers: HeaderMap, + request_body_sample: Option, + resource_type: Option, + status: u16, + response_headers: HeaderMap, + response_body_sample: Option, + response_mime_type: Option, + started_at: DateTime, + started_monotonic: Option, + duration_ms: u64, + redirect_chain: Vec, +} + +impl Default for PendingExchange { + fn default() -> Self { + Self { + method: String::new(), + url: String::new(), + request_headers: HeaderMap::new(), + request_body_sample: None, + resource_type: None, + status: 0, + response_headers: HeaderMap::new(), + response_body_sample: None, + response_mime_type: None, + started_at: Utc::now(), + started_monotonic: None, + duration_ms: 0, + redirect_chain: Vec::new(), + } + } +} + +impl PendingExchange { + fn into_exchange(mut self) -> Option { + if self.method.is_empty() || self.url.is_empty() { + return None; + } + + if !self.response_headers.contains_key("content-type") + && let Some(mime_type) = self.response_mime_type.take() + { + self.response_headers + .insert("content-type".to_owned(), Value::String(mime_type)); + } + + Some(CapturedExchange { + method: self.method, + url: self.url, + request_headers: self.request_headers, + request_body_sample: self.request_body_sample, + resource_type: self.resource_type, + status: self.status, + response_headers: self.response_headers, + response_body_sample: self.response_body_sample, + started_at: self.started_at, + duration_ms: self.duration_ms, + redirect_chain: self.redirect_chain, + }) + } +} diff --git a/crates/webclaw-capture/src/classify.rs b/crates/webclaw-capture/src/classify.rs new file mode 100644 index 0000000..c4e8e61 --- /dev/null +++ b/crates/webclaw-capture/src/classify.rs @@ -0,0 +1,253 @@ +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use url::Url; + +use crate::types::CapturedExchange; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct ApiClassification { + pub include: bool, + pub confidence: f32, + pub reasons: Vec, +} + +pub fn classify_exchange(exchange: &CapturedExchange) -> ApiClassification { + let url = match Url::parse(&exchange.url) { + Ok(url) => url, + Err(error) => { + return ApiClassification { + include: false, + confidence: 0.0, + reasons: vec![format!("invalid URL: {error}")], + }; + } + }; + + let mut exclusion_reasons = Vec::new(); + + if is_browser_extension_url(&url) { + exclusion_reasons.push("browser extension URL".to_owned()); + } + + if is_tracking_host(url.host_str()) { + exclusion_reasons.push("tracking, ad, or telemetry host".to_owned()); + } + + if has_static_asset_extension(url.path()) { + exclusion_reasons.push("static asset extension".to_owned()); + } + + if is_static_resource_type(exchange.resource_type.as_deref()) { + exclusion_reasons.push("static browser resource type".to_owned()); + } + + if !exclusion_reasons.is_empty() { + return ApiClassification { + include: false, + confidence: 0.0, + reasons: exclusion_reasons, + }; + } + + let mut confidence = 0.0_f32; + let mut reasons = Vec::new(); + + if matches_resource_type(exchange.resource_type.as_deref(), &["fetch", "xhr"]) { + confidence += 0.65; + reasons.push("browser resource type is fetch/xhr".to_owned()); + } + + if response_is_json(exchange) { + confidence += 0.55; + reasons.push("response content type is JSON".to_owned()); + } + + let path = url.path(); + + if has_api_path(path) { + confidence += 0.55; + reasons.push("URL path contains an API prefix".to_owned()); + } + + if has_versioned_path(path) { + confidence += 0.55; + reasons.push("URL path starts with a versioned API prefix".to_owned()); + } + + if has_graphql_path(path) { + confidence += 0.55; + reasons.push("URL path is GraphQL-like".to_owned()); + } + + if has_graphql_body(exchange.request_body_sample.as_deref()) { + confidence += 0.55; + reasons.push("request body is GraphQL-like".to_owned()); + } + + let confidence = confidence.min(1.0); + + if reasons.is_empty() { + reasons.push("no API traffic signals found".to_owned()); + } + + ApiClassification { + include: confidence >= 0.5, + confidence, + reasons, + } +} + +pub fn filter_api_exchanges(exchanges: &[CapturedExchange]) -> Vec { + exchanges + .iter() + .filter(|exchange| classify_exchange(exchange).include) + .cloned() + .collect() +} + +fn is_browser_extension_url(url: &Url) -> bool { + matches!( + url.scheme().to_ascii_lowercase().as_str(), + "chrome-extension" | "moz-extension" | "edge-extension" | "safari-extension" + ) +} + +fn is_tracking_host(host: Option<&str>) -> bool { + let Some(host) = host else { + return false; + }; + let host = host.to_ascii_lowercase(); + + [ + "google-analytics", + "googletagmanager", + "googlesyndication", + "doubleclick", + "adservice", + "ads.", + ".ads.", + "analytics.", + ".analytics.", + "telemetry", + "segment.", + "segment.io", + "amplitude", + "mixpanel", + "hotjar", + "sentry.io", + "datadog", + "newrelic", + ] + .iter() + .any(|needle| host.contains(needle)) +} + +fn has_static_asset_extension(path: &str) -> bool { + let path = path.to_ascii_lowercase(); + + [ + ".png", ".jpg", ".jpeg", ".gif", ".webp", ".avif", ".svg", ".ico", ".css", ".js", ".mjs", + ".woff", ".woff2", ".ttf", ".otf", ".eot", ".map", ".mp4", ".webm", ".mp3", ".wav", + ] + .iter() + .any(|extension| path.ends_with(extension)) +} + +fn is_static_resource_type(resource_type: Option<&str>) -> bool { + matches_resource_type( + resource_type, + &[ + "image", + "stylesheet", + "script", + "font", + "media", + "manifest", + "ping", + "cspviolationreport", + ], + ) +} + +fn matches_resource_type(resource_type: Option<&str>, candidates: &[&str]) -> bool { + let Some(resource_type) = resource_type else { + return false; + }; + candidates + .iter() + .any(|candidate| resource_type.eq_ignore_ascii_case(candidate)) +} + +fn response_is_json(exchange: &CapturedExchange) -> bool { + exchange.response_headers.iter().any(|(name, value)| { + name.eq_ignore_ascii_case("content-type") + && header_value_as_str(value) + .map(|value| value.to_ascii_lowercase().contains("json")) + .unwrap_or(false) + }) +} + +fn header_value_as_str(value: &Value) -> Option<&str> { + match value { + Value::String(value) => Some(value), + _ => None, + } +} + +fn has_api_path(path: &str) -> bool { + path.split('/') + .filter(|segment| !segment.is_empty()) + .any(|segment| segment.eq_ignore_ascii_case("api")) +} + +fn has_versioned_path(path: &str) -> bool { + path.split('/') + .find(|segment| !segment.is_empty()) + .map(|segment| { + let segment = segment.to_ascii_lowercase(); + segment.len() > 1 + && segment.starts_with('v') + && segment[1..] + .chars() + .all(|character| character.is_ascii_digit()) + }) + .unwrap_or(false) +} + +fn has_graphql_path(path: &str) -> bool { + path.split('/') + .filter(|segment| !segment.is_empty()) + .any(|segment| segment.eq_ignore_ascii_case("graphql")) +} + +fn has_graphql_body(body: Option<&str>) -> bool { + let Some(body) = body else { + return false; + }; + + if let Ok(value) = serde_json::from_str::(body) { + return value + .as_object() + .map(|object| { + object.contains_key("operationName") + || object + .get("query") + .and_then(Value::as_str) + .map(is_graphql_query_text) + .unwrap_or(false) + }) + .unwrap_or(false); + } + + is_graphql_query_text(body) +} + +fn is_graphql_query_text(text: &str) -> bool { + let text = text.trim_start(); + text.starts_with("query ") + || text.starts_with("query{") + || text.starts_with("mutation ") + || text.starts_with("mutation{") + || text.starts_with("subscription ") + || text.starts_with("subscription{") +} diff --git a/crates/webclaw-capture/src/infer.rs b/crates/webclaw-capture/src/infer.rs new file mode 100644 index 0000000..560d63f --- /dev/null +++ b/crates/webclaw-capture/src/infer.rs @@ -0,0 +1,386 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use serde_json::{Map, Value, json}; +use url::Url; + +use crate::classify::filter_api_exchanges; +use crate::redact::{redact_headers, redact_url}; +use crate::types::{ + CapturedExchange, EndpointDefinition, EndpointExample, EndpointSafety, HeaderMap, +}; + +pub fn infer_endpoints(exchanges: &[CapturedExchange]) -> Vec { + let mut groups = BTreeMap::::new(); + + for exchange in filter_api_exchanges(exchanges) { + let Ok(url) = Url::parse(&exchange.url) else { + continue; + }; + + let method = exchange.method.to_ascii_uppercase(); + let origin = url.origin().ascii_serialization(); + let path_template = normalize_path_template(url.path()); + let key = EndpointKey { + method: method.clone(), + origin: origin.clone(), + path_template: path_template.clone(), + }; + + groups + .entry(key) + .or_insert_with(|| EndpointBuilder::new(method, origin, path_template)) + .add_exchange(&exchange, &url); + } + + groups + .into_values() + .map(EndpointBuilder::into_endpoint) + .collect() +} + +pub fn normalize_path_template(path: &str) -> String { + let normalized = if path.is_empty() { "/" } else { path }; + let trailing_slash = normalized.len() > 1 && normalized.ends_with('/'); + + let mut segments = normalized + .split('/') + .filter(|segment| !segment.is_empty()) + .map(|segment| { + if is_identifier_segment(segment) { + "{id}".to_owned() + } else { + segment.to_owned() + } + }) + .collect::>(); + + if segments.is_empty() { + return "/".to_owned(); + } + + let mut path_template = format!("/{}", segments.join("/")); + if trailing_slash { + path_template.push('/'); + } + segments.clear(); + path_template +} + +pub fn infer_json_schema(value: &Value) -> Value { + match value { + Value::Null => json!({ "type": "null" }), + Value::Bool(_) => json!({ "type": "boolean" }), + Value::Number(number) if number.is_i64() || number.is_u64() => { + json!({ "type": "integer" }) + } + Value::Number(_) => json!({ "type": "number" }), + Value::String(_) => json!({ "type": "string" }), + Value::Array(items) => { + let item_schema = items + .iter() + .map(infer_json_schema) + .reduce(|left, right| merge_json_schemas(&left, &right)) + .unwrap_or_else(|| json!({})); + + json!({ + "type": "array", + "items": item_schema + }) + } + Value::Object(object) => { + let properties = object + .iter() + .map(|(key, value)| (key.clone(), infer_json_schema(value))) + .collect::>(); + + json!({ + "type": "object", + "properties": properties + }) + } + } +} + +pub fn endpoint_id(method: &str, origin: &str, path_template: &str) -> String { + format!( + "{} {}{}", + method.to_ascii_uppercase(), + origin.trim_end_matches('/'), + ensure_leading_slash(path_template) + ) +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +struct EndpointKey { + method: String, + origin: String, + path_template: String, +} + +#[derive(Debug, Clone)] +struct EndpointBuilder { + method: String, + origin: String, + path_template: String, + query_params: BTreeMap>, + request_schema: Option, + response_schema: Option, + auth_evidence: BTreeSet, + examples: Vec, +} + +impl EndpointBuilder { + fn new(method: String, origin: String, path_template: String) -> Self { + Self { + method, + origin, + path_template, + query_params: BTreeMap::new(), + request_schema: None, + response_schema: None, + auth_evidence: BTreeSet::new(), + examples: Vec::new(), + } + } + + fn add_exchange(&mut self, exchange: &CapturedExchange, url: &Url) { + for (name, value) in url.query_pairs() { + self.query_params + .entry(name.into_owned()) + .or_default() + .insert(value.into_owned()); + } + + self.record_auth_evidence(&exchange.request_headers); + self.record_auth_evidence(&exchange.response_headers); + + if let Some(schema) = infer_body_schema(exchange.request_body_sample.as_deref()) { + self.request_schema = merge_optional_schema(self.request_schema.take(), schema); + } + + if let Some(schema) = infer_body_schema(exchange.response_body_sample.as_deref()) { + self.response_schema = merge_optional_schema(self.response_schema.take(), schema); + } + + self.examples.push(EndpointExample { + url: redact_url(&exchange.url), + request_headers: redact_headers(&exchange.request_headers), + request_body_sample: redact_body_sample(exchange.request_body_sample.as_deref()), + response_status: exchange.status, + response_headers: redact_headers(&exchange.response_headers), + response_body_sample: redact_body_sample(exchange.response_body_sample.as_deref()), + captured_at: exchange.started_at, + }); + } + + fn into_endpoint(self) -> EndpointDefinition { + let safety = endpoint_safety(&self.method); + + EndpointDefinition { + id: endpoint_id(&self.method, &self.origin, &self.path_template), + method: self.method, + origin: self.origin, + path_template: self.path_template, + query_params: self + .query_params + .into_iter() + .map(|(name, values)| (name, values.into_iter().collect())) + .collect(), + request_schema: self.request_schema, + response_schema: self.response_schema, + auth_evidence: self.auth_evidence.into_iter().collect(), + safety, + examples: self.examples, + } + } + + fn record_auth_evidence(&mut self, headers: &HeaderMap) { + for name in headers.keys() { + if is_auth_evidence_header(name) { + self.auth_evidence.insert(format!("{name} header observed")); + } + } + } +} + +fn infer_body_schema(body: Option<&str>) -> Option { + let body = body?.trim(); + if body.is_empty() { + return None; + } + + serde_json::from_str::(body) + .ok() + .map(|value| infer_json_schema(&value)) +} + +fn merge_optional_schema(current: Option, next: Value) -> Option { + Some(match current { + Some(current) => merge_json_schemas(¤t, &next), + None => next, + }) +} + +fn merge_json_schemas(left: &Value, right: &Value) -> Value { + if left == right { + return left.clone(); + } + + let left_type = left.get("type").and_then(Value::as_str); + let right_type = right.get("type").and_then(Value::as_str); + + match (left_type, right_type) { + (Some("object"), Some("object")) => merge_object_schemas(left, right), + (Some("array"), Some("array")) => { + let left_items = left.get("items").cloned().unwrap_or_else(|| json!({})); + let right_items = right.get("items").cloned().unwrap_or_else(|| json!({})); + json!({ + "type": "array", + "items": merge_json_schemas(&left_items, &right_items) + }) + } + (Some(_), Some(_)) => { + let mut variants = Vec::new(); + push_unique_schema(&mut variants, left.clone()); + push_unique_schema(&mut variants, right.clone()); + json!({ "oneOf": variants }) + } + _ => right.clone(), + } +} + +fn merge_object_schemas(left: &Value, right: &Value) -> Value { + let mut properties = Map::new(); + + if let Some(left_properties) = left.get("properties").and_then(Value::as_object) { + for (name, schema) in left_properties { + properties.insert(name.clone(), schema.clone()); + } + } + + if let Some(right_properties) = right.get("properties").and_then(Value::as_object) { + for (name, schema) in right_properties { + let schema = properties + .remove(name) + .map(|existing| merge_json_schemas(&existing, schema)) + .unwrap_or_else(|| schema.clone()); + properties.insert(name.clone(), schema); + } + } + + json!({ + "type": "object", + "properties": properties + }) +} + +fn push_unique_schema(variants: &mut Vec, schema: Value) { + if let Some(nested) = schema.get("oneOf").and_then(Value::as_array) { + for item in nested { + push_unique_schema(variants, item.clone()); + } + return; + } + + if !variants.iter().any(|existing| existing == &schema) { + variants.push(schema); + } +} + +fn endpoint_safety(method: &str) -> EndpointSafety { + if is_safe_method(method) { + EndpointSafety { + safe_to_replay: true, + requires_confirmation: false, + reason: format!( + "{} is a read-oriented HTTP method", + method.to_ascii_uppercase() + ), + } + } else { + EndpointSafety { + safe_to_replay: false, + requires_confirmation: true, + reason: format!( + "{} may mutate server state and requires confirmation", + method.to_ascii_uppercase() + ), + } + } +} + +fn is_safe_method(method: &str) -> bool { + matches!( + method.to_ascii_uppercase().as_str(), + "GET" | "HEAD" | "OPTIONS" + ) +} + +fn redact_body_sample(sample: Option<&str>) -> Option { + sample.map(|body| match serde_json::from_str::(body) { + Ok(value) => crate::redact::redact_json(&value).to_string(), + Err(_) => body.to_owned(), + }) +} + +fn is_auth_evidence_header(name: &str) -> bool { + let lower = name.to_ascii_lowercase(); + let compact: String = lower + .chars() + .filter(|character| character.is_ascii_alphanumeric()) + .collect(); + + [ + "authorization", + "cookie", + "set-cookie", + "api-key", + "csrf", + "token", + "session", + ] + .iter() + .any(|needle| { + let compact_needle: String = needle + .chars() + .filter(|character| character.is_ascii_alphanumeric()) + .collect(); + + lower.contains(needle) || compact.contains(&compact_needle) + }) +} + +fn is_identifier_segment(segment: &str) -> bool { + is_numeric_segment(segment) || is_uuid_like_segment(segment) || is_high_entropy_segment(segment) +} + +fn is_numeric_segment(segment: &str) -> bool { + !segment.is_empty() && segment.chars().all(|character| character.is_ascii_digit()) +} + +fn is_uuid_like_segment(segment: &str) -> bool { + let parts = segment.split('-').map(str::len).collect::>(); + parts == [8, 4, 4, 4, 12] + && segment + .chars() + .all(|character| character == '-' || character.is_ascii_hexdigit()) +} + +fn is_high_entropy_segment(segment: &str) -> bool { + segment.len() >= 16 + && segment.chars().all(|character| { + character.is_ascii_alphanumeric() || matches!(character, '_' | '-' | '~') + }) + && segment.chars().any(|character| character.is_ascii_digit()) + && segment + .chars() + .any(|character| character.is_ascii_alphabetic()) +} + +fn ensure_leading_slash(path: &str) -> String { + if path.starts_with('/') { + path.to_owned() + } else { + format!("/{path}") + } +} diff --git a/crates/webclaw-capture/src/lib.rs b/crates/webclaw-capture/src/lib.rs new file mode 100644 index 0000000..1071762 --- /dev/null +++ b/crates/webclaw-capture/src/lib.rs @@ -0,0 +1,8 @@ +pub mod cdp; +pub mod classify; +pub mod infer; +pub mod openapi; +pub mod redact; +pub mod replay; +pub mod store; +pub mod types; diff --git a/crates/webclaw-capture/src/openapi.rs b/crates/webclaw-capture/src/openapi.rs new file mode 100644 index 0000000..b3e3729 --- /dev/null +++ b/crates/webclaw-capture/src/openapi.rs @@ -0,0 +1,463 @@ +use std::fs; +use std::path::{Component, Path, PathBuf}; + +use serde_json::{Map, Value, json}; +use url::Url; + +use crate::redact::{redact_headers, redact_json}; +use crate::store::{capture_root, load_endpoints}; +use crate::types::{CaptureError, EndpointDefinition, EndpointExample}; + +const OPENAPI_FILE: &str = "openapi.json"; +const REDACTED: &str = "[REDACTED]"; + +pub fn export_openapi(endpoints: &[EndpointDefinition]) -> Value { + let mut paths = Map::new(); + + for endpoint in endpoints { + let path = normalize_openapi_path(&endpoint.path_template); + let method = endpoint.method.to_ascii_lowercase(); + let operation = operation_for(endpoint); + + let path_item = paths + .entry(path) + .or_insert_with(|| Value::Object(Map::new())); + if let Value::Object(path_item) = path_item { + path_item.insert(method, operation); + } + } + + json!({ + "openapi": "3.1.0", + "info": { + "title": "Webclaw Learned API", + "version": "1.0.0" + }, + "paths": paths + }) +} + +pub fn write_openapi(capture_id: &str) -> Result { + let endpoints = load_endpoints(capture_id)?; + let document = export_openapi(&endpoints); + let capture_dir = capture_dir_for_id(&capture_root(), capture_id)?; + fs::create_dir_all(&capture_dir)?; + + let path = capture_dir.join(OPENAPI_FILE); + fs::write(&path, serde_json::to_string_pretty(&document)?)?; + + Ok(path) +} + +fn operation_for(endpoint: &EndpointDefinition) -> Value { + let mut operation = Map::new(); + let method = endpoint.method.to_ascii_uppercase(); + + operation.insert( + "operationId".to_owned(), + Value::String(operation_id(endpoint)), + ); + operation.insert( + "summary".to_owned(), + Value::String(format!("{method} {}", endpoint.path_template)), + ); + operation.insert( + "x-webclaw-endpoint-id".to_owned(), + Value::String(endpoint.id.clone()), + ); + operation.insert( + "x-webclaw-origin".to_owned(), + Value::String(endpoint.origin.clone()), + ); + + if !endpoint.auth_evidence.is_empty() { + operation.insert( + "x-webclaw-auth-evidence".to_owned(), + json!(endpoint.auth_evidence), + ); + } + + if endpoint.safety.requires_confirmation || !endpoint.safety.safe_to_replay { + operation.insert("x-webclaw-requires-confirmation".to_owned(), json!(true)); + } + + let parameters = parameters_for(endpoint); + if !parameters.is_empty() { + operation.insert("parameters".to_owned(), Value::Array(parameters)); + } + + if let Some(request_body) = request_body_for(endpoint) { + operation.insert("requestBody".to_owned(), request_body); + } + + operation.insert("responses".to_owned(), responses_for(endpoint)); + + let examples = examples_for(endpoint); + if !examples.is_empty() { + operation.insert("x-webclaw-examples".to_owned(), Value::Array(examples)); + } + + Value::Object(operation) +} + +fn parameters_for(endpoint: &EndpointDefinition) -> Vec { + let mut parameters = path_parameters(&endpoint.path_template); + + for (name, values) in &endpoint.query_params { + let examples = examples_object( + values + .iter() + .map(|value| Value::String(redacted_parameter_value(name, value))), + ); + let mut parameter = Map::new(); + + parameter.insert("name".to_owned(), Value::String(name.clone())); + parameter.insert("in".to_owned(), Value::String("query".to_owned())); + parameter.insert("required".to_owned(), Value::Bool(false)); + parameter.insert("schema".to_owned(), json!({ "type": "string" })); + + if !examples.is_empty() { + parameter.insert("examples".to_owned(), Value::Object(examples)); + } + + parameters.push(Value::Object(parameter)); + } + + parameters +} + +fn path_parameters(path_template: &str) -> Vec { + let mut parameters = Vec::new(); + let mut cursor = path_template; + + while let Some(start) = cursor.find('{') { + let after_start = &cursor[start + 1..]; + let Some(end) = after_start.find('}') else { + break; + }; + + let name = &after_start[..end]; + if !name.is_empty() + && !parameters + .iter() + .any(|parameter| parameter_name(parameter) == name) + { + parameters.push(json!({ + "name": name, + "in": "path", + "required": true, + "schema": { "type": "string" } + })); + } + + cursor = &after_start[end + 1..]; + } + + parameters +} + +fn request_body_for(endpoint: &EndpointDefinition) -> Option { + let examples = body_examples(endpoint.examples.iter().filter_map(|example| { + example + .request_body_sample + .as_deref() + .map(redacted_body_sample) + })); + + if endpoint.request_schema.is_none() && examples.is_empty() { + return None; + } + + Some(json!({ + "required": false, + "content": { + "application/json": media_type_object(endpoint.request_schema.clone(), examples) + } + })) +} + +fn responses_for(endpoint: &EndpointDefinition) -> Value { + let mut responses = Map::new(); + let mut statuses = endpoint + .examples + .iter() + .map(|example| example.response_status) + .collect::>(); + + statuses.sort_unstable(); + statuses.dedup(); + + if statuses.is_empty() { + statuses.push(200); + } + + for status in statuses { + let examples = body_examples( + endpoint + .examples + .iter() + .filter(move |example| example.response_status == status) + .filter_map(|example| { + example + .response_body_sample + .as_deref() + .map(redacted_body_sample) + }), + ); + + responses.insert( + status.to_string(), + json!({ + "description": format!("Captured HTTP {status} response"), + "content": { + "application/json": media_type_object(endpoint.response_schema.clone(), examples) + } + }), + ); + } + + Value::Object(responses) +} + +fn media_type_object(schema: Option, examples: Map) -> Value { + let mut media_type = Map::new(); + + if let Some(schema) = schema { + media_type.insert("schema".to_owned(), redact_json(&schema)); + } + + if !examples.is_empty() { + media_type.insert("examples".to_owned(), Value::Object(examples)); + } + + Value::Object(media_type) +} + +fn examples_for(endpoint: &EndpointDefinition) -> Vec { + endpoint.examples.iter().map(redacted_example).collect() +} + +fn redacted_example(example: &EndpointExample) -> Value { + json!({ + "url": redacted_example_url(&example.url), + "request_headers": redact_headers(&example.request_headers), + "request_body": example.request_body_sample.as_deref().map(redacted_body_sample), + "response_status": example.response_status, + "response_headers": redact_headers(&example.response_headers), + "response_body": example.response_body_sample.as_deref().map(redacted_body_sample), + "captured_at": example.captured_at + }) +} + +fn redacted_example_url(url: &str) -> String { + let Ok(mut parsed) = Url::parse(url) else { + return url.to_owned(); + }; + + let pairs: Vec<(String, String)> = parsed.query_pairs().into_owned().collect(); + if pairs.is_empty() { + return parsed.to_string(); + } + + parsed.set_query(None); + { + let mut query = parsed.query_pairs_mut(); + for (name, value) in pairs { + query.append_pair(&name, &redacted_parameter_value(&name, &value)); + } + } + + parsed.to_string() +} + +fn body_examples(values: impl Iterator) -> Map { + examples_object(values) +} + +fn examples_object(values: impl Iterator) -> Map { + let mut examples = Map::new(); + + for (index, value) in values.enumerate() { + examples.insert(format!("captured-{}", index + 1), json!({ "value": value })); + } + + examples +} + +fn redacted_body_sample(sample: &str) -> Value { + match serde_json::from_str::(sample) { + Ok(value) => redact_json(&value), + Err(_) if contains_obvious_secret(sample) => Value::String(REDACTED.to_owned()), + Err(_) => Value::String(sample.to_owned()), + } +} + +fn contains_obvious_secret(value: &str) -> bool { + let lower = value.to_ascii_lowercase(); + lower.contains("bearer ") + || lower.contains("authorization") + || lower.contains("api_key") + || lower.contains("api-key") + || lower.contains("csrf") + || lower.contains("token") + || lower.contains("session") + || lower.contains("password") + || lower.contains("cookie") + || contains_email_like_value(value) +} + +fn redacted_parameter_value(name: &str, value: &str) -> String { + if is_sensitive_name(name) || contains_obvious_secret(value) { + REDACTED.to_owned() + } else { + value.to_owned() + } +} + +fn is_sensitive_name(name: &str) -> bool { + let lower = name.to_ascii_lowercase(); + let compact: String = lower + .chars() + .filter(|character| character.is_ascii_alphanumeric()) + .collect(); + + [ + "authorization", + "cookie", + "set-cookie", + "api-key", + "csrf", + "token", + "session", + "password", + "email", + ] + .iter() + .any(|sensitive| { + let sensitive_compact: String = sensitive + .chars() + .filter(|character| character.is_ascii_alphanumeric()) + .collect(); + + lower.contains(sensitive) || compact.contains(&sensitive_compact) + }) +} + +fn contains_email_like_value(value: &str) -> bool { + let Some(at_index) = value.find('@') else { + return false; + }; + + let before = &value[..at_index]; + let after = &value[at_index + 1..]; + + before + .chars() + .rev() + .take_while(|character| { + character.is_ascii_alphanumeric() || matches!(character, '.' | '_' | '%' | '+' | '-') + }) + .count() + > 0 + && after + .chars() + .take_while(|character| { + character.is_ascii_alphanumeric() || matches!(character, '.' | '-') + }) + .any(|character| character == '.') +} + +fn operation_id(endpoint: &EndpointDefinition) -> String { + format!( + "{}_{}", + endpoint.method.to_ascii_lowercase(), + endpoint + .path_template + .trim_matches('/') + .chars() + .map(|character| { + if character.is_ascii_alphanumeric() { + character.to_ascii_lowercase() + } else { + '_' + } + }) + .collect::() + ) + .trim_matches('_') + .to_owned() +} + +fn normalize_openapi_path(path_template: &str) -> String { + if path_template.starts_with('/') { + path_template.to_owned() + } else { + format!("/{path_template}") + } +} + +fn parameter_name(parameter: &Value) -> &str { + parameter + .get("name") + .and_then(Value::as_str) + .unwrap_or_default() +} + +fn capture_dir_for_id(root: &Path, capture_id: &str) -> Result { + let mut capture_dir = root.to_path_buf(); + let parts = capture_id + .split(['/', '\\']) + .filter(|part| !part.is_empty()) + .collect::>(); + + if parts.is_empty() { + return Err(CaptureError::Storage( + "capture id cannot be empty".to_owned(), + )); + } + + for part in parts { + if !is_safe_path_segment(part) { + return Err(CaptureError::Storage(format!( + "capture id contains unsafe path segment: {capture_id}" + ))); + } + capture_dir.push(part); + } + + ensure_within_root(root, &capture_dir)?; + + Ok(capture_dir) +} + +fn ensure_within_root(root: &Path, path: &Path) -> Result<(), CaptureError> { + if relative_components(path).starts_with(&relative_components(root)) { + Ok(()) + } else { + Err(CaptureError::Storage(format!( + "capture path escapes capture root: {}", + path.display() + ))) + } +} + +fn relative_components(path: &Path) -> Vec { + path.components() + .filter_map(|component| match component { + Component::Prefix(prefix) => Some(prefix.as_os_str().to_string_lossy().to_string()), + Component::RootDir => Some(String::from("\\")), + Component::Normal(value) => Some(value.to_string_lossy().to_string()), + Component::CurDir => None, + Component::ParentDir => Some(String::from("..")), + }) + .collect() +} + +fn is_safe_path_segment(segment: &str) -> bool { + !segment.is_empty() + && segment != "." + && segment != ".." + && !segment.contains(':') + && !segment.contains('/') + && !segment.contains('\\') +} diff --git a/crates/webclaw-capture/src/redact.rs b/crates/webclaw-capture/src/redact.rs new file mode 100644 index 0000000..4993feb --- /dev/null +++ b/crates/webclaw-capture/src/redact.rs @@ -0,0 +1,236 @@ +use std::collections::BTreeMap; + +use serde_json::{Map, Value}; +use url::Url; + +use crate::types::{ + CaptureArtifact, CapturedExchange, EndpointDefinition, EndpointExample, HeaderMap, +}; + +const REDACTED: &str = "[REDACTED]"; + +const SENSITIVE_NAMES: &[&str] = &[ + "authorization", + "cookie", + "set-cookie", + "api-key", + "csrf", + "token", + "session", + "password", + "email", +]; + +pub fn redact_headers(headers: &HeaderMap) -> HeaderMap { + headers + .iter() + .map(|(name, value)| { + let value = if is_sensitive_name(name) { + Value::String(REDACTED.to_owned()) + } else { + value.clone() + }; + (name.clone(), value) + }) + .collect() +} + +pub fn redact_url(url: &str) -> String { + let Ok(mut parsed) = Url::parse(url) else { + return url.to_owned(); + }; + + let pairs: Vec<(String, String)> = parsed.query_pairs().into_owned().collect(); + if pairs.is_empty() { + return parsed.to_string(); + } + + parsed.set_query(None); + { + let mut query = parsed.query_pairs_mut(); + for (name, value) in pairs { + let value = if is_sensitive_name(&name) { + REDACTED.to_owned() + } else { + value + }; + query.append_pair(&name, &value); + } + } + + parsed.to_string() +} + +pub fn redact_json(value: &Value) -> Value { + match value { + Value::Object(object) => Value::Object(redact_json_object(object)), + Value::Array(items) => Value::Array(items.iter().map(redact_json).collect()), + _ => value.clone(), + } +} + +pub fn redact_artifact(artifact: &CaptureArtifact) -> CaptureArtifact { + let metadata = match redact_json(&Value::Object(artifact.metadata.clone())) { + Value::Object(metadata) => metadata, + _ => Map::new(), + }; + + CaptureArtifact { + id: artifact.id.clone(), + source_url: redact_url(&artifact.source_url), + intent: artifact.intent.clone(), + started_at: artifact.started_at, + completed_at: artifact.completed_at, + exchanges: artifact.exchanges.iter().map(redact_exchange).collect(), + endpoints: artifact.endpoints.iter().map(redact_endpoint).collect(), + metadata, + } +} + +fn redact_exchange(exchange: &CapturedExchange) -> CapturedExchange { + CapturedExchange { + method: exchange.method.clone(), + url: redact_url(&exchange.url), + request_headers: redact_headers(&exchange.request_headers), + request_body_sample: redact_body_sample(exchange.request_body_sample.as_deref()), + resource_type: exchange.resource_type.clone(), + status: exchange.status, + response_headers: redact_headers(&exchange.response_headers), + response_body_sample: redact_body_sample(exchange.response_body_sample.as_deref()), + started_at: exchange.started_at, + duration_ms: exchange.duration_ms, + redirect_chain: exchange + .redirect_chain + .iter() + .map(|redirect| redact_url(redirect)) + .collect(), + } +} + +fn redact_endpoint(endpoint: &EndpointDefinition) -> EndpointDefinition { + EndpointDefinition { + id: endpoint.id.clone(), + method: endpoint.method.clone(), + origin: endpoint.origin.clone(), + path_template: endpoint.path_template.clone(), + query_params: redact_query_params(&endpoint.query_params), + request_schema: endpoint.request_schema.as_ref().map(redact_json), + response_schema: endpoint.response_schema.as_ref().map(redact_json), + auth_evidence: endpoint.auth_evidence.clone(), + safety: endpoint.safety.clone(), + examples: endpoint + .examples + .iter() + .map(redact_endpoint_example) + .collect(), + } +} + +fn redact_endpoint_example(example: &EndpointExample) -> EndpointExample { + EndpointExample { + url: redact_url(&example.url), + request_headers: redact_headers(&example.request_headers), + request_body_sample: redact_body_sample(example.request_body_sample.as_deref()), + response_status: example.response_status, + response_headers: redact_headers(&example.response_headers), + response_body_sample: redact_body_sample(example.response_body_sample.as_deref()), + captured_at: example.captured_at, + } +} + +fn redact_query_params(params: &BTreeMap>) -> BTreeMap> { + params + .iter() + .map(|(name, values)| { + let values = if is_sensitive_name(name) { + vec![REDACTED.to_owned()] + } else { + values.clone() + }; + (name.clone(), values) + }) + .collect() +} + +fn redact_json_object(object: &Map) -> Map { + object + .iter() + .map(|(key, value)| { + let value = if is_sensitive_name(key) { + Value::String(REDACTED.to_owned()) + } else { + redact_json(value) + }; + (key.clone(), value) + }) + .collect() +} + +fn redact_body_sample(sample: Option<&str>) -> Option { + sample.map(|body| match serde_json::from_str::(body) { + Ok(value) => redact_json(&value).to_string(), + Err(_) => redact_text_body(body), + }) +} + +fn is_sensitive_name(name: &str) -> bool { + let lower = name.to_ascii_lowercase(); + let compact: String = lower + .chars() + .filter(|ch| ch.is_ascii_alphanumeric()) + .collect(); + + SENSITIVE_NAMES.iter().any(|sensitive| { + let sensitive_compact: String = sensitive + .chars() + .filter(|ch| ch.is_ascii_alphanumeric()) + .collect(); + + lower.contains(sensitive) || compact.contains(&sensitive_compact) + }) +} + +fn redact_text_body(body: &str) -> String { + body.lines() + .map(|line| { + if is_sensitive_text_line(line) { + REDACTED.to_owned() + } else { + line.to_owned() + } + }) + .collect::>() + .join("\n") +} + +fn is_sensitive_text_line(line: &str) -> bool { + is_sensitive_name(line) || contains_bearer_token(line) || contains_email_like_value(line) +} + +fn contains_bearer_token(line: &str) -> bool { + line.to_ascii_lowercase().contains("bearer ") +} + +fn contains_email_like_value(line: &str) -> bool { + let Some(at_index) = line.find('@') else { + return false; + }; + + let before = &line[..at_index]; + let after = &line[at_index + 1..]; + + before + .chars() + .rev() + .take_while(|character| { + character.is_ascii_alphanumeric() || matches!(character, '.' | '_' | '%' | '+' | '-') + }) + .count() + > 0 + && after + .chars() + .take_while(|character| { + character.is_ascii_alphanumeric() || matches!(character, '.' | '-') + }) + .any(|character| character == '.') +} diff --git a/crates/webclaw-capture/src/replay.rs b/crates/webclaw-capture/src/replay.rs new file mode 100644 index 0000000..e0deecd --- /dev/null +++ b/crates/webclaw-capture/src/replay.rs @@ -0,0 +1,383 @@ +use std::collections::BTreeSet; + +use reqwest::{ + Client, Method, RequestBuilder, + header::{HeaderName, HeaderValue}, +}; +use serde_json::{Map, Value}; +use url::{Url, form_urlencoded::byte_serialize}; + +use crate::types::{CaptureError, EndpointDefinition, HeaderMap, ReplayOptions, ReplayResult}; + +const MAX_BODY_SAMPLE_BYTES: usize = 64 * 1024; + +pub async fn replay_endpoint( + endpoint: &EndpointDefinition, + options: ReplayOptions, +) -> Result { + if unsafe_replay_requires_confirmation(endpoint, &options) { + return Ok(ReplayResult::Blocked { + reason: format!( + "{} replay requires --confirm-unsafe unless --dry-run is used", + endpoint.method.to_ascii_uppercase() + ), + }); + } + + let spec = replay_spec(endpoint, &options)?; + if options.dry_run { + return Ok(ReplayResult::Preview { + method: spec.method.as_str().to_owned(), + url: spec.url.to_string(), + headers: spec.headers, + body_sample: spec.body_sample, + }); + } + + let response = request_builder_from_spec(spec)?.send().await?; + let status = response.status().as_u16(); + let headers = response_headers_to_json(response.headers()); + let body = response.bytes().await?; + let body_sample = body_sample_from_bytes(&body); + + Ok(ReplayResult::Executed { + status, + headers, + body_sample, + }) +} + +pub fn build_replay_request( + endpoint: &EndpointDefinition, + options: &ReplayOptions, +) -> Result { + if unsafe_replay_requires_confirmation(endpoint, options) { + return Err(CaptureError::Replay(format!( + "{} replay requires confirmation", + endpoint.method.to_ascii_uppercase() + ))); + } + + request_builder_from_spec(replay_spec(endpoint, options)?) +} + +#[derive(Debug, Clone)] +struct ReplaySpec { + method: Method, + url: Url, + headers: HeaderMap, + body_sample: Option, +} + +fn replay_spec( + endpoint: &EndpointDefinition, + options: &ReplayOptions, +) -> Result { + let method = Method::from_bytes(endpoint.method.as_bytes()).map_err(|error| { + CaptureError::Replay(format!( + "invalid replay method {:?}: {error}", + endpoint.method + )) + })?; + + let (path, consumed_params) = interpolate_path_template(&endpoint.path_template, options)?; + let mut url = Url::parse(&format!( + "{}{}", + endpoint.origin.trim_end_matches('/'), + ensure_leading_slash(&path) + )) + .map_err(|error| CaptureError::InvalidUrl(error.to_string()))?; + + apply_query_params(&mut url, endpoint, options, &consumed_params); + + let mut headers = HeaderMap::new(); + if let Some(example) = endpoint.examples.first() { + merge_safe_headers(&mut headers, &example.request_headers); + } + merge_safe_headers(&mut headers, &options.headers); + + let body_sample = replay_body_sample(endpoint, options)?; + + Ok(ReplaySpec { + method, + url, + headers, + body_sample, + }) +} + +fn request_builder_from_spec(spec: ReplaySpec) -> Result { + let client = Client::new(); + let mut builder = client.request(spec.method, spec.url); + + for (name, value) in spec.headers { + let Some(value) = header_value_to_string(&value) else { + continue; + }; + + let Ok(name) = HeaderName::from_bytes(name.as_bytes()) else { + continue; + }; + let Ok(value) = HeaderValue::from_str(&value) else { + continue; + }; + + builder = builder.header(name, value); + } + + if let Some(body_sample) = spec.body_sample + && !contains_redacted_material(&body_sample) + { + builder = builder.body(body_sample); + } + + Ok(builder) +} + +fn unsafe_replay_requires_confirmation( + endpoint: &EndpointDefinition, + options: &ReplayOptions, +) -> bool { + is_unsafe_endpoint(endpoint) && !options.dry_run && !options.confirm_unsafe +} + +fn is_unsafe_endpoint(endpoint: &EndpointDefinition) -> bool { + endpoint.safety.requires_confirmation + || !endpoint.safety.safe_to_replay + || !matches!( + endpoint.method.to_ascii_uppercase().as_str(), + "GET" | "HEAD" | "OPTIONS" + ) +} + +fn interpolate_path_template( + path_template: &str, + options: &ReplayOptions, +) -> Result<(String, BTreeSet), CaptureError> { + let params = params_object(options); + let mut consumed = BTreeSet::new(); + let mut path = String::new(); + let mut rest = path_template; + + while let Some(start) = rest.find('{') { + let (before, after_start) = rest.split_at(start); + path.push_str(before); + + let Some(end) = after_start.find('}') else { + path.push_str(after_start); + return Ok((path, consumed)); + }; + + let name = &after_start[1..end]; + if let Some(value) = params.and_then(|object| object.get(name)) { + let value = scalar_param_to_string(value).ok_or_else(|| { + CaptureError::Replay(format!("path parameter {name:?} must be scalar")) + })?; + path.push_str(&encode_path_segment(&value)); + consumed.insert(name.to_owned()); + } else { + path.push_str(&after_start[..=end]); + } + + rest = &after_start[end + 1..]; + } + + path.push_str(rest); + Ok((path, consumed)) +} + +fn apply_query_params( + url: &mut Url, + endpoint: &EndpointDefinition, + options: &ReplayOptions, + consumed_params: &BTreeSet, +) { + url.set_query(None); + let mut pairs = Vec::<(String, String)>::new(); + + for (name, values) in &endpoint.query_params { + if consumed_params.contains(name) || is_sensitive_name(name) { + continue; + } + + if let Some(value) = values + .iter() + .find(|value| !contains_redacted_material(value)) + .cloned() + { + pairs.push((name.clone(), value)); + } + } + + if let Some(params) = params_object(options) { + for (name, value) in params { + if consumed_params.contains(name) || is_sensitive_name(name) { + continue; + } + + append_query_value(&mut pairs, name, value); + } + } + + if pairs.is_empty() { + return; + } + + let mut query = url.query_pairs_mut(); + for (name, value) in pairs { + query.append_pair(&name, &value); + } +} + +fn append_query_value(pairs: &mut Vec<(String, String)>, name: &str, value: &Value) { + match value { + Value::Array(values) => { + for value in values { + if let Some(value) = scalar_param_to_string(value) + && !contains_redacted_material(&value) + { + pairs.push((name.to_owned(), value)); + } + } + } + _ => { + if let Some(value) = scalar_param_to_string(value) + && !contains_redacted_material(&value) + { + pairs.retain(|(existing, _value)| existing != name); + pairs.push((name.to_owned(), value)); + } + } + } +} + +fn replay_body_sample( + endpoint: &EndpointDefinition, + options: &ReplayOptions, +) -> Result, CaptureError> { + if let Some(body_json) = &options.body_json { + return Ok(Some(serde_json::to_string(body_json)?)); + } + + let Some(example) = endpoint.examples.first() else { + return Ok(None); + }; + + Ok(example + .request_body_sample + .as_ref() + .filter(|sample| !contains_redacted_material(sample)) + .cloned()) +} + +fn merge_safe_headers(target: &mut HeaderMap, headers: &HeaderMap) { + for (name, value) in headers { + if should_skip_header(name, value) { + continue; + } + + target.insert(name.clone(), value.clone()); + } +} + +fn should_skip_header(name: &str, value: &Value) -> bool { + is_hop_by_hop_header(name) + || header_value_to_string(value) + .map(|value| value.trim().is_empty() || contains_redacted_material(&value)) + .unwrap_or(true) +} + +fn is_hop_by_hop_header(name: &str) -> bool { + matches!( + name.to_ascii_lowercase().as_str(), + "host" | "connection" | "content-length" | "transfer-encoding" | "accept-encoding" + ) +} + +fn header_value_to_string(value: &Value) -> Option { + match value { + Value::String(value) => Some(value.clone()), + Value::Number(value) => Some(value.to_string()), + Value::Bool(value) => Some(value.to_string()), + Value::Null | Value::Array(_) | Value::Object(_) => None, + } +} + +fn response_headers_to_json(headers: &reqwest::header::HeaderMap) -> HeaderMap { + headers + .iter() + .filter_map(|(name, value)| { + value + .to_str() + .ok() + .map(|value| (name.as_str().to_owned(), Value::String(value.to_owned()))) + }) + .collect() +} + +fn body_sample_from_bytes(bytes: &[u8]) -> Option { + if bytes.is_empty() { + return None; + } + + let capped = &bytes[..bytes.len().min(MAX_BODY_SAMPLE_BYTES)]; + Some(String::from_utf8_lossy(capped).into_owned()) +} + +fn params_object(options: &ReplayOptions) -> Option<&Map> { + options.params_json.as_ref()?.as_object() +} + +fn scalar_param_to_string(value: &Value) -> Option { + match value { + Value::String(value) => Some(value.clone()), + Value::Number(value) => Some(value.to_string()), + Value::Bool(value) => Some(value.to_string()), + Value::Null | Value::Array(_) | Value::Object(_) => None, + } +} + +fn contains_redacted_material(value: &str) -> bool { + value.to_ascii_lowercase().contains("[redacted]") +} + +fn is_sensitive_name(name: &str) -> bool { + let lower = name.to_ascii_lowercase(); + let compact: String = lower + .chars() + .filter(|character| character.is_ascii_alphanumeric()) + .collect(); + + [ + "authorization", + "cookie", + "set-cookie", + "api-key", + "csrf", + "token", + "session", + "password", + "email", + ] + .iter() + .any(|sensitive| { + let sensitive_compact: String = sensitive + .chars() + .filter(|character| character.is_ascii_alphanumeric()) + .collect(); + + lower.contains(sensitive) || compact.contains(&sensitive_compact) + }) +} + +fn encode_path_segment(value: &str) -> String { + byte_serialize(value.as_bytes()).collect() +} + +fn ensure_leading_slash(path: &str) -> String { + if path.starts_with('/') { + path.to_owned() + } else { + format!("/{path}") + } +} diff --git a/crates/webclaw-capture/src/store.rs b/crates/webclaw-capture/src/store.rs new file mode 100644 index 0000000..5b3a826 --- /dev/null +++ b/crates/webclaw-capture/src/store.rs @@ -0,0 +1,221 @@ +use std::env; +use std::fs; +use std::path::{Component, Path, PathBuf}; + +use chrono::{DateTime, Utc}; +use serde_json::{Map, Value, json}; +use url::Url; + +use crate::redact::redact_artifact; +use crate::types::{CaptureArtifact, CaptureError, EndpointDefinition, SavedCapture}; + +const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR"; +const RAW_CAPTURE_FILE: &str = "raw-capture.json"; +const REDACTED_CAPTURE_FILE: &str = "redacted-capture.json"; +const ENDPOINTS_FILE: &str = "endpoints.json"; +const METADATA_FILE: &str = "metadata.json"; + +pub fn capture_root() -> PathBuf { + env::var_os(CAPTURE_DIR_ENV) + .filter(|value| !value.is_empty()) + .map(PathBuf::from) + .unwrap_or_else(|| home_dir().join(".webclaw").join("api-captures")) +} + +pub fn capture_id_for(url: &Url, started_at: DateTime) -> String { + let host = url.host_str().unwrap_or("unknown-host"); + let host = match url.port() { + Some(port) => format!("{host}-{port}"), + None => host.to_owned(), + }; + let timestamp = started_at.format("%Y-%m-%dT%H-%M-%SZ"); + + format!("{}/{timestamp}", sanitize_id_segment(&host)) +} + +pub fn save_capture(artifact: &CaptureArtifact) -> Result { + let root = capture_root(); + let capture_dir = capture_dir_for_id(&root, &artifact.id)?; + + fs::create_dir_all(&capture_dir)?; + + let raw_capture_path = capture_dir.join(RAW_CAPTURE_FILE); + let redacted_capture_path = capture_dir.join(REDACTED_CAPTURE_FILE); + let endpoints_path = capture_dir.join(ENDPOINTS_FILE); + let metadata_path = capture_dir.join(METADATA_FILE); + let redacted_artifact = redact_artifact(artifact); + + write_json(&raw_capture_path, artifact)?; + write_json(&redacted_capture_path, &redacted_artifact)?; + write_json(&endpoints_path, &redacted_artifact.endpoints)?; + write_json(&metadata_path, &metadata_for(&redacted_artifact))?; + + Ok(SavedCapture { + id: artifact.id.clone(), + root, + capture_dir, + raw_capture_path, + redacted_capture_path, + endpoints_path, + metadata_path, + }) +} + +pub fn load_endpoints(capture_id: &str) -> Result, CaptureError> { + let endpoints_path = capture_dir_for_id(&capture_root(), capture_id)?.join(ENDPOINTS_FILE); + let contents = fs::read_to_string(&endpoints_path).map_err(|error| { + CaptureError::Storage(format!( + "could not read endpoints for capture id {capture_id}: {error}" + )) + })?; + + serde_json::from_str(&contents).map_err(CaptureError::from) +} + +pub fn find_endpoint(endpoint_id: &str) -> Result { + let root = capture_root(); + if !root.exists() { + return Err(CaptureError::EndpointNotFound(endpoint_id.to_owned())); + } + + let mut stack = vec![root]; + while let Some(path) = stack.pop() { + let entries = match fs::read_dir(&path) { + Ok(entries) => entries, + Err(_) => continue, + }; + + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + stack.push(path); + continue; + } + + if path.file_name().and_then(|name| name.to_str()) != Some(ENDPOINTS_FILE) { + continue; + } + + let contents = match fs::read_to_string(&path) { + Ok(contents) => contents, + Err(_) => continue, + }; + let endpoints: Vec = match serde_json::from_str(&contents) { + Ok(endpoints) => endpoints, + Err(_) => continue, + }; + + if let Some(endpoint) = endpoints + .into_iter() + .find(|endpoint| endpoint.id == endpoint_id) + { + return Ok(endpoint); + } + } + } + + Err(CaptureError::EndpointNotFound(endpoint_id.to_owned())) +} + +fn home_dir() -> PathBuf { + env::var_os("USERPROFILE") + .map(PathBuf::from) + .or_else(dirs::home_dir) + .unwrap_or_else(|| PathBuf::from(".")) +} + +fn capture_dir_for_id(root: &Path, capture_id: &str) -> Result { + let mut capture_dir = root.to_path_buf(); + let parts = capture_id + .split(['/', '\\']) + .filter(|part| !part.is_empty()) + .collect::>(); + + if parts.is_empty() { + return Err(CaptureError::Storage( + "capture id cannot be empty".to_owned(), + )); + } + + for part in parts { + if !is_safe_path_segment(part) { + return Err(CaptureError::Storage(format!( + "capture id contains unsafe path segment: {capture_id}" + ))); + } + capture_dir.push(part); + } + + ensure_within_root(root, &capture_dir)?; + + Ok(capture_dir) +} + +fn ensure_within_root(root: &Path, path: &Path) -> Result<(), CaptureError> { + if relative_components(path).starts_with(&relative_components(root)) { + Ok(()) + } else { + Err(CaptureError::Storage(format!( + "capture path escapes capture root: {}", + path.display() + ))) + } +} + +fn relative_components(path: &Path) -> Vec { + path.components() + .filter_map(|component| match component { + Component::Prefix(prefix) => Some(prefix.as_os_str().to_string_lossy().to_string()), + Component::RootDir => Some(String::from("\\")), + Component::Normal(value) => Some(value.to_string_lossy().to_string()), + Component::CurDir => None, + Component::ParentDir => Some(String::from("..")), + }) + .collect() +} + +fn is_safe_path_segment(segment: &str) -> bool { + !segment.is_empty() + && segment != "." + && segment != ".." + && !segment.contains(':') + && !segment.contains('/') + && !segment.contains('\\') +} + +fn sanitize_id_segment(segment: &str) -> String { + let sanitized = segment + .chars() + .map(|character| { + if character.is_ascii_alphanumeric() || matches!(character, '.' | '-' | '_') { + character + } else { + '-' + } + }) + .collect::(); + + if sanitized.is_empty() { + "unknown".to_owned() + } else { + sanitized + } +} + +fn write_json(path: &PathBuf, value: &T) -> Result<(), CaptureError> { + let contents = serde_json::to_string_pretty(value)?; + fs::write(path, contents)?; + Ok(()) +} + +fn metadata_for(artifact: &CaptureArtifact) -> Map { + let mut metadata = artifact.metadata.clone(); + metadata.insert("id".to_owned(), json!(artifact.id)); + metadata.insert("source_url".to_owned(), json!(artifact.source_url)); + metadata.insert("intent".to_owned(), json!(artifact.intent)); + metadata.insert("started_at".to_owned(), json!(artifact.started_at)); + metadata.insert("completed_at".to_owned(), json!(artifact.completed_at)); + metadata.insert("exchange_count".to_owned(), json!(artifact.exchanges.len())); + metadata.insert("endpoint_count".to_owned(), json!(artifact.endpoints.len())); + metadata +} diff --git a/crates/webclaw-capture/src/types.rs b/crates/webclaw-capture/src/types.rs new file mode 100644 index 0000000..4a0bf7f --- /dev/null +++ b/crates/webclaw-capture/src/types.rs @@ -0,0 +1,174 @@ +use std::collections::BTreeMap; +use std::path::PathBuf; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; + +pub type HeaderMap = Map; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CapturedRequest { + pub method: String, + pub url: String, + pub headers: HeaderMap, + pub body_sample: Option, + pub resource_type: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CapturedResponse { + pub status: u16, + pub headers: HeaderMap, + pub body_sample: Option, + pub mime_type: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CapturedExchange { + pub method: String, + pub url: String, + pub request_headers: HeaderMap, + pub request_body_sample: Option, + pub resource_type: Option, + pub status: u16, + pub response_headers: HeaderMap, + pub response_body_sample: Option, + pub started_at: DateTime, + pub duration_ms: u64, + pub redirect_chain: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CaptureArtifact { + pub id: String, + pub source_url: String, + pub intent: Option, + pub started_at: DateTime, + pub completed_at: Option>, + pub exchanges: Vec, + pub endpoints: Vec, + pub metadata: Map, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct EndpointDefinition { + pub id: String, + pub method: String, + pub origin: String, + pub path_template: String, + pub query_params: BTreeMap>, + pub request_schema: Option, + pub response_schema: Option, + pub auth_evidence: Vec, + pub safety: EndpointSafety, + pub examples: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct EndpointExample { + pub url: String, + pub request_headers: HeaderMap, + pub request_body_sample: Option, + pub response_status: u16, + pub response_headers: HeaderMap, + pub response_body_sample: Option, + pub captured_at: DateTime, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct EndpointSafety { + pub safe_to_replay: bool, + pub requires_confirmation: bool, + pub reason: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct ReplayOptions { + pub dry_run: bool, + pub confirm_unsafe: bool, + pub params_json: Option, + pub headers: HeaderMap, + pub body_json: Option, +} + +impl Default for ReplayOptions { + fn default() -> Self { + Self { + dry_run: true, + confirm_unsafe: false, + params_json: None, + headers: HeaderMap::new(), + body_json: None, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum ReplayResult { + Preview { + method: String, + url: String, + headers: HeaderMap, + body_sample: Option, + }, + Executed { + status: u16, + headers: HeaderMap, + body_sample: Option, + }, + Blocked { + reason: String, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct SavedCapture { + pub id: String, + pub root: PathBuf, + pub capture_dir: PathBuf, + pub raw_capture_path: PathBuf, + pub redacted_capture_path: PathBuf, + pub endpoints_path: PathBuf, + pub metadata_path: PathBuf, +} + +#[derive(Debug, thiserror::Error)] +pub enum CaptureError { + #[error("invalid url: {0}")] + InvalidUrl(String), + + #[error("capture failed: {0}")] + Capture(String), + + #[error("storage failed: {0}")] + Storage(String), + + #[error("replay failed: {0}")] + Replay(String), + + #[error("endpoint not found: {0}")] + EndpointNotFound(String), + + #[error("request failed: {0}")] + Request(#[from] reqwest::Error), + + #[error("I/O failed: {0}")] + Io(String), + + #[error("JSON failed: {0}")] + Json(String), +} + +impl From for CaptureError { + fn from(error: std::io::Error) -> Self { + Self::Io(error.to_string()) + } +} + +impl From for CaptureError { + fn from(error: serde_json::Error) -> Self { + Self::Json(error.to_string()) + } +} diff --git a/crates/webclaw-capture/tests/classify.rs b/crates/webclaw-capture/tests/classify.rs new file mode 100644 index 0000000..721a95a --- /dev/null +++ b/crates/webclaw-capture/tests/classify.rs @@ -0,0 +1,216 @@ +use chrono::{TimeZone, Utc}; +use serde_json::{Map, Value, json}; +use webclaw_capture::classify::{classify_exchange, filter_api_exchanges}; +use webclaw_capture::types::CapturedExchange; + +fn headers(entries: &[(&str, &str)]) -> Map { + entries + .iter() + .map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned()))) + .collect() +} + +fn exchange(url: &str) -> CapturedExchange { + CapturedExchange { + method: "GET".to_owned(), + url: url.to_owned(), + request_headers: Map::new(), + request_body_sample: None, + resource_type: Some("document".to_owned()), + status: 200, + response_headers: Map::new(), + response_body_sample: None, + started_at: Utc.with_ymd_and_hms(2026, 5, 16, 12, 0, 0).unwrap(), + duration_ms: 25, + redirect_chain: Vec::new(), + } +} + +fn with_resource_type(mut exchange: CapturedExchange, resource_type: &str) -> CapturedExchange { + exchange.resource_type = Some(resource_type.to_owned()); + exchange +} + +fn with_response_header( + mut exchange: CapturedExchange, + name: &str, + value: &str, +) -> CapturedExchange { + exchange.response_headers = headers(&[(name, value)]); + exchange +} + +fn with_request_body(mut exchange: CapturedExchange, body: serde_json::Value) -> CapturedExchange { + exchange.method = "POST".to_owned(); + exchange.request_headers = headers(&[("Content-Type", "application/json")]); + exchange.request_body_sample = Some(body.to_string()); + exchange +} + +fn assert_included(exchange: &CapturedExchange, label: &str) { + let classification = classify_exchange(exchange); + + assert!( + classification.include, + "{label} should be included, got {classification:?}" + ); + assert!( + classification.confidence >= 0.5, + "{label} should have useful confidence, got {classification:?}" + ); + assert!( + !classification.reasons.is_empty(), + "{label} should explain why it was classified as API traffic" + ); +} + +fn assert_excluded(exchange: &CapturedExchange, label: &str) { + let classification = classify_exchange(exchange); + + assert!( + !classification.include, + "{label} should be excluded, got {classification:?}" + ); + assert!( + classification.confidence <= 0.5, + "{label} should not look like confident API traffic, got {classification:?}" + ); + assert!( + !classification.reasons.is_empty(), + "{label} should explain why it was excluded" + ); +} + +#[test] +fn includes_fetch_and_xhr_resource_types() { + let cases = [ + with_resource_type(exchange("https://example.test/products"), "fetch"), + with_resource_type(exchange("https://example.test/products"), "xhr"), + ]; + + for case in cases { + assert_included( + &case, + case.resource_type + .as_deref() + .expect("resource type should be set"), + ); + } +} + +#[test] +fn includes_json_responses() { + let case = with_response_header( + exchange("https://example.test/products"), + "Content-Type", + "application/json; charset=utf-8", + ); + + assert_included(&case, "JSON response"); +} + +#[test] +fn includes_common_api_path_prefixes() { + let cases = [ + exchange("https://example.test/api/products"), + exchange("https://example.test/v1/products"), + exchange("https://example.test/v2/products"), + ]; + + for case in cases { + assert_included(&case, &case.url); + } +} + +#[test] +fn includes_graphql_paths() { + let case = exchange("https://example.test/graphql"); + + assert_included(&case, "GraphQL path"); +} + +#[test] +fn includes_graphql_request_bodies() { + let case = with_request_body( + exchange("https://example.test/query"), + json!({ + "operationName": "Products", + "query": "query Products { products { id name } }", + "variables": { + "first": 25 + } + }), + ); + + assert_included(&case, "GraphQL request body"); +} + +#[test] +fn excludes_static_assets_by_extension() { + let cases = [ + exchange("https://example.test/static/logo.png"), + exchange("https://example.test/static/photo.jpg"), + exchange("https://example.test/static/icon.svg"), + exchange("https://example.test/static/site.css"), + exchange("https://example.test/static/app.js"), + exchange("https://example.test/static/font.woff2"), + exchange("https://example.test/static/app.js.map"), + ]; + + for case in cases { + assert_excluded(&case, &case.url); + } +} + +#[test] +fn excludes_tracking_hosts() { + let cases = [ + with_response_header( + exchange("https://www.google-analytics.com/g/collect?v=2"), + "Content-Type", + "application/json", + ), + with_response_header( + exchange("https://ads.doubleclick.net/pagead/id"), + "Content-Type", + "application/json", + ), + with_response_header( + exchange("https://telemetry.example.test/v1/events"), + "Content-Type", + "application/json", + ), + ]; + + for case in cases { + assert_excluded(&case, &case.url); + } +} + +#[test] +fn excludes_browser_extension_urls() { + let cases = [ + with_resource_type(exchange("chrome-extension://abcdef/options.html"), "fetch"), + with_resource_type(exchange("moz-extension://abcdef/options.html"), "xhr"), + ]; + + for case in cases { + assert_excluded(&case, &case.url); + } +} + +#[test] +fn filter_api_exchanges_returns_only_included_traffic() { + let api = exchange("https://example.test/api/products"); + let asset = exchange("https://example.test/static/app.js"); + let tracking = with_response_header( + exchange("https://telemetry.example.test/v1/events"), + "Content-Type", + "application/json", + ); + let exchanges = vec![api.clone(), asset, tracking]; + + let filtered = filter_api_exchanges(&exchanges); + + assert_eq!(filtered, vec![api]); +} diff --git a/crates/webclaw-capture/tests/fixtures/sample.har.json b/crates/webclaw-capture/tests/fixtures/sample.har.json new file mode 100644 index 0000000..42a221a --- /dev/null +++ b/crates/webclaw-capture/tests/fixtures/sample.har.json @@ -0,0 +1,139 @@ +{ + "log": { + "version": "1.2", + "creator": { + "name": "webclaw-capture-test", + "version": "0.1.0" + }, + "entries": [ + { + "startedDateTime": "2026-05-16T12:00:00Z", + "time": 42, + "_resourceType": "fetch", + "request": { + "method": "GET", + "url": "https://example.test/api/products?category=tools&page=2", + "headers": [ + { + "name": "Accept", + "value": "application/json" + }, + { + "name": "Authorization", + "value": "Bearer example-token" + } + ] + }, + "response": { + "status": 200, + "headers": [ + { + "name": "Content-Type", + "value": "application/json; charset=utf-8" + } + ], + "content": { + "mimeType": "application/json", + "text": "{\"items\":[{\"id\":12345,\"name\":\"Hammer\",\"price\":12.5,\"inStock\":true}],\"page\":2,\"hasMore\":false}" + } + } + }, + { + "startedDateTime": "2026-05-16T12:00:01Z", + "time": 31, + "_resourceType": "xhr", + "request": { + "method": "GET", + "url": "https://example.test/api/products/12345", + "headers": [ + { + "name": "Accept", + "value": "application/json" + }, + { + "name": "Cookie", + "value": "session_id=example-session" + } + ] + }, + "response": { + "status": 200, + "headers": [ + { + "name": "Content-Type", + "value": "application/json" + } + ], + "content": { + "mimeType": "application/json", + "text": "{\"id\":12345,\"name\":\"Hammer\",\"category\":\"tools\",\"tags\":[\"hand-tool\",\"steel\"]}" + } + } + }, + { + "startedDateTime": "2026-05-16T12:00:02Z", + "time": 57, + "_resourceType": "fetch", + "request": { + "method": "POST", + "url": "https://example.test/graphql", + "headers": [ + { + "name": "Content-Type", + "value": "application/json" + }, + { + "name": "X-CSRF-Token", + "value": "example-csrf" + } + ], + "postData": { + "mimeType": "application/json", + "text": "{\"operationName\":\"CreateProduct\",\"query\":\"mutation CreateProduct($name: String!) { createProduct(input: { name: $name }) { id name } }\",\"variables\":{\"name\":\"Hammer\"}}" + } + }, + "response": { + "status": 200, + "headers": [ + { + "name": "Content-Type", + "value": "application/json" + } + ], + "content": { + "mimeType": "application/json", + "text": "{\"data\":{\"createProduct\":{\"id\":\"gid://example/Product/12345\",\"name\":\"Hammer\"}}}" + } + } + }, + { + "startedDateTime": "2026-05-16T12:00:03Z", + "time": 8, + "_resourceType": "script", + "request": { + "method": "GET", + "url": "https://example.test/static/app.js", + "headers": [ + { + "name": "Accept", + "value": "application/javascript" + } + ] + }, + "response": { + "status": 200, + "headers": [ + { + "name": "Content-Type", + "value": "application/javascript" + } + ], + "content": { + "mimeType": "application/javascript", + "text": "fetch('/api/products?category=tools')" + } + } + } + ] + } +} diff --git a/crates/webclaw-capture/tests/infer.rs b/crates/webclaw-capture/tests/infer.rs new file mode 100644 index 0000000..bdc785d --- /dev/null +++ b/crates/webclaw-capture/tests/infer.rs @@ -0,0 +1,261 @@ +use chrono::{DateTime, Utc}; +use serde_json::{Map, Value, json}; +use webclaw_capture::infer::{ + endpoint_id, infer_endpoints, infer_json_schema, normalize_path_template, +}; +use webclaw_capture::types::{CapturedExchange, EndpointDefinition}; + +fn fixture_exchanges() -> Vec { + let har: Value = + serde_json::from_str(include_str!("fixtures/sample.har.json")).expect("valid HAR fixture"); + let entries = har + .pointer("/log/entries") + .and_then(Value::as_array) + .expect("HAR fixture entries"); + + entries.iter().map(har_entry_to_exchange).collect() +} + +fn har_entry_to_exchange(entry: &Value) -> CapturedExchange { + let request = entry.get("request").expect("request"); + let response = entry.get("response").expect("response"); + + CapturedExchange { + method: string_at(request, "method"), + url: string_at(request, "url"), + request_headers: har_headers(request), + request_body_sample: request + .pointer("/postData/text") + .and_then(Value::as_str) + .map(str::to_owned), + resource_type: entry + .get("_resourceType") + .and_then(Value::as_str) + .map(str::to_owned), + status: response + .get("status") + .and_then(Value::as_u64) + .expect("response status") as u16, + response_headers: har_headers(response), + response_body_sample: response + .pointer("/content/text") + .and_then(Value::as_str) + .map(str::to_owned), + started_at: DateTime::parse_from_rfc3339(&string_at(entry, "startedDateTime")) + .expect("RFC3339 startedDateTime") + .with_timezone(&Utc), + duration_ms: entry.get("time").and_then(Value::as_u64).expect("duration"), + redirect_chain: Vec::new(), + } +} + +fn har_headers(container: &Value) -> Map { + container + .get("headers") + .and_then(Value::as_array) + .expect("headers") + .iter() + .map(|header| { + ( + string_at(header, "name"), + Value::String(string_at(header, "value")), + ) + }) + .collect() +} + +fn string_at(value: &Value, key: &str) -> String { + value + .get(key) + .and_then(Value::as_str) + .unwrap_or_else(|| panic!("{key} should be a string")) + .to_owned() +} + +fn find_endpoint<'a>( + endpoints: &'a [EndpointDefinition], + method: &str, + path_template: &str, +) -> &'a EndpointDefinition { + endpoints + .iter() + .find(|endpoint| endpoint.method == method && endpoint.path_template == path_template) + .unwrap_or_else(|| panic!("missing endpoint {method} {path_template}; got {endpoints:#?}")) +} + +fn sorted_ids(endpoints: &[EndpointDefinition]) -> Vec { + let mut ids = endpoints + .iter() + .map(|endpoint| endpoint.id.clone()) + .collect::>(); + ids.sort(); + ids +} + +#[test] +fn infers_stable_endpoint_ids_and_path_templates_from_har_fixture() { + let exchanges = fixture_exchanges(); + + let endpoints = infer_endpoints(&exchanges); + let repeated = infer_endpoints(&exchanges); + + assert_eq!(endpoints.len(), 3, "static assets should be ignored"); + assert_eq!( + sorted_ids(&endpoints), + sorted_ids(&repeated), + "endpoint ids should be deterministic across inference runs" + ); + + let products = find_endpoint(&endpoints, "GET", "/api/products"); + assert_eq!( + products.id, + endpoint_id("GET", "https://example.test", "/api/products") + ); + + let product_detail = find_endpoint(&endpoints, "GET", "/api/products/{id}"); + assert_eq!( + product_detail.id, + endpoint_id("GET", "https://example.test", "/api/products/{id}") + ); + + let graphql = find_endpoint(&endpoints, "POST", "/graphql"); + assert_eq!( + graphql.id, + endpoint_id("POST", "https://example.test", "/graphql") + ); +} + +#[test] +fn infers_query_examples_schemas_auth_evidence_and_mutation_safety() { + let endpoints = infer_endpoints(&fixture_exchanges()); + + let products = find_endpoint(&endpoints, "GET", "/api/products"); + assert_eq!( + products.query_params.get("category"), + Some(&vec!["tools".to_owned()]) + ); + assert_eq!( + products.query_params.get("page"), + Some(&vec!["2".to_owned()]) + ); + assert!( + products + .auth_evidence + .iter() + .any(|evidence| evidence.to_ascii_lowercase().contains("authorization")), + "Authorization header should be recorded as auth evidence" + ); + assert!(products.safety.safe_to_replay); + assert!(!products.safety.requires_confirmation); + + let products_schema = products.response_schema.as_ref().expect("response schema"); + assert_eq!( + products_schema.pointer("/properties/items/type"), + Some(&json!("array")) + ); + assert_eq!( + products_schema.pointer("/properties/items/items/properties/id/type"), + Some(&json!("integer")) + ); + assert_eq!( + products_schema.pointer("/properties/hasMore/type"), + Some(&json!("boolean")) + ); + + let graphql = find_endpoint(&endpoints, "POST", "/graphql"); + assert!(!graphql.safety.safe_to_replay); + assert!(graphql.safety.requires_confirmation); + assert!( + graphql + .auth_evidence + .iter() + .any(|evidence| evidence.to_ascii_lowercase().contains("csrf")), + "CSRF header should be recorded as auth evidence" + ); + + let request_schema = graphql.request_schema.as_ref().expect("request schema"); + assert_eq!( + request_schema.pointer("/properties/query/type"), + Some(&json!("string")) + ); + assert_eq!( + request_schema.pointer("/properties/variables/properties/name/type"), + Some(&json!("string")) + ); + + let response_schema = graphql.response_schema.as_ref().expect("response schema"); + assert_eq!( + response_schema.pointer("/properties/data/properties/createProduct/properties/id/type"), + Some(&json!("string")) + ); +} + +#[test] +fn ignores_static_asset_entries_from_the_fixture() { + let endpoints = infer_endpoints(&fixture_exchanges()); + + assert!( + endpoints + .iter() + .all(|endpoint| !endpoint.path_template.contains("/static/")), + "static asset requests should not become learned endpoints: {endpoints:#?}" + ); +} + +#[test] +fn normalizes_numeric_uuid_and_high_entropy_path_segments() { + assert_eq!( + normalize_path_template("/api/products/12345"), + "/api/products/{id}" + ); + assert_eq!( + normalize_path_template("/api/users/550e8400-e29b-41d4-a716-446655440000"), + "/api/users/{id}" + ); + assert_eq!( + normalize_path_template("/api/sessions/a1b2c3d4e5f6a7b8"), + "/api/sessions/{id}" + ); + assert_eq!( + normalize_path_template("/api/categories/tools"), + "/api/categories/tools" + ); +} + +#[test] +fn infers_basic_json_schema_shapes() { + let schema = infer_json_schema(&json!({ + "id": 12345, + "name": "Hammer", + "price": 12.5, + "inStock": true, + "tags": ["hand-tool"], + "metadata": null + })); + + assert_eq!(schema.pointer("/type"), Some(&json!("object"))); + assert_eq!( + schema.pointer("/properties/id/type"), + Some(&json!("integer")) + ); + assert_eq!( + schema.pointer("/properties/price/type"), + Some(&json!("number")) + ); + assert_eq!( + schema.pointer("/properties/inStock/type"), + Some(&json!("boolean")) + ); + assert_eq!( + schema.pointer("/properties/tags/type"), + Some(&json!("array")) + ); + assert_eq!( + schema.pointer("/properties/tags/items/type"), + Some(&json!("string")) + ); + assert_eq!( + schema.pointer("/properties/metadata/type"), + Some(&json!("null")) + ); +} diff --git a/crates/webclaw-capture/tests/integration_capture.rs b/crates/webclaw-capture/tests/integration_capture.rs new file mode 100644 index 0000000..67e0c2d --- /dev/null +++ b/crates/webclaw-capture/tests/integration_capture.rs @@ -0,0 +1,245 @@ +use std::env; +use std::ffi::OsString; +use std::fs; +use std::path::{Path, PathBuf}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::{TcpListener, TcpStream}; +use tokio::sync::oneshot; +use webclaw_capture::cdp::{CaptureOptions, capture_network}; +use webclaw_capture::types::{CaptureArtifact, EndpointDefinition}; + +const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR"; + +struct CaptureDirGuard { + original: Option, +} + +impl CaptureDirGuard { + fn set(path: &Path) -> Self { + let original = env::var_os(CAPTURE_DIR_ENV); + + unsafe { + env::set_var(CAPTURE_DIR_ENV, path); + } + + Self { original } + } +} + +impl Drop for CaptureDirGuard { + fn drop(&mut self) { + unsafe { + match &self.original { + Some(value) => env::set_var(CAPTURE_DIR_ENV, value), + None => env::remove_var(CAPTURE_DIR_ENV), + } + } + } +} + +struct LocalServer { + base_url: String, + shutdown: Option>, +} + +impl LocalServer { + async fn start() -> Self { + let listener = TcpListener::bind("127.0.0.1:0") + .await + .expect("bind local test server"); + let address = listener.local_addr().expect("local test server address"); + let (shutdown, mut shutdown_rx) = oneshot::channel::<()>(); + + tokio::spawn(async move { + loop { + tokio::select! { + _ = &mut shutdown_rx => break, + accepted = listener.accept() => { + let Ok((stream, _peer)) = accepted else { + continue; + }; + + tokio::spawn(handle_connection(stream)); + } + } + } + }); + + Self { + base_url: format!("http://{address}"), + shutdown: Some(shutdown), + } + } + + fn url(&self, path: &str) -> String { + format!("{}{}", self.base_url, path) + } +} + +impl Drop for LocalServer { + fn drop(&mut self) { + if let Some(shutdown) = self.shutdown.take() { + let _ = shutdown.send(()); + } + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn capture_network_records_fetches_redacts_secrets_and_learns_api_endpoints() { + let capture_root = unique_temp_root("integration-capture"); + let _capture_dir = CaptureDirGuard::set(&capture_root); + let server = LocalServer::start().await; + + let saved = capture_network(CaptureOptions { + url: server.url("/"), + intent: Some("discover product listing API".to_owned()), + wait_ms: 1_500, + headed: false, + }) + .await + .expect("capture network traffic"); + + let raw_capture: CaptureArtifact = read_json(&saved.raw_capture_path); + assert!( + raw_capture + .exchanges + .iter() + .any(|exchange| exchange.url.contains("/api/products?category=tools")), + "raw capture should include the fetch to /api/products" + ); + + let redacted_capture_text = + fs::read_to_string(&saved.redacted_capture_path).expect("read redacted capture"); + for secret in [ + "browser-authorization-secret", + "browser-api-key-secret", + "browser-csrf-secret", + "page-session-secret", + "api-session-secret", + ] { + assert!( + !redacted_capture_text.contains(secret), + "redacted capture should not contain raw secret value {secret}" + ); + } + + let endpoints: Vec = read_json(&saved.endpoints_path); + let api_endpoints = endpoints + .iter() + .filter(|endpoint| endpoint.method == "GET" && endpoint.path_template == "/api/products") + .collect::>(); + + assert_eq!( + api_endpoints.len(), + 1, + "inferred endpoints should contain one GET /api/products endpoint" + ); + assert!( + endpoints + .iter() + .all(|endpoint| endpoint.path_template != "/static/app.js"), + "static assets should not be included as learned endpoints" + ); + + let _ = fs::remove_dir_all(capture_root); +} + +async fn handle_connection(mut stream: TcpStream) { + let mut buffer = vec![0_u8; 8192]; + let Ok(bytes_read) = stream.read(&mut buffer).await else { + return; + }; + if bytes_read == 0 { + return; + } + + let request = String::from_utf8_lossy(&buffer[..bytes_read]); + let path = request + .lines() + .next() + .and_then(|line| line.split_whitespace().nth(1)) + .unwrap_or("/"); + + let response = match path.split('?').next().unwrap_or(path) { + "/" => http_response( + "200 OK", + &[ + ("Content-Type", "text/html; charset=utf-8"), + ("Set-Cookie", "session=page-session-secret; HttpOnly"), + ], + r#" + + Webclaw capture test + + + +"#, + ), + "/static/app.js" => http_response( + "200 OK", + &[("Content-Type", "application/javascript; charset=utf-8")], + r#"fetch('/api/products?category=tools', { + headers: { + 'Authorization': 'Bearer browser-authorization-secret', + 'X-Api-Key': 'browser-api-key-secret', + 'X-CSRF-Token': 'browser-csrf-secret' + } +}).then(response => response.json()).then(products => { + window.__webclawProducts = products; +});"#, + ), + "/api/products" => http_response( + "200 OK", + &[ + ("Content-Type", "application/json"), + ("Set-Cookie", "session=api-session-secret; HttpOnly"), + ], + r#"{"items":[{"id":12345,"name":"Hammer","category":"tools"}]}"#, + ), + _ => http_response( + "404 Not Found", + &[("Content-Type", "text/plain; charset=utf-8")], + "not found", + ), + }; + + let _ = stream.write_all(response.as_bytes()).await; + let _ = stream.shutdown().await; +} + +fn http_response(status: &str, headers: &[(&str, &str)], body: &str) -> String { + let mut response = format!( + "HTTP/1.1 {status}\r\nContent-Length: {}\r\nConnection: close\r\nCache-Control: no-store\r\n", + body.len() + ); + + for (name, value) in headers { + response.push_str(name); + response.push_str(": "); + response.push_str(value); + response.push_str("\r\n"); + } + + response.push_str("\r\n"); + response.push_str(body); + response +} + +fn unique_temp_root(test_name: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time after unix epoch") + .as_nanos(); + + env::temp_dir().join(format!( + "webclaw-capture-{test_name}-{}-{nanos}", + std::process::id() + )) +} + +fn read_json(path: &Path) -> T { + let contents = fs::read_to_string(path).expect("read JSON file"); + serde_json::from_str(&contents).expect("valid JSON file") +} diff --git a/crates/webclaw-capture/tests/openapi.rs b/crates/webclaw-capture/tests/openapi.rs new file mode 100644 index 0000000..b7890c3 --- /dev/null +++ b/crates/webclaw-capture/tests/openapi.rs @@ -0,0 +1,358 @@ +use std::collections::BTreeMap; +use std::env; +use std::ffi::OsString; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Mutex; +use std::time::{SystemTime, UNIX_EPOCH}; + +use chrono::{DateTime, Utc}; +use serde_json::{Map, Value, json}; +use webclaw_capture::openapi::{export_openapi, write_openapi}; +use webclaw_capture::store::save_capture; +use webclaw_capture::types::{ + CaptureArtifact, EndpointDefinition, EndpointExample, EndpointSafety, +}; + +static ENV_LOCK: Mutex<()> = Mutex::new(()); +const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR"; + +struct EnvVarGuard { + original: Option, +} + +impl EnvVarGuard { + fn set_capture_dir(value: Option<&Path>) -> Self { + let original = env::var_os(CAPTURE_DIR_ENV); + + unsafe { + match value { + Some(path) => env::set_var(CAPTURE_DIR_ENV, path), + None => env::remove_var(CAPTURE_DIR_ENV), + } + } + + Self { original } + } +} + +impl Drop for EnvVarGuard { + fn drop(&mut self) { + unsafe { + match &self.original { + Some(value) => env::set_var(CAPTURE_DIR_ENV, value), + None => env::remove_var(CAPTURE_DIR_ENV), + } + } + } +} + +fn with_capture_dir(value: Option<&Path>, test: impl FnOnce() -> T) -> T { + let _lock = ENV_LOCK.lock().expect("capture env lock"); + let _guard = EnvVarGuard::set_capture_dir(value); + + test() +} + +#[test] +fn exports_openapi_31_and_an_operation_for_every_endpoint() { + let doc = export_openapi(&sample_endpoints()); + + assert_eq!(doc.get("openapi").and_then(Value::as_str), Some("3.1.0")); + + let paths = doc + .get("paths") + .and_then(Value::as_object) + .expect("OpenAPI document should contain paths"); + + assert!( + operation(&doc, "/api/products", "get").is_some(), + "GET product endpoint should become an OpenAPI operation" + ); + assert!( + operation(&doc, "/graphql", "post").is_some(), + "POST GraphQL endpoint should become an OpenAPI operation" + ); + assert_eq!( + operation_count(paths), + 2, + "every learned endpoint should become exactly one operation" + ); +} + +#[test] +fn unsafe_operations_require_confirmation_extension() { + let doc = export_openapi(&sample_endpoints()); + + let get_operation = + operation(&doc, "/api/products", "get").expect("GET product endpoint should be exported"); + let post_operation = + operation(&doc, "/graphql", "post").expect("POST GraphQL endpoint should be exported"); + + assert_ne!( + get_operation.get("x-webclaw-requires-confirmation"), + Some(&json!(true)), + "safe GET operations should not require unsafe replay confirmation" + ); + assert_eq!( + post_operation.get("x-webclaw-requires-confirmation"), + Some(&json!(true)), + "unsafe POST operations should require explicit replay confirmation" + ); +} + +#[test] +fn generated_examples_do_not_leak_secret_values() { + let doc = export_openapi(&sample_endpoints()); + + assert!( + contains_example_node(&doc), + "OpenAPI export should include examples derived from captured endpoint examples" + ); + + let doc_text = serde_json::to_string(&doc).expect("serialize OpenAPI document"); + for forbidden in [ + "Bearer raw-secret", + "raw-api-key", + "raw-csrf-token", + "raw-session-id", + "raw-password", + "user@example.test", + ] { + assert!( + !doc_text.contains(forbidden), + "OpenAPI examples should not leak secret value {forbidden:?}" + ); + } + assert!( + doc_text.contains("[REDACTED]"), + "OpenAPI examples should preserve redaction markers instead of raw secrets" + ); +} + +#[test] +fn write_openapi_writes_openapi_json_next_to_saved_endpoints() { + let root = unique_temp_root("write"); + + with_capture_dir(Some(&root), || { + let artifact = sample_artifact(); + save_capture(&artifact).expect("save capture before OpenAPI export"); + + let openapi_path = write_openapi(&artifact.id).expect("write OpenAPI document"); + + assert_eq!( + openapi_path, + root.join("example.test") + .join("2026-05-16T12-00-00Z") + .join("openapi.json") + ); + assert!(openapi_path.is_file()); + + let doc: Value = read_json(&openapi_path); + assert_eq!(doc.get("openapi").and_then(Value::as_str), Some("3.1.0")); + assert!( + operation(&doc, "/api/products", "get").is_some(), + "written OpenAPI document should contain saved capture endpoints" + ); + }); + + let _ = fs::remove_dir_all(root); +} + +fn sample_artifact() -> CaptureArtifact { + CaptureArtifact { + id: "example.test/2026-05-16T12-00-00Z".to_owned(), + source_url: "https://example.test/products?email=user@example.test".to_owned(), + intent: Some("discover product listing API".to_owned()), + started_at: test_time(), + completed_at: Some(test_time()), + exchanges: Vec::new(), + endpoints: sample_endpoints(), + metadata: Map::new(), + } +} + +fn sample_endpoints() -> Vec { + vec![product_endpoint(), graphql_endpoint()] +} + +fn product_endpoint() -> EndpointDefinition { + let mut query_params = BTreeMap::new(); + query_params.insert("category".to_owned(), vec!["tools".to_owned()]); + query_params.insert("page".to_owned(), vec!["2".to_owned()]); + + EndpointDefinition { + id: "GET https://example.test/api/products".to_owned(), + method: "GET".to_owned(), + origin: "https://example.test".to_owned(), + path_template: "/api/products".to_owned(), + query_params, + request_schema: None, + response_schema: Some(json!({ + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { "type": "integer" }, + "name": { "type": "string" } + } + } + } + } + })), + auth_evidence: vec!["Authorization header observed".to_owned()], + safety: EndpointSafety { + safe_to_replay: true, + requires_confirmation: false, + reason: "GET is a read-oriented HTTP method".to_owned(), + }, + examples: vec![EndpointExample { + url: "https://example.test/api/products?category=tools&page=2&api_key=raw-api-key" + .to_owned(), + request_headers: headers(&[ + ("Authorization", "Bearer raw-secret"), + ("Accept", "application/json"), + ("X-Api-Key", "raw-api-key"), + ]), + request_body_sample: None, + response_status: 200, + response_headers: headers(&[ + ("Content-Type", "application/json"), + ("Set-Cookie", "session=raw-session-id"), + ]), + response_body_sample: Some( + r#"{"items":[{"id":12345,"name":"Hammer","email":"user@example.test"}]}"# + .to_owned(), + ), + captured_at: test_time(), + }], + } +} + +fn graphql_endpoint() -> EndpointDefinition { + EndpointDefinition { + id: "POST https://example.test/graphql".to_owned(), + method: "POST".to_owned(), + origin: "https://example.test".to_owned(), + path_template: "/graphql".to_owned(), + query_params: BTreeMap::new(), + request_schema: Some(json!({ + "type": "object", + "properties": { + "query": { "type": "string" }, + "variables": { "type": "object" } + } + })), + response_schema: Some(json!({ + "type": "object", + "properties": { + "data": { "type": "object" } + } + })), + auth_evidence: vec!["X-CSRF-Token header observed".to_owned()], + safety: EndpointSafety { + safe_to_replay: false, + requires_confirmation: true, + reason: "POST may mutate server state and requires confirmation".to_owned(), + }, + examples: vec![EndpointExample { + url: concat!( + "https://example.test/graphql?", + "ref=user%40example.test&", + "debug=Bearer%20raw-secret&", + "trace=raw-session-id" + ) + .to_owned(), + request_headers: headers(&[ + ("Content-Type", "application/json"), + ("X-CSRF-Token", "raw-csrf-token"), + ]), + request_body_sample: Some( + json!({ + "query": "mutation CreateProduct($name: String!) { createProduct(name: $name) { id } }", + "variables": { + "name": "Hammer", + "password": "raw-password" + } + }) + .to_string(), + ), + response_status: 200, + response_headers: headers(&[("Content-Type", "application/json")]), + response_body_sample: Some(r#"{"data":{"createProduct":{"id":"12345"}}}"#.to_owned()), + captured_at: test_time(), + }], + } +} + +fn headers(entries: &[(&str, &str)]) -> Map { + entries + .iter() + .map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned()))) + .collect() +} + +fn operation<'a>(doc: &'a Value, path: &str, method: &str) -> Option<&'a Map> { + doc.get("paths") + .and_then(Value::as_object) + .and_then(|paths| paths.get(path)) + .and_then(Value::as_object) + .and_then(|path_item| path_item.get(method)) + .and_then(Value::as_object) +} + +fn operation_count(paths: &Map) -> usize { + const HTTP_METHODS: &[&str] = &[ + "get", "put", "post", "delete", "options", "head", "patch", "trace", + ]; + + paths + .values() + .filter_map(Value::as_object) + .map(|path_item| { + HTTP_METHODS + .iter() + .filter(|method| path_item.contains_key(**method)) + .count() + }) + .sum() +} + +fn contains_example_node(value: &Value) -> bool { + match value { + Value::Object(object) => { + object + .keys() + .any(|key| matches!(key.as_str(), "example" | "examples" | "x-webclaw-examples")) + || object.values().any(contains_example_node) + } + Value::Array(items) => items.iter().any(contains_example_node), + _ => false, + } +} + +fn unique_temp_root(test_name: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time after unix epoch") + .as_nanos(); + + env::temp_dir().join(format!( + "webclaw-capture-openapi-{test_name}-{}-{nanos}", + std::process::id() + )) +} + +fn read_json(path: &Path) -> T { + let contents = fs::read_to_string(path).expect("read JSON file"); + serde_json::from_str(&contents).expect("valid JSON file") +} + +fn test_time() -> DateTime { + DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z") + .expect("valid test timestamp") + .with_timezone(&Utc) +} diff --git a/crates/webclaw-capture/tests/redact.rs b/crates/webclaw-capture/tests/redact.rs new file mode 100644 index 0000000..9e9d697 --- /dev/null +++ b/crates/webclaw-capture/tests/redact.rs @@ -0,0 +1,209 @@ +use chrono::{TimeZone, Utc}; +use serde_json::{Map, Value, json}; +use url::Url; +use webclaw_capture::redact::{redact_artifact, redact_headers, redact_json, redact_url}; +use webclaw_capture::types::{CaptureArtifact, CapturedExchange}; + +const REDACTED: &str = "[REDACTED]"; + +fn header_map(entries: &[(&str, &str)]) -> Map { + entries + .iter() + .map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned()))) + .collect() +} + +fn query_value(url: &str, name: &str) -> Option { + Url::parse(url) + .unwrap() + .query_pairs() + .find(|(key, _)| key == name) + .map(|(_, value)| value.into_owned()) +} + +#[test] +fn redacts_sensitive_header_and_cookie_values_by_name() { + let headers = header_map(&[ + ("Authorization", "Bearer secret-token"), + ("Cookie", "session=secret-session; theme=dark"), + ("Set-Cookie", "account=secret-cookie; HttpOnly"), + ("X-Api-Key", "secret-api-key"), + ("X-CSRF-Token", "secret-csrf-token"), + ("X-Session-Id", "secret-session-id"), + ("X-Password-Hash", "secret-password"), + ("X-User-Email", "person@example.test"), + ("Content-Type", "application/json"), + ]); + + let redacted = redact_headers(&headers); + + assert_eq!(redacted["Authorization"], REDACTED); + assert_eq!(redacted["Cookie"], REDACTED); + assert_eq!(redacted["Set-Cookie"], REDACTED); + assert_eq!(redacted["X-Api-Key"], REDACTED); + assert_eq!(redacted["X-CSRF-Token"], REDACTED); + assert_eq!(redacted["X-Session-Id"], REDACTED); + assert_eq!(redacted["X-Password-Hash"], REDACTED); + assert_eq!(redacted["X-User-Email"], REDACTED); + assert_eq!(redacted["Content-Type"], "application/json"); +} + +#[test] +fn redacts_sensitive_query_parameter_values_by_name() { + let url = concat!( + "https://example.test/api/products?", + "authorization=Bearer%20secret-token&", + "api-key=secret-api-key&", + "csrf=secret-csrf&", + "access_token=secret-access-token&", + "session_id=secret-session&", + "password=secret-password&", + "email=person%40example.test&", + "cookie=secret-cookie&", + "page=2" + ); + + let redacted = redact_url(url); + + assert_eq!( + query_value(&redacted, "authorization").as_deref(), + Some(REDACTED) + ); + assert_eq!(query_value(&redacted, "api-key").as_deref(), Some(REDACTED)); + assert_eq!(query_value(&redacted, "csrf").as_deref(), Some(REDACTED)); + assert_eq!( + query_value(&redacted, "access_token").as_deref(), + Some(REDACTED) + ); + assert_eq!( + query_value(&redacted, "session_id").as_deref(), + Some(REDACTED) + ); + assert_eq!( + query_value(&redacted, "password").as_deref(), + Some(REDACTED) + ); + assert_eq!(query_value(&redacted, "email").as_deref(), Some(REDACTED)); + assert_eq!(query_value(&redacted, "cookie").as_deref(), Some(REDACTED)); + assert_eq!(query_value(&redacted, "page").as_deref(), Some("2")); + assert!(!redacted.contains("secret")); + assert!(!redacted.contains("person%40example.test")); +} + +#[test] +fn redacts_sensitive_json_body_keys_recursively() { + let body = json!({ + "authorization": "Bearer secret-token", + "cookie": "session=secret-session", + "set-cookie": "session=secret-session", + "api-key": "secret-api-key", + "csrf": "secret-csrf", + "access_token": "secret-access-token", + "session_id": "secret-session", + "password": "secret-password", + "email": "person@example.test", + "profile": { + "backupEmail": "backup@example.test", + "display_name": "Visible Name" + }, + "items": [ + { + "sessionToken": "nested-secret-session-token", + "quantity": 3 + } + ] + }); + + let redacted = redact_json(&body); + + assert_eq!(redacted["authorization"], REDACTED); + assert_eq!(redacted["cookie"], REDACTED); + assert_eq!(redacted["set-cookie"], REDACTED); + assert_eq!(redacted["api-key"], REDACTED); + assert_eq!(redacted["csrf"], REDACTED); + assert_eq!(redacted["access_token"], REDACTED); + assert_eq!(redacted["session_id"], REDACTED); + assert_eq!(redacted["password"], REDACTED); + assert_eq!(redacted["email"], REDACTED); + assert_eq!(redacted["profile"]["backupEmail"], REDACTED); + assert_eq!(redacted["profile"]["display_name"], "Visible Name"); + assert_eq!(redacted["items"][0]["sessionToken"], REDACTED); + assert_eq!(redacted["items"][0]["quantity"], 3); +} + +#[test] +fn redacts_capture_artifact_headers_urls_and_json_body_samples() { + let captured_at = Utc.with_ymd_and_hms(2026, 5, 16, 12, 0, 0).unwrap(); + let artifact = CaptureArtifact { + id: "example.test/2026-05-16T12-00-00Z".to_owned(), + source_url: "https://example.test/app?email=person@example.test".to_owned(), + intent: Some("discover public API".to_owned()), + started_at: captured_at, + completed_at: Some(captured_at), + exchanges: vec![CapturedExchange { + method: "POST".to_owned(), + url: "https://example.test/api/session?token=secret-token&page=2".to_owned(), + request_headers: header_map(&[ + ("Authorization", "Bearer secret-token"), + ("Content-Type", "application/json"), + ]), + request_body_sample: Some( + json!({ + "email": "person@example.test", + "password": "secret-password", + "name": "Visible Name" + }) + .to_string(), + ), + resource_type: Some("fetch".to_owned()), + status: 200, + response_headers: header_map(&[ + ("Set-Cookie", "session=secret-session; HttpOnly"), + ("Content-Type", "application/json"), + ]), + response_body_sample: Some( + json!({ + "sessionToken": "secret-session-token", + "status": "ok" + }) + .to_string(), + ), + started_at: captured_at, + duration_ms: 25, + redirect_chain: vec!["https://example.test/login?csrf=secret-csrf".to_owned()], + }], + endpoints: Vec::new(), + metadata: Map::new(), + }; + + let redacted = redact_artifact(&artifact); + let exchange = &redacted.exchanges[0]; + + assert_eq!( + query_value(&redacted.source_url, "email").as_deref(), + Some(REDACTED) + ); + assert_eq!( + query_value(&exchange.url, "token").as_deref(), + Some(REDACTED) + ); + assert_eq!(query_value(&exchange.url, "page").as_deref(), Some("2")); + assert_eq!(exchange.request_headers["Authorization"], REDACTED); + assert_eq!(exchange.request_headers["Content-Type"], "application/json"); + assert_eq!(exchange.response_headers["Set-Cookie"], REDACTED); + assert_eq!( + query_value(&exchange.redirect_chain[0], "csrf").as_deref(), + Some(REDACTED) + ); + + let request_body = exchange.request_body_sample.as_deref().unwrap(); + assert!(request_body.contains(REDACTED)); + assert!(request_body.contains("Visible Name")); + assert!(!request_body.contains("person@example.test")); + assert!(!request_body.contains("secret-password")); + + let response_body = exchange.response_body_sample.as_deref().unwrap(); + assert!(response_body.contains(REDACTED)); + assert!(response_body.contains("ok")); + assert!(!response_body.contains("secret-session-token")); +} diff --git a/crates/webclaw-capture/tests/replay.rs b/crates/webclaw-capture/tests/replay.rs new file mode 100644 index 0000000..7938f81 --- /dev/null +++ b/crates/webclaw-capture/tests/replay.rs @@ -0,0 +1,414 @@ +use std::collections::BTreeMap; +use std::time::Duration; + +use chrono::{DateTime, Utc}; +use serde_json::{Map, Value, json}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::{TcpListener, TcpStream}; +use tokio::sync::{mpsc, oneshot}; +use webclaw_capture::replay::replay_endpoint; +use webclaw_capture::types::{ + EndpointDefinition, EndpointExample, EndpointSafety, ReplayOptions, ReplayResult, +}; + +struct LocalServer { + base_url: String, + requests: mpsc::UnboundedReceiver, + shutdown: Option>, +} + +impl LocalServer { + async fn start() -> Self { + let listener = TcpListener::bind("127.0.0.1:0") + .await + .expect("bind local replay test server"); + let address = listener.local_addr().expect("local replay server address"); + let (shutdown, mut shutdown_rx) = oneshot::channel::<()>(); + let (requests_tx, requests_rx) = mpsc::unbounded_channel::(); + + tokio::spawn(async move { + loop { + tokio::select! { + _ = &mut shutdown_rx => break, + accepted = listener.accept() => { + let Ok((stream, _peer)) = accepted else { + continue; + }; + + tokio::spawn(handle_connection(stream, requests_tx.clone())); + } + } + } + }); + + Self { + base_url: format!("http://{address}"), + requests: requests_rx, + shutdown: Some(shutdown), + } + } + + async fn next_request(&mut self) -> String { + tokio::time::timeout(Duration::from_secs(2), self.requests.recv()) + .await + .expect("local replay server should receive a request") + .expect("local replay server request channel should remain open") + } +} + +impl Drop for LocalServer { + fn drop(&mut self) { + if let Some(shutdown) = self.shutdown.take() { + let _ = shutdown.send(()); + } + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn get_endpoint_executes_when_dry_run_is_false() { + let mut server = LocalServer::start().await; + let endpoint = get_endpoint(&server.base_url, headers(&[("Accept", "application/json")])); + + let result = replay_endpoint( + &endpoint, + ReplayOptions { + dry_run: false, + confirm_unsafe: false, + params_json: Some(json!({ "category": "tools" })), + headers: Map::new(), + body_json: None, + }, + ) + .await + .expect("replay GET endpoint"); + + match result { + ReplayResult::Executed { + status, + body_sample, + .. + } => { + assert_eq!(status, 200); + assert!( + body_sample + .as_deref() + .unwrap_or_default() + .contains(r#""ok":true"#), + "executed replay should return the response body sample" + ); + } + other => panic!("GET replay should execute, got {other:#?}"), + } + + let request = server.next_request().await; + assert!( + request.starts_with("GET /api/products"), + "server should receive the replayed GET request, got {request:?}" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn get_endpoint_with_dry_run_returns_preview_without_network() { + let endpoint = get_endpoint( + "http://127.0.0.1:9", + headers(&[("Accept", "application/json")]), + ); + + let result = replay_endpoint( + &endpoint, + ReplayOptions { + dry_run: true, + confirm_unsafe: false, + params_json: Some(json!({ "category": "tools" })), + headers: headers(&[("X-Replay-Trace", "dry-run")]), + body_json: None, + }, + ) + .await + .expect("preview GET endpoint"); + + match result { + ReplayResult::Preview { + method, + url, + headers, + body_sample, + } => { + assert_eq!(method, "GET"); + assert!(url.starts_with("http://127.0.0.1:9/api/products")); + assert!(url.contains("category=tools")); + assert_eq!(header_string(&headers, "X-Replay-Trace"), Some("dry-run")); + assert_eq!(body_sample, None); + } + other => panic!("dry-run GET replay should return a preview, got {other:#?}"), + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn post_without_confirmation_is_blocked() { + let endpoint = post_endpoint("http://127.0.0.1:9"); + + let result = replay_endpoint( + &endpoint, + ReplayOptions { + dry_run: false, + confirm_unsafe: false, + params_json: None, + headers: Map::new(), + body_json: Some(graphql_body()), + }, + ) + .await + .expect("block unsafe POST replay"); + + match result { + ReplayResult::Blocked { reason } => { + let reason = reason.to_ascii_lowercase(); + assert!( + reason.contains("confirm") || reason.contains("unsafe"), + "blocked replay should explain confirmation is required, got {reason:?}" + ); + } + other => { + panic!("unsafe POST replay without confirmation should be blocked, got {other:#?}") + } + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn post_with_dry_run_returns_preview_only() { + let endpoint = post_endpoint("http://127.0.0.1:9"); + + let result = replay_endpoint( + &endpoint, + ReplayOptions { + dry_run: true, + confirm_unsafe: false, + params_json: None, + headers: headers(&[("Content-Type", "application/json")]), + body_json: Some(graphql_body()), + }, + ) + .await + .expect("preview unsafe POST replay"); + + match result { + ReplayResult::Preview { + method, + url, + body_sample, + .. + } => { + assert_eq!(method, "POST"); + assert_eq!(url, "http://127.0.0.1:9/graphql"); + assert!( + body_sample + .as_deref() + .unwrap_or_default() + .contains("CreateProduct"), + "dry-run POST preview should include the request body sample" + ); + } + other => panic!("dry-run POST replay should return a preview, got {other:#?}"), + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn redacted_headers_are_never_sent() { + let mut server = LocalServer::start().await; + let endpoint = get_endpoint( + &server.base_url, + headers(&[ + ("Authorization", "[REDACTED]"), + ("Cookie", "[REDACTED]"), + ("X-Api-Key", "[REDACTED]"), + ("X-Trace-Id", "captured-trace"), + ]), + ); + + let result = replay_endpoint( + &endpoint, + ReplayOptions { + dry_run: false, + confirm_unsafe: false, + params_json: None, + headers: headers(&[ + ("X-User-Email", "[REDACTED]"), + ("X-Allowed-Override", "override-ok"), + ]), + body_json: None, + }, + ) + .await + .expect("replay GET endpoint without redacted headers"); + + assert!( + matches!(result, ReplayResult::Executed { status: 200, .. }), + "GET replay should execute, got {result:#?}" + ); + + let request = server.next_request().await; + let lower_request = request.to_ascii_lowercase(); + + for forbidden in [ + "authorization:", + "cookie:", + "x-api-key:", + "x-user-email:", + "[redacted]", + ] { + assert!( + !lower_request.contains(forbidden), + "replay request should not send redacted header material {forbidden:?}: {request}" + ); + } + assert!( + lower_request.contains("x-allowed-override: override-ok"), + "non-redacted caller-supplied headers should still be sent: {request}" + ); +} + +async fn handle_connection(mut stream: TcpStream, requests: mpsc::UnboundedSender) { + let mut buffer = vec![0_u8; 8192]; + let Ok(bytes_read) = stream.read(&mut buffer).await else { + return; + }; + if bytes_read == 0 { + return; + } + + let request = String::from_utf8_lossy(&buffer[..bytes_read]).to_string(); + let status = if request.starts_with("GET /api/products") { + "200 OK" + } else { + "404 Not Found" + }; + let body = if status == "200 OK" { + r#"{"ok":true,"items":[{"id":12345,"name":"Hammer"}]}"# + } else { + r#"{"ok":false}"# + }; + let response = http_response(status, &[("Content-Type", "application/json")], body); + + let _ = requests.send(request); + let _ = stream.write_all(response.as_bytes()).await; + let _ = stream.shutdown().await; +} + +fn http_response(status: &str, headers: &[(&str, &str)], body: &str) -> String { + let mut response = format!( + "HTTP/1.1 {status}\r\nContent-Length: {}\r\nConnection: close\r\nCache-Control: no-store\r\n", + body.len() + ); + + for (name, value) in headers { + response.push_str(name); + response.push_str(": "); + response.push_str(value); + response.push_str("\r\n"); + } + + response.push_str("\r\n"); + response.push_str(body); + response +} + +fn get_endpoint(origin: &str, request_headers: Map) -> EndpointDefinition { + let mut query_params = BTreeMap::new(); + query_params.insert("category".to_owned(), vec!["tools".to_owned()]); + + EndpointDefinition { + id: format!("GET {origin}/api/products"), + method: "GET".to_owned(), + origin: origin.to_owned(), + path_template: "/api/products".to_owned(), + query_params, + request_schema: None, + response_schema: Some(json!({ + "type": "object", + "properties": { + "items": { "type": "array" } + } + })), + auth_evidence: Vec::new(), + safety: EndpointSafety { + safe_to_replay: true, + requires_confirmation: false, + reason: "GET is a read-oriented HTTP method".to_owned(), + }, + examples: vec![EndpointExample { + url: format!("{origin}/api/products?category=tools"), + request_headers, + request_body_sample: None, + response_status: 200, + response_headers: headers(&[("Content-Type", "application/json")]), + response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()), + captured_at: test_time(), + }], + } +} + +fn post_endpoint(origin: &str) -> EndpointDefinition { + EndpointDefinition { + id: format!("POST {origin}/graphql"), + method: "POST".to_owned(), + origin: origin.to_owned(), + path_template: "/graphql".to_owned(), + query_params: BTreeMap::new(), + request_schema: Some(json!({ + "type": "object", + "properties": { + "query": { "type": "string" }, + "variables": { "type": "object" } + } + })), + response_schema: Some(json!({ "type": "object" })), + auth_evidence: vec!["X-CSRF-Token header observed".to_owned()], + safety: EndpointSafety { + safe_to_replay: false, + requires_confirmation: true, + reason: "POST may mutate server state and requires confirmation".to_owned(), + }, + examples: vec![EndpointExample { + url: format!("{origin}/graphql"), + request_headers: headers(&[ + ("Content-Type", "application/json"), + ("X-CSRF-Token", "[REDACTED]"), + ]), + request_body_sample: Some(graphql_body().to_string()), + response_status: 200, + response_headers: headers(&[("Content-Type", "application/json")]), + response_body_sample: Some(r#"{"data":{"createProduct":{"id":"12345"}}}"#.to_owned()), + captured_at: test_time(), + }], + } +} + +fn headers(entries: &[(&str, &str)]) -> Map { + entries + .iter() + .map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned()))) + .collect() +} + +fn header_string<'a>(headers: &'a Map, name: &str) -> Option<&'a str> { + headers + .iter() + .find(|(header_name, _value)| header_name.eq_ignore_ascii_case(name)) + .and_then(|(_header_name, value)| value.as_str()) +} + +fn graphql_body() -> Value { + json!({ + "query": "mutation CreateProduct($name: String!) { createProduct(name: $name) { id } }", + "variables": { + "name": "Hammer" + } + }) +} + +fn test_time() -> DateTime { + DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z") + .expect("valid test timestamp") + .with_timezone(&Utc) +} diff --git a/crates/webclaw-capture/tests/store.rs b/crates/webclaw-capture/tests/store.rs new file mode 100644 index 0000000..2c677ba --- /dev/null +++ b/crates/webclaw-capture/tests/store.rs @@ -0,0 +1,312 @@ +use std::collections::BTreeMap; +use std::env; +use std::ffi::OsString; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Mutex; +use std::time::{SystemTime, UNIX_EPOCH}; + +use chrono::{DateTime, Utc}; +use serde_json::{Map, Value, json}; +use url::Url; +use webclaw_capture::redact::redact_artifact; +use webclaw_capture::store::{ + capture_id_for, capture_root, find_endpoint, load_endpoints, save_capture, +}; +use webclaw_capture::types::{ + CaptureArtifact, CapturedExchange, EndpointDefinition, EndpointExample, EndpointSafety, +}; + +static ENV_LOCK: Mutex<()> = Mutex::new(()); +const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR"; + +struct EnvVarGuard { + original: Option, +} + +impl EnvVarGuard { + fn set_capture_dir(value: Option<&Path>) -> Self { + let original = env::var_os(CAPTURE_DIR_ENV); + + unsafe { + match value { + Some(path) => env::set_var(CAPTURE_DIR_ENV, path), + None => env::remove_var(CAPTURE_DIR_ENV), + } + } + + Self { original } + } +} + +impl Drop for EnvVarGuard { + fn drop(&mut self) { + unsafe { + match &self.original { + Some(value) => env::set_var(CAPTURE_DIR_ENV, value), + None => env::remove_var(CAPTURE_DIR_ENV), + } + } + } +} + +fn with_capture_dir(value: Option<&Path>, test: impl FnOnce() -> T) -> T { + let _lock = ENV_LOCK.lock().expect("capture env lock"); + let _guard = EnvVarGuard::set_capture_dir(value); + + test() +} + +fn unique_temp_root(test_name: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time after unix epoch") + .as_nanos(); + + env::temp_dir().join(format!( + "webclaw-capture-store-{test_name}-{}-{nanos}", + std::process::id() + )) +} + +fn test_time() -> DateTime { + DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z") + .expect("valid test timestamp") + .with_timezone(&Utc) +} + +fn headers(entries: &[(&str, &str)]) -> Map { + entries + .iter() + .map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned()))) + .collect() +} + +fn sample_endpoint() -> EndpointDefinition { + let mut query_params = BTreeMap::new(); + query_params.insert("category".to_owned(), vec!["tools".to_owned()]); + + EndpointDefinition { + id: "GET https://example.test/api/products".to_owned(), + method: "GET".to_owned(), + origin: "https://example.test".to_owned(), + path_template: "/api/products".to_owned(), + query_params, + request_schema: None, + response_schema: Some(json!({ + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { "type": "object" } + } + } + })), + auth_evidence: vec!["Authorization header observed".to_owned()], + safety: EndpointSafety { + safe_to_replay: true, + requires_confirmation: false, + reason: "GET is a read-oriented HTTP method".to_owned(), + }, + examples: vec![EndpointExample { + url: "https://example.test/api/products?category=tools".to_owned(), + request_headers: headers(&[ + ("Authorization", "Bearer raw-secret"), + ("Accept", "application/json"), + ]), + request_body_sample: None, + response_status: 200, + response_headers: headers(&[("Content-Type", "application/json")]), + response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()), + captured_at: test_time(), + }], + } +} + +fn sample_exchange() -> CapturedExchange { + CapturedExchange { + method: "GET".to_owned(), + url: "https://example.test/api/products?category=tools&token=raw-secret".to_owned(), + request_headers: headers(&[ + ("Authorization", "Bearer raw-secret"), + ("Accept", "application/json"), + ]), + request_body_sample: None, + resource_type: Some("fetch".to_owned()), + status: 200, + response_headers: headers(&[("Content-Type", "application/json")]), + response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()), + started_at: test_time(), + duration_ms: 42, + redirect_chain: vec!["https://example.test/login?session=raw-secret".to_owned()], + } +} + +fn sample_artifact() -> CaptureArtifact { + let mut metadata = Map::new(); + metadata.insert("runner".to_owned(), json!("store-test")); + + CaptureArtifact { + id: "example.test/2026-05-16T12-00-00Z".to_owned(), + source_url: "https://example.test/products?email=user@example.test".to_owned(), + intent: Some("discover product listing API".to_owned()), + started_at: test_time(), + completed_at: Some(test_time()), + exchanges: vec![sample_exchange()], + endpoints: vec![sample_endpoint()], + metadata, + } +} + +fn read_json(path: &Path) -> T { + let contents = fs::read_to_string(path).expect("read JSON file"); + serde_json::from_str(&contents).expect("valid JSON file") +} + +#[test] +fn default_capture_root_resolves_under_user_profile_webclaw_api_captures() { + with_capture_dir(None, || { + let home = env::var_os("USERPROFILE") + .map(PathBuf::from) + .or_else(dirs::home_dir) + .expect("home directory"); + + assert_eq!(capture_root(), home.join(".webclaw").join("api-captures")); + }); +} + +#[test] +fn capture_root_uses_webclaw_capture_dir_override() { + let root = unique_temp_root("override"); + + with_capture_dir(Some(&root), || { + assert_eq!(capture_root(), root); + }); +} + +#[test] +fn capture_id_for_uses_domain_and_filesystem_safe_utc_timestamp() { + let url = Url::parse("https://example.test/api/products?category=tools").expect("valid URL"); + + assert_eq!( + capture_id_for(&url, test_time()), + "example.test/2026-05-16T12-00-00Z" + ); +} + +#[test] +fn save_capture_writes_raw_redacted_endpoints_and_metadata_files() { + let root = unique_temp_root("save"); + + with_capture_dir(Some(&root), || { + let artifact = sample_artifact(); + let saved = save_capture(&artifact).expect("save capture"); + + assert_eq!(saved.id, artifact.id); + assert_eq!( + saved.capture_dir, + root.join("example.test").join("2026-05-16T12-00-00Z") + ); + assert_eq!( + saved.raw_capture_path, + saved.capture_dir.join("raw-capture.json") + ); + assert_eq!( + saved.redacted_capture_path, + saved.capture_dir.join("redacted-capture.json") + ); + assert_eq!( + saved.endpoints_path, + saved.capture_dir.join("endpoints.json") + ); + assert_eq!(saved.metadata_path, saved.capture_dir.join("metadata.json")); + + assert!(saved.raw_capture_path.is_file()); + assert!(saved.redacted_capture_path.is_file()); + assert!(saved.endpoints_path.is_file()); + assert!(saved.metadata_path.is_file()); + + let raw_capture: CaptureArtifact = read_json(&saved.raw_capture_path); + assert_eq!(raw_capture, artifact); + + let redacted_capture: CaptureArtifact = read_json(&saved.redacted_capture_path); + assert_ne!(redacted_capture, artifact); + assert!( + !serde_json::to_string(&redacted_capture) + .expect("serialize redacted capture") + .contains("raw-secret"), + "redacted capture should not contain raw secrets" + ); + + let endpoints: Vec = read_json(&saved.endpoints_path); + assert_eq!(endpoints, redact_artifact(&artifact).endpoints); + assert!( + !serde_json::to_string(&endpoints) + .expect("serialize endpoints") + .contains("raw-secret"), + "endpoints.json should not contain raw secrets" + ); + + let metadata: Value = read_json(&saved.metadata_path); + assert!( + metadata.is_object(), + "metadata.json should contain a JSON object" + ); + let metadata_text = serde_json::to_string(&metadata).expect("serialize metadata"); + assert!( + !metadata_text.contains("user@example.test"), + "metadata.json should redact PII from source_url" + ); + assert!( + metadata_text.contains("REDACTED"), + "metadata.json should preserve the redaction marker" + ); + }); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn load_endpoints_by_capture_id_reads_endpoints_json() { + let root = unique_temp_root("load"); + + with_capture_dir(Some(&root), || { + let artifact = sample_artifact(); + save_capture(&artifact).expect("save capture"); + + let loaded = load_endpoints(&artifact.id).expect("load endpoints"); + + assert_eq!(loaded, redact_artifact(&artifact).endpoints); + assert!( + !serde_json::to_string(&loaded) + .expect("serialize loaded endpoints") + .contains("raw-secret"), + "loaded endpoints should not contain raw secrets" + ); + }); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn find_endpoint_scans_saved_capture_endpoints() { + let root = unique_temp_root("find"); + + with_capture_dir(Some(&root), || { + let artifact = sample_artifact(); + let expected = redact_artifact(&artifact).endpoints[0].clone(); + save_capture(&artifact).expect("save capture"); + + let found = find_endpoint(&expected.id).expect("find endpoint"); + + assert_eq!(found, expected); + assert!( + !serde_json::to_string(&found) + .expect("serialize found endpoint") + .contains("raw-secret"), + "found endpoint should not contain raw secrets" + ); + }); + + let _ = fs::remove_dir_all(root); +} diff --git a/crates/webclaw-cli/Cargo.toml b/crates/webclaw-cli/Cargo.toml index adce50f..b73d0d6 100644 --- a/crates/webclaw-cli/Cargo.toml +++ b/crates/webclaw-cli/Cargo.toml @@ -11,6 +11,7 @@ path = "src/main.rs" [dependencies] webclaw-core = { workspace = true } +webclaw-capture = { path = "../webclaw-capture" } webclaw-fetch = { workspace = true } webclaw-llm = { workspace = true } webclaw-pdf = { workspace = true } diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 03c1490..b2978be 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -10,6 +10,11 @@ use std::sync::atomic::{AtomicBool, Ordering}; use clap::{Parser, Subcommand, ValueEnum}; use tracing_subscriber::EnvFilter; +use webclaw_capture::cdp::{CaptureOptions, capture_network}; +use webclaw_capture::openapi::write_openapi; +use webclaw_capture::replay::replay_endpoint; +use webclaw_capture::store::{find_endpoint, load_endpoints}; +use webclaw_capture::types::{EndpointDefinition, ReplayOptions}; use webclaw_core::{ ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options, to_llm_text, @@ -336,6 +341,61 @@ enum Commands { #[arg(long)] raw: bool, }, + + /// Capture browser network traffic and learn reusable API endpoints. + CaptureNetwork { + /// Page URL to inspect. + url: String, + + /// Capture intent, stored with the capture metadata. + #[arg(long)] + intent: Option, + + /// Milliseconds to wait after page navigation before saving the capture. + #[arg(long, default_value_t = 3000)] + wait_ms: u64, + + /// Run Chromium with a visible window instead of headless mode. + #[arg(long)] + headed: bool, + }, + + /// Print learned endpoints for a saved capture id. + Endpoints { + /// Capture id, for example `example.com/2026-05-16T12-00-00Z`. + capture_id: String, + }, + + /// Print one learned endpoint by endpoint id. + ShowEndpoint { + /// Endpoint id, for example `get_example_test_api_products`. + endpoint_id: String, + }, + + /// Replay or preview a learned endpoint. + ReplayEndpoint { + /// Endpoint id to replay. + endpoint_id: String, + + /// JSON object with path/query parameter overrides. + #[arg(long, default_value = "{}")] + params_json: String, + + /// Preview the replay request without network access. + #[arg(long)] + dry_run: bool, + + /// Allow unsafe methods such as POST, PUT, PATCH, and DELETE to execute. + #[arg(long)] + confirm_unsafe: bool, + }, + + /// Export a saved capture's learned endpoints as OpenAPI 3.1 JSON. + #[command(name = "export-openapi")] + ExportOpenapi { + /// Capture id, for example `example.com/2026-05-16T12-00-00Z`. + capture_id: String, + }, } #[derive(Clone, ValueEnum)] @@ -2169,6 +2229,121 @@ fn has_llm_flags(cli: &Cli) -> bool { cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some() } +async fn run_capture_network_command( + url: &str, + intent: Option, + wait_ms: u64, + headed: bool, +) -> Result<(), String> { + let saved = capture_network(CaptureOptions { + url: normalize_url(url), + intent, + wait_ms, + headed, + }) + .await + .map_err(|e| format!("capture-network failed: {e}"))?; + + println!( + "{}", + serde_json::to_string_pretty(&saved).map_err(|e| format!("JSON encode failed: {e}"))? + ); + + Ok(()) +} + +fn run_endpoints_command(capture_id: &str) -> Result<(), String> { + let endpoints = load_endpoints(capture_id) + .map_err(|e| format!("could not load endpoints for capture id {capture_id}: {e}"))?; + + println!( + "{}", + serde_json::to_string_pretty(&endpoints).map_err(|e| format!("JSON encode failed: {e}"))? + ); + + Ok(()) +} + +fn run_show_endpoint_command(endpoint_id: &str) -> Result<(), String> { + let endpoint = find_endpoint(endpoint_id) + .map_err(|e| format!("could not find endpoint id {endpoint_id}: {e}"))?; + + println!( + "{}", + serde_json::to_string_pretty(&endpoint).map_err(|e| format!("JSON encode failed: {e}"))? + ); + + Ok(()) +} + +async fn run_replay_endpoint_command( + endpoint_id: &str, + params_json: &str, + dry_run: bool, + confirm_unsafe: bool, +) -> Result<(), String> { + let endpoint = find_endpoint(endpoint_id) + .map_err(|e| format!("could not find endpoint id {endpoint_id}: {e}"))?; + let params_json = parse_params_json(params_json)?; + let default_dry_run = endpoint_defaults_to_dry_run(&endpoint) && !confirm_unsafe; + + if default_dry_run && !dry_run { + eprintln!( + "Unsafe endpoint replay defaults to dry-run. Re-run with --confirm-unsafe to execute." + ); + } + + let options = ReplayOptions { + dry_run: dry_run || default_dry_run, + confirm_unsafe, + params_json, + headers: serde_json::Map::new(), + body_json: None, + }; + + let result = replay_endpoint(&endpoint, options) + .await + .map_err(|e| format!("replay-endpoint failed: {e}"))?; + + println!( + "{}", + serde_json::to_string_pretty(&result).map_err(|e| format!("JSON encode failed: {e}"))? + ); + + Ok(()) +} + +fn run_export_openapi_command(capture_id: &str) -> Result<(), String> { + let path = write_openapi(capture_id) + .map_err(|e| format!("could not export OpenAPI for capture id {capture_id}: {e}"))?; + println!("{}", path.display()); + Ok(()) +} + +fn parse_params_json(params_json: &str) -> Result, String> { + let trimmed = params_json.trim(); + if trimmed.is_empty() { + return Ok(None); + } + + let value: serde_json::Value = serde_json::from_str(trimmed) + .map_err(|e| format!("--params-json must be valid JSON: {e}"))?; + if !value.is_object() { + return Err("--params-json must be a JSON object".to_owned()); + } + + Ok(Some(value)) +} + +fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool { + endpoint.safety.requires_confirmation + || !endpoint.safety.safe_to_replay + || !matches!( + endpoint.method.to_ascii_uppercase().as_str(), + "GET" | "HEAD" | "OPTIONS" + ) +} + async fn run_research(cli: &Cli, query: &str) -> Result<(), String> { let api_key = cli .api_key @@ -2405,6 +2580,56 @@ async fn main() { } return; } + Commands::CaptureNetwork { + url, + intent, + wait_ms, + headed, + } => { + if let Err(e) = + run_capture_network_command(url, intent.clone(), *wait_ms, *headed).await + { + eprintln!("error: {e}"); + process::exit(1); + } + return; + } + Commands::Endpoints { capture_id } => { + if let Err(e) = run_endpoints_command(capture_id) { + eprintln!("error: {e}"); + process::exit(1); + } + return; + } + Commands::ShowEndpoint { endpoint_id } => { + if let Err(e) = run_show_endpoint_command(endpoint_id) { + eprintln!("error: {e}"); + process::exit(1); + } + return; + } + Commands::ReplayEndpoint { + endpoint_id, + params_json, + dry_run, + confirm_unsafe, + } => { + if let Err(e) = + run_replay_endpoint_command(endpoint_id, params_json, *dry_run, *confirm_unsafe) + .await + { + eprintln!("error: {e}"); + process::exit(1); + } + return; + } + Commands::ExportOpenapi { capture_id } => { + if let Err(e) = run_export_openapi_command(capture_id) { + eprintln!("error: {e}"); + process::exit(1); + } + return; + } } } diff --git a/crates/webclaw-fetch/src/sitemap.rs b/crates/webclaw-fetch/src/sitemap.rs index 931db32..374892d 100644 --- a/crates/webclaw-fetch/src/sitemap.rs +++ b/crates/webclaw-fetch/src/sitemap.rs @@ -597,7 +597,7 @@ mod tests { "#; let entries = parse_sitemap_xml(xml); // Should return at least the successfully parsed entry - assert!(entries.len() >= 1); + assert!(!entries.is_empty()); assert_eq!(entries[0].url, "https://example.com/good"); } diff --git a/crates/webclaw-fetch/src/url_security.rs b/crates/webclaw-fetch/src/url_security.rs index 328879e..bf8f24c 100644 --- a/crates/webclaw-fetch/src/url_security.rs +++ b/crates/webclaw-fetch/src/url_security.rs @@ -193,7 +193,7 @@ mod tests { .await .is_ok() ); - assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false); + assert!(!is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)))); } #[tokio::test] diff --git a/crates/webclaw-fetch/tests/bench_1k.rs b/crates/webclaw-fetch/tests/bench_1k.rs index ffbbf0a..9858ef1 100644 --- a/crates/webclaw-fetch/tests/bench_1k.rs +++ b/crates/webclaw-fetch/tests/bench_1k.rs @@ -71,7 +71,7 @@ fn classify(body: &str, len: usize, status: u16, kw: &[String]) -> &'static str "CHALLENGE" } else if status == 403 || status == 429 { "BLOCKED" - } else if status >= 300 && status < 400 { + } else if (300..400).contains(&status) { "REDIRECT" } else if len < 1000 { "EMPTY" diff --git a/crates/webclaw-mcp/Cargo.toml b/crates/webclaw-mcp/Cargo.toml index ec3b2b4..d447589 100644 --- a/crates/webclaw-mcp/Cargo.toml +++ b/crates/webclaw-mcp/Cargo.toml @@ -14,6 +14,7 @@ webclaw-core = { workspace = true } webclaw-fetch = { workspace = true } webclaw-llm = { workspace = true } webclaw-pdf = { workspace = true } +webclaw-capture = { path = "../webclaw-capture" } rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] } schemars = "1.0" dotenvy = { workspace = true } diff --git a/crates/webclaw-mcp/src/main.rs b/crates/webclaw-mcp/src/main.rs index 89a4755..bf9e6fc 100644 --- a/crates/webclaw-mcp/src/main.rs +++ b/crates/webclaw-mcp/src/main.rs @@ -11,6 +11,10 @@ use server::WebclawMcp; #[tokio::main] async fn main() -> Result<(), Box> { + if print_help_or_version() { + return Ok(()); + } + dotenvy::dotenv().ok(); // Log to stderr -- stdout is the MCP transport channel @@ -25,3 +29,42 @@ async fn main() -> Result<(), Box> { service.waiting().await?; Ok(()) } + +fn print_help_or_version() -> bool { + let mut args = std::env::args().skip(1); + let Some(arg) = args.next() else { + return false; + }; + + match arg.as_str() { + "-h" | "--help" => { + println!("{}", help_text()); + true + } + "-V" | "--version" => { + println!("webclaw-mcp {}", env!("CARGO_PKG_VERSION")); + true + } + _ => false, + } +} + +fn help_text() -> String { + format!( + "\ +webclaw-mcp {version} +MCP server for webclaw web extraction toolkit + +Usage: webclaw-mcp + +Options: + -h, --help Print help + -V, --version Print version + +Tools: + scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, + capture_network, discover_endpoints, show_endpoint, replay_endpoint, + export_openapi, list_captures, list_extractors, vertical_scrape", + version = env!("CARGO_PKG_VERSION") + ) +} diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index 3b88bab..6458317 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -4,6 +4,8 @@ /// Uses a local-first architecture: fetches pages directly, then falls back /// to the webclaw cloud API (api.webclaw.io) when bot protection or /// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback. +use std::fs; +use std::path::Path; use std::sync::{Arc, OnceLock}; use std::time::Duration; @@ -11,9 +13,14 @@ use rmcp::handler::server::router::tool::ToolRouter; use rmcp::handler::server::wrapper::Parameters; use rmcp::model::{Implementation, ServerCapabilities, ServerInfo}; use rmcp::{ServerHandler, tool, tool_handler, tool_router}; -use serde_json::json; +use serde_json::{Map, Value, json}; use tracing::{error, info, warn}; +use webclaw_capture::cdp::{CaptureOptions, capture_network as run_network_capture}; +use webclaw_capture::openapi::write_openapi; +use webclaw_capture::replay::replay_endpoint as run_endpoint_replay; +use webclaw_capture::store::{capture_root, find_endpoint, load_endpoints}; +use webclaw_capture::types::{EndpointDefinition, HeaderMap, ReplayOptions}; use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult}; use crate::tools::*; @@ -709,6 +716,96 @@ impl WebclawMcp { } } + /// Capture browser network traffic from a page and save learned API endpoints for later replay. + #[tool] + async fn capture_network( + &self, + Parameters(params): Parameters, + ) -> Result { + let url = normalize_capture_url(¶ms.url)?; + validate_url(&url).await?; + + let saved = run_network_capture(CaptureOptions { + url, + intent: params.intent, + wait_ms: params.wait_ms.unwrap_or(3000), + headed: params.headed.unwrap_or(false), + }) + .await + .map_err(|e| format!("capture_network failed: {e}"))?; + + to_pretty_json(&saved) + } + + /// Return learned endpoint definitions for a saved capture id. + #[tool] + async fn discover_endpoints( + &self, + Parameters(params): Parameters, + ) -> Result { + let endpoints = load_endpoints(¶ms.capture_id).map_err(|e| { + format!( + "could not load endpoints for capture id {}: {e}", + params.capture_id + ) + })?; + + to_pretty_json(&endpoints) + } + + /// Show one learned endpoint definition by endpoint id. + #[tool] + async fn show_endpoint( + &self, + Parameters(params): Parameters, + ) -> Result { + let endpoint = find_endpoint(¶ms.endpoint_id) + .map_err(|e| format!("could not find endpoint id {}: {e}", params.endpoint_id))?; + + to_pretty_json(&endpoint) + } + + /// Replay or preview a learned endpoint. Mutating methods default to dry-run unless confirmed. + #[tool] + async fn replay_endpoint( + &self, + Parameters(params): Parameters, + ) -> Result { + let endpoint = find_endpoint(¶ms.endpoint_id) + .map_err(|e| format!("could not find endpoint id {}: {e}", params.endpoint_id))?; + let options = replay_options_from_params(&endpoint, ¶ms)?; + let result = run_endpoint_replay(&endpoint, options) + .await + .map_err(|e| format!("replay_endpoint failed: {e}"))?; + + to_pretty_json(&result) + } + + /// Export a saved capture's learned endpoints as OpenAPI 3.1 JSON. + #[tool] + async fn export_openapi( + &self, + Parameters(params): Parameters, + ) -> Result { + let path = write_openapi(¶ms.capture_id).map_err(|e| { + format!( + "could not export OpenAPI for capture id {}: {e}", + params.capture_id + ) + })?; + + to_pretty_json(&json!({ "path": path })) + } + + /// List saved network captures from the configured capture root. + #[tool] + async fn list_captures( + &self, + Parameters(_params): Parameters, + ) -> Result { + to_pretty_json(&list_saved_captures_from_root(&capture_root())?) + } + /// List every vertical extractor the server knows about. Returns a /// JSON array of `{name, label, description, url_patterns}` entries. /// Call this to discover what verticals are available before using @@ -767,11 +864,183 @@ impl ServerHandler for WebclawMcp { .with_instructions(String::from( "Webclaw MCP server -- web content extraction for AI agents. \ Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, \ - list_extractors, vertical_scrape.", + capture_network, discover_endpoints, show_endpoint, replay_endpoint, export_openapi, \ + list_captures, list_extractors, vertical_scrape.", )) } } +fn normalize_capture_url(url: &str) -> Result { + let trimmed = url.trim(); + if trimmed.is_empty() { + return Err("url must not be empty".to_owned()); + } + + let normalized = if trimmed.contains("://") { + trimmed.to_owned() + } else { + format!("https://{trimmed}") + }; + + let parsed = url::Url::parse(&normalized).map_err(|e| format!("invalid URL: {e}"))?; + match parsed.scheme() { + "http" | "https" => Ok(normalized), + scheme => Err(format!( + "capture_network only supports http and https URLs, got {scheme:?}" + )), + } +} + +fn replay_options_from_params( + endpoint: &EndpointDefinition, + params: &ReplayEndpointParams, +) -> Result { + if let Some(value) = ¶ms.params_json + && !value.is_object() + { + return Err("params_json must be a JSON object".to_owned()); + } + + let confirm_unsafe = params.confirm_unsafe.unwrap_or(false); + let default_dry_run = endpoint_defaults_to_dry_run(endpoint) && !confirm_unsafe; + + Ok(ReplayOptions { + dry_run: params.dry_run.unwrap_or(false) || default_dry_run, + confirm_unsafe, + params_json: params.params_json.clone(), + headers: header_map_from_strings(params.headers.as_ref()), + body_json: params.body_json.clone(), + }) +} + +fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool { + endpoint.safety.requires_confirmation + || !endpoint.safety.safe_to_replay + || !matches!( + endpoint.method.to_ascii_uppercase().as_str(), + "GET" | "HEAD" | "OPTIONS" + ) +} + +fn header_map_from_strings( + headers: Option<&std::collections::BTreeMap>, +) -> HeaderMap { + headers + .into_iter() + .flat_map(|headers| headers.iter()) + .map(|(name, value)| (name.clone(), Value::String(value.clone()))) + .collect() +} + +fn list_saved_captures_from_root(root: &Path) -> Result, String> { + if !root.exists() { + return Ok(Vec::new()); + } + + let mut captures = Vec::new(); + collect_saved_captures(root, root, &mut captures)?; + captures.sort_by(|left, right| { + left.get("id") + .and_then(Value::as_str) + .unwrap_or_default() + .cmp(right.get("id").and_then(Value::as_str).unwrap_or_default()) + }); + + Ok(captures) +} + +fn collect_saved_captures( + root: &Path, + current: &Path, + captures: &mut Vec, +) -> Result<(), String> { + let entries = fs::read_dir(current).map_err(|e| { + format!( + "could not read capture directory {}: {e}", + current.display() + ) + })?; + + for entry in entries { + let entry = entry.map_err(|e| format!("could not read capture directory entry: {e}"))?; + let path = entry.path(); + + if path.is_dir() { + collect_saved_captures(root, &path, captures)?; + continue; + } + + if path.file_name().and_then(|name| name.to_str()) == Some("metadata.json") { + captures.push(read_capture_metadata(root, &path)?); + } + } + + Ok(()) +} + +fn read_capture_metadata(root: &Path, metadata_path: &Path) -> Result { + let contents = fs::read_to_string(metadata_path).map_err(|e| { + format!( + "could not read capture metadata {}: {e}", + metadata_path.display() + ) + })?; + let mut metadata = match serde_json::from_str::(&contents).map_err(|e| { + format!( + "could not parse capture metadata {}: {e}", + metadata_path.display() + ) + })? { + Value::Object(metadata) => metadata, + _ => Map::new(), + }; + + let capture_dir = metadata_path + .parent() + .ok_or_else(|| format!("metadata path has no parent: {}", metadata_path.display()))?; + let capture_id = capture_id_from_dir(root, capture_dir)?; + + metadata + .entry("id".to_owned()) + .or_insert_with(|| Value::String(capture_id)); + metadata.insert( + "capture_dir".to_owned(), + Value::String(capture_dir.display().to_string()), + ); + + Ok(Value::Object(metadata)) +} + +fn capture_id_from_dir(root: &Path, capture_dir: &Path) -> Result { + let relative = capture_dir.strip_prefix(root).map_err(|e| { + format!( + "capture directory {} is not under root {}: {e}", + capture_dir.display(), + root.display() + ) + })?; + let parts = relative + .components() + .filter_map(|component| match component { + std::path::Component::Normal(value) => Some(value.to_string_lossy().to_string()), + _ => None, + }) + .collect::>(); + + if parts.is_empty() { + Err(format!( + "capture directory {} does not contain a capture id", + capture_dir.display() + )) + } else { + Ok(parts.join("/")) + } +} + +fn to_pretty_json(value: &T) -> Result { + serde_json::to_string_pretty(value).map_err(|e| format!("JSON encode failed: {e}")) +} + // --------------------------------------------------------------------------- // Research file helpers // --------------------------------------------------------------------------- @@ -856,3 +1125,127 @@ fn save_research(dir: &std::path::Path, slug: &str, data: &serde_json::Value) -> json_path.to_string_lossy().to_string(), ) } + +#[cfg(test)] +mod tests { + use std::collections::BTreeMap; + use std::fs; + + use serde_json::json; + use webclaw_capture::types::{EndpointDefinition, EndpointSafety}; + + use super::*; + + fn endpoint( + method: &str, + safe_to_replay: bool, + requires_confirmation: bool, + ) -> EndpointDefinition { + EndpointDefinition { + id: format!("{}_example", method.to_ascii_lowercase()), + method: method.to_owned(), + origin: "https://example.test".to_owned(), + path_template: "/api/items".to_owned(), + query_params: BTreeMap::new(), + request_schema: None, + response_schema: None, + auth_evidence: Vec::new(), + safety: EndpointSafety { + safe_to_replay, + requires_confirmation, + reason: "test".to_owned(), + }, + examples: Vec::new(), + } + } + + #[test] + fn normalize_capture_url_adds_https_and_rejects_non_http_schemes() { + assert_eq!( + normalize_capture_url("example.test/path").unwrap(), + "https://example.test/path" + ); + + assert!(normalize_capture_url("file:///C:/secret.txt").is_err()); + } + + #[test] + fn replay_options_default_unsafe_methods_to_dry_run_unless_confirmed() { + let unsafe_endpoint = endpoint("POST", false, true); + let params = ReplayEndpointParams { + endpoint_id: unsafe_endpoint.id.clone(), + params_json: Some(json!({"id": "123"})), + dry_run: None, + confirm_unsafe: None, + headers: Some(BTreeMap::from([("X-Test".to_owned(), "ok".to_owned())])), + body_json: Some(json!({"name": "tool"})), + }; + + let options = replay_options_from_params(&unsafe_endpoint, ¶ms).unwrap(); + assert!(options.dry_run); + assert!(!options.confirm_unsafe); + assert_eq!(options.params_json, Some(json!({"id": "123"}))); + assert_eq!(options.headers.get("X-Test"), Some(&json!("ok"))); + + let confirmed = ReplayEndpointParams { + confirm_unsafe: Some(true), + ..params + }; + let options = replay_options_from_params(&unsafe_endpoint, &confirmed).unwrap(); + assert!(!options.dry_run); + assert!(options.confirm_unsafe); + } + + #[test] + fn replay_options_leave_safe_gets_executable_by_default() { + let safe_endpoint = endpoint("GET", true, false); + let params = ReplayEndpointParams { + endpoint_id: safe_endpoint.id.clone(), + params_json: None, + dry_run: None, + confirm_unsafe: None, + headers: None, + body_json: None, + }; + + let options = replay_options_from_params(&safe_endpoint, ¶ms).unwrap(); + assert!(!options.dry_run); + assert!(!options.confirm_unsafe); + } + + #[test] + fn list_saved_captures_from_root_returns_metadata_with_capture_id() { + let root = std::env::temp_dir().join(format!( + "webclaw-mcp-list-captures-{}-{}", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + let capture_dir = root.join("example.test").join("2026-05-16T12-00-00Z"); + fs::create_dir_all(&capture_dir).unwrap(); + fs::write( + capture_dir.join("metadata.json"), + serde_json::to_string(&json!({ + "source_url": "https://example.test", + "endpoint_count": 2 + })) + .unwrap(), + ) + .unwrap(); + + let captures = list_saved_captures_from_root(&root).unwrap(); + fs::remove_dir_all(&root).ok(); + + assert_eq!(captures.len(), 1); + assert_eq!(captures[0]["id"], "example.test/2026-05-16T12-00-00Z"); + assert_eq!(captures[0]["endpoint_count"], 2); + assert!( + captures[0]["capture_dir"] + .as_str() + .unwrap() + .contains("example.test") + ); + } +} diff --git a/crates/webclaw-mcp/src/tools.rs b/crates/webclaw-mcp/src/tools.rs index 02bf534..0b0991f 100644 --- a/crates/webclaw-mcp/src/tools.rs +++ b/crates/webclaw-mcp/src/tools.rs @@ -104,6 +104,63 @@ pub struct SearchParams { pub num_results: Option, } +#[derive(Debug, Deserialize, JsonSchema)] +#[allow(dead_code)] +pub struct CaptureNetworkParams { + /// URL to open in Chromium and capture network traffic from. + pub url: String, + /// Optional natural-language purpose for the capture. + pub intent: Option, + /// Milliseconds to wait after navigation while collecting network events. + pub wait_ms: Option, + /// Run the browser in headed mode for debugging. + pub headed: Option, +} + +#[derive(Debug, Deserialize, JsonSchema)] +#[allow(dead_code)] +pub struct DiscoverEndpointsParams { + /// Saved capture id, for example `example.com/2026-05-16T12-00-00Z`. + pub capture_id: String, +} + +#[derive(Debug, Deserialize, JsonSchema)] +#[allow(dead_code)] +pub struct ShowEndpointParams { + /// Learned endpoint id to load from saved captures. + pub endpoint_id: String, +} + +#[derive(Debug, Deserialize, JsonSchema)] +#[allow(dead_code)] +pub struct ReplayEndpointParams { + /// Learned endpoint id to replay or preview. + pub endpoint_id: String, + /// Path/query parameter values to substitute into the learned endpoint. + pub params_json: Option, + /// Preview the replay request without sending network traffic. + pub dry_run: Option, + /// Allow mutating methods such as POST, PUT, PATCH, and DELETE to execute. + pub confirm_unsafe: Option, + /// Additional non-secret request headers to include in the replay. + pub headers: Option>, + /// JSON request body override for replay. + pub body_json: Option, +} + +#[derive(Debug, Deserialize, JsonSchema)] +#[allow(dead_code)] +pub struct ExportOpenApiParams { + /// Saved capture id whose learned endpoints should be exported. + pub capture_id: String, +} + +/// `list_captures` takes no arguments but uses a struct so rmcp can generate +/// a schema and parse the empty JSON-RPC params. +#[derive(Debug, Deserialize, JsonSchema)] +#[allow(dead_code)] +pub struct ListCapturesParams {} + /// Parameters for `vertical_scrape`: run a site-specific extractor by name. #[derive(Debug, Deserialize, JsonSchema)] pub struct VerticalParams { diff --git a/crates/webclaw-server/Cargo.toml b/crates/webclaw-server/Cargo.toml index 3d4c075..382ef2d 100644 --- a/crates/webclaw-server/Cargo.toml +++ b/crates/webclaw-server/Cargo.toml @@ -15,6 +15,7 @@ webclaw-core = { workspace = true } webclaw-fetch = { workspace = true } webclaw-llm = { workspace = true } webclaw-pdf = { workspace = true } +webclaw-capture = { path = "../webclaw-capture" } axum = { version = "0.8", features = ["macros"] } tokio = { workspace = true } diff --git a/crates/webclaw-server/src/main.rs b/crates/webclaw-server/src/main.rs index 06f2451..79ca46d 100644 --- a/crates/webclaw-server/src/main.rs +++ b/crates/webclaw-server/src/main.rs @@ -95,8 +95,18 @@ async fn main() -> anyhow::Result<()> { .route("/crawl", post(routes::crawl::crawl)) .route("/map", post(routes::map::map)) .route("/batch", post(routes::batch::batch)) + .route("/capture-network", post(routes::capture::capture_network)) + .route( + "/captures/{domain}/{timestamp}/endpoints", + get(routes::capture::endpoints), + ) + .route( + "/captures/{domain}/{timestamp}/openapi", + post(routes::capture::export_openapi), + ) .route("/extract", post(routes::extract::extract)) .route("/extractors", get(routes::structured::list_extractors)) + .route("/replay-endpoint", post(routes::capture::replay_endpoint)) .route("/summarize", post(routes::summarize::summarize_route)) .route("/diff", post(routes::diff::diff_route)) .route("/brand", post(routes::brand::brand)) diff --git a/crates/webclaw-server/src/routes/capture.rs b/crates/webclaw-server/src/routes/capture.rs new file mode 100644 index 0000000..84de434 --- /dev/null +++ b/crates/webclaw-server/src/routes/capture.rs @@ -0,0 +1,283 @@ +use std::collections::BTreeMap; + +use axum::{Json, extract::Path}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_capture::cdp::{CaptureOptions, capture_network as run_network_capture}; +use webclaw_capture::openapi::write_openapi; +use webclaw_capture::replay::replay_endpoint as run_endpoint_replay; +use webclaw_capture::store::{find_endpoint, load_endpoints}; +use webclaw_capture::types::{ + CaptureError, EndpointDefinition, HeaderMap, ReplayOptions, ReplayResult, +}; + +use crate::error::ApiError; + +#[derive(Debug, Deserialize, Default)] +#[serde(default)] +pub struct CaptureNetworkRequest { + pub url: String, + pub intent: Option, + pub wait_ms: Option, + pub headed: Option, +} + +#[derive(Debug, Deserialize, Default)] +#[serde(default)] +pub struct ReplayEndpointRequest { + pub endpoint_id: String, + pub params_json: Option, + pub dry_run: Option, + pub confirm_unsafe: Option, + pub headers: Option>, + pub body_json: Option, +} + +pub async fn capture_network( + Json(request): Json, +) -> Result, ApiError> { + if request.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + + let url = normalize_capture_url(&request.url)?; + webclaw_fetch::url_security::validate_public_http_url(&url).await?; + + let saved = run_network_capture(CaptureOptions { + url, + intent: request.intent, + wait_ms: request.wait_ms.unwrap_or(3000), + headed: request.headed.unwrap_or(false), + }) + .await + .map_err(|error| capture_error("capture-network failed", error))?; + + Ok(Json(json!(saved))) +} + +pub async fn endpoints( + Path((domain, timestamp)): Path<(String, String)>, +) -> Result>, ApiError> { + let capture_id = capture_id_from_path(&domain, ×tamp)?; + let endpoints = load_endpoints(&capture_id).map_err(|error| { + capture_error( + format!("could not load endpoints for capture id {capture_id}"), + error, + ) + })?; + + Ok(Json(endpoints)) +} + +pub async fn replay_endpoint( + Json(request): Json, +) -> Result, ApiError> { + if request.endpoint_id.trim().is_empty() { + return Err(ApiError::bad_request("`endpoint_id` is required")); + } + + let endpoint = find_endpoint(&request.endpoint_id).map_err(|error| { + capture_error( + format!("could not find endpoint id {}", request.endpoint_id), + error, + ) + })?; + let options = replay_options_from_request(&endpoint, &request)?; + let result = run_endpoint_replay(&endpoint, options) + .await + .map_err(|error| capture_error("replay-endpoint failed", error))?; + + Ok(Json(result)) +} + +pub async fn export_openapi( + Path((domain, timestamp)): Path<(String, String)>, +) -> Result, ApiError> { + let capture_id = capture_id_from_path(&domain, ×tamp)?; + let path = write_openapi(&capture_id).map_err(|error| { + capture_error( + format!("could not export OpenAPI for capture id {capture_id}"), + error, + ) + })?; + + Ok(Json(json!({ "path": path.display().to_string() }))) +} + +fn normalize_capture_url(url: &str) -> Result { + let trimmed = url.trim(); + if trimmed.is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + + let normalized = if let Some((scheme, _rest)) = trimmed.split_once("://") { + if !matches!(scheme, "http" | "https") { + return Err(ApiError::bad_request(format!( + "capture-network only supports http and https URLs, got {scheme:?}" + ))); + } + trimmed.to_owned() + } else { + format!("https://{trimmed}") + }; + + Ok(normalized) +} + +fn capture_id_from_path(domain: &str, timestamp: &str) -> Result { + if !is_safe_capture_segment(domain) || !is_safe_capture_segment(timestamp) { + return Err(ApiError::bad_request( + "capture id contains an unsafe path segment", + )); + } + + Ok(format!("{domain}/{timestamp}")) +} + +fn replay_options_from_request( + endpoint: &EndpointDefinition, + request: &ReplayEndpointRequest, +) -> Result { + if let Some(value) = &request.params_json + && !value.is_object() + { + return Err(ApiError::bad_request("`params_json` must be a JSON object")); + } + + let confirm_unsafe = request.confirm_unsafe.unwrap_or(false); + let default_dry_run = endpoint_defaults_to_dry_run(endpoint) && !confirm_unsafe; + + Ok(ReplayOptions { + dry_run: request.dry_run.unwrap_or(false) || default_dry_run, + confirm_unsafe, + params_json: request.params_json.clone(), + headers: header_map_from_strings(request.headers.as_ref()), + body_json: request.body_json.clone(), + }) +} + +fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool { + endpoint.safety.requires_confirmation + || !endpoint.safety.safe_to_replay + || !matches!( + endpoint.method.to_ascii_uppercase().as_str(), + "GET" | "HEAD" | "OPTIONS" + ) +} + +fn header_map_from_strings(headers: Option<&BTreeMap>) -> HeaderMap { + headers + .into_iter() + .flat_map(|headers| headers.iter()) + .map(|(name, value)| (name.clone(), Value::String(value.clone()))) + .collect() +} + +fn is_safe_capture_segment(segment: &str) -> bool { + !segment.is_empty() + && segment != "." + && segment != ".." + && !segment.contains(':') + && !segment.contains('/') + && !segment.contains('\\') +} + +fn capture_error(context: impl Into, error: CaptureError) -> ApiError { + let context = context.into(); + match error { + CaptureError::InvalidUrl(_) | CaptureError::Replay(_) | CaptureError::Storage(_) => { + ApiError::bad_request(format!("{context}: {error}")) + } + CaptureError::EndpointNotFound(_) => ApiError::NotFound, + CaptureError::Request(_) | CaptureError::Capture(_) => ApiError::Fetch(error.to_string()), + CaptureError::Io(_) | CaptureError::Json(_) => ApiError::Internal(error.to_string()), + } +} + +#[cfg(test)] +mod tests { + use std::collections::BTreeMap; + + use serde_json::json; + use webclaw_capture::types::{EndpointDefinition, EndpointSafety}; + + use super::*; + + fn endpoint( + method: &str, + safe_to_replay: bool, + requires_confirmation: bool, + ) -> EndpointDefinition { + EndpointDefinition { + id: format!("{}_example", method.to_ascii_lowercase()), + method: method.to_owned(), + origin: "https://example.test".to_owned(), + path_template: "/api/items".to_owned(), + query_params: BTreeMap::new(), + request_schema: None, + response_schema: None, + auth_evidence: Vec::new(), + safety: EndpointSafety { + safe_to_replay, + requires_confirmation, + reason: "test".to_owned(), + }, + examples: Vec::new(), + } + } + + #[test] + fn capture_id_from_path_joins_domain_timestamp_and_rejects_unsafe_segments() { + assert_eq!( + capture_id_from_path("example.test", "2026-05-16T12-00-00Z").unwrap(), + "example.test/2026-05-16T12-00-00Z" + ); + + assert!(capture_id_from_path("..", "2026-05-16T12-00-00Z").is_err()); + assert!(capture_id_from_path("example.test", "..").is_err()); + } + + #[test] + fn replay_request_defaults_unsafe_methods_to_dry_run_unless_confirmed() { + let unsafe_endpoint = endpoint("POST", false, true); + let request = ReplayEndpointRequest { + endpoint_id: unsafe_endpoint.id.clone(), + params_json: Some(json!({"id": "123"})), + dry_run: None, + confirm_unsafe: None, + headers: Some(BTreeMap::from([("X-Test".to_owned(), "ok".to_owned())])), + body_json: Some(json!({"name": "tool"})), + }; + + let options = replay_options_from_request(&unsafe_endpoint, &request).unwrap(); + assert!(options.dry_run); + assert!(!options.confirm_unsafe); + assert_eq!(options.params_json, Some(json!({"id": "123"}))); + assert_eq!(options.headers.get("X-Test"), Some(&json!("ok"))); + assert_eq!(options.body_json, Some(json!({"name": "tool"}))); + + let confirmed = ReplayEndpointRequest { + confirm_unsafe: Some(true), + ..request + }; + let options = replay_options_from_request(&unsafe_endpoint, &confirmed).unwrap(); + assert!(!options.dry_run); + assert!(options.confirm_unsafe); + } + + #[test] + fn replay_request_rejects_non_object_params_json() { + let safe_endpoint = endpoint("GET", true, false); + let request = ReplayEndpointRequest { + endpoint_id: safe_endpoint.id.clone(), + params_json: Some(json!(["not", "an", "object"])), + dry_run: None, + confirm_unsafe: None, + headers: None, + body_json: None, + }; + + let error = replay_options_from_request(&safe_endpoint, &request).unwrap_err(); + assert!(error.to_string().contains("params_json")); + } +} diff --git a/crates/webclaw-server/src/routes/mod.rs b/crates/webclaw-server/src/routes/mod.rs index 01f1052..fdfacb7 100644 --- a/crates/webclaw-server/src/routes/mod.rs +++ b/crates/webclaw-server/src/routes/mod.rs @@ -9,6 +9,7 @@ pub mod batch; pub mod brand; +pub mod capture; pub mod crawl; pub mod diff; pub mod extract;