add network capture endpoint replay

This commit is contained in:
karolinnger 2026-05-16 22:22:18 -04:00
parent 72edb61881
commit cb31c70465
34 changed files with 5996 additions and 8 deletions

View file

@ -0,0 +1,21 @@
[package]
name = "webclaw-capture"
description = "Browser network capture, endpoint inference, and safe replay for Webclaw"
version.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
chromiumoxide = "0.9.1"
futures-util = "0.3"
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
url = "2"
dirs = "6"
chrono = { version = "0.4", features = ["serde"] }
sha2 = "0.10"
hex = "0.4"

View file

@ -0,0 +1,404 @@
use std::collections::HashMap;
use std::time::Duration;
use chromiumoxide::cdp::browser_protocol::network::{
EnableParams, EventLoadingFinished, EventRequestWillBeSent, EventResponseReceived,
GetResponseBodyParams, Headers, RequestId, ResourceType, TimeSinceEpoch,
};
use chromiumoxide::{Browser, BrowserConfig, Page};
use chrono::{DateTime, Utc};
use futures_util::StreamExt;
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value, json};
use tokio::sync::oneshot;
use url::Url;
use crate::infer::infer_endpoints;
use crate::store::{capture_id_for, save_capture};
use crate::types::{CaptureArtifact, CaptureError, CapturedExchange, HeaderMap, SavedCapture};
const BODY_SAMPLE_LIMIT: usize = 64 * 1024;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct CaptureOptions {
pub url: String,
pub intent: Option<String>,
pub wait_ms: u64,
pub headed: bool,
}
pub async fn capture_network(options: CaptureOptions) -> Result<SavedCapture, CaptureError> {
let source_url =
Url::parse(&options.url).map_err(|error| CaptureError::InvalidUrl(error.to_string()))?;
let started_at = Utc::now();
let capture_id = capture_id_for(&source_url, started_at);
let (mut browser, mut handler) = launch_browser(options.headed).await?;
let handler_task = tokio::spawn(async move {
while let Some(event) = handler.next().await {
if let Err(error) = event {
tracing::debug!(error = %error, "chromiumoxide browser handler stopped");
break;
}
}
});
let capture_result = async {
let page = browser
.new_page("about:blank")
.await
.map_err(|error| CaptureError::Capture(format!("could not create page: {error}")))?;
enable_network_capture(&page).await?;
let request_events = page
.event_listener::<EventRequestWillBeSent>()
.await
.map_err(|error| {
CaptureError::Capture(format!("could not listen for network requests: {error}"))
})?;
let response_events = page
.event_listener::<EventResponseReceived>()
.await
.map_err(|error| {
CaptureError::Capture(format!("could not listen for network responses: {error}"))
})?;
let finished_events = page
.event_listener::<EventLoadingFinished>()
.await
.map_err(|error| {
CaptureError::Capture(format!("could not listen for completed requests: {error}"))
})?;
let (stop_tx, stop_rx) = oneshot::channel();
let collector_page = page.clone();
let collector_task = tokio::spawn(async move {
collect_exchanges(
collector_page,
request_events,
response_events,
finished_events,
stop_rx,
started_at,
)
.await
});
page.goto(options.url.clone()).await.map_err(|error| {
CaptureError::Capture(format!("could not navigate to {}: {error}", options.url))
})?;
tokio::time::sleep(Duration::from_millis(options.wait_ms)).await;
let _ = stop_tx.send(());
let exchanges = collector_task
.await
.map_err(|error| CaptureError::Capture(format!("capture collector failed: {error}")))?
.map_err(|error| CaptureError::Capture(format!("capture collector failed: {error}")))?;
let completed_at = Utc::now();
let endpoints = infer_endpoints(&exchanges);
let exchange_count = exchanges.len();
let endpoint_count = endpoints.len();
let mut metadata = Map::new();
metadata.insert("wait_ms".to_owned(), json!(options.wait_ms));
metadata.insert("headed".to_owned(), json!(options.headed));
metadata.insert("exchange_count".to_owned(), json!(exchange_count));
metadata.insert("endpoint_count".to_owned(), json!(endpoint_count));
let artifact = CaptureArtifact {
id: capture_id,
source_url: options.url,
intent: options.intent,
started_at,
completed_at: Some(completed_at),
exchanges,
endpoints,
metadata,
};
save_capture(&artifact)
}
.await;
if let Err(error) = browser.close().await {
tracing::debug!(error = %error, "failed to close browser after capture");
}
if let Err(error) = handler_task.await {
tracing::debug!(error = %error, "failed to join browser handler after capture");
}
capture_result
}
async fn launch_browser(headed: bool) -> Result<(Browser, chromiumoxide::Handler), CaptureError> {
let mut config = BrowserConfig::builder()
.request_timeout(Duration::from_secs(15))
.no_sandbox()
.disable_cache()
.disable_https_first();
if headed {
config = config.with_head();
}
let config = config.build().map_err(|error| {
CaptureError::Capture(format!("could not build browser config: {error}"))
})?;
Browser::launch(config)
.await
.map_err(|error| CaptureError::Capture(format!("could not launch Chromium: {error}")))
}
async fn enable_network_capture(page: &Page) -> Result<(), CaptureError> {
let params = EnableParams::builder()
.max_total_buffer_size(16 * 1024 * 1024)
.max_resource_buffer_size(2 * 1024 * 1024)
.max_post_data_size(BODY_SAMPLE_LIMIT as i64)
.build();
page.execute(params).await.map_err(|error| {
CaptureError::Capture(format!("could not enable CDP network capture: {error}"))
})?;
Ok(())
}
async fn collect_exchanges(
page: Page,
mut request_events: chromiumoxide::listeners::EventStream<EventRequestWillBeSent>,
mut response_events: chromiumoxide::listeners::EventStream<EventResponseReceived>,
mut finished_events: chromiumoxide::listeners::EventStream<EventLoadingFinished>,
mut stop_rx: oneshot::Receiver<()>,
fallback_started_at: DateTime<Utc>,
) -> Result<Vec<CapturedExchange>, CaptureError> {
let mut pending = HashMap::<RequestId, PendingExchange>::new();
let mut exchanges = Vec::<CapturedExchange>::new();
loop {
tokio::select! {
_ = &mut stop_rx => break,
event = request_events.next() => {
if let Some(event) = event {
record_request(&mut pending, &event, fallback_started_at);
}
}
event = response_events.next() => {
if let Some(event) = event {
record_response(&mut pending, &event);
}
}
event = finished_events.next() => {
if let Some(event) = event
&& let Some(exchange) = finish_request(&page, &mut pending, &event).await?
{
exchanges.push(exchange);
}
}
}
}
for (_request_id, pending_exchange) in pending {
if let Some(exchange) = pending_exchange.into_exchange() {
exchanges.push(exchange);
}
}
exchanges.sort_by(|left, right| {
left.started_at
.cmp(&right.started_at)
.then_with(|| left.url.cmp(&right.url))
});
Ok(exchanges)
}
fn record_request(
pending: &mut HashMap<RequestId, PendingExchange>,
event: &EventRequestWillBeSent,
fallback_started_at: DateTime<Utc>,
) {
let request_id = event.request_id.clone();
let mut current = pending.remove(&request_id).unwrap_or_default();
if let Some(redirect_response) = &event.redirect_response {
if !current.url.is_empty() {
current.redirect_chain.push(current.url.clone());
}
current.redirect_chain.push(redirect_response.url.clone());
}
current.method = event.request.method.clone();
current.url = event.request.url.clone();
current.request_headers = headers_to_map(&event.request.headers);
current.request_body_sample = request_body_sample(event);
current.resource_type = event.r#type.as_ref().map(resource_type_name);
current.started_at = wall_time_to_utc(&event.wall_time, fallback_started_at);
current.started_monotonic = Some(*event.timestamp.inner());
pending.insert(request_id, current);
}
fn record_response(
pending: &mut HashMap<RequestId, PendingExchange>,
event: &EventResponseReceived,
) {
let current = pending.entry(event.request_id.clone()).or_default();
if current.url.is_empty() {
current.url = event.response.url.clone();
}
current.status = u16::try_from(event.response.status).unwrap_or_default();
current.response_headers = headers_to_map(&event.response.headers);
current.response_mime_type = Some(event.response.mime_type.clone());
current.resource_type = Some(resource_type_name(&event.r#type));
}
async fn finish_request(
page: &Page,
pending: &mut HashMap<RequestId, PendingExchange>,
event: &EventLoadingFinished,
) -> Result<Option<CapturedExchange>, CaptureError> {
let Some(mut current) = pending.remove(&event.request_id) else {
return Ok(None);
};
if let Some(started) = current.started_monotonic {
let elapsed = ((*event.timestamp.inner() - started) * 1_000.0).max(0.0);
current.duration_ms = elapsed.round() as u64;
}
current.response_body_sample = response_body_sample(page, event.request_id.clone()).await;
Ok(current.into_exchange())
}
async fn response_body_sample(page: &Page, request_id: RequestId) -> Option<String> {
let response = page
.execute(GetResponseBodyParams::new(request_id))
.await
.ok()?;
Some(truncate_sample(response.result.body))
}
fn headers_to_map(headers: &Headers) -> HeaderMap {
match headers.inner() {
Value::Object(headers) => headers.clone(),
_ => HeaderMap::new(),
}
}
fn request_body_sample(event: &EventRequestWillBeSent) -> Option<String> {
let entries = event.request.post_data_entries.as_ref()?;
let mut body = String::new();
for entry in entries {
if let Some(bytes) = &entry.bytes {
body.push_str(bytes.as_ref());
}
}
if body.is_empty() {
None
} else {
Some(truncate_sample(body))
}
}
fn resource_type_name(resource_type: &ResourceType) -> String {
resource_type.as_ref().to_owned()
}
fn wall_time_to_utc(wall_time: &TimeSinceEpoch, fallback: DateTime<Utc>) -> DateTime<Utc> {
let seconds = *wall_time.inner();
if !seconds.is_finite() || seconds < 0.0 {
return fallback;
}
let whole_seconds = seconds.trunc() as i64;
let nanos = ((seconds.fract() * 1_000_000_000.0).round() as u32).min(999_999_999);
DateTime::<Utc>::from_timestamp(whole_seconds, nanos).unwrap_or(fallback)
}
fn truncate_sample(sample: String) -> String {
if sample.len() <= BODY_SAMPLE_LIMIT {
return sample;
}
let end = sample
.char_indices()
.take_while(|(index, _)| *index <= BODY_SAMPLE_LIMIT)
.map(|(index, character)| index + character.len_utf8())
.last()
.unwrap_or(0)
.min(sample.len());
sample[..end].to_owned()
}
#[derive(Debug, Clone)]
struct PendingExchange {
method: String,
url: String,
request_headers: HeaderMap,
request_body_sample: Option<String>,
resource_type: Option<String>,
status: u16,
response_headers: HeaderMap,
response_body_sample: Option<String>,
response_mime_type: Option<String>,
started_at: DateTime<Utc>,
started_monotonic: Option<f64>,
duration_ms: u64,
redirect_chain: Vec<String>,
}
impl Default for PendingExchange {
fn default() -> Self {
Self {
method: String::new(),
url: String::new(),
request_headers: HeaderMap::new(),
request_body_sample: None,
resource_type: None,
status: 0,
response_headers: HeaderMap::new(),
response_body_sample: None,
response_mime_type: None,
started_at: Utc::now(),
started_monotonic: None,
duration_ms: 0,
redirect_chain: Vec::new(),
}
}
}
impl PendingExchange {
fn into_exchange(mut self) -> Option<CapturedExchange> {
if self.method.is_empty() || self.url.is_empty() {
return None;
}
if !self.response_headers.contains_key("content-type")
&& let Some(mime_type) = self.response_mime_type.take()
{
self.response_headers
.insert("content-type".to_owned(), Value::String(mime_type));
}
Some(CapturedExchange {
method: self.method,
url: self.url,
request_headers: self.request_headers,
request_body_sample: self.request_body_sample,
resource_type: self.resource_type,
status: self.status,
response_headers: self.response_headers,
response_body_sample: self.response_body_sample,
started_at: self.started_at,
duration_ms: self.duration_ms,
redirect_chain: self.redirect_chain,
})
}
}

View file

@ -0,0 +1,253 @@
use serde::{Deserialize, Serialize};
use serde_json::Value;
use url::Url;
use crate::types::CapturedExchange;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ApiClassification {
pub include: bool,
pub confidence: f32,
pub reasons: Vec<String>,
}
pub fn classify_exchange(exchange: &CapturedExchange) -> ApiClassification {
let url = match Url::parse(&exchange.url) {
Ok(url) => url,
Err(error) => {
return ApiClassification {
include: false,
confidence: 0.0,
reasons: vec![format!("invalid URL: {error}")],
};
}
};
let mut exclusion_reasons = Vec::new();
if is_browser_extension_url(&url) {
exclusion_reasons.push("browser extension URL".to_owned());
}
if is_tracking_host(url.host_str()) {
exclusion_reasons.push("tracking, ad, or telemetry host".to_owned());
}
if has_static_asset_extension(url.path()) {
exclusion_reasons.push("static asset extension".to_owned());
}
if is_static_resource_type(exchange.resource_type.as_deref()) {
exclusion_reasons.push("static browser resource type".to_owned());
}
if !exclusion_reasons.is_empty() {
return ApiClassification {
include: false,
confidence: 0.0,
reasons: exclusion_reasons,
};
}
let mut confidence = 0.0_f32;
let mut reasons = Vec::new();
if matches_resource_type(exchange.resource_type.as_deref(), &["fetch", "xhr"]) {
confidence += 0.65;
reasons.push("browser resource type is fetch/xhr".to_owned());
}
if response_is_json(exchange) {
confidence += 0.55;
reasons.push("response content type is JSON".to_owned());
}
let path = url.path();
if has_api_path(path) {
confidence += 0.55;
reasons.push("URL path contains an API prefix".to_owned());
}
if has_versioned_path(path) {
confidence += 0.55;
reasons.push("URL path starts with a versioned API prefix".to_owned());
}
if has_graphql_path(path) {
confidence += 0.55;
reasons.push("URL path is GraphQL-like".to_owned());
}
if has_graphql_body(exchange.request_body_sample.as_deref()) {
confidence += 0.55;
reasons.push("request body is GraphQL-like".to_owned());
}
let confidence = confidence.min(1.0);
if reasons.is_empty() {
reasons.push("no API traffic signals found".to_owned());
}
ApiClassification {
include: confidence >= 0.5,
confidence,
reasons,
}
}
pub fn filter_api_exchanges(exchanges: &[CapturedExchange]) -> Vec<CapturedExchange> {
exchanges
.iter()
.filter(|exchange| classify_exchange(exchange).include)
.cloned()
.collect()
}
fn is_browser_extension_url(url: &Url) -> bool {
matches!(
url.scheme().to_ascii_lowercase().as_str(),
"chrome-extension" | "moz-extension" | "edge-extension" | "safari-extension"
)
}
fn is_tracking_host(host: Option<&str>) -> bool {
let Some(host) = host else {
return false;
};
let host = host.to_ascii_lowercase();
[
"google-analytics",
"googletagmanager",
"googlesyndication",
"doubleclick",
"adservice",
"ads.",
".ads.",
"analytics.",
".analytics.",
"telemetry",
"segment.",
"segment.io",
"amplitude",
"mixpanel",
"hotjar",
"sentry.io",
"datadog",
"newrelic",
]
.iter()
.any(|needle| host.contains(needle))
}
fn has_static_asset_extension(path: &str) -> bool {
let path = path.to_ascii_lowercase();
[
".png", ".jpg", ".jpeg", ".gif", ".webp", ".avif", ".svg", ".ico", ".css", ".js", ".mjs",
".woff", ".woff2", ".ttf", ".otf", ".eot", ".map", ".mp4", ".webm", ".mp3", ".wav",
]
.iter()
.any(|extension| path.ends_with(extension))
}
fn is_static_resource_type(resource_type: Option<&str>) -> bool {
matches_resource_type(
resource_type,
&[
"image",
"stylesheet",
"script",
"font",
"media",
"manifest",
"ping",
"cspviolationreport",
],
)
}
fn matches_resource_type(resource_type: Option<&str>, candidates: &[&str]) -> bool {
let Some(resource_type) = resource_type else {
return false;
};
candidates
.iter()
.any(|candidate| resource_type.eq_ignore_ascii_case(candidate))
}
fn response_is_json(exchange: &CapturedExchange) -> bool {
exchange.response_headers.iter().any(|(name, value)| {
name.eq_ignore_ascii_case("content-type")
&& header_value_as_str(value)
.map(|value| value.to_ascii_lowercase().contains("json"))
.unwrap_or(false)
})
}
fn header_value_as_str(value: &Value) -> Option<&str> {
match value {
Value::String(value) => Some(value),
_ => None,
}
}
fn has_api_path(path: &str) -> bool {
path.split('/')
.filter(|segment| !segment.is_empty())
.any(|segment| segment.eq_ignore_ascii_case("api"))
}
fn has_versioned_path(path: &str) -> bool {
path.split('/')
.find(|segment| !segment.is_empty())
.map(|segment| {
let segment = segment.to_ascii_lowercase();
segment.len() > 1
&& segment.starts_with('v')
&& segment[1..]
.chars()
.all(|character| character.is_ascii_digit())
})
.unwrap_or(false)
}
fn has_graphql_path(path: &str) -> bool {
path.split('/')
.filter(|segment| !segment.is_empty())
.any(|segment| segment.eq_ignore_ascii_case("graphql"))
}
fn has_graphql_body(body: Option<&str>) -> bool {
let Some(body) = body else {
return false;
};
if let Ok(value) = serde_json::from_str::<Value>(body) {
return value
.as_object()
.map(|object| {
object.contains_key("operationName")
|| object
.get("query")
.and_then(Value::as_str)
.map(is_graphql_query_text)
.unwrap_or(false)
})
.unwrap_or(false);
}
is_graphql_query_text(body)
}
fn is_graphql_query_text(text: &str) -> bool {
let text = text.trim_start();
text.starts_with("query ")
|| text.starts_with("query{")
|| text.starts_with("mutation ")
|| text.starts_with("mutation{")
|| text.starts_with("subscription ")
|| text.starts_with("subscription{")
}

View file

@ -0,0 +1,386 @@
use std::collections::{BTreeMap, BTreeSet};
use serde_json::{Map, Value, json};
use url::Url;
use crate::classify::filter_api_exchanges;
use crate::redact::{redact_headers, redact_url};
use crate::types::{
CapturedExchange, EndpointDefinition, EndpointExample, EndpointSafety, HeaderMap,
};
pub fn infer_endpoints(exchanges: &[CapturedExchange]) -> Vec<EndpointDefinition> {
let mut groups = BTreeMap::<EndpointKey, EndpointBuilder>::new();
for exchange in filter_api_exchanges(exchanges) {
let Ok(url) = Url::parse(&exchange.url) else {
continue;
};
let method = exchange.method.to_ascii_uppercase();
let origin = url.origin().ascii_serialization();
let path_template = normalize_path_template(url.path());
let key = EndpointKey {
method: method.clone(),
origin: origin.clone(),
path_template: path_template.clone(),
};
groups
.entry(key)
.or_insert_with(|| EndpointBuilder::new(method, origin, path_template))
.add_exchange(&exchange, &url);
}
groups
.into_values()
.map(EndpointBuilder::into_endpoint)
.collect()
}
pub fn normalize_path_template(path: &str) -> String {
let normalized = if path.is_empty() { "/" } else { path };
let trailing_slash = normalized.len() > 1 && normalized.ends_with('/');
let mut segments = normalized
.split('/')
.filter(|segment| !segment.is_empty())
.map(|segment| {
if is_identifier_segment(segment) {
"{id}".to_owned()
} else {
segment.to_owned()
}
})
.collect::<Vec<_>>();
if segments.is_empty() {
return "/".to_owned();
}
let mut path_template = format!("/{}", segments.join("/"));
if trailing_slash {
path_template.push('/');
}
segments.clear();
path_template
}
pub fn infer_json_schema(value: &Value) -> Value {
match value {
Value::Null => json!({ "type": "null" }),
Value::Bool(_) => json!({ "type": "boolean" }),
Value::Number(number) if number.is_i64() || number.is_u64() => {
json!({ "type": "integer" })
}
Value::Number(_) => json!({ "type": "number" }),
Value::String(_) => json!({ "type": "string" }),
Value::Array(items) => {
let item_schema = items
.iter()
.map(infer_json_schema)
.reduce(|left, right| merge_json_schemas(&left, &right))
.unwrap_or_else(|| json!({}));
json!({
"type": "array",
"items": item_schema
})
}
Value::Object(object) => {
let properties = object
.iter()
.map(|(key, value)| (key.clone(), infer_json_schema(value)))
.collect::<Map<_, _>>();
json!({
"type": "object",
"properties": properties
})
}
}
}
pub fn endpoint_id(method: &str, origin: &str, path_template: &str) -> String {
format!(
"{} {}{}",
method.to_ascii_uppercase(),
origin.trim_end_matches('/'),
ensure_leading_slash(path_template)
)
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
struct EndpointKey {
method: String,
origin: String,
path_template: String,
}
#[derive(Debug, Clone)]
struct EndpointBuilder {
method: String,
origin: String,
path_template: String,
query_params: BTreeMap<String, BTreeSet<String>>,
request_schema: Option<Value>,
response_schema: Option<Value>,
auth_evidence: BTreeSet<String>,
examples: Vec<EndpointExample>,
}
impl EndpointBuilder {
fn new(method: String, origin: String, path_template: String) -> Self {
Self {
method,
origin,
path_template,
query_params: BTreeMap::new(),
request_schema: None,
response_schema: None,
auth_evidence: BTreeSet::new(),
examples: Vec::new(),
}
}
fn add_exchange(&mut self, exchange: &CapturedExchange, url: &Url) {
for (name, value) in url.query_pairs() {
self.query_params
.entry(name.into_owned())
.or_default()
.insert(value.into_owned());
}
self.record_auth_evidence(&exchange.request_headers);
self.record_auth_evidence(&exchange.response_headers);
if let Some(schema) = infer_body_schema(exchange.request_body_sample.as_deref()) {
self.request_schema = merge_optional_schema(self.request_schema.take(), schema);
}
if let Some(schema) = infer_body_schema(exchange.response_body_sample.as_deref()) {
self.response_schema = merge_optional_schema(self.response_schema.take(), schema);
}
self.examples.push(EndpointExample {
url: redact_url(&exchange.url),
request_headers: redact_headers(&exchange.request_headers),
request_body_sample: redact_body_sample(exchange.request_body_sample.as_deref()),
response_status: exchange.status,
response_headers: redact_headers(&exchange.response_headers),
response_body_sample: redact_body_sample(exchange.response_body_sample.as_deref()),
captured_at: exchange.started_at,
});
}
fn into_endpoint(self) -> EndpointDefinition {
let safety = endpoint_safety(&self.method);
EndpointDefinition {
id: endpoint_id(&self.method, &self.origin, &self.path_template),
method: self.method,
origin: self.origin,
path_template: self.path_template,
query_params: self
.query_params
.into_iter()
.map(|(name, values)| (name, values.into_iter().collect()))
.collect(),
request_schema: self.request_schema,
response_schema: self.response_schema,
auth_evidence: self.auth_evidence.into_iter().collect(),
safety,
examples: self.examples,
}
}
fn record_auth_evidence(&mut self, headers: &HeaderMap) {
for name in headers.keys() {
if is_auth_evidence_header(name) {
self.auth_evidence.insert(format!("{name} header observed"));
}
}
}
}
fn infer_body_schema(body: Option<&str>) -> Option<Value> {
let body = body?.trim();
if body.is_empty() {
return None;
}
serde_json::from_str::<Value>(body)
.ok()
.map(|value| infer_json_schema(&value))
}
fn merge_optional_schema(current: Option<Value>, next: Value) -> Option<Value> {
Some(match current {
Some(current) => merge_json_schemas(&current, &next),
None => next,
})
}
fn merge_json_schemas(left: &Value, right: &Value) -> Value {
if left == right {
return left.clone();
}
let left_type = left.get("type").and_then(Value::as_str);
let right_type = right.get("type").and_then(Value::as_str);
match (left_type, right_type) {
(Some("object"), Some("object")) => merge_object_schemas(left, right),
(Some("array"), Some("array")) => {
let left_items = left.get("items").cloned().unwrap_or_else(|| json!({}));
let right_items = right.get("items").cloned().unwrap_or_else(|| json!({}));
json!({
"type": "array",
"items": merge_json_schemas(&left_items, &right_items)
})
}
(Some(_), Some(_)) => {
let mut variants = Vec::new();
push_unique_schema(&mut variants, left.clone());
push_unique_schema(&mut variants, right.clone());
json!({ "oneOf": variants })
}
_ => right.clone(),
}
}
fn merge_object_schemas(left: &Value, right: &Value) -> Value {
let mut properties = Map::new();
if let Some(left_properties) = left.get("properties").and_then(Value::as_object) {
for (name, schema) in left_properties {
properties.insert(name.clone(), schema.clone());
}
}
if let Some(right_properties) = right.get("properties").and_then(Value::as_object) {
for (name, schema) in right_properties {
let schema = properties
.remove(name)
.map(|existing| merge_json_schemas(&existing, schema))
.unwrap_or_else(|| schema.clone());
properties.insert(name.clone(), schema);
}
}
json!({
"type": "object",
"properties": properties
})
}
fn push_unique_schema(variants: &mut Vec<Value>, schema: Value) {
if let Some(nested) = schema.get("oneOf").and_then(Value::as_array) {
for item in nested {
push_unique_schema(variants, item.clone());
}
return;
}
if !variants.iter().any(|existing| existing == &schema) {
variants.push(schema);
}
}
fn endpoint_safety(method: &str) -> EndpointSafety {
if is_safe_method(method) {
EndpointSafety {
safe_to_replay: true,
requires_confirmation: false,
reason: format!(
"{} is a read-oriented HTTP method",
method.to_ascii_uppercase()
),
}
} else {
EndpointSafety {
safe_to_replay: false,
requires_confirmation: true,
reason: format!(
"{} may mutate server state and requires confirmation",
method.to_ascii_uppercase()
),
}
}
}
fn is_safe_method(method: &str) -> bool {
matches!(
method.to_ascii_uppercase().as_str(),
"GET" | "HEAD" | "OPTIONS"
)
}
fn redact_body_sample(sample: Option<&str>) -> Option<String> {
sample.map(|body| match serde_json::from_str::<Value>(body) {
Ok(value) => crate::redact::redact_json(&value).to_string(),
Err(_) => body.to_owned(),
})
}
fn is_auth_evidence_header(name: &str) -> bool {
let lower = name.to_ascii_lowercase();
let compact: String = lower
.chars()
.filter(|character| character.is_ascii_alphanumeric())
.collect();
[
"authorization",
"cookie",
"set-cookie",
"api-key",
"csrf",
"token",
"session",
]
.iter()
.any(|needle| {
let compact_needle: String = needle
.chars()
.filter(|character| character.is_ascii_alphanumeric())
.collect();
lower.contains(needle) || compact.contains(&compact_needle)
})
}
fn is_identifier_segment(segment: &str) -> bool {
is_numeric_segment(segment) || is_uuid_like_segment(segment) || is_high_entropy_segment(segment)
}
fn is_numeric_segment(segment: &str) -> bool {
!segment.is_empty() && segment.chars().all(|character| character.is_ascii_digit())
}
fn is_uuid_like_segment(segment: &str) -> bool {
let parts = segment.split('-').map(str::len).collect::<Vec<_>>();
parts == [8, 4, 4, 4, 12]
&& segment
.chars()
.all(|character| character == '-' || character.is_ascii_hexdigit())
}
fn is_high_entropy_segment(segment: &str) -> bool {
segment.len() >= 16
&& segment.chars().all(|character| {
character.is_ascii_alphanumeric() || matches!(character, '_' | '-' | '~')
})
&& segment.chars().any(|character| character.is_ascii_digit())
&& segment
.chars()
.any(|character| character.is_ascii_alphabetic())
}
fn ensure_leading_slash(path: &str) -> String {
if path.starts_with('/') {
path.to_owned()
} else {
format!("/{path}")
}
}

View file

@ -0,0 +1,8 @@
pub mod cdp;
pub mod classify;
pub mod infer;
pub mod openapi;
pub mod redact;
pub mod replay;
pub mod store;
pub mod types;

View file

@ -0,0 +1,463 @@
use std::fs;
use std::path::{Component, Path, PathBuf};
use serde_json::{Map, Value, json};
use url::Url;
use crate::redact::{redact_headers, redact_json};
use crate::store::{capture_root, load_endpoints};
use crate::types::{CaptureError, EndpointDefinition, EndpointExample};
const OPENAPI_FILE: &str = "openapi.json";
const REDACTED: &str = "[REDACTED]";
pub fn export_openapi(endpoints: &[EndpointDefinition]) -> Value {
let mut paths = Map::new();
for endpoint in endpoints {
let path = normalize_openapi_path(&endpoint.path_template);
let method = endpoint.method.to_ascii_lowercase();
let operation = operation_for(endpoint);
let path_item = paths
.entry(path)
.or_insert_with(|| Value::Object(Map::new()));
if let Value::Object(path_item) = path_item {
path_item.insert(method, operation);
}
}
json!({
"openapi": "3.1.0",
"info": {
"title": "Webclaw Learned API",
"version": "1.0.0"
},
"paths": paths
})
}
pub fn write_openapi(capture_id: &str) -> Result<PathBuf, CaptureError> {
let endpoints = load_endpoints(capture_id)?;
let document = export_openapi(&endpoints);
let capture_dir = capture_dir_for_id(&capture_root(), capture_id)?;
fs::create_dir_all(&capture_dir)?;
let path = capture_dir.join(OPENAPI_FILE);
fs::write(&path, serde_json::to_string_pretty(&document)?)?;
Ok(path)
}
fn operation_for(endpoint: &EndpointDefinition) -> Value {
let mut operation = Map::new();
let method = endpoint.method.to_ascii_uppercase();
operation.insert(
"operationId".to_owned(),
Value::String(operation_id(endpoint)),
);
operation.insert(
"summary".to_owned(),
Value::String(format!("{method} {}", endpoint.path_template)),
);
operation.insert(
"x-webclaw-endpoint-id".to_owned(),
Value::String(endpoint.id.clone()),
);
operation.insert(
"x-webclaw-origin".to_owned(),
Value::String(endpoint.origin.clone()),
);
if !endpoint.auth_evidence.is_empty() {
operation.insert(
"x-webclaw-auth-evidence".to_owned(),
json!(endpoint.auth_evidence),
);
}
if endpoint.safety.requires_confirmation || !endpoint.safety.safe_to_replay {
operation.insert("x-webclaw-requires-confirmation".to_owned(), json!(true));
}
let parameters = parameters_for(endpoint);
if !parameters.is_empty() {
operation.insert("parameters".to_owned(), Value::Array(parameters));
}
if let Some(request_body) = request_body_for(endpoint) {
operation.insert("requestBody".to_owned(), request_body);
}
operation.insert("responses".to_owned(), responses_for(endpoint));
let examples = examples_for(endpoint);
if !examples.is_empty() {
operation.insert("x-webclaw-examples".to_owned(), Value::Array(examples));
}
Value::Object(operation)
}
fn parameters_for(endpoint: &EndpointDefinition) -> Vec<Value> {
let mut parameters = path_parameters(&endpoint.path_template);
for (name, values) in &endpoint.query_params {
let examples = examples_object(
values
.iter()
.map(|value| Value::String(redacted_parameter_value(name, value))),
);
let mut parameter = Map::new();
parameter.insert("name".to_owned(), Value::String(name.clone()));
parameter.insert("in".to_owned(), Value::String("query".to_owned()));
parameter.insert("required".to_owned(), Value::Bool(false));
parameter.insert("schema".to_owned(), json!({ "type": "string" }));
if !examples.is_empty() {
parameter.insert("examples".to_owned(), Value::Object(examples));
}
parameters.push(Value::Object(parameter));
}
parameters
}
fn path_parameters(path_template: &str) -> Vec<Value> {
let mut parameters = Vec::new();
let mut cursor = path_template;
while let Some(start) = cursor.find('{') {
let after_start = &cursor[start + 1..];
let Some(end) = after_start.find('}') else {
break;
};
let name = &after_start[..end];
if !name.is_empty()
&& !parameters
.iter()
.any(|parameter| parameter_name(parameter) == name)
{
parameters.push(json!({
"name": name,
"in": "path",
"required": true,
"schema": { "type": "string" }
}));
}
cursor = &after_start[end + 1..];
}
parameters
}
fn request_body_for(endpoint: &EndpointDefinition) -> Option<Value> {
let examples = body_examples(endpoint.examples.iter().filter_map(|example| {
example
.request_body_sample
.as_deref()
.map(redacted_body_sample)
}));
if endpoint.request_schema.is_none() && examples.is_empty() {
return None;
}
Some(json!({
"required": false,
"content": {
"application/json": media_type_object(endpoint.request_schema.clone(), examples)
}
}))
}
fn responses_for(endpoint: &EndpointDefinition) -> Value {
let mut responses = Map::new();
let mut statuses = endpoint
.examples
.iter()
.map(|example| example.response_status)
.collect::<Vec<_>>();
statuses.sort_unstable();
statuses.dedup();
if statuses.is_empty() {
statuses.push(200);
}
for status in statuses {
let examples = body_examples(
endpoint
.examples
.iter()
.filter(move |example| example.response_status == status)
.filter_map(|example| {
example
.response_body_sample
.as_deref()
.map(redacted_body_sample)
}),
);
responses.insert(
status.to_string(),
json!({
"description": format!("Captured HTTP {status} response"),
"content": {
"application/json": media_type_object(endpoint.response_schema.clone(), examples)
}
}),
);
}
Value::Object(responses)
}
fn media_type_object(schema: Option<Value>, examples: Map<String, Value>) -> Value {
let mut media_type = Map::new();
if let Some(schema) = schema {
media_type.insert("schema".to_owned(), redact_json(&schema));
}
if !examples.is_empty() {
media_type.insert("examples".to_owned(), Value::Object(examples));
}
Value::Object(media_type)
}
fn examples_for(endpoint: &EndpointDefinition) -> Vec<Value> {
endpoint.examples.iter().map(redacted_example).collect()
}
fn redacted_example(example: &EndpointExample) -> Value {
json!({
"url": redacted_example_url(&example.url),
"request_headers": redact_headers(&example.request_headers),
"request_body": example.request_body_sample.as_deref().map(redacted_body_sample),
"response_status": example.response_status,
"response_headers": redact_headers(&example.response_headers),
"response_body": example.response_body_sample.as_deref().map(redacted_body_sample),
"captured_at": example.captured_at
})
}
fn redacted_example_url(url: &str) -> String {
let Ok(mut parsed) = Url::parse(url) else {
return url.to_owned();
};
let pairs: Vec<(String, String)> = parsed.query_pairs().into_owned().collect();
if pairs.is_empty() {
return parsed.to_string();
}
parsed.set_query(None);
{
let mut query = parsed.query_pairs_mut();
for (name, value) in pairs {
query.append_pair(&name, &redacted_parameter_value(&name, &value));
}
}
parsed.to_string()
}
fn body_examples(values: impl Iterator<Item = Value>) -> Map<String, Value> {
examples_object(values)
}
fn examples_object(values: impl Iterator<Item = Value>) -> Map<String, Value> {
let mut examples = Map::new();
for (index, value) in values.enumerate() {
examples.insert(format!("captured-{}", index + 1), json!({ "value": value }));
}
examples
}
fn redacted_body_sample(sample: &str) -> Value {
match serde_json::from_str::<Value>(sample) {
Ok(value) => redact_json(&value),
Err(_) if contains_obvious_secret(sample) => Value::String(REDACTED.to_owned()),
Err(_) => Value::String(sample.to_owned()),
}
}
fn contains_obvious_secret(value: &str) -> bool {
let lower = value.to_ascii_lowercase();
lower.contains("bearer ")
|| lower.contains("authorization")
|| lower.contains("api_key")
|| lower.contains("api-key")
|| lower.contains("csrf")
|| lower.contains("token")
|| lower.contains("session")
|| lower.contains("password")
|| lower.contains("cookie")
|| contains_email_like_value(value)
}
fn redacted_parameter_value(name: &str, value: &str) -> String {
if is_sensitive_name(name) || contains_obvious_secret(value) {
REDACTED.to_owned()
} else {
value.to_owned()
}
}
fn is_sensitive_name(name: &str) -> bool {
let lower = name.to_ascii_lowercase();
let compact: String = lower
.chars()
.filter(|character| character.is_ascii_alphanumeric())
.collect();
[
"authorization",
"cookie",
"set-cookie",
"api-key",
"csrf",
"token",
"session",
"password",
"email",
]
.iter()
.any(|sensitive| {
let sensitive_compact: String = sensitive
.chars()
.filter(|character| character.is_ascii_alphanumeric())
.collect();
lower.contains(sensitive) || compact.contains(&sensitive_compact)
})
}
fn contains_email_like_value(value: &str) -> bool {
let Some(at_index) = value.find('@') else {
return false;
};
let before = &value[..at_index];
let after = &value[at_index + 1..];
before
.chars()
.rev()
.take_while(|character| {
character.is_ascii_alphanumeric() || matches!(character, '.' | '_' | '%' | '+' | '-')
})
.count()
> 0
&& after
.chars()
.take_while(|character| {
character.is_ascii_alphanumeric() || matches!(character, '.' | '-')
})
.any(|character| character == '.')
}
fn operation_id(endpoint: &EndpointDefinition) -> String {
format!(
"{}_{}",
endpoint.method.to_ascii_lowercase(),
endpoint
.path_template
.trim_matches('/')
.chars()
.map(|character| {
if character.is_ascii_alphanumeric() {
character.to_ascii_lowercase()
} else {
'_'
}
})
.collect::<String>()
)
.trim_matches('_')
.to_owned()
}
fn normalize_openapi_path(path_template: &str) -> String {
if path_template.starts_with('/') {
path_template.to_owned()
} else {
format!("/{path_template}")
}
}
fn parameter_name(parameter: &Value) -> &str {
parameter
.get("name")
.and_then(Value::as_str)
.unwrap_or_default()
}
fn capture_dir_for_id(root: &Path, capture_id: &str) -> Result<PathBuf, CaptureError> {
let mut capture_dir = root.to_path_buf();
let parts = capture_id
.split(['/', '\\'])
.filter(|part| !part.is_empty())
.collect::<Vec<_>>();
if parts.is_empty() {
return Err(CaptureError::Storage(
"capture id cannot be empty".to_owned(),
));
}
for part in parts {
if !is_safe_path_segment(part) {
return Err(CaptureError::Storage(format!(
"capture id contains unsafe path segment: {capture_id}"
)));
}
capture_dir.push(part);
}
ensure_within_root(root, &capture_dir)?;
Ok(capture_dir)
}
fn ensure_within_root(root: &Path, path: &Path) -> Result<(), CaptureError> {
if relative_components(path).starts_with(&relative_components(root)) {
Ok(())
} else {
Err(CaptureError::Storage(format!(
"capture path escapes capture root: {}",
path.display()
)))
}
}
fn relative_components(path: &Path) -> Vec<String> {
path.components()
.filter_map(|component| match component {
Component::Prefix(prefix) => Some(prefix.as_os_str().to_string_lossy().to_string()),
Component::RootDir => Some(String::from("\\")),
Component::Normal(value) => Some(value.to_string_lossy().to_string()),
Component::CurDir => None,
Component::ParentDir => Some(String::from("..")),
})
.collect()
}
fn is_safe_path_segment(segment: &str) -> bool {
!segment.is_empty()
&& segment != "."
&& segment != ".."
&& !segment.contains(':')
&& !segment.contains('/')
&& !segment.contains('\\')
}

View file

@ -0,0 +1,236 @@
use std::collections::BTreeMap;
use serde_json::{Map, Value};
use url::Url;
use crate::types::{
CaptureArtifact, CapturedExchange, EndpointDefinition, EndpointExample, HeaderMap,
};
const REDACTED: &str = "[REDACTED]";
const SENSITIVE_NAMES: &[&str] = &[
"authorization",
"cookie",
"set-cookie",
"api-key",
"csrf",
"token",
"session",
"password",
"email",
];
pub fn redact_headers(headers: &HeaderMap) -> HeaderMap {
headers
.iter()
.map(|(name, value)| {
let value = if is_sensitive_name(name) {
Value::String(REDACTED.to_owned())
} else {
value.clone()
};
(name.clone(), value)
})
.collect()
}
pub fn redact_url(url: &str) -> String {
let Ok(mut parsed) = Url::parse(url) else {
return url.to_owned();
};
let pairs: Vec<(String, String)> = parsed.query_pairs().into_owned().collect();
if pairs.is_empty() {
return parsed.to_string();
}
parsed.set_query(None);
{
let mut query = parsed.query_pairs_mut();
for (name, value) in pairs {
let value = if is_sensitive_name(&name) {
REDACTED.to_owned()
} else {
value
};
query.append_pair(&name, &value);
}
}
parsed.to_string()
}
pub fn redact_json(value: &Value) -> Value {
match value {
Value::Object(object) => Value::Object(redact_json_object(object)),
Value::Array(items) => Value::Array(items.iter().map(redact_json).collect()),
_ => value.clone(),
}
}
pub fn redact_artifact(artifact: &CaptureArtifact) -> CaptureArtifact {
let metadata = match redact_json(&Value::Object(artifact.metadata.clone())) {
Value::Object(metadata) => metadata,
_ => Map::new(),
};
CaptureArtifact {
id: artifact.id.clone(),
source_url: redact_url(&artifact.source_url),
intent: artifact.intent.clone(),
started_at: artifact.started_at,
completed_at: artifact.completed_at,
exchanges: artifact.exchanges.iter().map(redact_exchange).collect(),
endpoints: artifact.endpoints.iter().map(redact_endpoint).collect(),
metadata,
}
}
fn redact_exchange(exchange: &CapturedExchange) -> CapturedExchange {
CapturedExchange {
method: exchange.method.clone(),
url: redact_url(&exchange.url),
request_headers: redact_headers(&exchange.request_headers),
request_body_sample: redact_body_sample(exchange.request_body_sample.as_deref()),
resource_type: exchange.resource_type.clone(),
status: exchange.status,
response_headers: redact_headers(&exchange.response_headers),
response_body_sample: redact_body_sample(exchange.response_body_sample.as_deref()),
started_at: exchange.started_at,
duration_ms: exchange.duration_ms,
redirect_chain: exchange
.redirect_chain
.iter()
.map(|redirect| redact_url(redirect))
.collect(),
}
}
fn redact_endpoint(endpoint: &EndpointDefinition) -> EndpointDefinition {
EndpointDefinition {
id: endpoint.id.clone(),
method: endpoint.method.clone(),
origin: endpoint.origin.clone(),
path_template: endpoint.path_template.clone(),
query_params: redact_query_params(&endpoint.query_params),
request_schema: endpoint.request_schema.as_ref().map(redact_json),
response_schema: endpoint.response_schema.as_ref().map(redact_json),
auth_evidence: endpoint.auth_evidence.clone(),
safety: endpoint.safety.clone(),
examples: endpoint
.examples
.iter()
.map(redact_endpoint_example)
.collect(),
}
}
fn redact_endpoint_example(example: &EndpointExample) -> EndpointExample {
EndpointExample {
url: redact_url(&example.url),
request_headers: redact_headers(&example.request_headers),
request_body_sample: redact_body_sample(example.request_body_sample.as_deref()),
response_status: example.response_status,
response_headers: redact_headers(&example.response_headers),
response_body_sample: redact_body_sample(example.response_body_sample.as_deref()),
captured_at: example.captured_at,
}
}
fn redact_query_params(params: &BTreeMap<String, Vec<String>>) -> BTreeMap<String, Vec<String>> {
params
.iter()
.map(|(name, values)| {
let values = if is_sensitive_name(name) {
vec![REDACTED.to_owned()]
} else {
values.clone()
};
(name.clone(), values)
})
.collect()
}
fn redact_json_object(object: &Map<String, Value>) -> Map<String, Value> {
object
.iter()
.map(|(key, value)| {
let value = if is_sensitive_name(key) {
Value::String(REDACTED.to_owned())
} else {
redact_json(value)
};
(key.clone(), value)
})
.collect()
}
fn redact_body_sample(sample: Option<&str>) -> Option<String> {
sample.map(|body| match serde_json::from_str::<Value>(body) {
Ok(value) => redact_json(&value).to_string(),
Err(_) => redact_text_body(body),
})
}
fn is_sensitive_name(name: &str) -> bool {
let lower = name.to_ascii_lowercase();
let compact: String = lower
.chars()
.filter(|ch| ch.is_ascii_alphanumeric())
.collect();
SENSITIVE_NAMES.iter().any(|sensitive| {
let sensitive_compact: String = sensitive
.chars()
.filter(|ch| ch.is_ascii_alphanumeric())
.collect();
lower.contains(sensitive) || compact.contains(&sensitive_compact)
})
}
fn redact_text_body(body: &str) -> String {
body.lines()
.map(|line| {
if is_sensitive_text_line(line) {
REDACTED.to_owned()
} else {
line.to_owned()
}
})
.collect::<Vec<_>>()
.join("\n")
}
fn is_sensitive_text_line(line: &str) -> bool {
is_sensitive_name(line) || contains_bearer_token(line) || contains_email_like_value(line)
}
fn contains_bearer_token(line: &str) -> bool {
line.to_ascii_lowercase().contains("bearer ")
}
fn contains_email_like_value(line: &str) -> bool {
let Some(at_index) = line.find('@') else {
return false;
};
let before = &line[..at_index];
let after = &line[at_index + 1..];
before
.chars()
.rev()
.take_while(|character| {
character.is_ascii_alphanumeric() || matches!(character, '.' | '_' | '%' | '+' | '-')
})
.count()
> 0
&& after
.chars()
.take_while(|character| {
character.is_ascii_alphanumeric() || matches!(character, '.' | '-')
})
.any(|character| character == '.')
}

View file

@ -0,0 +1,383 @@
use std::collections::BTreeSet;
use reqwest::{
Client, Method, RequestBuilder,
header::{HeaderName, HeaderValue},
};
use serde_json::{Map, Value};
use url::{Url, form_urlencoded::byte_serialize};
use crate::types::{CaptureError, EndpointDefinition, HeaderMap, ReplayOptions, ReplayResult};
const MAX_BODY_SAMPLE_BYTES: usize = 64 * 1024;
pub async fn replay_endpoint(
endpoint: &EndpointDefinition,
options: ReplayOptions,
) -> Result<ReplayResult, CaptureError> {
if unsafe_replay_requires_confirmation(endpoint, &options) {
return Ok(ReplayResult::Blocked {
reason: format!(
"{} replay requires --confirm-unsafe unless --dry-run is used",
endpoint.method.to_ascii_uppercase()
),
});
}
let spec = replay_spec(endpoint, &options)?;
if options.dry_run {
return Ok(ReplayResult::Preview {
method: spec.method.as_str().to_owned(),
url: spec.url.to_string(),
headers: spec.headers,
body_sample: spec.body_sample,
});
}
let response = request_builder_from_spec(spec)?.send().await?;
let status = response.status().as_u16();
let headers = response_headers_to_json(response.headers());
let body = response.bytes().await?;
let body_sample = body_sample_from_bytes(&body);
Ok(ReplayResult::Executed {
status,
headers,
body_sample,
})
}
pub fn build_replay_request(
endpoint: &EndpointDefinition,
options: &ReplayOptions,
) -> Result<RequestBuilder, CaptureError> {
if unsafe_replay_requires_confirmation(endpoint, options) {
return Err(CaptureError::Replay(format!(
"{} replay requires confirmation",
endpoint.method.to_ascii_uppercase()
)));
}
request_builder_from_spec(replay_spec(endpoint, options)?)
}
#[derive(Debug, Clone)]
struct ReplaySpec {
method: Method,
url: Url,
headers: HeaderMap,
body_sample: Option<String>,
}
fn replay_spec(
endpoint: &EndpointDefinition,
options: &ReplayOptions,
) -> Result<ReplaySpec, CaptureError> {
let method = Method::from_bytes(endpoint.method.as_bytes()).map_err(|error| {
CaptureError::Replay(format!(
"invalid replay method {:?}: {error}",
endpoint.method
))
})?;
let (path, consumed_params) = interpolate_path_template(&endpoint.path_template, options)?;
let mut url = Url::parse(&format!(
"{}{}",
endpoint.origin.trim_end_matches('/'),
ensure_leading_slash(&path)
))
.map_err(|error| CaptureError::InvalidUrl(error.to_string()))?;
apply_query_params(&mut url, endpoint, options, &consumed_params);
let mut headers = HeaderMap::new();
if let Some(example) = endpoint.examples.first() {
merge_safe_headers(&mut headers, &example.request_headers);
}
merge_safe_headers(&mut headers, &options.headers);
let body_sample = replay_body_sample(endpoint, options)?;
Ok(ReplaySpec {
method,
url,
headers,
body_sample,
})
}
fn request_builder_from_spec(spec: ReplaySpec) -> Result<RequestBuilder, CaptureError> {
let client = Client::new();
let mut builder = client.request(spec.method, spec.url);
for (name, value) in spec.headers {
let Some(value) = header_value_to_string(&value) else {
continue;
};
let Ok(name) = HeaderName::from_bytes(name.as_bytes()) else {
continue;
};
let Ok(value) = HeaderValue::from_str(&value) else {
continue;
};
builder = builder.header(name, value);
}
if let Some(body_sample) = spec.body_sample
&& !contains_redacted_material(&body_sample)
{
builder = builder.body(body_sample);
}
Ok(builder)
}
fn unsafe_replay_requires_confirmation(
endpoint: &EndpointDefinition,
options: &ReplayOptions,
) -> bool {
is_unsafe_endpoint(endpoint) && !options.dry_run && !options.confirm_unsafe
}
fn is_unsafe_endpoint(endpoint: &EndpointDefinition) -> bool {
endpoint.safety.requires_confirmation
|| !endpoint.safety.safe_to_replay
|| !matches!(
endpoint.method.to_ascii_uppercase().as_str(),
"GET" | "HEAD" | "OPTIONS"
)
}
fn interpolate_path_template(
path_template: &str,
options: &ReplayOptions,
) -> Result<(String, BTreeSet<String>), CaptureError> {
let params = params_object(options);
let mut consumed = BTreeSet::new();
let mut path = String::new();
let mut rest = path_template;
while let Some(start) = rest.find('{') {
let (before, after_start) = rest.split_at(start);
path.push_str(before);
let Some(end) = after_start.find('}') else {
path.push_str(after_start);
return Ok((path, consumed));
};
let name = &after_start[1..end];
if let Some(value) = params.and_then(|object| object.get(name)) {
let value = scalar_param_to_string(value).ok_or_else(|| {
CaptureError::Replay(format!("path parameter {name:?} must be scalar"))
})?;
path.push_str(&encode_path_segment(&value));
consumed.insert(name.to_owned());
} else {
path.push_str(&after_start[..=end]);
}
rest = &after_start[end + 1..];
}
path.push_str(rest);
Ok((path, consumed))
}
fn apply_query_params(
url: &mut Url,
endpoint: &EndpointDefinition,
options: &ReplayOptions,
consumed_params: &BTreeSet<String>,
) {
url.set_query(None);
let mut pairs = Vec::<(String, String)>::new();
for (name, values) in &endpoint.query_params {
if consumed_params.contains(name) || is_sensitive_name(name) {
continue;
}
if let Some(value) = values
.iter()
.find(|value| !contains_redacted_material(value))
.cloned()
{
pairs.push((name.clone(), value));
}
}
if let Some(params) = params_object(options) {
for (name, value) in params {
if consumed_params.contains(name) || is_sensitive_name(name) {
continue;
}
append_query_value(&mut pairs, name, value);
}
}
if pairs.is_empty() {
return;
}
let mut query = url.query_pairs_mut();
for (name, value) in pairs {
query.append_pair(&name, &value);
}
}
fn append_query_value(pairs: &mut Vec<(String, String)>, name: &str, value: &Value) {
match value {
Value::Array(values) => {
for value in values {
if let Some(value) = scalar_param_to_string(value)
&& !contains_redacted_material(&value)
{
pairs.push((name.to_owned(), value));
}
}
}
_ => {
if let Some(value) = scalar_param_to_string(value)
&& !contains_redacted_material(&value)
{
pairs.retain(|(existing, _value)| existing != name);
pairs.push((name.to_owned(), value));
}
}
}
}
fn replay_body_sample(
endpoint: &EndpointDefinition,
options: &ReplayOptions,
) -> Result<Option<String>, CaptureError> {
if let Some(body_json) = &options.body_json {
return Ok(Some(serde_json::to_string(body_json)?));
}
let Some(example) = endpoint.examples.first() else {
return Ok(None);
};
Ok(example
.request_body_sample
.as_ref()
.filter(|sample| !contains_redacted_material(sample))
.cloned())
}
fn merge_safe_headers(target: &mut HeaderMap, headers: &HeaderMap) {
for (name, value) in headers {
if should_skip_header(name, value) {
continue;
}
target.insert(name.clone(), value.clone());
}
}
fn should_skip_header(name: &str, value: &Value) -> bool {
is_hop_by_hop_header(name)
|| header_value_to_string(value)
.map(|value| value.trim().is_empty() || contains_redacted_material(&value))
.unwrap_or(true)
}
fn is_hop_by_hop_header(name: &str) -> bool {
matches!(
name.to_ascii_lowercase().as_str(),
"host" | "connection" | "content-length" | "transfer-encoding" | "accept-encoding"
)
}
fn header_value_to_string(value: &Value) -> Option<String> {
match value {
Value::String(value) => Some(value.clone()),
Value::Number(value) => Some(value.to_string()),
Value::Bool(value) => Some(value.to_string()),
Value::Null | Value::Array(_) | Value::Object(_) => None,
}
}
fn response_headers_to_json(headers: &reqwest::header::HeaderMap) -> HeaderMap {
headers
.iter()
.filter_map(|(name, value)| {
value
.to_str()
.ok()
.map(|value| (name.as_str().to_owned(), Value::String(value.to_owned())))
})
.collect()
}
fn body_sample_from_bytes(bytes: &[u8]) -> Option<String> {
if bytes.is_empty() {
return None;
}
let capped = &bytes[..bytes.len().min(MAX_BODY_SAMPLE_BYTES)];
Some(String::from_utf8_lossy(capped).into_owned())
}
fn params_object(options: &ReplayOptions) -> Option<&Map<String, Value>> {
options.params_json.as_ref()?.as_object()
}
fn scalar_param_to_string(value: &Value) -> Option<String> {
match value {
Value::String(value) => Some(value.clone()),
Value::Number(value) => Some(value.to_string()),
Value::Bool(value) => Some(value.to_string()),
Value::Null | Value::Array(_) | Value::Object(_) => None,
}
}
fn contains_redacted_material(value: &str) -> bool {
value.to_ascii_lowercase().contains("[redacted]")
}
fn is_sensitive_name(name: &str) -> bool {
let lower = name.to_ascii_lowercase();
let compact: String = lower
.chars()
.filter(|character| character.is_ascii_alphanumeric())
.collect();
[
"authorization",
"cookie",
"set-cookie",
"api-key",
"csrf",
"token",
"session",
"password",
"email",
]
.iter()
.any(|sensitive| {
let sensitive_compact: String = sensitive
.chars()
.filter(|character| character.is_ascii_alphanumeric())
.collect();
lower.contains(sensitive) || compact.contains(&sensitive_compact)
})
}
fn encode_path_segment(value: &str) -> String {
byte_serialize(value.as_bytes()).collect()
}
fn ensure_leading_slash(path: &str) -> String {
if path.starts_with('/') {
path.to_owned()
} else {
format!("/{path}")
}
}

View file

@ -0,0 +1,221 @@
use std::env;
use std::fs;
use std::path::{Component, Path, PathBuf};
use chrono::{DateTime, Utc};
use serde_json::{Map, Value, json};
use url::Url;
use crate::redact::redact_artifact;
use crate::types::{CaptureArtifact, CaptureError, EndpointDefinition, SavedCapture};
const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR";
const RAW_CAPTURE_FILE: &str = "raw-capture.json";
const REDACTED_CAPTURE_FILE: &str = "redacted-capture.json";
const ENDPOINTS_FILE: &str = "endpoints.json";
const METADATA_FILE: &str = "metadata.json";
pub fn capture_root() -> PathBuf {
env::var_os(CAPTURE_DIR_ENV)
.filter(|value| !value.is_empty())
.map(PathBuf::from)
.unwrap_or_else(|| home_dir().join(".webclaw").join("api-captures"))
}
pub fn capture_id_for(url: &Url, started_at: DateTime<Utc>) -> String {
let host = url.host_str().unwrap_or("unknown-host");
let host = match url.port() {
Some(port) => format!("{host}-{port}"),
None => host.to_owned(),
};
let timestamp = started_at.format("%Y-%m-%dT%H-%M-%SZ");
format!("{}/{timestamp}", sanitize_id_segment(&host))
}
pub fn save_capture(artifact: &CaptureArtifact) -> Result<SavedCapture, CaptureError> {
let root = capture_root();
let capture_dir = capture_dir_for_id(&root, &artifact.id)?;
fs::create_dir_all(&capture_dir)?;
let raw_capture_path = capture_dir.join(RAW_CAPTURE_FILE);
let redacted_capture_path = capture_dir.join(REDACTED_CAPTURE_FILE);
let endpoints_path = capture_dir.join(ENDPOINTS_FILE);
let metadata_path = capture_dir.join(METADATA_FILE);
let redacted_artifact = redact_artifact(artifact);
write_json(&raw_capture_path, artifact)?;
write_json(&redacted_capture_path, &redacted_artifact)?;
write_json(&endpoints_path, &redacted_artifact.endpoints)?;
write_json(&metadata_path, &metadata_for(&redacted_artifact))?;
Ok(SavedCapture {
id: artifact.id.clone(),
root,
capture_dir,
raw_capture_path,
redacted_capture_path,
endpoints_path,
metadata_path,
})
}
pub fn load_endpoints(capture_id: &str) -> Result<Vec<EndpointDefinition>, CaptureError> {
let endpoints_path = capture_dir_for_id(&capture_root(), capture_id)?.join(ENDPOINTS_FILE);
let contents = fs::read_to_string(&endpoints_path).map_err(|error| {
CaptureError::Storage(format!(
"could not read endpoints for capture id {capture_id}: {error}"
))
})?;
serde_json::from_str(&contents).map_err(CaptureError::from)
}
pub fn find_endpoint(endpoint_id: &str) -> Result<EndpointDefinition, CaptureError> {
let root = capture_root();
if !root.exists() {
return Err(CaptureError::EndpointNotFound(endpoint_id.to_owned()));
}
let mut stack = vec![root];
while let Some(path) = stack.pop() {
let entries = match fs::read_dir(&path) {
Ok(entries) => entries,
Err(_) => continue,
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
stack.push(path);
continue;
}
if path.file_name().and_then(|name| name.to_str()) != Some(ENDPOINTS_FILE) {
continue;
}
let contents = match fs::read_to_string(&path) {
Ok(contents) => contents,
Err(_) => continue,
};
let endpoints: Vec<EndpointDefinition> = match serde_json::from_str(&contents) {
Ok(endpoints) => endpoints,
Err(_) => continue,
};
if let Some(endpoint) = endpoints
.into_iter()
.find(|endpoint| endpoint.id == endpoint_id)
{
return Ok(endpoint);
}
}
}
Err(CaptureError::EndpointNotFound(endpoint_id.to_owned()))
}
fn home_dir() -> PathBuf {
env::var_os("USERPROFILE")
.map(PathBuf::from)
.or_else(dirs::home_dir)
.unwrap_or_else(|| PathBuf::from("."))
}
fn capture_dir_for_id(root: &Path, capture_id: &str) -> Result<PathBuf, CaptureError> {
let mut capture_dir = root.to_path_buf();
let parts = capture_id
.split(['/', '\\'])
.filter(|part| !part.is_empty())
.collect::<Vec<_>>();
if parts.is_empty() {
return Err(CaptureError::Storage(
"capture id cannot be empty".to_owned(),
));
}
for part in parts {
if !is_safe_path_segment(part) {
return Err(CaptureError::Storage(format!(
"capture id contains unsafe path segment: {capture_id}"
)));
}
capture_dir.push(part);
}
ensure_within_root(root, &capture_dir)?;
Ok(capture_dir)
}
fn ensure_within_root(root: &Path, path: &Path) -> Result<(), CaptureError> {
if relative_components(path).starts_with(&relative_components(root)) {
Ok(())
} else {
Err(CaptureError::Storage(format!(
"capture path escapes capture root: {}",
path.display()
)))
}
}
fn relative_components(path: &Path) -> Vec<String> {
path.components()
.filter_map(|component| match component {
Component::Prefix(prefix) => Some(prefix.as_os_str().to_string_lossy().to_string()),
Component::RootDir => Some(String::from("\\")),
Component::Normal(value) => Some(value.to_string_lossy().to_string()),
Component::CurDir => None,
Component::ParentDir => Some(String::from("..")),
})
.collect()
}
fn is_safe_path_segment(segment: &str) -> bool {
!segment.is_empty()
&& segment != "."
&& segment != ".."
&& !segment.contains(':')
&& !segment.contains('/')
&& !segment.contains('\\')
}
fn sanitize_id_segment(segment: &str) -> String {
let sanitized = segment
.chars()
.map(|character| {
if character.is_ascii_alphanumeric() || matches!(character, '.' | '-' | '_') {
character
} else {
'-'
}
})
.collect::<String>();
if sanitized.is_empty() {
"unknown".to_owned()
} else {
sanitized
}
}
fn write_json<T: serde::Serialize>(path: &PathBuf, value: &T) -> Result<(), CaptureError> {
let contents = serde_json::to_string_pretty(value)?;
fs::write(path, contents)?;
Ok(())
}
fn metadata_for(artifact: &CaptureArtifact) -> Map<String, Value> {
let mut metadata = artifact.metadata.clone();
metadata.insert("id".to_owned(), json!(artifact.id));
metadata.insert("source_url".to_owned(), json!(artifact.source_url));
metadata.insert("intent".to_owned(), json!(artifact.intent));
metadata.insert("started_at".to_owned(), json!(artifact.started_at));
metadata.insert("completed_at".to_owned(), json!(artifact.completed_at));
metadata.insert("exchange_count".to_owned(), json!(artifact.exchanges.len()));
metadata.insert("endpoint_count".to_owned(), json!(artifact.endpoints.len()));
metadata
}

View file

@ -0,0 +1,174 @@
use std::collections::BTreeMap;
use std::path::PathBuf;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
pub type HeaderMap = Map<String, Value>;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct CapturedRequest {
pub method: String,
pub url: String,
pub headers: HeaderMap,
pub body_sample: Option<String>,
pub resource_type: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct CapturedResponse {
pub status: u16,
pub headers: HeaderMap,
pub body_sample: Option<String>,
pub mime_type: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct CapturedExchange {
pub method: String,
pub url: String,
pub request_headers: HeaderMap,
pub request_body_sample: Option<String>,
pub resource_type: Option<String>,
pub status: u16,
pub response_headers: HeaderMap,
pub response_body_sample: Option<String>,
pub started_at: DateTime<Utc>,
pub duration_ms: u64,
pub redirect_chain: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct CaptureArtifact {
pub id: String,
pub source_url: String,
pub intent: Option<String>,
pub started_at: DateTime<Utc>,
pub completed_at: Option<DateTime<Utc>>,
pub exchanges: Vec<CapturedExchange>,
pub endpoints: Vec<EndpointDefinition>,
pub metadata: Map<String, Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct EndpointDefinition {
pub id: String,
pub method: String,
pub origin: String,
pub path_template: String,
pub query_params: BTreeMap<String, Vec<String>>,
pub request_schema: Option<Value>,
pub response_schema: Option<Value>,
pub auth_evidence: Vec<String>,
pub safety: EndpointSafety,
pub examples: Vec<EndpointExample>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct EndpointExample {
pub url: String,
pub request_headers: HeaderMap,
pub request_body_sample: Option<String>,
pub response_status: u16,
pub response_headers: HeaderMap,
pub response_body_sample: Option<String>,
pub captured_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct EndpointSafety {
pub safe_to_replay: bool,
pub requires_confirmation: bool,
pub reason: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ReplayOptions {
pub dry_run: bool,
pub confirm_unsafe: bool,
pub params_json: Option<Value>,
pub headers: HeaderMap,
pub body_json: Option<Value>,
}
impl Default for ReplayOptions {
fn default() -> Self {
Self {
dry_run: true,
confirm_unsafe: false,
params_json: None,
headers: HeaderMap::new(),
body_json: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum ReplayResult {
Preview {
method: String,
url: String,
headers: HeaderMap,
body_sample: Option<String>,
},
Executed {
status: u16,
headers: HeaderMap,
body_sample: Option<String>,
},
Blocked {
reason: String,
},
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SavedCapture {
pub id: String,
pub root: PathBuf,
pub capture_dir: PathBuf,
pub raw_capture_path: PathBuf,
pub redacted_capture_path: PathBuf,
pub endpoints_path: PathBuf,
pub metadata_path: PathBuf,
}
#[derive(Debug, thiserror::Error)]
pub enum CaptureError {
#[error("invalid url: {0}")]
InvalidUrl(String),
#[error("capture failed: {0}")]
Capture(String),
#[error("storage failed: {0}")]
Storage(String),
#[error("replay failed: {0}")]
Replay(String),
#[error("endpoint not found: {0}")]
EndpointNotFound(String),
#[error("request failed: {0}")]
Request(#[from] reqwest::Error),
#[error("I/O failed: {0}")]
Io(String),
#[error("JSON failed: {0}")]
Json(String),
}
impl From<std::io::Error> for CaptureError {
fn from(error: std::io::Error) -> Self {
Self::Io(error.to_string())
}
}
impl From<serde_json::Error> for CaptureError {
fn from(error: serde_json::Error) -> Self {
Self::Json(error.to_string())
}
}

View file

@ -0,0 +1,216 @@
use chrono::{TimeZone, Utc};
use serde_json::{Map, Value, json};
use webclaw_capture::classify::{classify_exchange, filter_api_exchanges};
use webclaw_capture::types::CapturedExchange;
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
entries
.iter()
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
.collect()
}
fn exchange(url: &str) -> CapturedExchange {
CapturedExchange {
method: "GET".to_owned(),
url: url.to_owned(),
request_headers: Map::new(),
request_body_sample: None,
resource_type: Some("document".to_owned()),
status: 200,
response_headers: Map::new(),
response_body_sample: None,
started_at: Utc.with_ymd_and_hms(2026, 5, 16, 12, 0, 0).unwrap(),
duration_ms: 25,
redirect_chain: Vec::new(),
}
}
fn with_resource_type(mut exchange: CapturedExchange, resource_type: &str) -> CapturedExchange {
exchange.resource_type = Some(resource_type.to_owned());
exchange
}
fn with_response_header(
mut exchange: CapturedExchange,
name: &str,
value: &str,
) -> CapturedExchange {
exchange.response_headers = headers(&[(name, value)]);
exchange
}
fn with_request_body(mut exchange: CapturedExchange, body: serde_json::Value) -> CapturedExchange {
exchange.method = "POST".to_owned();
exchange.request_headers = headers(&[("Content-Type", "application/json")]);
exchange.request_body_sample = Some(body.to_string());
exchange
}
fn assert_included(exchange: &CapturedExchange, label: &str) {
let classification = classify_exchange(exchange);
assert!(
classification.include,
"{label} should be included, got {classification:?}"
);
assert!(
classification.confidence >= 0.5,
"{label} should have useful confidence, got {classification:?}"
);
assert!(
!classification.reasons.is_empty(),
"{label} should explain why it was classified as API traffic"
);
}
fn assert_excluded(exchange: &CapturedExchange, label: &str) {
let classification = classify_exchange(exchange);
assert!(
!classification.include,
"{label} should be excluded, got {classification:?}"
);
assert!(
classification.confidence <= 0.5,
"{label} should not look like confident API traffic, got {classification:?}"
);
assert!(
!classification.reasons.is_empty(),
"{label} should explain why it was excluded"
);
}
#[test]
fn includes_fetch_and_xhr_resource_types() {
let cases = [
with_resource_type(exchange("https://example.test/products"), "fetch"),
with_resource_type(exchange("https://example.test/products"), "xhr"),
];
for case in cases {
assert_included(
&case,
case.resource_type
.as_deref()
.expect("resource type should be set"),
);
}
}
#[test]
fn includes_json_responses() {
let case = with_response_header(
exchange("https://example.test/products"),
"Content-Type",
"application/json; charset=utf-8",
);
assert_included(&case, "JSON response");
}
#[test]
fn includes_common_api_path_prefixes() {
let cases = [
exchange("https://example.test/api/products"),
exchange("https://example.test/v1/products"),
exchange("https://example.test/v2/products"),
];
for case in cases {
assert_included(&case, &case.url);
}
}
#[test]
fn includes_graphql_paths() {
let case = exchange("https://example.test/graphql");
assert_included(&case, "GraphQL path");
}
#[test]
fn includes_graphql_request_bodies() {
let case = with_request_body(
exchange("https://example.test/query"),
json!({
"operationName": "Products",
"query": "query Products { products { id name } }",
"variables": {
"first": 25
}
}),
);
assert_included(&case, "GraphQL request body");
}
#[test]
fn excludes_static_assets_by_extension() {
let cases = [
exchange("https://example.test/static/logo.png"),
exchange("https://example.test/static/photo.jpg"),
exchange("https://example.test/static/icon.svg"),
exchange("https://example.test/static/site.css"),
exchange("https://example.test/static/app.js"),
exchange("https://example.test/static/font.woff2"),
exchange("https://example.test/static/app.js.map"),
];
for case in cases {
assert_excluded(&case, &case.url);
}
}
#[test]
fn excludes_tracking_hosts() {
let cases = [
with_response_header(
exchange("https://www.google-analytics.com/g/collect?v=2"),
"Content-Type",
"application/json",
),
with_response_header(
exchange("https://ads.doubleclick.net/pagead/id"),
"Content-Type",
"application/json",
),
with_response_header(
exchange("https://telemetry.example.test/v1/events"),
"Content-Type",
"application/json",
),
];
for case in cases {
assert_excluded(&case, &case.url);
}
}
#[test]
fn excludes_browser_extension_urls() {
let cases = [
with_resource_type(exchange("chrome-extension://abcdef/options.html"), "fetch"),
with_resource_type(exchange("moz-extension://abcdef/options.html"), "xhr"),
];
for case in cases {
assert_excluded(&case, &case.url);
}
}
#[test]
fn filter_api_exchanges_returns_only_included_traffic() {
let api = exchange("https://example.test/api/products");
let asset = exchange("https://example.test/static/app.js");
let tracking = with_response_header(
exchange("https://telemetry.example.test/v1/events"),
"Content-Type",
"application/json",
);
let exchanges = vec![api.clone(), asset, tracking];
let filtered = filter_api_exchanges(&exchanges);
assert_eq!(filtered, vec![api]);
}

View file

@ -0,0 +1,139 @@
{
"log": {
"version": "1.2",
"creator": {
"name": "webclaw-capture-test",
"version": "0.1.0"
},
"entries": [
{
"startedDateTime": "2026-05-16T12:00:00Z",
"time": 42,
"_resourceType": "fetch",
"request": {
"method": "GET",
"url": "https://example.test/api/products?category=tools&page=2",
"headers": [
{
"name": "Accept",
"value": "application/json"
},
{
"name": "Authorization",
"value": "Bearer example-token"
}
]
},
"response": {
"status": 200,
"headers": [
{
"name": "Content-Type",
"value": "application/json; charset=utf-8"
}
],
"content": {
"mimeType": "application/json",
"text": "{\"items\":[{\"id\":12345,\"name\":\"Hammer\",\"price\":12.5,\"inStock\":true}],\"page\":2,\"hasMore\":false}"
}
}
},
{
"startedDateTime": "2026-05-16T12:00:01Z",
"time": 31,
"_resourceType": "xhr",
"request": {
"method": "GET",
"url": "https://example.test/api/products/12345",
"headers": [
{
"name": "Accept",
"value": "application/json"
},
{
"name": "Cookie",
"value": "session_id=example-session"
}
]
},
"response": {
"status": 200,
"headers": [
{
"name": "Content-Type",
"value": "application/json"
}
],
"content": {
"mimeType": "application/json",
"text": "{\"id\":12345,\"name\":\"Hammer\",\"category\":\"tools\",\"tags\":[\"hand-tool\",\"steel\"]}"
}
}
},
{
"startedDateTime": "2026-05-16T12:00:02Z",
"time": 57,
"_resourceType": "fetch",
"request": {
"method": "POST",
"url": "https://example.test/graphql",
"headers": [
{
"name": "Content-Type",
"value": "application/json"
},
{
"name": "X-CSRF-Token",
"value": "example-csrf"
}
],
"postData": {
"mimeType": "application/json",
"text": "{\"operationName\":\"CreateProduct\",\"query\":\"mutation CreateProduct($name: String!) { createProduct(input: { name: $name }) { id name } }\",\"variables\":{\"name\":\"Hammer\"}}"
}
},
"response": {
"status": 200,
"headers": [
{
"name": "Content-Type",
"value": "application/json"
}
],
"content": {
"mimeType": "application/json",
"text": "{\"data\":{\"createProduct\":{\"id\":\"gid://example/Product/12345\",\"name\":\"Hammer\"}}}"
}
}
},
{
"startedDateTime": "2026-05-16T12:00:03Z",
"time": 8,
"_resourceType": "script",
"request": {
"method": "GET",
"url": "https://example.test/static/app.js",
"headers": [
{
"name": "Accept",
"value": "application/javascript"
}
]
},
"response": {
"status": 200,
"headers": [
{
"name": "Content-Type",
"value": "application/javascript"
}
],
"content": {
"mimeType": "application/javascript",
"text": "fetch('/api/products?category=tools')"
}
}
}
]
}
}

View file

@ -0,0 +1,261 @@
use chrono::{DateTime, Utc};
use serde_json::{Map, Value, json};
use webclaw_capture::infer::{
endpoint_id, infer_endpoints, infer_json_schema, normalize_path_template,
};
use webclaw_capture::types::{CapturedExchange, EndpointDefinition};
fn fixture_exchanges() -> Vec<CapturedExchange> {
let har: Value =
serde_json::from_str(include_str!("fixtures/sample.har.json")).expect("valid HAR fixture");
let entries = har
.pointer("/log/entries")
.and_then(Value::as_array)
.expect("HAR fixture entries");
entries.iter().map(har_entry_to_exchange).collect()
}
fn har_entry_to_exchange(entry: &Value) -> CapturedExchange {
let request = entry.get("request").expect("request");
let response = entry.get("response").expect("response");
CapturedExchange {
method: string_at(request, "method"),
url: string_at(request, "url"),
request_headers: har_headers(request),
request_body_sample: request
.pointer("/postData/text")
.and_then(Value::as_str)
.map(str::to_owned),
resource_type: entry
.get("_resourceType")
.and_then(Value::as_str)
.map(str::to_owned),
status: response
.get("status")
.and_then(Value::as_u64)
.expect("response status") as u16,
response_headers: har_headers(response),
response_body_sample: response
.pointer("/content/text")
.and_then(Value::as_str)
.map(str::to_owned),
started_at: DateTime::parse_from_rfc3339(&string_at(entry, "startedDateTime"))
.expect("RFC3339 startedDateTime")
.with_timezone(&Utc),
duration_ms: entry.get("time").and_then(Value::as_u64).expect("duration"),
redirect_chain: Vec::new(),
}
}
fn har_headers(container: &Value) -> Map<String, Value> {
container
.get("headers")
.and_then(Value::as_array)
.expect("headers")
.iter()
.map(|header| {
(
string_at(header, "name"),
Value::String(string_at(header, "value")),
)
})
.collect()
}
fn string_at(value: &Value, key: &str) -> String {
value
.get(key)
.and_then(Value::as_str)
.unwrap_or_else(|| panic!("{key} should be a string"))
.to_owned()
}
fn find_endpoint<'a>(
endpoints: &'a [EndpointDefinition],
method: &str,
path_template: &str,
) -> &'a EndpointDefinition {
endpoints
.iter()
.find(|endpoint| endpoint.method == method && endpoint.path_template == path_template)
.unwrap_or_else(|| panic!("missing endpoint {method} {path_template}; got {endpoints:#?}"))
}
fn sorted_ids(endpoints: &[EndpointDefinition]) -> Vec<String> {
let mut ids = endpoints
.iter()
.map(|endpoint| endpoint.id.clone())
.collect::<Vec<_>>();
ids.sort();
ids
}
#[test]
fn infers_stable_endpoint_ids_and_path_templates_from_har_fixture() {
let exchanges = fixture_exchanges();
let endpoints = infer_endpoints(&exchanges);
let repeated = infer_endpoints(&exchanges);
assert_eq!(endpoints.len(), 3, "static assets should be ignored");
assert_eq!(
sorted_ids(&endpoints),
sorted_ids(&repeated),
"endpoint ids should be deterministic across inference runs"
);
let products = find_endpoint(&endpoints, "GET", "/api/products");
assert_eq!(
products.id,
endpoint_id("GET", "https://example.test", "/api/products")
);
let product_detail = find_endpoint(&endpoints, "GET", "/api/products/{id}");
assert_eq!(
product_detail.id,
endpoint_id("GET", "https://example.test", "/api/products/{id}")
);
let graphql = find_endpoint(&endpoints, "POST", "/graphql");
assert_eq!(
graphql.id,
endpoint_id("POST", "https://example.test", "/graphql")
);
}
#[test]
fn infers_query_examples_schemas_auth_evidence_and_mutation_safety() {
let endpoints = infer_endpoints(&fixture_exchanges());
let products = find_endpoint(&endpoints, "GET", "/api/products");
assert_eq!(
products.query_params.get("category"),
Some(&vec!["tools".to_owned()])
);
assert_eq!(
products.query_params.get("page"),
Some(&vec!["2".to_owned()])
);
assert!(
products
.auth_evidence
.iter()
.any(|evidence| evidence.to_ascii_lowercase().contains("authorization")),
"Authorization header should be recorded as auth evidence"
);
assert!(products.safety.safe_to_replay);
assert!(!products.safety.requires_confirmation);
let products_schema = products.response_schema.as_ref().expect("response schema");
assert_eq!(
products_schema.pointer("/properties/items/type"),
Some(&json!("array"))
);
assert_eq!(
products_schema.pointer("/properties/items/items/properties/id/type"),
Some(&json!("integer"))
);
assert_eq!(
products_schema.pointer("/properties/hasMore/type"),
Some(&json!("boolean"))
);
let graphql = find_endpoint(&endpoints, "POST", "/graphql");
assert!(!graphql.safety.safe_to_replay);
assert!(graphql.safety.requires_confirmation);
assert!(
graphql
.auth_evidence
.iter()
.any(|evidence| evidence.to_ascii_lowercase().contains("csrf")),
"CSRF header should be recorded as auth evidence"
);
let request_schema = graphql.request_schema.as_ref().expect("request schema");
assert_eq!(
request_schema.pointer("/properties/query/type"),
Some(&json!("string"))
);
assert_eq!(
request_schema.pointer("/properties/variables/properties/name/type"),
Some(&json!("string"))
);
let response_schema = graphql.response_schema.as_ref().expect("response schema");
assert_eq!(
response_schema.pointer("/properties/data/properties/createProduct/properties/id/type"),
Some(&json!("string"))
);
}
#[test]
fn ignores_static_asset_entries_from_the_fixture() {
let endpoints = infer_endpoints(&fixture_exchanges());
assert!(
endpoints
.iter()
.all(|endpoint| !endpoint.path_template.contains("/static/")),
"static asset requests should not become learned endpoints: {endpoints:#?}"
);
}
#[test]
fn normalizes_numeric_uuid_and_high_entropy_path_segments() {
assert_eq!(
normalize_path_template("/api/products/12345"),
"/api/products/{id}"
);
assert_eq!(
normalize_path_template("/api/users/550e8400-e29b-41d4-a716-446655440000"),
"/api/users/{id}"
);
assert_eq!(
normalize_path_template("/api/sessions/a1b2c3d4e5f6a7b8"),
"/api/sessions/{id}"
);
assert_eq!(
normalize_path_template("/api/categories/tools"),
"/api/categories/tools"
);
}
#[test]
fn infers_basic_json_schema_shapes() {
let schema = infer_json_schema(&json!({
"id": 12345,
"name": "Hammer",
"price": 12.5,
"inStock": true,
"tags": ["hand-tool"],
"metadata": null
}));
assert_eq!(schema.pointer("/type"), Some(&json!("object")));
assert_eq!(
schema.pointer("/properties/id/type"),
Some(&json!("integer"))
);
assert_eq!(
schema.pointer("/properties/price/type"),
Some(&json!("number"))
);
assert_eq!(
schema.pointer("/properties/inStock/type"),
Some(&json!("boolean"))
);
assert_eq!(
schema.pointer("/properties/tags/type"),
Some(&json!("array"))
);
assert_eq!(
schema.pointer("/properties/tags/items/type"),
Some(&json!("string"))
);
assert_eq!(
schema.pointer("/properties/metadata/type"),
Some(&json!("null"))
);
}

View file

@ -0,0 +1,245 @@
use std::env;
use std::ffi::OsString;
use std::fs;
use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::{TcpListener, TcpStream};
use tokio::sync::oneshot;
use webclaw_capture::cdp::{CaptureOptions, capture_network};
use webclaw_capture::types::{CaptureArtifact, EndpointDefinition};
const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR";
struct CaptureDirGuard {
original: Option<OsString>,
}
impl CaptureDirGuard {
fn set(path: &Path) -> Self {
let original = env::var_os(CAPTURE_DIR_ENV);
unsafe {
env::set_var(CAPTURE_DIR_ENV, path);
}
Self { original }
}
}
impl Drop for CaptureDirGuard {
fn drop(&mut self) {
unsafe {
match &self.original {
Some(value) => env::set_var(CAPTURE_DIR_ENV, value),
None => env::remove_var(CAPTURE_DIR_ENV),
}
}
}
}
struct LocalServer {
base_url: String,
shutdown: Option<oneshot::Sender<()>>,
}
impl LocalServer {
async fn start() -> Self {
let listener = TcpListener::bind("127.0.0.1:0")
.await
.expect("bind local test server");
let address = listener.local_addr().expect("local test server address");
let (shutdown, mut shutdown_rx) = oneshot::channel::<()>();
tokio::spawn(async move {
loop {
tokio::select! {
_ = &mut shutdown_rx => break,
accepted = listener.accept() => {
let Ok((stream, _peer)) = accepted else {
continue;
};
tokio::spawn(handle_connection(stream));
}
}
}
});
Self {
base_url: format!("http://{address}"),
shutdown: Some(shutdown),
}
}
fn url(&self, path: &str) -> String {
format!("{}{}", self.base_url, path)
}
}
impl Drop for LocalServer {
fn drop(&mut self) {
if let Some(shutdown) = self.shutdown.take() {
let _ = shutdown.send(());
}
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn capture_network_records_fetches_redacts_secrets_and_learns_api_endpoints() {
let capture_root = unique_temp_root("integration-capture");
let _capture_dir = CaptureDirGuard::set(&capture_root);
let server = LocalServer::start().await;
let saved = capture_network(CaptureOptions {
url: server.url("/"),
intent: Some("discover product listing API".to_owned()),
wait_ms: 1_500,
headed: false,
})
.await
.expect("capture network traffic");
let raw_capture: CaptureArtifact = read_json(&saved.raw_capture_path);
assert!(
raw_capture
.exchanges
.iter()
.any(|exchange| exchange.url.contains("/api/products?category=tools")),
"raw capture should include the fetch to /api/products"
);
let redacted_capture_text =
fs::read_to_string(&saved.redacted_capture_path).expect("read redacted capture");
for secret in [
"browser-authorization-secret",
"browser-api-key-secret",
"browser-csrf-secret",
"page-session-secret",
"api-session-secret",
] {
assert!(
!redacted_capture_text.contains(secret),
"redacted capture should not contain raw secret value {secret}"
);
}
let endpoints: Vec<EndpointDefinition> = read_json(&saved.endpoints_path);
let api_endpoints = endpoints
.iter()
.filter(|endpoint| endpoint.method == "GET" && endpoint.path_template == "/api/products")
.collect::<Vec<_>>();
assert_eq!(
api_endpoints.len(),
1,
"inferred endpoints should contain one GET /api/products endpoint"
);
assert!(
endpoints
.iter()
.all(|endpoint| endpoint.path_template != "/static/app.js"),
"static assets should not be included as learned endpoints"
);
let _ = fs::remove_dir_all(capture_root);
}
async fn handle_connection(mut stream: TcpStream) {
let mut buffer = vec![0_u8; 8192];
let Ok(bytes_read) = stream.read(&mut buffer).await else {
return;
};
if bytes_read == 0 {
return;
}
let request = String::from_utf8_lossy(&buffer[..bytes_read]);
let path = request
.lines()
.next()
.and_then(|line| line.split_whitespace().nth(1))
.unwrap_or("/");
let response = match path.split('?').next().unwrap_or(path) {
"/" => http_response(
"200 OK",
&[
("Content-Type", "text/html; charset=utf-8"),
("Set-Cookie", "session=page-session-secret; HttpOnly"),
],
r#"<!doctype html>
<html>
<head><title>Webclaw capture test</title></head>
<body>
<script src="/static/app.js"></script>
</body>
</html>"#,
),
"/static/app.js" => http_response(
"200 OK",
&[("Content-Type", "application/javascript; charset=utf-8")],
r#"fetch('/api/products?category=tools', {
headers: {
'Authorization': 'Bearer browser-authorization-secret',
'X-Api-Key': 'browser-api-key-secret',
'X-CSRF-Token': 'browser-csrf-secret'
}
}).then(response => response.json()).then(products => {
window.__webclawProducts = products;
});"#,
),
"/api/products" => http_response(
"200 OK",
&[
("Content-Type", "application/json"),
("Set-Cookie", "session=api-session-secret; HttpOnly"),
],
r#"{"items":[{"id":12345,"name":"Hammer","category":"tools"}]}"#,
),
_ => http_response(
"404 Not Found",
&[("Content-Type", "text/plain; charset=utf-8")],
"not found",
),
};
let _ = stream.write_all(response.as_bytes()).await;
let _ = stream.shutdown().await;
}
fn http_response(status: &str, headers: &[(&str, &str)], body: &str) -> String {
let mut response = format!(
"HTTP/1.1 {status}\r\nContent-Length: {}\r\nConnection: close\r\nCache-Control: no-store\r\n",
body.len()
);
for (name, value) in headers {
response.push_str(name);
response.push_str(": ");
response.push_str(value);
response.push_str("\r\n");
}
response.push_str("\r\n");
response.push_str(body);
response
}
fn unique_temp_root(test_name: &str) -> PathBuf {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("system time after unix epoch")
.as_nanos();
env::temp_dir().join(format!(
"webclaw-capture-{test_name}-{}-{nanos}",
std::process::id()
))
}
fn read_json<T: serde::de::DeserializeOwned>(path: &Path) -> T {
let contents = fs::read_to_string(path).expect("read JSON file");
serde_json::from_str(&contents).expect("valid JSON file")
}

View file

@ -0,0 +1,358 @@
use std::collections::BTreeMap;
use std::env;
use std::ffi::OsString;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use std::time::{SystemTime, UNIX_EPOCH};
use chrono::{DateTime, Utc};
use serde_json::{Map, Value, json};
use webclaw_capture::openapi::{export_openapi, write_openapi};
use webclaw_capture::store::save_capture;
use webclaw_capture::types::{
CaptureArtifact, EndpointDefinition, EndpointExample, EndpointSafety,
};
static ENV_LOCK: Mutex<()> = Mutex::new(());
const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR";
struct EnvVarGuard {
original: Option<OsString>,
}
impl EnvVarGuard {
fn set_capture_dir(value: Option<&Path>) -> Self {
let original = env::var_os(CAPTURE_DIR_ENV);
unsafe {
match value {
Some(path) => env::set_var(CAPTURE_DIR_ENV, path),
None => env::remove_var(CAPTURE_DIR_ENV),
}
}
Self { original }
}
}
impl Drop for EnvVarGuard {
fn drop(&mut self) {
unsafe {
match &self.original {
Some(value) => env::set_var(CAPTURE_DIR_ENV, value),
None => env::remove_var(CAPTURE_DIR_ENV),
}
}
}
}
fn with_capture_dir<T>(value: Option<&Path>, test: impl FnOnce() -> T) -> T {
let _lock = ENV_LOCK.lock().expect("capture env lock");
let _guard = EnvVarGuard::set_capture_dir(value);
test()
}
#[test]
fn exports_openapi_31_and_an_operation_for_every_endpoint() {
let doc = export_openapi(&sample_endpoints());
assert_eq!(doc.get("openapi").and_then(Value::as_str), Some("3.1.0"));
let paths = doc
.get("paths")
.and_then(Value::as_object)
.expect("OpenAPI document should contain paths");
assert!(
operation(&doc, "/api/products", "get").is_some(),
"GET product endpoint should become an OpenAPI operation"
);
assert!(
operation(&doc, "/graphql", "post").is_some(),
"POST GraphQL endpoint should become an OpenAPI operation"
);
assert_eq!(
operation_count(paths),
2,
"every learned endpoint should become exactly one operation"
);
}
#[test]
fn unsafe_operations_require_confirmation_extension() {
let doc = export_openapi(&sample_endpoints());
let get_operation =
operation(&doc, "/api/products", "get").expect("GET product endpoint should be exported");
let post_operation =
operation(&doc, "/graphql", "post").expect("POST GraphQL endpoint should be exported");
assert_ne!(
get_operation.get("x-webclaw-requires-confirmation"),
Some(&json!(true)),
"safe GET operations should not require unsafe replay confirmation"
);
assert_eq!(
post_operation.get("x-webclaw-requires-confirmation"),
Some(&json!(true)),
"unsafe POST operations should require explicit replay confirmation"
);
}
#[test]
fn generated_examples_do_not_leak_secret_values() {
let doc = export_openapi(&sample_endpoints());
assert!(
contains_example_node(&doc),
"OpenAPI export should include examples derived from captured endpoint examples"
);
let doc_text = serde_json::to_string(&doc).expect("serialize OpenAPI document");
for forbidden in [
"Bearer raw-secret",
"raw-api-key",
"raw-csrf-token",
"raw-session-id",
"raw-password",
"user@example.test",
] {
assert!(
!doc_text.contains(forbidden),
"OpenAPI examples should not leak secret value {forbidden:?}"
);
}
assert!(
doc_text.contains("[REDACTED]"),
"OpenAPI examples should preserve redaction markers instead of raw secrets"
);
}
#[test]
fn write_openapi_writes_openapi_json_next_to_saved_endpoints() {
let root = unique_temp_root("write");
with_capture_dir(Some(&root), || {
let artifact = sample_artifact();
save_capture(&artifact).expect("save capture before OpenAPI export");
let openapi_path = write_openapi(&artifact.id).expect("write OpenAPI document");
assert_eq!(
openapi_path,
root.join("example.test")
.join("2026-05-16T12-00-00Z")
.join("openapi.json")
);
assert!(openapi_path.is_file());
let doc: Value = read_json(&openapi_path);
assert_eq!(doc.get("openapi").and_then(Value::as_str), Some("3.1.0"));
assert!(
operation(&doc, "/api/products", "get").is_some(),
"written OpenAPI document should contain saved capture endpoints"
);
});
let _ = fs::remove_dir_all(root);
}
fn sample_artifact() -> CaptureArtifact {
CaptureArtifact {
id: "example.test/2026-05-16T12-00-00Z".to_owned(),
source_url: "https://example.test/products?email=user@example.test".to_owned(),
intent: Some("discover product listing API".to_owned()),
started_at: test_time(),
completed_at: Some(test_time()),
exchanges: Vec::new(),
endpoints: sample_endpoints(),
metadata: Map::new(),
}
}
fn sample_endpoints() -> Vec<EndpointDefinition> {
vec![product_endpoint(), graphql_endpoint()]
}
fn product_endpoint() -> EndpointDefinition {
let mut query_params = BTreeMap::new();
query_params.insert("category".to_owned(), vec!["tools".to_owned()]);
query_params.insert("page".to_owned(), vec!["2".to_owned()]);
EndpointDefinition {
id: "GET https://example.test/api/products".to_owned(),
method: "GET".to_owned(),
origin: "https://example.test".to_owned(),
path_template: "/api/products".to_owned(),
query_params,
request_schema: None,
response_schema: Some(json!({
"type": "object",
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": { "type": "integer" },
"name": { "type": "string" }
}
}
}
}
})),
auth_evidence: vec!["Authorization header observed".to_owned()],
safety: EndpointSafety {
safe_to_replay: true,
requires_confirmation: false,
reason: "GET is a read-oriented HTTP method".to_owned(),
},
examples: vec![EndpointExample {
url: "https://example.test/api/products?category=tools&page=2&api_key=raw-api-key"
.to_owned(),
request_headers: headers(&[
("Authorization", "Bearer raw-secret"),
("Accept", "application/json"),
("X-Api-Key", "raw-api-key"),
]),
request_body_sample: None,
response_status: 200,
response_headers: headers(&[
("Content-Type", "application/json"),
("Set-Cookie", "session=raw-session-id"),
]),
response_body_sample: Some(
r#"{"items":[{"id":12345,"name":"Hammer","email":"user@example.test"}]}"#
.to_owned(),
),
captured_at: test_time(),
}],
}
}
fn graphql_endpoint() -> EndpointDefinition {
EndpointDefinition {
id: "POST https://example.test/graphql".to_owned(),
method: "POST".to_owned(),
origin: "https://example.test".to_owned(),
path_template: "/graphql".to_owned(),
query_params: BTreeMap::new(),
request_schema: Some(json!({
"type": "object",
"properties": {
"query": { "type": "string" },
"variables": { "type": "object" }
}
})),
response_schema: Some(json!({
"type": "object",
"properties": {
"data": { "type": "object" }
}
})),
auth_evidence: vec!["X-CSRF-Token header observed".to_owned()],
safety: EndpointSafety {
safe_to_replay: false,
requires_confirmation: true,
reason: "POST may mutate server state and requires confirmation".to_owned(),
},
examples: vec![EndpointExample {
url: concat!(
"https://example.test/graphql?",
"ref=user%40example.test&",
"debug=Bearer%20raw-secret&",
"trace=raw-session-id"
)
.to_owned(),
request_headers: headers(&[
("Content-Type", "application/json"),
("X-CSRF-Token", "raw-csrf-token"),
]),
request_body_sample: Some(
json!({
"query": "mutation CreateProduct($name: String!) { createProduct(name: $name) { id } }",
"variables": {
"name": "Hammer",
"password": "raw-password"
}
})
.to_string(),
),
response_status: 200,
response_headers: headers(&[("Content-Type", "application/json")]),
response_body_sample: Some(r#"{"data":{"createProduct":{"id":"12345"}}}"#.to_owned()),
captured_at: test_time(),
}],
}
}
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
entries
.iter()
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
.collect()
}
fn operation<'a>(doc: &'a Value, path: &str, method: &str) -> Option<&'a Map<String, Value>> {
doc.get("paths")
.and_then(Value::as_object)
.and_then(|paths| paths.get(path))
.and_then(Value::as_object)
.and_then(|path_item| path_item.get(method))
.and_then(Value::as_object)
}
fn operation_count(paths: &Map<String, Value>) -> usize {
const HTTP_METHODS: &[&str] = &[
"get", "put", "post", "delete", "options", "head", "patch", "trace",
];
paths
.values()
.filter_map(Value::as_object)
.map(|path_item| {
HTTP_METHODS
.iter()
.filter(|method| path_item.contains_key(**method))
.count()
})
.sum()
}
fn contains_example_node(value: &Value) -> bool {
match value {
Value::Object(object) => {
object
.keys()
.any(|key| matches!(key.as_str(), "example" | "examples" | "x-webclaw-examples"))
|| object.values().any(contains_example_node)
}
Value::Array(items) => items.iter().any(contains_example_node),
_ => false,
}
}
fn unique_temp_root(test_name: &str) -> PathBuf {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("system time after unix epoch")
.as_nanos();
env::temp_dir().join(format!(
"webclaw-capture-openapi-{test_name}-{}-{nanos}",
std::process::id()
))
}
fn read_json<T: serde::de::DeserializeOwned>(path: &Path) -> T {
let contents = fs::read_to_string(path).expect("read JSON file");
serde_json::from_str(&contents).expect("valid JSON file")
}
fn test_time() -> DateTime<Utc> {
DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z")
.expect("valid test timestamp")
.with_timezone(&Utc)
}

View file

@ -0,0 +1,209 @@
use chrono::{TimeZone, Utc};
use serde_json::{Map, Value, json};
use url::Url;
use webclaw_capture::redact::{redact_artifact, redact_headers, redact_json, redact_url};
use webclaw_capture::types::{CaptureArtifact, CapturedExchange};
const REDACTED: &str = "[REDACTED]";
fn header_map(entries: &[(&str, &str)]) -> Map<String, Value> {
entries
.iter()
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
.collect()
}
fn query_value(url: &str, name: &str) -> Option<String> {
Url::parse(url)
.unwrap()
.query_pairs()
.find(|(key, _)| key == name)
.map(|(_, value)| value.into_owned())
}
#[test]
fn redacts_sensitive_header_and_cookie_values_by_name() {
let headers = header_map(&[
("Authorization", "Bearer secret-token"),
("Cookie", "session=secret-session; theme=dark"),
("Set-Cookie", "account=secret-cookie; HttpOnly"),
("X-Api-Key", "secret-api-key"),
("X-CSRF-Token", "secret-csrf-token"),
("X-Session-Id", "secret-session-id"),
("X-Password-Hash", "secret-password"),
("X-User-Email", "person@example.test"),
("Content-Type", "application/json"),
]);
let redacted = redact_headers(&headers);
assert_eq!(redacted["Authorization"], REDACTED);
assert_eq!(redacted["Cookie"], REDACTED);
assert_eq!(redacted["Set-Cookie"], REDACTED);
assert_eq!(redacted["X-Api-Key"], REDACTED);
assert_eq!(redacted["X-CSRF-Token"], REDACTED);
assert_eq!(redacted["X-Session-Id"], REDACTED);
assert_eq!(redacted["X-Password-Hash"], REDACTED);
assert_eq!(redacted["X-User-Email"], REDACTED);
assert_eq!(redacted["Content-Type"], "application/json");
}
#[test]
fn redacts_sensitive_query_parameter_values_by_name() {
let url = concat!(
"https://example.test/api/products?",
"authorization=Bearer%20secret-token&",
"api-key=secret-api-key&",
"csrf=secret-csrf&",
"access_token=secret-access-token&",
"session_id=secret-session&",
"password=secret-password&",
"email=person%40example.test&",
"cookie=secret-cookie&",
"page=2"
);
let redacted = redact_url(url);
assert_eq!(
query_value(&redacted, "authorization").as_deref(),
Some(REDACTED)
);
assert_eq!(query_value(&redacted, "api-key").as_deref(), Some(REDACTED));
assert_eq!(query_value(&redacted, "csrf").as_deref(), Some(REDACTED));
assert_eq!(
query_value(&redacted, "access_token").as_deref(),
Some(REDACTED)
);
assert_eq!(
query_value(&redacted, "session_id").as_deref(),
Some(REDACTED)
);
assert_eq!(
query_value(&redacted, "password").as_deref(),
Some(REDACTED)
);
assert_eq!(query_value(&redacted, "email").as_deref(), Some(REDACTED));
assert_eq!(query_value(&redacted, "cookie").as_deref(), Some(REDACTED));
assert_eq!(query_value(&redacted, "page").as_deref(), Some("2"));
assert!(!redacted.contains("secret"));
assert!(!redacted.contains("person%40example.test"));
}
#[test]
fn redacts_sensitive_json_body_keys_recursively() {
let body = json!({
"authorization": "Bearer secret-token",
"cookie": "session=secret-session",
"set-cookie": "session=secret-session",
"api-key": "secret-api-key",
"csrf": "secret-csrf",
"access_token": "secret-access-token",
"session_id": "secret-session",
"password": "secret-password",
"email": "person@example.test",
"profile": {
"backupEmail": "backup@example.test",
"display_name": "Visible Name"
},
"items": [
{
"sessionToken": "nested-secret-session-token",
"quantity": 3
}
]
});
let redacted = redact_json(&body);
assert_eq!(redacted["authorization"], REDACTED);
assert_eq!(redacted["cookie"], REDACTED);
assert_eq!(redacted["set-cookie"], REDACTED);
assert_eq!(redacted["api-key"], REDACTED);
assert_eq!(redacted["csrf"], REDACTED);
assert_eq!(redacted["access_token"], REDACTED);
assert_eq!(redacted["session_id"], REDACTED);
assert_eq!(redacted["password"], REDACTED);
assert_eq!(redacted["email"], REDACTED);
assert_eq!(redacted["profile"]["backupEmail"], REDACTED);
assert_eq!(redacted["profile"]["display_name"], "Visible Name");
assert_eq!(redacted["items"][0]["sessionToken"], REDACTED);
assert_eq!(redacted["items"][0]["quantity"], 3);
}
#[test]
fn redacts_capture_artifact_headers_urls_and_json_body_samples() {
let captured_at = Utc.with_ymd_and_hms(2026, 5, 16, 12, 0, 0).unwrap();
let artifact = CaptureArtifact {
id: "example.test/2026-05-16T12-00-00Z".to_owned(),
source_url: "https://example.test/app?email=person@example.test".to_owned(),
intent: Some("discover public API".to_owned()),
started_at: captured_at,
completed_at: Some(captured_at),
exchanges: vec![CapturedExchange {
method: "POST".to_owned(),
url: "https://example.test/api/session?token=secret-token&page=2".to_owned(),
request_headers: header_map(&[
("Authorization", "Bearer secret-token"),
("Content-Type", "application/json"),
]),
request_body_sample: Some(
json!({
"email": "person@example.test",
"password": "secret-password",
"name": "Visible Name"
})
.to_string(),
),
resource_type: Some("fetch".to_owned()),
status: 200,
response_headers: header_map(&[
("Set-Cookie", "session=secret-session; HttpOnly"),
("Content-Type", "application/json"),
]),
response_body_sample: Some(
json!({
"sessionToken": "secret-session-token",
"status": "ok"
})
.to_string(),
),
started_at: captured_at,
duration_ms: 25,
redirect_chain: vec!["https://example.test/login?csrf=secret-csrf".to_owned()],
}],
endpoints: Vec::new(),
metadata: Map::new(),
};
let redacted = redact_artifact(&artifact);
let exchange = &redacted.exchanges[0];
assert_eq!(
query_value(&redacted.source_url, "email").as_deref(),
Some(REDACTED)
);
assert_eq!(
query_value(&exchange.url, "token").as_deref(),
Some(REDACTED)
);
assert_eq!(query_value(&exchange.url, "page").as_deref(), Some("2"));
assert_eq!(exchange.request_headers["Authorization"], REDACTED);
assert_eq!(exchange.request_headers["Content-Type"], "application/json");
assert_eq!(exchange.response_headers["Set-Cookie"], REDACTED);
assert_eq!(
query_value(&exchange.redirect_chain[0], "csrf").as_deref(),
Some(REDACTED)
);
let request_body = exchange.request_body_sample.as_deref().unwrap();
assert!(request_body.contains(REDACTED));
assert!(request_body.contains("Visible Name"));
assert!(!request_body.contains("person@example.test"));
assert!(!request_body.contains("secret-password"));
let response_body = exchange.response_body_sample.as_deref().unwrap();
assert!(response_body.contains(REDACTED));
assert!(response_body.contains("ok"));
assert!(!response_body.contains("secret-session-token"));
}

View file

@ -0,0 +1,414 @@
use std::collections::BTreeMap;
use std::time::Duration;
use chrono::{DateTime, Utc};
use serde_json::{Map, Value, json};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::{TcpListener, TcpStream};
use tokio::sync::{mpsc, oneshot};
use webclaw_capture::replay::replay_endpoint;
use webclaw_capture::types::{
EndpointDefinition, EndpointExample, EndpointSafety, ReplayOptions, ReplayResult,
};
struct LocalServer {
base_url: String,
requests: mpsc::UnboundedReceiver<String>,
shutdown: Option<oneshot::Sender<()>>,
}
impl LocalServer {
async fn start() -> Self {
let listener = TcpListener::bind("127.0.0.1:0")
.await
.expect("bind local replay test server");
let address = listener.local_addr().expect("local replay server address");
let (shutdown, mut shutdown_rx) = oneshot::channel::<()>();
let (requests_tx, requests_rx) = mpsc::unbounded_channel::<String>();
tokio::spawn(async move {
loop {
tokio::select! {
_ = &mut shutdown_rx => break,
accepted = listener.accept() => {
let Ok((stream, _peer)) = accepted else {
continue;
};
tokio::spawn(handle_connection(stream, requests_tx.clone()));
}
}
}
});
Self {
base_url: format!("http://{address}"),
requests: requests_rx,
shutdown: Some(shutdown),
}
}
async fn next_request(&mut self) -> String {
tokio::time::timeout(Duration::from_secs(2), self.requests.recv())
.await
.expect("local replay server should receive a request")
.expect("local replay server request channel should remain open")
}
}
impl Drop for LocalServer {
fn drop(&mut self) {
if let Some(shutdown) = self.shutdown.take() {
let _ = shutdown.send(());
}
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn get_endpoint_executes_when_dry_run_is_false() {
let mut server = LocalServer::start().await;
let endpoint = get_endpoint(&server.base_url, headers(&[("Accept", "application/json")]));
let result = replay_endpoint(
&endpoint,
ReplayOptions {
dry_run: false,
confirm_unsafe: false,
params_json: Some(json!({ "category": "tools" })),
headers: Map::new(),
body_json: None,
},
)
.await
.expect("replay GET endpoint");
match result {
ReplayResult::Executed {
status,
body_sample,
..
} => {
assert_eq!(status, 200);
assert!(
body_sample
.as_deref()
.unwrap_or_default()
.contains(r#""ok":true"#),
"executed replay should return the response body sample"
);
}
other => panic!("GET replay should execute, got {other:#?}"),
}
let request = server.next_request().await;
assert!(
request.starts_with("GET /api/products"),
"server should receive the replayed GET request, got {request:?}"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn get_endpoint_with_dry_run_returns_preview_without_network() {
let endpoint = get_endpoint(
"http://127.0.0.1:9",
headers(&[("Accept", "application/json")]),
);
let result = replay_endpoint(
&endpoint,
ReplayOptions {
dry_run: true,
confirm_unsafe: false,
params_json: Some(json!({ "category": "tools" })),
headers: headers(&[("X-Replay-Trace", "dry-run")]),
body_json: None,
},
)
.await
.expect("preview GET endpoint");
match result {
ReplayResult::Preview {
method,
url,
headers,
body_sample,
} => {
assert_eq!(method, "GET");
assert!(url.starts_with("http://127.0.0.1:9/api/products"));
assert!(url.contains("category=tools"));
assert_eq!(header_string(&headers, "X-Replay-Trace"), Some("dry-run"));
assert_eq!(body_sample, None);
}
other => panic!("dry-run GET replay should return a preview, got {other:#?}"),
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn post_without_confirmation_is_blocked() {
let endpoint = post_endpoint("http://127.0.0.1:9");
let result = replay_endpoint(
&endpoint,
ReplayOptions {
dry_run: false,
confirm_unsafe: false,
params_json: None,
headers: Map::new(),
body_json: Some(graphql_body()),
},
)
.await
.expect("block unsafe POST replay");
match result {
ReplayResult::Blocked { reason } => {
let reason = reason.to_ascii_lowercase();
assert!(
reason.contains("confirm") || reason.contains("unsafe"),
"blocked replay should explain confirmation is required, got {reason:?}"
);
}
other => {
panic!("unsafe POST replay without confirmation should be blocked, got {other:#?}")
}
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn post_with_dry_run_returns_preview_only() {
let endpoint = post_endpoint("http://127.0.0.1:9");
let result = replay_endpoint(
&endpoint,
ReplayOptions {
dry_run: true,
confirm_unsafe: false,
params_json: None,
headers: headers(&[("Content-Type", "application/json")]),
body_json: Some(graphql_body()),
},
)
.await
.expect("preview unsafe POST replay");
match result {
ReplayResult::Preview {
method,
url,
body_sample,
..
} => {
assert_eq!(method, "POST");
assert_eq!(url, "http://127.0.0.1:9/graphql");
assert!(
body_sample
.as_deref()
.unwrap_or_default()
.contains("CreateProduct"),
"dry-run POST preview should include the request body sample"
);
}
other => panic!("dry-run POST replay should return a preview, got {other:#?}"),
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn redacted_headers_are_never_sent() {
let mut server = LocalServer::start().await;
let endpoint = get_endpoint(
&server.base_url,
headers(&[
("Authorization", "[REDACTED]"),
("Cookie", "[REDACTED]"),
("X-Api-Key", "[REDACTED]"),
("X-Trace-Id", "captured-trace"),
]),
);
let result = replay_endpoint(
&endpoint,
ReplayOptions {
dry_run: false,
confirm_unsafe: false,
params_json: None,
headers: headers(&[
("X-User-Email", "[REDACTED]"),
("X-Allowed-Override", "override-ok"),
]),
body_json: None,
},
)
.await
.expect("replay GET endpoint without redacted headers");
assert!(
matches!(result, ReplayResult::Executed { status: 200, .. }),
"GET replay should execute, got {result:#?}"
);
let request = server.next_request().await;
let lower_request = request.to_ascii_lowercase();
for forbidden in [
"authorization:",
"cookie:",
"x-api-key:",
"x-user-email:",
"[redacted]",
] {
assert!(
!lower_request.contains(forbidden),
"replay request should not send redacted header material {forbidden:?}: {request}"
);
}
assert!(
lower_request.contains("x-allowed-override: override-ok"),
"non-redacted caller-supplied headers should still be sent: {request}"
);
}
async fn handle_connection(mut stream: TcpStream, requests: mpsc::UnboundedSender<String>) {
let mut buffer = vec![0_u8; 8192];
let Ok(bytes_read) = stream.read(&mut buffer).await else {
return;
};
if bytes_read == 0 {
return;
}
let request = String::from_utf8_lossy(&buffer[..bytes_read]).to_string();
let status = if request.starts_with("GET /api/products") {
"200 OK"
} else {
"404 Not Found"
};
let body = if status == "200 OK" {
r#"{"ok":true,"items":[{"id":12345,"name":"Hammer"}]}"#
} else {
r#"{"ok":false}"#
};
let response = http_response(status, &[("Content-Type", "application/json")], body);
let _ = requests.send(request);
let _ = stream.write_all(response.as_bytes()).await;
let _ = stream.shutdown().await;
}
fn http_response(status: &str, headers: &[(&str, &str)], body: &str) -> String {
let mut response = format!(
"HTTP/1.1 {status}\r\nContent-Length: {}\r\nConnection: close\r\nCache-Control: no-store\r\n",
body.len()
);
for (name, value) in headers {
response.push_str(name);
response.push_str(": ");
response.push_str(value);
response.push_str("\r\n");
}
response.push_str("\r\n");
response.push_str(body);
response
}
fn get_endpoint(origin: &str, request_headers: Map<String, Value>) -> EndpointDefinition {
let mut query_params = BTreeMap::new();
query_params.insert("category".to_owned(), vec!["tools".to_owned()]);
EndpointDefinition {
id: format!("GET {origin}/api/products"),
method: "GET".to_owned(),
origin: origin.to_owned(),
path_template: "/api/products".to_owned(),
query_params,
request_schema: None,
response_schema: Some(json!({
"type": "object",
"properties": {
"items": { "type": "array" }
}
})),
auth_evidence: Vec::new(),
safety: EndpointSafety {
safe_to_replay: true,
requires_confirmation: false,
reason: "GET is a read-oriented HTTP method".to_owned(),
},
examples: vec![EndpointExample {
url: format!("{origin}/api/products?category=tools"),
request_headers,
request_body_sample: None,
response_status: 200,
response_headers: headers(&[("Content-Type", "application/json")]),
response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()),
captured_at: test_time(),
}],
}
}
fn post_endpoint(origin: &str) -> EndpointDefinition {
EndpointDefinition {
id: format!("POST {origin}/graphql"),
method: "POST".to_owned(),
origin: origin.to_owned(),
path_template: "/graphql".to_owned(),
query_params: BTreeMap::new(),
request_schema: Some(json!({
"type": "object",
"properties": {
"query": { "type": "string" },
"variables": { "type": "object" }
}
})),
response_schema: Some(json!({ "type": "object" })),
auth_evidence: vec!["X-CSRF-Token header observed".to_owned()],
safety: EndpointSafety {
safe_to_replay: false,
requires_confirmation: true,
reason: "POST may mutate server state and requires confirmation".to_owned(),
},
examples: vec![EndpointExample {
url: format!("{origin}/graphql"),
request_headers: headers(&[
("Content-Type", "application/json"),
("X-CSRF-Token", "[REDACTED]"),
]),
request_body_sample: Some(graphql_body().to_string()),
response_status: 200,
response_headers: headers(&[("Content-Type", "application/json")]),
response_body_sample: Some(r#"{"data":{"createProduct":{"id":"12345"}}}"#.to_owned()),
captured_at: test_time(),
}],
}
}
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
entries
.iter()
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
.collect()
}
fn header_string<'a>(headers: &'a Map<String, Value>, name: &str) -> Option<&'a str> {
headers
.iter()
.find(|(header_name, _value)| header_name.eq_ignore_ascii_case(name))
.and_then(|(_header_name, value)| value.as_str())
}
fn graphql_body() -> Value {
json!({
"query": "mutation CreateProduct($name: String!) { createProduct(name: $name) { id } }",
"variables": {
"name": "Hammer"
}
})
}
fn test_time() -> DateTime<Utc> {
DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z")
.expect("valid test timestamp")
.with_timezone(&Utc)
}

View file

@ -0,0 +1,312 @@
use std::collections::BTreeMap;
use std::env;
use std::ffi::OsString;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use std::time::{SystemTime, UNIX_EPOCH};
use chrono::{DateTime, Utc};
use serde_json::{Map, Value, json};
use url::Url;
use webclaw_capture::redact::redact_artifact;
use webclaw_capture::store::{
capture_id_for, capture_root, find_endpoint, load_endpoints, save_capture,
};
use webclaw_capture::types::{
CaptureArtifact, CapturedExchange, EndpointDefinition, EndpointExample, EndpointSafety,
};
static ENV_LOCK: Mutex<()> = Mutex::new(());
const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR";
struct EnvVarGuard {
original: Option<OsString>,
}
impl EnvVarGuard {
fn set_capture_dir(value: Option<&Path>) -> Self {
let original = env::var_os(CAPTURE_DIR_ENV);
unsafe {
match value {
Some(path) => env::set_var(CAPTURE_DIR_ENV, path),
None => env::remove_var(CAPTURE_DIR_ENV),
}
}
Self { original }
}
}
impl Drop for EnvVarGuard {
fn drop(&mut self) {
unsafe {
match &self.original {
Some(value) => env::set_var(CAPTURE_DIR_ENV, value),
None => env::remove_var(CAPTURE_DIR_ENV),
}
}
}
}
fn with_capture_dir<T>(value: Option<&Path>, test: impl FnOnce() -> T) -> T {
let _lock = ENV_LOCK.lock().expect("capture env lock");
let _guard = EnvVarGuard::set_capture_dir(value);
test()
}
fn unique_temp_root(test_name: &str) -> PathBuf {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("system time after unix epoch")
.as_nanos();
env::temp_dir().join(format!(
"webclaw-capture-store-{test_name}-{}-{nanos}",
std::process::id()
))
}
fn test_time() -> DateTime<Utc> {
DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z")
.expect("valid test timestamp")
.with_timezone(&Utc)
}
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
entries
.iter()
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
.collect()
}
fn sample_endpoint() -> EndpointDefinition {
let mut query_params = BTreeMap::new();
query_params.insert("category".to_owned(), vec!["tools".to_owned()]);
EndpointDefinition {
id: "GET https://example.test/api/products".to_owned(),
method: "GET".to_owned(),
origin: "https://example.test".to_owned(),
path_template: "/api/products".to_owned(),
query_params,
request_schema: None,
response_schema: Some(json!({
"type": "object",
"properties": {
"items": {
"type": "array",
"items": { "type": "object" }
}
}
})),
auth_evidence: vec!["Authorization header observed".to_owned()],
safety: EndpointSafety {
safe_to_replay: true,
requires_confirmation: false,
reason: "GET is a read-oriented HTTP method".to_owned(),
},
examples: vec![EndpointExample {
url: "https://example.test/api/products?category=tools".to_owned(),
request_headers: headers(&[
("Authorization", "Bearer raw-secret"),
("Accept", "application/json"),
]),
request_body_sample: None,
response_status: 200,
response_headers: headers(&[("Content-Type", "application/json")]),
response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()),
captured_at: test_time(),
}],
}
}
fn sample_exchange() -> CapturedExchange {
CapturedExchange {
method: "GET".to_owned(),
url: "https://example.test/api/products?category=tools&token=raw-secret".to_owned(),
request_headers: headers(&[
("Authorization", "Bearer raw-secret"),
("Accept", "application/json"),
]),
request_body_sample: None,
resource_type: Some("fetch".to_owned()),
status: 200,
response_headers: headers(&[("Content-Type", "application/json")]),
response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()),
started_at: test_time(),
duration_ms: 42,
redirect_chain: vec!["https://example.test/login?session=raw-secret".to_owned()],
}
}
fn sample_artifact() -> CaptureArtifact {
let mut metadata = Map::new();
metadata.insert("runner".to_owned(), json!("store-test"));
CaptureArtifact {
id: "example.test/2026-05-16T12-00-00Z".to_owned(),
source_url: "https://example.test/products?email=user@example.test".to_owned(),
intent: Some("discover product listing API".to_owned()),
started_at: test_time(),
completed_at: Some(test_time()),
exchanges: vec![sample_exchange()],
endpoints: vec![sample_endpoint()],
metadata,
}
}
fn read_json<T: serde::de::DeserializeOwned>(path: &Path) -> T {
let contents = fs::read_to_string(path).expect("read JSON file");
serde_json::from_str(&contents).expect("valid JSON file")
}
#[test]
fn default_capture_root_resolves_under_user_profile_webclaw_api_captures() {
with_capture_dir(None, || {
let home = env::var_os("USERPROFILE")
.map(PathBuf::from)
.or_else(dirs::home_dir)
.expect("home directory");
assert_eq!(capture_root(), home.join(".webclaw").join("api-captures"));
});
}
#[test]
fn capture_root_uses_webclaw_capture_dir_override() {
let root = unique_temp_root("override");
with_capture_dir(Some(&root), || {
assert_eq!(capture_root(), root);
});
}
#[test]
fn capture_id_for_uses_domain_and_filesystem_safe_utc_timestamp() {
let url = Url::parse("https://example.test/api/products?category=tools").expect("valid URL");
assert_eq!(
capture_id_for(&url, test_time()),
"example.test/2026-05-16T12-00-00Z"
);
}
#[test]
fn save_capture_writes_raw_redacted_endpoints_and_metadata_files() {
let root = unique_temp_root("save");
with_capture_dir(Some(&root), || {
let artifact = sample_artifact();
let saved = save_capture(&artifact).expect("save capture");
assert_eq!(saved.id, artifact.id);
assert_eq!(
saved.capture_dir,
root.join("example.test").join("2026-05-16T12-00-00Z")
);
assert_eq!(
saved.raw_capture_path,
saved.capture_dir.join("raw-capture.json")
);
assert_eq!(
saved.redacted_capture_path,
saved.capture_dir.join("redacted-capture.json")
);
assert_eq!(
saved.endpoints_path,
saved.capture_dir.join("endpoints.json")
);
assert_eq!(saved.metadata_path, saved.capture_dir.join("metadata.json"));
assert!(saved.raw_capture_path.is_file());
assert!(saved.redacted_capture_path.is_file());
assert!(saved.endpoints_path.is_file());
assert!(saved.metadata_path.is_file());
let raw_capture: CaptureArtifact = read_json(&saved.raw_capture_path);
assert_eq!(raw_capture, artifact);
let redacted_capture: CaptureArtifact = read_json(&saved.redacted_capture_path);
assert_ne!(redacted_capture, artifact);
assert!(
!serde_json::to_string(&redacted_capture)
.expect("serialize redacted capture")
.contains("raw-secret"),
"redacted capture should not contain raw secrets"
);
let endpoints: Vec<EndpointDefinition> = read_json(&saved.endpoints_path);
assert_eq!(endpoints, redact_artifact(&artifact).endpoints);
assert!(
!serde_json::to_string(&endpoints)
.expect("serialize endpoints")
.contains("raw-secret"),
"endpoints.json should not contain raw secrets"
);
let metadata: Value = read_json(&saved.metadata_path);
assert!(
metadata.is_object(),
"metadata.json should contain a JSON object"
);
let metadata_text = serde_json::to_string(&metadata).expect("serialize metadata");
assert!(
!metadata_text.contains("user@example.test"),
"metadata.json should redact PII from source_url"
);
assert!(
metadata_text.contains("REDACTED"),
"metadata.json should preserve the redaction marker"
);
});
let _ = fs::remove_dir_all(root);
}
#[test]
fn load_endpoints_by_capture_id_reads_endpoints_json() {
let root = unique_temp_root("load");
with_capture_dir(Some(&root), || {
let artifact = sample_artifact();
save_capture(&artifact).expect("save capture");
let loaded = load_endpoints(&artifact.id).expect("load endpoints");
assert_eq!(loaded, redact_artifact(&artifact).endpoints);
assert!(
!serde_json::to_string(&loaded)
.expect("serialize loaded endpoints")
.contains("raw-secret"),
"loaded endpoints should not contain raw secrets"
);
});
let _ = fs::remove_dir_all(root);
}
#[test]
fn find_endpoint_scans_saved_capture_endpoints() {
let root = unique_temp_root("find");
with_capture_dir(Some(&root), || {
let artifact = sample_artifact();
let expected = redact_artifact(&artifact).endpoints[0].clone();
save_capture(&artifact).expect("save capture");
let found = find_endpoint(&expected.id).expect("find endpoint");
assert_eq!(found, expected);
assert!(
!serde_json::to_string(&found)
.expect("serialize found endpoint")
.contains("raw-secret"),
"found endpoint should not contain raw secrets"
);
});
let _ = fs::remove_dir_all(root);
}

View file

@ -11,6 +11,7 @@ path = "src/main.rs"
[dependencies]
webclaw-core = { workspace = true }
webclaw-capture = { path = "../webclaw-capture" }
webclaw-fetch = { workspace = true }
webclaw-llm = { workspace = true }
webclaw-pdf = { workspace = true }

View file

@ -10,6 +10,11 @@ use std::sync::atomic::{AtomicBool, Ordering};
use clap::{Parser, Subcommand, ValueEnum};
use tracing_subscriber::EnvFilter;
use webclaw_capture::cdp::{CaptureOptions, capture_network};
use webclaw_capture::openapi::write_openapi;
use webclaw_capture::replay::replay_endpoint;
use webclaw_capture::store::{find_endpoint, load_endpoints};
use webclaw_capture::types::{EndpointDefinition, ReplayOptions};
use webclaw_core::{
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
to_llm_text,
@ -336,6 +341,61 @@ enum Commands {
#[arg(long)]
raw: bool,
},
/// Capture browser network traffic and learn reusable API endpoints.
CaptureNetwork {
/// Page URL to inspect.
url: String,
/// Capture intent, stored with the capture metadata.
#[arg(long)]
intent: Option<String>,
/// Milliseconds to wait after page navigation before saving the capture.
#[arg(long, default_value_t = 3000)]
wait_ms: u64,
/// Run Chromium with a visible window instead of headless mode.
#[arg(long)]
headed: bool,
},
/// Print learned endpoints for a saved capture id.
Endpoints {
/// Capture id, for example `example.com/2026-05-16T12-00-00Z`.
capture_id: String,
},
/// Print one learned endpoint by endpoint id.
ShowEndpoint {
/// Endpoint id, for example `get_example_test_api_products`.
endpoint_id: String,
},
/// Replay or preview a learned endpoint.
ReplayEndpoint {
/// Endpoint id to replay.
endpoint_id: String,
/// JSON object with path/query parameter overrides.
#[arg(long, default_value = "{}")]
params_json: String,
/// Preview the replay request without network access.
#[arg(long)]
dry_run: bool,
/// Allow unsafe methods such as POST, PUT, PATCH, and DELETE to execute.
#[arg(long)]
confirm_unsafe: bool,
},
/// Export a saved capture's learned endpoints as OpenAPI 3.1 JSON.
#[command(name = "export-openapi")]
ExportOpenapi {
/// Capture id, for example `example.com/2026-05-16T12-00-00Z`.
capture_id: String,
},
}
#[derive(Clone, ValueEnum)]
@ -2169,6 +2229,121 @@ fn has_llm_flags(cli: &Cli) -> bool {
cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
}
async fn run_capture_network_command(
url: &str,
intent: Option<String>,
wait_ms: u64,
headed: bool,
) -> Result<(), String> {
let saved = capture_network(CaptureOptions {
url: normalize_url(url),
intent,
wait_ms,
headed,
})
.await
.map_err(|e| format!("capture-network failed: {e}"))?;
println!(
"{}",
serde_json::to_string_pretty(&saved).map_err(|e| format!("JSON encode failed: {e}"))?
);
Ok(())
}
fn run_endpoints_command(capture_id: &str) -> Result<(), String> {
let endpoints = load_endpoints(capture_id)
.map_err(|e| format!("could not load endpoints for capture id {capture_id}: {e}"))?;
println!(
"{}",
serde_json::to_string_pretty(&endpoints).map_err(|e| format!("JSON encode failed: {e}"))?
);
Ok(())
}
fn run_show_endpoint_command(endpoint_id: &str) -> Result<(), String> {
let endpoint = find_endpoint(endpoint_id)
.map_err(|e| format!("could not find endpoint id {endpoint_id}: {e}"))?;
println!(
"{}",
serde_json::to_string_pretty(&endpoint).map_err(|e| format!("JSON encode failed: {e}"))?
);
Ok(())
}
async fn run_replay_endpoint_command(
endpoint_id: &str,
params_json: &str,
dry_run: bool,
confirm_unsafe: bool,
) -> Result<(), String> {
let endpoint = find_endpoint(endpoint_id)
.map_err(|e| format!("could not find endpoint id {endpoint_id}: {e}"))?;
let params_json = parse_params_json(params_json)?;
let default_dry_run = endpoint_defaults_to_dry_run(&endpoint) && !confirm_unsafe;
if default_dry_run && !dry_run {
eprintln!(
"Unsafe endpoint replay defaults to dry-run. Re-run with --confirm-unsafe to execute."
);
}
let options = ReplayOptions {
dry_run: dry_run || default_dry_run,
confirm_unsafe,
params_json,
headers: serde_json::Map::new(),
body_json: None,
};
let result = replay_endpoint(&endpoint, options)
.await
.map_err(|e| format!("replay-endpoint failed: {e}"))?;
println!(
"{}",
serde_json::to_string_pretty(&result).map_err(|e| format!("JSON encode failed: {e}"))?
);
Ok(())
}
fn run_export_openapi_command(capture_id: &str) -> Result<(), String> {
let path = write_openapi(capture_id)
.map_err(|e| format!("could not export OpenAPI for capture id {capture_id}: {e}"))?;
println!("{}", path.display());
Ok(())
}
fn parse_params_json(params_json: &str) -> Result<Option<serde_json::Value>, String> {
let trimmed = params_json.trim();
if trimmed.is_empty() {
return Ok(None);
}
let value: serde_json::Value = serde_json::from_str(trimmed)
.map_err(|e| format!("--params-json must be valid JSON: {e}"))?;
if !value.is_object() {
return Err("--params-json must be a JSON object".to_owned());
}
Ok(Some(value))
}
fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool {
endpoint.safety.requires_confirmation
|| !endpoint.safety.safe_to_replay
|| !matches!(
endpoint.method.to_ascii_uppercase().as_str(),
"GET" | "HEAD" | "OPTIONS"
)
}
async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
let api_key = cli
.api_key
@ -2405,6 +2580,56 @@ async fn main() {
}
return;
}
Commands::CaptureNetwork {
url,
intent,
wait_ms,
headed,
} => {
if let Err(e) =
run_capture_network_command(url, intent.clone(), *wait_ms, *headed).await
{
eprintln!("error: {e}");
process::exit(1);
}
return;
}
Commands::Endpoints { capture_id } => {
if let Err(e) = run_endpoints_command(capture_id) {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
Commands::ShowEndpoint { endpoint_id } => {
if let Err(e) = run_show_endpoint_command(endpoint_id) {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
Commands::ReplayEndpoint {
endpoint_id,
params_json,
dry_run,
confirm_unsafe,
} => {
if let Err(e) =
run_replay_endpoint_command(endpoint_id, params_json, *dry_run, *confirm_unsafe)
.await
{
eprintln!("error: {e}");
process::exit(1);
}
return;
}
Commands::ExportOpenapi { capture_id } => {
if let Err(e) = run_export_openapi_command(capture_id) {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
}
}

View file

@ -597,7 +597,7 @@ mod tests {
"#;
let entries = parse_sitemap_xml(xml);
// Should return at least the successfully parsed entry
assert!(entries.len() >= 1);
assert!(!entries.is_empty());
assert_eq!(entries[0].url, "https://example.com/good");
}

View file

@ -193,7 +193,7 @@ mod tests {
.await
.is_ok()
);
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
assert!(!is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))));
}
#[tokio::test]

View file

@ -71,7 +71,7 @@ fn classify(body: &str, len: usize, status: u16, kw: &[String]) -> &'static str
"CHALLENGE"
} else if status == 403 || status == 429 {
"BLOCKED"
} else if status >= 300 && status < 400 {
} else if (300..400).contains(&status) {
"REDIRECT"
} else if len < 1000 {
"EMPTY"

View file

@ -14,6 +14,7 @@ webclaw-core = { workspace = true }
webclaw-fetch = { workspace = true }
webclaw-llm = { workspace = true }
webclaw-pdf = { workspace = true }
webclaw-capture = { path = "../webclaw-capture" }
rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
schemars = "1.0"
dotenvy = { workspace = true }

View file

@ -11,6 +11,10 @@ use server::WebclawMcp;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
if print_help_or_version() {
return Ok(());
}
dotenvy::dotenv().ok();
// Log to stderr -- stdout is the MCP transport channel
@ -25,3 +29,42 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
service.waiting().await?;
Ok(())
}
fn print_help_or_version() -> bool {
let mut args = std::env::args().skip(1);
let Some(arg) = args.next() else {
return false;
};
match arg.as_str() {
"-h" | "--help" => {
println!("{}", help_text());
true
}
"-V" | "--version" => {
println!("webclaw-mcp {}", env!("CARGO_PKG_VERSION"));
true
}
_ => false,
}
}
fn help_text() -> String {
format!(
"\
webclaw-mcp {version}
MCP server for webclaw web extraction toolkit
Usage: webclaw-mcp
Options:
-h, --help Print help
-V, --version Print version
Tools:
scrape, crawl, map, batch, extract, summarize, diff, brand, research, search,
capture_network, discover_endpoints, show_endpoint, replay_endpoint,
export_openapi, list_captures, list_extractors, vertical_scrape",
version = env!("CARGO_PKG_VERSION")
)
}

View file

@ -4,6 +4,8 @@
/// Uses a local-first architecture: fetches pages directly, then falls back
/// to the webclaw cloud API (api.webclaw.io) when bot protection or
/// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
use std::fs;
use std::path::Path;
use std::sync::{Arc, OnceLock};
use std::time::Duration;
@ -11,9 +13,14 @@ use rmcp::handler::server::router::tool::ToolRouter;
use rmcp::handler::server::wrapper::Parameters;
use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
use rmcp::{ServerHandler, tool, tool_handler, tool_router};
use serde_json::json;
use serde_json::{Map, Value, json};
use tracing::{error, info, warn};
use webclaw_capture::cdp::{CaptureOptions, capture_network as run_network_capture};
use webclaw_capture::openapi::write_openapi;
use webclaw_capture::replay::replay_endpoint as run_endpoint_replay;
use webclaw_capture::store::{capture_root, find_endpoint, load_endpoints};
use webclaw_capture::types::{EndpointDefinition, HeaderMap, ReplayOptions};
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
use crate::tools::*;
@ -709,6 +716,96 @@ impl WebclawMcp {
}
}
/// Capture browser network traffic from a page and save learned API endpoints for later replay.
#[tool]
async fn capture_network(
&self,
Parameters(params): Parameters<CaptureNetworkParams>,
) -> Result<String, String> {
let url = normalize_capture_url(&params.url)?;
validate_url(&url).await?;
let saved = run_network_capture(CaptureOptions {
url,
intent: params.intent,
wait_ms: params.wait_ms.unwrap_or(3000),
headed: params.headed.unwrap_or(false),
})
.await
.map_err(|e| format!("capture_network failed: {e}"))?;
to_pretty_json(&saved)
}
/// Return learned endpoint definitions for a saved capture id.
#[tool]
async fn discover_endpoints(
&self,
Parameters(params): Parameters<DiscoverEndpointsParams>,
) -> Result<String, String> {
let endpoints = load_endpoints(&params.capture_id).map_err(|e| {
format!(
"could not load endpoints for capture id {}: {e}",
params.capture_id
)
})?;
to_pretty_json(&endpoints)
}
/// Show one learned endpoint definition by endpoint id.
#[tool]
async fn show_endpoint(
&self,
Parameters(params): Parameters<ShowEndpointParams>,
) -> Result<String, String> {
let endpoint = find_endpoint(&params.endpoint_id)
.map_err(|e| format!("could not find endpoint id {}: {e}", params.endpoint_id))?;
to_pretty_json(&endpoint)
}
/// Replay or preview a learned endpoint. Mutating methods default to dry-run unless confirmed.
#[tool]
async fn replay_endpoint(
&self,
Parameters(params): Parameters<ReplayEndpointParams>,
) -> Result<String, String> {
let endpoint = find_endpoint(&params.endpoint_id)
.map_err(|e| format!("could not find endpoint id {}: {e}", params.endpoint_id))?;
let options = replay_options_from_params(&endpoint, &params)?;
let result = run_endpoint_replay(&endpoint, options)
.await
.map_err(|e| format!("replay_endpoint failed: {e}"))?;
to_pretty_json(&result)
}
/// Export a saved capture's learned endpoints as OpenAPI 3.1 JSON.
#[tool]
async fn export_openapi(
&self,
Parameters(params): Parameters<ExportOpenApiParams>,
) -> Result<String, String> {
let path = write_openapi(&params.capture_id).map_err(|e| {
format!(
"could not export OpenAPI for capture id {}: {e}",
params.capture_id
)
})?;
to_pretty_json(&json!({ "path": path }))
}
/// List saved network captures from the configured capture root.
#[tool]
async fn list_captures(
&self,
Parameters(_params): Parameters<ListCapturesParams>,
) -> Result<String, String> {
to_pretty_json(&list_saved_captures_from_root(&capture_root())?)
}
/// List every vertical extractor the server knows about. Returns a
/// JSON array of `{name, label, description, url_patterns}` entries.
/// Call this to discover what verticals are available before using
@ -767,11 +864,183 @@ impl ServerHandler for WebclawMcp {
.with_instructions(String::from(
"Webclaw MCP server -- web content extraction for AI agents. \
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, \
list_extractors, vertical_scrape.",
capture_network, discover_endpoints, show_endpoint, replay_endpoint, export_openapi, \
list_captures, list_extractors, vertical_scrape.",
))
}
}
fn normalize_capture_url(url: &str) -> Result<String, String> {
let trimmed = url.trim();
if trimmed.is_empty() {
return Err("url must not be empty".to_owned());
}
let normalized = if trimmed.contains("://") {
trimmed.to_owned()
} else {
format!("https://{trimmed}")
};
let parsed = url::Url::parse(&normalized).map_err(|e| format!("invalid URL: {e}"))?;
match parsed.scheme() {
"http" | "https" => Ok(normalized),
scheme => Err(format!(
"capture_network only supports http and https URLs, got {scheme:?}"
)),
}
}
fn replay_options_from_params(
endpoint: &EndpointDefinition,
params: &ReplayEndpointParams,
) -> Result<ReplayOptions, String> {
if let Some(value) = &params.params_json
&& !value.is_object()
{
return Err("params_json must be a JSON object".to_owned());
}
let confirm_unsafe = params.confirm_unsafe.unwrap_or(false);
let default_dry_run = endpoint_defaults_to_dry_run(endpoint) && !confirm_unsafe;
Ok(ReplayOptions {
dry_run: params.dry_run.unwrap_or(false) || default_dry_run,
confirm_unsafe,
params_json: params.params_json.clone(),
headers: header_map_from_strings(params.headers.as_ref()),
body_json: params.body_json.clone(),
})
}
fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool {
endpoint.safety.requires_confirmation
|| !endpoint.safety.safe_to_replay
|| !matches!(
endpoint.method.to_ascii_uppercase().as_str(),
"GET" | "HEAD" | "OPTIONS"
)
}
fn header_map_from_strings(
headers: Option<&std::collections::BTreeMap<String, String>>,
) -> HeaderMap {
headers
.into_iter()
.flat_map(|headers| headers.iter())
.map(|(name, value)| (name.clone(), Value::String(value.clone())))
.collect()
}
fn list_saved_captures_from_root(root: &Path) -> Result<Vec<Value>, String> {
if !root.exists() {
return Ok(Vec::new());
}
let mut captures = Vec::new();
collect_saved_captures(root, root, &mut captures)?;
captures.sort_by(|left, right| {
left.get("id")
.and_then(Value::as_str)
.unwrap_or_default()
.cmp(right.get("id").and_then(Value::as_str).unwrap_or_default())
});
Ok(captures)
}
fn collect_saved_captures(
root: &Path,
current: &Path,
captures: &mut Vec<Value>,
) -> Result<(), String> {
let entries = fs::read_dir(current).map_err(|e| {
format!(
"could not read capture directory {}: {e}",
current.display()
)
})?;
for entry in entries {
let entry = entry.map_err(|e| format!("could not read capture directory entry: {e}"))?;
let path = entry.path();
if path.is_dir() {
collect_saved_captures(root, &path, captures)?;
continue;
}
if path.file_name().and_then(|name| name.to_str()) == Some("metadata.json") {
captures.push(read_capture_metadata(root, &path)?);
}
}
Ok(())
}
fn read_capture_metadata(root: &Path, metadata_path: &Path) -> Result<Value, String> {
let contents = fs::read_to_string(metadata_path).map_err(|e| {
format!(
"could not read capture metadata {}: {e}",
metadata_path.display()
)
})?;
let mut metadata = match serde_json::from_str::<Value>(&contents).map_err(|e| {
format!(
"could not parse capture metadata {}: {e}",
metadata_path.display()
)
})? {
Value::Object(metadata) => metadata,
_ => Map::new(),
};
let capture_dir = metadata_path
.parent()
.ok_or_else(|| format!("metadata path has no parent: {}", metadata_path.display()))?;
let capture_id = capture_id_from_dir(root, capture_dir)?;
metadata
.entry("id".to_owned())
.or_insert_with(|| Value::String(capture_id));
metadata.insert(
"capture_dir".to_owned(),
Value::String(capture_dir.display().to_string()),
);
Ok(Value::Object(metadata))
}
fn capture_id_from_dir(root: &Path, capture_dir: &Path) -> Result<String, String> {
let relative = capture_dir.strip_prefix(root).map_err(|e| {
format!(
"capture directory {} is not under root {}: {e}",
capture_dir.display(),
root.display()
)
})?;
let parts = relative
.components()
.filter_map(|component| match component {
std::path::Component::Normal(value) => Some(value.to_string_lossy().to_string()),
_ => None,
})
.collect::<Vec<_>>();
if parts.is_empty() {
Err(format!(
"capture directory {} does not contain a capture id",
capture_dir.display()
))
} else {
Ok(parts.join("/"))
}
}
fn to_pretty_json<T: serde::Serialize>(value: &T) -> Result<String, String> {
serde_json::to_string_pretty(value).map_err(|e| format!("JSON encode failed: {e}"))
}
// ---------------------------------------------------------------------------
// Research file helpers
// ---------------------------------------------------------------------------
@ -856,3 +1125,127 @@ fn save_research(dir: &std::path::Path, slug: &str, data: &serde_json::Value) ->
json_path.to_string_lossy().to_string(),
)
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use std::fs;
use serde_json::json;
use webclaw_capture::types::{EndpointDefinition, EndpointSafety};
use super::*;
fn endpoint(
method: &str,
safe_to_replay: bool,
requires_confirmation: bool,
) -> EndpointDefinition {
EndpointDefinition {
id: format!("{}_example", method.to_ascii_lowercase()),
method: method.to_owned(),
origin: "https://example.test".to_owned(),
path_template: "/api/items".to_owned(),
query_params: BTreeMap::new(),
request_schema: None,
response_schema: None,
auth_evidence: Vec::new(),
safety: EndpointSafety {
safe_to_replay,
requires_confirmation,
reason: "test".to_owned(),
},
examples: Vec::new(),
}
}
#[test]
fn normalize_capture_url_adds_https_and_rejects_non_http_schemes() {
assert_eq!(
normalize_capture_url("example.test/path").unwrap(),
"https://example.test/path"
);
assert!(normalize_capture_url("file:///C:/secret.txt").is_err());
}
#[test]
fn replay_options_default_unsafe_methods_to_dry_run_unless_confirmed() {
let unsafe_endpoint = endpoint("POST", false, true);
let params = ReplayEndpointParams {
endpoint_id: unsafe_endpoint.id.clone(),
params_json: Some(json!({"id": "123"})),
dry_run: None,
confirm_unsafe: None,
headers: Some(BTreeMap::from([("X-Test".to_owned(), "ok".to_owned())])),
body_json: Some(json!({"name": "tool"})),
};
let options = replay_options_from_params(&unsafe_endpoint, &params).unwrap();
assert!(options.dry_run);
assert!(!options.confirm_unsafe);
assert_eq!(options.params_json, Some(json!({"id": "123"})));
assert_eq!(options.headers.get("X-Test"), Some(&json!("ok")));
let confirmed = ReplayEndpointParams {
confirm_unsafe: Some(true),
..params
};
let options = replay_options_from_params(&unsafe_endpoint, &confirmed).unwrap();
assert!(!options.dry_run);
assert!(options.confirm_unsafe);
}
#[test]
fn replay_options_leave_safe_gets_executable_by_default() {
let safe_endpoint = endpoint("GET", true, false);
let params = ReplayEndpointParams {
endpoint_id: safe_endpoint.id.clone(),
params_json: None,
dry_run: None,
confirm_unsafe: None,
headers: None,
body_json: None,
};
let options = replay_options_from_params(&safe_endpoint, &params).unwrap();
assert!(!options.dry_run);
assert!(!options.confirm_unsafe);
}
#[test]
fn list_saved_captures_from_root_returns_metadata_with_capture_id() {
let root = std::env::temp_dir().join(format!(
"webclaw-mcp-list-captures-{}-{}",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos()
));
let capture_dir = root.join("example.test").join("2026-05-16T12-00-00Z");
fs::create_dir_all(&capture_dir).unwrap();
fs::write(
capture_dir.join("metadata.json"),
serde_json::to_string(&json!({
"source_url": "https://example.test",
"endpoint_count": 2
}))
.unwrap(),
)
.unwrap();
let captures = list_saved_captures_from_root(&root).unwrap();
fs::remove_dir_all(&root).ok();
assert_eq!(captures.len(), 1);
assert_eq!(captures[0]["id"], "example.test/2026-05-16T12-00-00Z");
assert_eq!(captures[0]["endpoint_count"], 2);
assert!(
captures[0]["capture_dir"]
.as_str()
.unwrap()
.contains("example.test")
);
}
}

View file

@ -104,6 +104,63 @@ pub struct SearchParams {
pub num_results: Option<u32>,
}
#[derive(Debug, Deserialize, JsonSchema)]
#[allow(dead_code)]
pub struct CaptureNetworkParams {
/// URL to open in Chromium and capture network traffic from.
pub url: String,
/// Optional natural-language purpose for the capture.
pub intent: Option<String>,
/// Milliseconds to wait after navigation while collecting network events.
pub wait_ms: Option<u64>,
/// Run the browser in headed mode for debugging.
pub headed: Option<bool>,
}
#[derive(Debug, Deserialize, JsonSchema)]
#[allow(dead_code)]
pub struct DiscoverEndpointsParams {
/// Saved capture id, for example `example.com/2026-05-16T12-00-00Z`.
pub capture_id: String,
}
#[derive(Debug, Deserialize, JsonSchema)]
#[allow(dead_code)]
pub struct ShowEndpointParams {
/// Learned endpoint id to load from saved captures.
pub endpoint_id: String,
}
#[derive(Debug, Deserialize, JsonSchema)]
#[allow(dead_code)]
pub struct ReplayEndpointParams {
/// Learned endpoint id to replay or preview.
pub endpoint_id: String,
/// Path/query parameter values to substitute into the learned endpoint.
pub params_json: Option<serde_json::Value>,
/// Preview the replay request without sending network traffic.
pub dry_run: Option<bool>,
/// Allow mutating methods such as POST, PUT, PATCH, and DELETE to execute.
pub confirm_unsafe: Option<bool>,
/// Additional non-secret request headers to include in the replay.
pub headers: Option<std::collections::BTreeMap<String, String>>,
/// JSON request body override for replay.
pub body_json: Option<serde_json::Value>,
}
#[derive(Debug, Deserialize, JsonSchema)]
#[allow(dead_code)]
pub struct ExportOpenApiParams {
/// Saved capture id whose learned endpoints should be exported.
pub capture_id: String,
}
/// `list_captures` takes no arguments but uses a struct so rmcp can generate
/// a schema and parse the empty JSON-RPC params.
#[derive(Debug, Deserialize, JsonSchema)]
#[allow(dead_code)]
pub struct ListCapturesParams {}
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
#[derive(Debug, Deserialize, JsonSchema)]
pub struct VerticalParams {

View file

@ -15,6 +15,7 @@ webclaw-core = { workspace = true }
webclaw-fetch = { workspace = true }
webclaw-llm = { workspace = true }
webclaw-pdf = { workspace = true }
webclaw-capture = { path = "../webclaw-capture" }
axum = { version = "0.8", features = ["macros"] }
tokio = { workspace = true }

View file

@ -95,8 +95,18 @@ async fn main() -> anyhow::Result<()> {
.route("/crawl", post(routes::crawl::crawl))
.route("/map", post(routes::map::map))
.route("/batch", post(routes::batch::batch))
.route("/capture-network", post(routes::capture::capture_network))
.route(
"/captures/{domain}/{timestamp}/endpoints",
get(routes::capture::endpoints),
)
.route(
"/captures/{domain}/{timestamp}/openapi",
post(routes::capture::export_openapi),
)
.route("/extract", post(routes::extract::extract))
.route("/extractors", get(routes::structured::list_extractors))
.route("/replay-endpoint", post(routes::capture::replay_endpoint))
.route("/summarize", post(routes::summarize::summarize_route))
.route("/diff", post(routes::diff::diff_route))
.route("/brand", post(routes::brand::brand))

View file

@ -0,0 +1,283 @@
use std::collections::BTreeMap;
use axum::{Json, extract::Path};
use serde::Deserialize;
use serde_json::{Value, json};
use webclaw_capture::cdp::{CaptureOptions, capture_network as run_network_capture};
use webclaw_capture::openapi::write_openapi;
use webclaw_capture::replay::replay_endpoint as run_endpoint_replay;
use webclaw_capture::store::{find_endpoint, load_endpoints};
use webclaw_capture::types::{
CaptureError, EndpointDefinition, HeaderMap, ReplayOptions, ReplayResult,
};
use crate::error::ApiError;
#[derive(Debug, Deserialize, Default)]
#[serde(default)]
pub struct CaptureNetworkRequest {
pub url: String,
pub intent: Option<String>,
pub wait_ms: Option<u64>,
pub headed: Option<bool>,
}
#[derive(Debug, Deserialize, Default)]
#[serde(default)]
pub struct ReplayEndpointRequest {
pub endpoint_id: String,
pub params_json: Option<Value>,
pub dry_run: Option<bool>,
pub confirm_unsafe: Option<bool>,
pub headers: Option<BTreeMap<String, String>>,
pub body_json: Option<Value>,
}
pub async fn capture_network(
Json(request): Json<CaptureNetworkRequest>,
) -> Result<Json<Value>, ApiError> {
if request.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let url = normalize_capture_url(&request.url)?;
webclaw_fetch::url_security::validate_public_http_url(&url).await?;
let saved = run_network_capture(CaptureOptions {
url,
intent: request.intent,
wait_ms: request.wait_ms.unwrap_or(3000),
headed: request.headed.unwrap_or(false),
})
.await
.map_err(|error| capture_error("capture-network failed", error))?;
Ok(Json(json!(saved)))
}
pub async fn endpoints(
Path((domain, timestamp)): Path<(String, String)>,
) -> Result<Json<Vec<EndpointDefinition>>, ApiError> {
let capture_id = capture_id_from_path(&domain, &timestamp)?;
let endpoints = load_endpoints(&capture_id).map_err(|error| {
capture_error(
format!("could not load endpoints for capture id {capture_id}"),
error,
)
})?;
Ok(Json(endpoints))
}
pub async fn replay_endpoint(
Json(request): Json<ReplayEndpointRequest>,
) -> Result<Json<ReplayResult>, ApiError> {
if request.endpoint_id.trim().is_empty() {
return Err(ApiError::bad_request("`endpoint_id` is required"));
}
let endpoint = find_endpoint(&request.endpoint_id).map_err(|error| {
capture_error(
format!("could not find endpoint id {}", request.endpoint_id),
error,
)
})?;
let options = replay_options_from_request(&endpoint, &request)?;
let result = run_endpoint_replay(&endpoint, options)
.await
.map_err(|error| capture_error("replay-endpoint failed", error))?;
Ok(Json(result))
}
pub async fn export_openapi(
Path((domain, timestamp)): Path<(String, String)>,
) -> Result<Json<Value>, ApiError> {
let capture_id = capture_id_from_path(&domain, &timestamp)?;
let path = write_openapi(&capture_id).map_err(|error| {
capture_error(
format!("could not export OpenAPI for capture id {capture_id}"),
error,
)
})?;
Ok(Json(json!({ "path": path.display().to_string() })))
}
fn normalize_capture_url(url: &str) -> Result<String, ApiError> {
let trimmed = url.trim();
if trimmed.is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let normalized = if let Some((scheme, _rest)) = trimmed.split_once("://") {
if !matches!(scheme, "http" | "https") {
return Err(ApiError::bad_request(format!(
"capture-network only supports http and https URLs, got {scheme:?}"
)));
}
trimmed.to_owned()
} else {
format!("https://{trimmed}")
};
Ok(normalized)
}
fn capture_id_from_path(domain: &str, timestamp: &str) -> Result<String, ApiError> {
if !is_safe_capture_segment(domain) || !is_safe_capture_segment(timestamp) {
return Err(ApiError::bad_request(
"capture id contains an unsafe path segment",
));
}
Ok(format!("{domain}/{timestamp}"))
}
fn replay_options_from_request(
endpoint: &EndpointDefinition,
request: &ReplayEndpointRequest,
) -> Result<ReplayOptions, ApiError> {
if let Some(value) = &request.params_json
&& !value.is_object()
{
return Err(ApiError::bad_request("`params_json` must be a JSON object"));
}
let confirm_unsafe = request.confirm_unsafe.unwrap_or(false);
let default_dry_run = endpoint_defaults_to_dry_run(endpoint) && !confirm_unsafe;
Ok(ReplayOptions {
dry_run: request.dry_run.unwrap_or(false) || default_dry_run,
confirm_unsafe,
params_json: request.params_json.clone(),
headers: header_map_from_strings(request.headers.as_ref()),
body_json: request.body_json.clone(),
})
}
fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool {
endpoint.safety.requires_confirmation
|| !endpoint.safety.safe_to_replay
|| !matches!(
endpoint.method.to_ascii_uppercase().as_str(),
"GET" | "HEAD" | "OPTIONS"
)
}
fn header_map_from_strings(headers: Option<&BTreeMap<String, String>>) -> HeaderMap {
headers
.into_iter()
.flat_map(|headers| headers.iter())
.map(|(name, value)| (name.clone(), Value::String(value.clone())))
.collect()
}
fn is_safe_capture_segment(segment: &str) -> bool {
!segment.is_empty()
&& segment != "."
&& segment != ".."
&& !segment.contains(':')
&& !segment.contains('/')
&& !segment.contains('\\')
}
fn capture_error(context: impl Into<String>, error: CaptureError) -> ApiError {
let context = context.into();
match error {
CaptureError::InvalidUrl(_) | CaptureError::Replay(_) | CaptureError::Storage(_) => {
ApiError::bad_request(format!("{context}: {error}"))
}
CaptureError::EndpointNotFound(_) => ApiError::NotFound,
CaptureError::Request(_) | CaptureError::Capture(_) => ApiError::Fetch(error.to_string()),
CaptureError::Io(_) | CaptureError::Json(_) => ApiError::Internal(error.to_string()),
}
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use serde_json::json;
use webclaw_capture::types::{EndpointDefinition, EndpointSafety};
use super::*;
fn endpoint(
method: &str,
safe_to_replay: bool,
requires_confirmation: bool,
) -> EndpointDefinition {
EndpointDefinition {
id: format!("{}_example", method.to_ascii_lowercase()),
method: method.to_owned(),
origin: "https://example.test".to_owned(),
path_template: "/api/items".to_owned(),
query_params: BTreeMap::new(),
request_schema: None,
response_schema: None,
auth_evidence: Vec::new(),
safety: EndpointSafety {
safe_to_replay,
requires_confirmation,
reason: "test".to_owned(),
},
examples: Vec::new(),
}
}
#[test]
fn capture_id_from_path_joins_domain_timestamp_and_rejects_unsafe_segments() {
assert_eq!(
capture_id_from_path("example.test", "2026-05-16T12-00-00Z").unwrap(),
"example.test/2026-05-16T12-00-00Z"
);
assert!(capture_id_from_path("..", "2026-05-16T12-00-00Z").is_err());
assert!(capture_id_from_path("example.test", "..").is_err());
}
#[test]
fn replay_request_defaults_unsafe_methods_to_dry_run_unless_confirmed() {
let unsafe_endpoint = endpoint("POST", false, true);
let request = ReplayEndpointRequest {
endpoint_id: unsafe_endpoint.id.clone(),
params_json: Some(json!({"id": "123"})),
dry_run: None,
confirm_unsafe: None,
headers: Some(BTreeMap::from([("X-Test".to_owned(), "ok".to_owned())])),
body_json: Some(json!({"name": "tool"})),
};
let options = replay_options_from_request(&unsafe_endpoint, &request).unwrap();
assert!(options.dry_run);
assert!(!options.confirm_unsafe);
assert_eq!(options.params_json, Some(json!({"id": "123"})));
assert_eq!(options.headers.get("X-Test"), Some(&json!("ok")));
assert_eq!(options.body_json, Some(json!({"name": "tool"})));
let confirmed = ReplayEndpointRequest {
confirm_unsafe: Some(true),
..request
};
let options = replay_options_from_request(&unsafe_endpoint, &confirmed).unwrap();
assert!(!options.dry_run);
assert!(options.confirm_unsafe);
}
#[test]
fn replay_request_rejects_non_object_params_json() {
let safe_endpoint = endpoint("GET", true, false);
let request = ReplayEndpointRequest {
endpoint_id: safe_endpoint.id.clone(),
params_json: Some(json!(["not", "an", "object"])),
dry_run: None,
confirm_unsafe: None,
headers: None,
body_json: None,
};
let error = replay_options_from_request(&safe_endpoint, &request).unwrap_err();
assert!(error.to_string().contains("params_json"));
}
}

View file

@ -9,6 +9,7 @@
pub mod batch;
pub mod brand;
pub mod capture;
pub mod crawl;
pub mod diff;
pub mod extract;