mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-25 03:08:06 +02:00
add network capture endpoint replay
This commit is contained in:
parent
72edb61881
commit
cb31c70465
34 changed files with 5996 additions and 8 deletions
21
crates/webclaw-capture/Cargo.toml
Normal file
21
crates/webclaw-capture/Cargo.toml
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
[package]
|
||||
name = "webclaw-capture"
|
||||
description = "Browser network capture, endpoint inference, and safe replay for Webclaw"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
chromiumoxide = "0.9.1"
|
||||
futures-util = "0.3"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
url = "2"
|
||||
dirs = "6"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
sha2 = "0.10"
|
||||
hex = "0.4"
|
||||
404
crates/webclaw-capture/src/cdp.rs
Normal file
404
crates/webclaw-capture/src/cdp.rs
Normal file
|
|
@ -0,0 +1,404 @@
|
|||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
|
||||
use chromiumoxide::cdp::browser_protocol::network::{
|
||||
EnableParams, EventLoadingFinished, EventRequestWillBeSent, EventResponseReceived,
|
||||
GetResponseBodyParams, Headers, RequestId, ResourceType, TimeSinceEpoch,
|
||||
};
|
||||
use chromiumoxide::{Browser, BrowserConfig, Page};
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures_util::StreamExt;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value, json};
|
||||
use tokio::sync::oneshot;
|
||||
use url::Url;
|
||||
|
||||
use crate::infer::infer_endpoints;
|
||||
use crate::store::{capture_id_for, save_capture};
|
||||
use crate::types::{CaptureArtifact, CaptureError, CapturedExchange, HeaderMap, SavedCapture};
|
||||
|
||||
const BODY_SAMPLE_LIMIT: usize = 64 * 1024;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct CaptureOptions {
|
||||
pub url: String,
|
||||
pub intent: Option<String>,
|
||||
pub wait_ms: u64,
|
||||
pub headed: bool,
|
||||
}
|
||||
|
||||
pub async fn capture_network(options: CaptureOptions) -> Result<SavedCapture, CaptureError> {
|
||||
let source_url =
|
||||
Url::parse(&options.url).map_err(|error| CaptureError::InvalidUrl(error.to_string()))?;
|
||||
let started_at = Utc::now();
|
||||
let capture_id = capture_id_for(&source_url, started_at);
|
||||
|
||||
let (mut browser, mut handler) = launch_browser(options.headed).await?;
|
||||
let handler_task = tokio::spawn(async move {
|
||||
while let Some(event) = handler.next().await {
|
||||
if let Err(error) = event {
|
||||
tracing::debug!(error = %error, "chromiumoxide browser handler stopped");
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let capture_result = async {
|
||||
let page = browser
|
||||
.new_page("about:blank")
|
||||
.await
|
||||
.map_err(|error| CaptureError::Capture(format!("could not create page: {error}")))?;
|
||||
|
||||
enable_network_capture(&page).await?;
|
||||
let request_events = page
|
||||
.event_listener::<EventRequestWillBeSent>()
|
||||
.await
|
||||
.map_err(|error| {
|
||||
CaptureError::Capture(format!("could not listen for network requests: {error}"))
|
||||
})?;
|
||||
let response_events = page
|
||||
.event_listener::<EventResponseReceived>()
|
||||
.await
|
||||
.map_err(|error| {
|
||||
CaptureError::Capture(format!("could not listen for network responses: {error}"))
|
||||
})?;
|
||||
let finished_events = page
|
||||
.event_listener::<EventLoadingFinished>()
|
||||
.await
|
||||
.map_err(|error| {
|
||||
CaptureError::Capture(format!("could not listen for completed requests: {error}"))
|
||||
})?;
|
||||
|
||||
let (stop_tx, stop_rx) = oneshot::channel();
|
||||
let collector_page = page.clone();
|
||||
let collector_task = tokio::spawn(async move {
|
||||
collect_exchanges(
|
||||
collector_page,
|
||||
request_events,
|
||||
response_events,
|
||||
finished_events,
|
||||
stop_rx,
|
||||
started_at,
|
||||
)
|
||||
.await
|
||||
});
|
||||
|
||||
page.goto(options.url.clone()).await.map_err(|error| {
|
||||
CaptureError::Capture(format!("could not navigate to {}: {error}", options.url))
|
||||
})?;
|
||||
|
||||
tokio::time::sleep(Duration::from_millis(options.wait_ms)).await;
|
||||
let _ = stop_tx.send(());
|
||||
|
||||
let exchanges = collector_task
|
||||
.await
|
||||
.map_err(|error| CaptureError::Capture(format!("capture collector failed: {error}")))?
|
||||
.map_err(|error| CaptureError::Capture(format!("capture collector failed: {error}")))?;
|
||||
let completed_at = Utc::now();
|
||||
let endpoints = infer_endpoints(&exchanges);
|
||||
let exchange_count = exchanges.len();
|
||||
let endpoint_count = endpoints.len();
|
||||
|
||||
let mut metadata = Map::new();
|
||||
metadata.insert("wait_ms".to_owned(), json!(options.wait_ms));
|
||||
metadata.insert("headed".to_owned(), json!(options.headed));
|
||||
metadata.insert("exchange_count".to_owned(), json!(exchange_count));
|
||||
metadata.insert("endpoint_count".to_owned(), json!(endpoint_count));
|
||||
|
||||
let artifact = CaptureArtifact {
|
||||
id: capture_id,
|
||||
source_url: options.url,
|
||||
intent: options.intent,
|
||||
started_at,
|
||||
completed_at: Some(completed_at),
|
||||
exchanges,
|
||||
endpoints,
|
||||
metadata,
|
||||
};
|
||||
|
||||
save_capture(&artifact)
|
||||
}
|
||||
.await;
|
||||
|
||||
if let Err(error) = browser.close().await {
|
||||
tracing::debug!(error = %error, "failed to close browser after capture");
|
||||
}
|
||||
if let Err(error) = handler_task.await {
|
||||
tracing::debug!(error = %error, "failed to join browser handler after capture");
|
||||
}
|
||||
|
||||
capture_result
|
||||
}
|
||||
|
||||
async fn launch_browser(headed: bool) -> Result<(Browser, chromiumoxide::Handler), CaptureError> {
|
||||
let mut config = BrowserConfig::builder()
|
||||
.request_timeout(Duration::from_secs(15))
|
||||
.no_sandbox()
|
||||
.disable_cache()
|
||||
.disable_https_first();
|
||||
|
||||
if headed {
|
||||
config = config.with_head();
|
||||
}
|
||||
|
||||
let config = config.build().map_err(|error| {
|
||||
CaptureError::Capture(format!("could not build browser config: {error}"))
|
||||
})?;
|
||||
|
||||
Browser::launch(config)
|
||||
.await
|
||||
.map_err(|error| CaptureError::Capture(format!("could not launch Chromium: {error}")))
|
||||
}
|
||||
|
||||
async fn enable_network_capture(page: &Page) -> Result<(), CaptureError> {
|
||||
let params = EnableParams::builder()
|
||||
.max_total_buffer_size(16 * 1024 * 1024)
|
||||
.max_resource_buffer_size(2 * 1024 * 1024)
|
||||
.max_post_data_size(BODY_SAMPLE_LIMIT as i64)
|
||||
.build();
|
||||
|
||||
page.execute(params).await.map_err(|error| {
|
||||
CaptureError::Capture(format!("could not enable CDP network capture: {error}"))
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn collect_exchanges(
|
||||
page: Page,
|
||||
mut request_events: chromiumoxide::listeners::EventStream<EventRequestWillBeSent>,
|
||||
mut response_events: chromiumoxide::listeners::EventStream<EventResponseReceived>,
|
||||
mut finished_events: chromiumoxide::listeners::EventStream<EventLoadingFinished>,
|
||||
mut stop_rx: oneshot::Receiver<()>,
|
||||
fallback_started_at: DateTime<Utc>,
|
||||
) -> Result<Vec<CapturedExchange>, CaptureError> {
|
||||
let mut pending = HashMap::<RequestId, PendingExchange>::new();
|
||||
let mut exchanges = Vec::<CapturedExchange>::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = &mut stop_rx => break,
|
||||
event = request_events.next() => {
|
||||
if let Some(event) = event {
|
||||
record_request(&mut pending, &event, fallback_started_at);
|
||||
}
|
||||
}
|
||||
event = response_events.next() => {
|
||||
if let Some(event) = event {
|
||||
record_response(&mut pending, &event);
|
||||
}
|
||||
}
|
||||
event = finished_events.next() => {
|
||||
if let Some(event) = event
|
||||
&& let Some(exchange) = finish_request(&page, &mut pending, &event).await?
|
||||
{
|
||||
exchanges.push(exchange);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (_request_id, pending_exchange) in pending {
|
||||
if let Some(exchange) = pending_exchange.into_exchange() {
|
||||
exchanges.push(exchange);
|
||||
}
|
||||
}
|
||||
|
||||
exchanges.sort_by(|left, right| {
|
||||
left.started_at
|
||||
.cmp(&right.started_at)
|
||||
.then_with(|| left.url.cmp(&right.url))
|
||||
});
|
||||
|
||||
Ok(exchanges)
|
||||
}
|
||||
|
||||
fn record_request(
|
||||
pending: &mut HashMap<RequestId, PendingExchange>,
|
||||
event: &EventRequestWillBeSent,
|
||||
fallback_started_at: DateTime<Utc>,
|
||||
) {
|
||||
let request_id = event.request_id.clone();
|
||||
let mut current = pending.remove(&request_id).unwrap_or_default();
|
||||
|
||||
if let Some(redirect_response) = &event.redirect_response {
|
||||
if !current.url.is_empty() {
|
||||
current.redirect_chain.push(current.url.clone());
|
||||
}
|
||||
current.redirect_chain.push(redirect_response.url.clone());
|
||||
}
|
||||
|
||||
current.method = event.request.method.clone();
|
||||
current.url = event.request.url.clone();
|
||||
current.request_headers = headers_to_map(&event.request.headers);
|
||||
current.request_body_sample = request_body_sample(event);
|
||||
current.resource_type = event.r#type.as_ref().map(resource_type_name);
|
||||
current.started_at = wall_time_to_utc(&event.wall_time, fallback_started_at);
|
||||
current.started_monotonic = Some(*event.timestamp.inner());
|
||||
|
||||
pending.insert(request_id, current);
|
||||
}
|
||||
|
||||
fn record_response(
|
||||
pending: &mut HashMap<RequestId, PendingExchange>,
|
||||
event: &EventResponseReceived,
|
||||
) {
|
||||
let current = pending.entry(event.request_id.clone()).or_default();
|
||||
|
||||
if current.url.is_empty() {
|
||||
current.url = event.response.url.clone();
|
||||
}
|
||||
current.status = u16::try_from(event.response.status).unwrap_or_default();
|
||||
current.response_headers = headers_to_map(&event.response.headers);
|
||||
current.response_mime_type = Some(event.response.mime_type.clone());
|
||||
current.resource_type = Some(resource_type_name(&event.r#type));
|
||||
}
|
||||
|
||||
async fn finish_request(
|
||||
page: &Page,
|
||||
pending: &mut HashMap<RequestId, PendingExchange>,
|
||||
event: &EventLoadingFinished,
|
||||
) -> Result<Option<CapturedExchange>, CaptureError> {
|
||||
let Some(mut current) = pending.remove(&event.request_id) else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if let Some(started) = current.started_monotonic {
|
||||
let elapsed = ((*event.timestamp.inner() - started) * 1_000.0).max(0.0);
|
||||
current.duration_ms = elapsed.round() as u64;
|
||||
}
|
||||
|
||||
current.response_body_sample = response_body_sample(page, event.request_id.clone()).await;
|
||||
|
||||
Ok(current.into_exchange())
|
||||
}
|
||||
|
||||
async fn response_body_sample(page: &Page, request_id: RequestId) -> Option<String> {
|
||||
let response = page
|
||||
.execute(GetResponseBodyParams::new(request_id))
|
||||
.await
|
||||
.ok()?;
|
||||
Some(truncate_sample(response.result.body))
|
||||
}
|
||||
|
||||
fn headers_to_map(headers: &Headers) -> HeaderMap {
|
||||
match headers.inner() {
|
||||
Value::Object(headers) => headers.clone(),
|
||||
_ => HeaderMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn request_body_sample(event: &EventRequestWillBeSent) -> Option<String> {
|
||||
let entries = event.request.post_data_entries.as_ref()?;
|
||||
let mut body = String::new();
|
||||
|
||||
for entry in entries {
|
||||
if let Some(bytes) = &entry.bytes {
|
||||
body.push_str(bytes.as_ref());
|
||||
}
|
||||
}
|
||||
|
||||
if body.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(truncate_sample(body))
|
||||
}
|
||||
}
|
||||
|
||||
fn resource_type_name(resource_type: &ResourceType) -> String {
|
||||
resource_type.as_ref().to_owned()
|
||||
}
|
||||
|
||||
fn wall_time_to_utc(wall_time: &TimeSinceEpoch, fallback: DateTime<Utc>) -> DateTime<Utc> {
|
||||
let seconds = *wall_time.inner();
|
||||
if !seconds.is_finite() || seconds < 0.0 {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
let whole_seconds = seconds.trunc() as i64;
|
||||
let nanos = ((seconds.fract() * 1_000_000_000.0).round() as u32).min(999_999_999);
|
||||
|
||||
DateTime::<Utc>::from_timestamp(whole_seconds, nanos).unwrap_or(fallback)
|
||||
}
|
||||
|
||||
fn truncate_sample(sample: String) -> String {
|
||||
if sample.len() <= BODY_SAMPLE_LIMIT {
|
||||
return sample;
|
||||
}
|
||||
|
||||
let end = sample
|
||||
.char_indices()
|
||||
.take_while(|(index, _)| *index <= BODY_SAMPLE_LIMIT)
|
||||
.map(|(index, character)| index + character.len_utf8())
|
||||
.last()
|
||||
.unwrap_or(0)
|
||||
.min(sample.len());
|
||||
|
||||
sample[..end].to_owned()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct PendingExchange {
|
||||
method: String,
|
||||
url: String,
|
||||
request_headers: HeaderMap,
|
||||
request_body_sample: Option<String>,
|
||||
resource_type: Option<String>,
|
||||
status: u16,
|
||||
response_headers: HeaderMap,
|
||||
response_body_sample: Option<String>,
|
||||
response_mime_type: Option<String>,
|
||||
started_at: DateTime<Utc>,
|
||||
started_monotonic: Option<f64>,
|
||||
duration_ms: u64,
|
||||
redirect_chain: Vec<String>,
|
||||
}
|
||||
|
||||
impl Default for PendingExchange {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
method: String::new(),
|
||||
url: String::new(),
|
||||
request_headers: HeaderMap::new(),
|
||||
request_body_sample: None,
|
||||
resource_type: None,
|
||||
status: 0,
|
||||
response_headers: HeaderMap::new(),
|
||||
response_body_sample: None,
|
||||
response_mime_type: None,
|
||||
started_at: Utc::now(),
|
||||
started_monotonic: None,
|
||||
duration_ms: 0,
|
||||
redirect_chain: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PendingExchange {
|
||||
fn into_exchange(mut self) -> Option<CapturedExchange> {
|
||||
if self.method.is_empty() || self.url.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if !self.response_headers.contains_key("content-type")
|
||||
&& let Some(mime_type) = self.response_mime_type.take()
|
||||
{
|
||||
self.response_headers
|
||||
.insert("content-type".to_owned(), Value::String(mime_type));
|
||||
}
|
||||
|
||||
Some(CapturedExchange {
|
||||
method: self.method,
|
||||
url: self.url,
|
||||
request_headers: self.request_headers,
|
||||
request_body_sample: self.request_body_sample,
|
||||
resource_type: self.resource_type,
|
||||
status: self.status,
|
||||
response_headers: self.response_headers,
|
||||
response_body_sample: self.response_body_sample,
|
||||
started_at: self.started_at,
|
||||
duration_ms: self.duration_ms,
|
||||
redirect_chain: self.redirect_chain,
|
||||
})
|
||||
}
|
||||
}
|
||||
253
crates/webclaw-capture/src/classify.rs
Normal file
253
crates/webclaw-capture/src/classify.rs
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use url::Url;
|
||||
|
||||
use crate::types::CapturedExchange;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct ApiClassification {
|
||||
pub include: bool,
|
||||
pub confidence: f32,
|
||||
pub reasons: Vec<String>,
|
||||
}
|
||||
|
||||
pub fn classify_exchange(exchange: &CapturedExchange) -> ApiClassification {
|
||||
let url = match Url::parse(&exchange.url) {
|
||||
Ok(url) => url,
|
||||
Err(error) => {
|
||||
return ApiClassification {
|
||||
include: false,
|
||||
confidence: 0.0,
|
||||
reasons: vec![format!("invalid URL: {error}")],
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
let mut exclusion_reasons = Vec::new();
|
||||
|
||||
if is_browser_extension_url(&url) {
|
||||
exclusion_reasons.push("browser extension URL".to_owned());
|
||||
}
|
||||
|
||||
if is_tracking_host(url.host_str()) {
|
||||
exclusion_reasons.push("tracking, ad, or telemetry host".to_owned());
|
||||
}
|
||||
|
||||
if has_static_asset_extension(url.path()) {
|
||||
exclusion_reasons.push("static asset extension".to_owned());
|
||||
}
|
||||
|
||||
if is_static_resource_type(exchange.resource_type.as_deref()) {
|
||||
exclusion_reasons.push("static browser resource type".to_owned());
|
||||
}
|
||||
|
||||
if !exclusion_reasons.is_empty() {
|
||||
return ApiClassification {
|
||||
include: false,
|
||||
confidence: 0.0,
|
||||
reasons: exclusion_reasons,
|
||||
};
|
||||
}
|
||||
|
||||
let mut confidence = 0.0_f32;
|
||||
let mut reasons = Vec::new();
|
||||
|
||||
if matches_resource_type(exchange.resource_type.as_deref(), &["fetch", "xhr"]) {
|
||||
confidence += 0.65;
|
||||
reasons.push("browser resource type is fetch/xhr".to_owned());
|
||||
}
|
||||
|
||||
if response_is_json(exchange) {
|
||||
confidence += 0.55;
|
||||
reasons.push("response content type is JSON".to_owned());
|
||||
}
|
||||
|
||||
let path = url.path();
|
||||
|
||||
if has_api_path(path) {
|
||||
confidence += 0.55;
|
||||
reasons.push("URL path contains an API prefix".to_owned());
|
||||
}
|
||||
|
||||
if has_versioned_path(path) {
|
||||
confidence += 0.55;
|
||||
reasons.push("URL path starts with a versioned API prefix".to_owned());
|
||||
}
|
||||
|
||||
if has_graphql_path(path) {
|
||||
confidence += 0.55;
|
||||
reasons.push("URL path is GraphQL-like".to_owned());
|
||||
}
|
||||
|
||||
if has_graphql_body(exchange.request_body_sample.as_deref()) {
|
||||
confidence += 0.55;
|
||||
reasons.push("request body is GraphQL-like".to_owned());
|
||||
}
|
||||
|
||||
let confidence = confidence.min(1.0);
|
||||
|
||||
if reasons.is_empty() {
|
||||
reasons.push("no API traffic signals found".to_owned());
|
||||
}
|
||||
|
||||
ApiClassification {
|
||||
include: confidence >= 0.5,
|
||||
confidence,
|
||||
reasons,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn filter_api_exchanges(exchanges: &[CapturedExchange]) -> Vec<CapturedExchange> {
|
||||
exchanges
|
||||
.iter()
|
||||
.filter(|exchange| classify_exchange(exchange).include)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn is_browser_extension_url(url: &Url) -> bool {
|
||||
matches!(
|
||||
url.scheme().to_ascii_lowercase().as_str(),
|
||||
"chrome-extension" | "moz-extension" | "edge-extension" | "safari-extension"
|
||||
)
|
||||
}
|
||||
|
||||
fn is_tracking_host(host: Option<&str>) -> bool {
|
||||
let Some(host) = host else {
|
||||
return false;
|
||||
};
|
||||
let host = host.to_ascii_lowercase();
|
||||
|
||||
[
|
||||
"google-analytics",
|
||||
"googletagmanager",
|
||||
"googlesyndication",
|
||||
"doubleclick",
|
||||
"adservice",
|
||||
"ads.",
|
||||
".ads.",
|
||||
"analytics.",
|
||||
".analytics.",
|
||||
"telemetry",
|
||||
"segment.",
|
||||
"segment.io",
|
||||
"amplitude",
|
||||
"mixpanel",
|
||||
"hotjar",
|
||||
"sentry.io",
|
||||
"datadog",
|
||||
"newrelic",
|
||||
]
|
||||
.iter()
|
||||
.any(|needle| host.contains(needle))
|
||||
}
|
||||
|
||||
fn has_static_asset_extension(path: &str) -> bool {
|
||||
let path = path.to_ascii_lowercase();
|
||||
|
||||
[
|
||||
".png", ".jpg", ".jpeg", ".gif", ".webp", ".avif", ".svg", ".ico", ".css", ".js", ".mjs",
|
||||
".woff", ".woff2", ".ttf", ".otf", ".eot", ".map", ".mp4", ".webm", ".mp3", ".wav",
|
||||
]
|
||||
.iter()
|
||||
.any(|extension| path.ends_with(extension))
|
||||
}
|
||||
|
||||
fn is_static_resource_type(resource_type: Option<&str>) -> bool {
|
||||
matches_resource_type(
|
||||
resource_type,
|
||||
&[
|
||||
"image",
|
||||
"stylesheet",
|
||||
"script",
|
||||
"font",
|
||||
"media",
|
||||
"manifest",
|
||||
"ping",
|
||||
"cspviolationreport",
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
fn matches_resource_type(resource_type: Option<&str>, candidates: &[&str]) -> bool {
|
||||
let Some(resource_type) = resource_type else {
|
||||
return false;
|
||||
};
|
||||
candidates
|
||||
.iter()
|
||||
.any(|candidate| resource_type.eq_ignore_ascii_case(candidate))
|
||||
}
|
||||
|
||||
fn response_is_json(exchange: &CapturedExchange) -> bool {
|
||||
exchange.response_headers.iter().any(|(name, value)| {
|
||||
name.eq_ignore_ascii_case("content-type")
|
||||
&& header_value_as_str(value)
|
||||
.map(|value| value.to_ascii_lowercase().contains("json"))
|
||||
.unwrap_or(false)
|
||||
})
|
||||
}
|
||||
|
||||
fn header_value_as_str(value: &Value) -> Option<&str> {
|
||||
match value {
|
||||
Value::String(value) => Some(value),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn has_api_path(path: &str) -> bool {
|
||||
path.split('/')
|
||||
.filter(|segment| !segment.is_empty())
|
||||
.any(|segment| segment.eq_ignore_ascii_case("api"))
|
||||
}
|
||||
|
||||
fn has_versioned_path(path: &str) -> bool {
|
||||
path.split('/')
|
||||
.find(|segment| !segment.is_empty())
|
||||
.map(|segment| {
|
||||
let segment = segment.to_ascii_lowercase();
|
||||
segment.len() > 1
|
||||
&& segment.starts_with('v')
|
||||
&& segment[1..]
|
||||
.chars()
|
||||
.all(|character| character.is_ascii_digit())
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn has_graphql_path(path: &str) -> bool {
|
||||
path.split('/')
|
||||
.filter(|segment| !segment.is_empty())
|
||||
.any(|segment| segment.eq_ignore_ascii_case("graphql"))
|
||||
}
|
||||
|
||||
fn has_graphql_body(body: Option<&str>) -> bool {
|
||||
let Some(body) = body else {
|
||||
return false;
|
||||
};
|
||||
|
||||
if let Ok(value) = serde_json::from_str::<Value>(body) {
|
||||
return value
|
||||
.as_object()
|
||||
.map(|object| {
|
||||
object.contains_key("operationName")
|
||||
|| object
|
||||
.get("query")
|
||||
.and_then(Value::as_str)
|
||||
.map(is_graphql_query_text)
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.unwrap_or(false);
|
||||
}
|
||||
|
||||
is_graphql_query_text(body)
|
||||
}
|
||||
|
||||
fn is_graphql_query_text(text: &str) -> bool {
|
||||
let text = text.trim_start();
|
||||
text.starts_with("query ")
|
||||
|| text.starts_with("query{")
|
||||
|| text.starts_with("mutation ")
|
||||
|| text.starts_with("mutation{")
|
||||
|| text.starts_with("subscription ")
|
||||
|| text.starts_with("subscription{")
|
||||
}
|
||||
386
crates/webclaw-capture/src/infer.rs
Normal file
386
crates/webclaw-capture/src/infer.rs
Normal file
|
|
@ -0,0 +1,386 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use serde_json::{Map, Value, json};
|
||||
use url::Url;
|
||||
|
||||
use crate::classify::filter_api_exchanges;
|
||||
use crate::redact::{redact_headers, redact_url};
|
||||
use crate::types::{
|
||||
CapturedExchange, EndpointDefinition, EndpointExample, EndpointSafety, HeaderMap,
|
||||
};
|
||||
|
||||
pub fn infer_endpoints(exchanges: &[CapturedExchange]) -> Vec<EndpointDefinition> {
|
||||
let mut groups = BTreeMap::<EndpointKey, EndpointBuilder>::new();
|
||||
|
||||
for exchange in filter_api_exchanges(exchanges) {
|
||||
let Ok(url) = Url::parse(&exchange.url) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let method = exchange.method.to_ascii_uppercase();
|
||||
let origin = url.origin().ascii_serialization();
|
||||
let path_template = normalize_path_template(url.path());
|
||||
let key = EndpointKey {
|
||||
method: method.clone(),
|
||||
origin: origin.clone(),
|
||||
path_template: path_template.clone(),
|
||||
};
|
||||
|
||||
groups
|
||||
.entry(key)
|
||||
.or_insert_with(|| EndpointBuilder::new(method, origin, path_template))
|
||||
.add_exchange(&exchange, &url);
|
||||
}
|
||||
|
||||
groups
|
||||
.into_values()
|
||||
.map(EndpointBuilder::into_endpoint)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn normalize_path_template(path: &str) -> String {
|
||||
let normalized = if path.is_empty() { "/" } else { path };
|
||||
let trailing_slash = normalized.len() > 1 && normalized.ends_with('/');
|
||||
|
||||
let mut segments = normalized
|
||||
.split('/')
|
||||
.filter(|segment| !segment.is_empty())
|
||||
.map(|segment| {
|
||||
if is_identifier_segment(segment) {
|
||||
"{id}".to_owned()
|
||||
} else {
|
||||
segment.to_owned()
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if segments.is_empty() {
|
||||
return "/".to_owned();
|
||||
}
|
||||
|
||||
let mut path_template = format!("/{}", segments.join("/"));
|
||||
if trailing_slash {
|
||||
path_template.push('/');
|
||||
}
|
||||
segments.clear();
|
||||
path_template
|
||||
}
|
||||
|
||||
pub fn infer_json_schema(value: &Value) -> Value {
|
||||
match value {
|
||||
Value::Null => json!({ "type": "null" }),
|
||||
Value::Bool(_) => json!({ "type": "boolean" }),
|
||||
Value::Number(number) if number.is_i64() || number.is_u64() => {
|
||||
json!({ "type": "integer" })
|
||||
}
|
||||
Value::Number(_) => json!({ "type": "number" }),
|
||||
Value::String(_) => json!({ "type": "string" }),
|
||||
Value::Array(items) => {
|
||||
let item_schema = items
|
||||
.iter()
|
||||
.map(infer_json_schema)
|
||||
.reduce(|left, right| merge_json_schemas(&left, &right))
|
||||
.unwrap_or_else(|| json!({}));
|
||||
|
||||
json!({
|
||||
"type": "array",
|
||||
"items": item_schema
|
||||
})
|
||||
}
|
||||
Value::Object(object) => {
|
||||
let properties = object
|
||||
.iter()
|
||||
.map(|(key, value)| (key.clone(), infer_json_schema(value)))
|
||||
.collect::<Map<_, _>>();
|
||||
|
||||
json!({
|
||||
"type": "object",
|
||||
"properties": properties
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn endpoint_id(method: &str, origin: &str, path_template: &str) -> String {
|
||||
format!(
|
||||
"{} {}{}",
|
||||
method.to_ascii_uppercase(),
|
||||
origin.trim_end_matches('/'),
|
||||
ensure_leading_slash(path_template)
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct EndpointKey {
|
||||
method: String,
|
||||
origin: String,
|
||||
path_template: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct EndpointBuilder {
|
||||
method: String,
|
||||
origin: String,
|
||||
path_template: String,
|
||||
query_params: BTreeMap<String, BTreeSet<String>>,
|
||||
request_schema: Option<Value>,
|
||||
response_schema: Option<Value>,
|
||||
auth_evidence: BTreeSet<String>,
|
||||
examples: Vec<EndpointExample>,
|
||||
}
|
||||
|
||||
impl EndpointBuilder {
|
||||
fn new(method: String, origin: String, path_template: String) -> Self {
|
||||
Self {
|
||||
method,
|
||||
origin,
|
||||
path_template,
|
||||
query_params: BTreeMap::new(),
|
||||
request_schema: None,
|
||||
response_schema: None,
|
||||
auth_evidence: BTreeSet::new(),
|
||||
examples: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_exchange(&mut self, exchange: &CapturedExchange, url: &Url) {
|
||||
for (name, value) in url.query_pairs() {
|
||||
self.query_params
|
||||
.entry(name.into_owned())
|
||||
.or_default()
|
||||
.insert(value.into_owned());
|
||||
}
|
||||
|
||||
self.record_auth_evidence(&exchange.request_headers);
|
||||
self.record_auth_evidence(&exchange.response_headers);
|
||||
|
||||
if let Some(schema) = infer_body_schema(exchange.request_body_sample.as_deref()) {
|
||||
self.request_schema = merge_optional_schema(self.request_schema.take(), schema);
|
||||
}
|
||||
|
||||
if let Some(schema) = infer_body_schema(exchange.response_body_sample.as_deref()) {
|
||||
self.response_schema = merge_optional_schema(self.response_schema.take(), schema);
|
||||
}
|
||||
|
||||
self.examples.push(EndpointExample {
|
||||
url: redact_url(&exchange.url),
|
||||
request_headers: redact_headers(&exchange.request_headers),
|
||||
request_body_sample: redact_body_sample(exchange.request_body_sample.as_deref()),
|
||||
response_status: exchange.status,
|
||||
response_headers: redact_headers(&exchange.response_headers),
|
||||
response_body_sample: redact_body_sample(exchange.response_body_sample.as_deref()),
|
||||
captured_at: exchange.started_at,
|
||||
});
|
||||
}
|
||||
|
||||
fn into_endpoint(self) -> EndpointDefinition {
|
||||
let safety = endpoint_safety(&self.method);
|
||||
|
||||
EndpointDefinition {
|
||||
id: endpoint_id(&self.method, &self.origin, &self.path_template),
|
||||
method: self.method,
|
||||
origin: self.origin,
|
||||
path_template: self.path_template,
|
||||
query_params: self
|
||||
.query_params
|
||||
.into_iter()
|
||||
.map(|(name, values)| (name, values.into_iter().collect()))
|
||||
.collect(),
|
||||
request_schema: self.request_schema,
|
||||
response_schema: self.response_schema,
|
||||
auth_evidence: self.auth_evidence.into_iter().collect(),
|
||||
safety,
|
||||
examples: self.examples,
|
||||
}
|
||||
}
|
||||
|
||||
fn record_auth_evidence(&mut self, headers: &HeaderMap) {
|
||||
for name in headers.keys() {
|
||||
if is_auth_evidence_header(name) {
|
||||
self.auth_evidence.insert(format!("{name} header observed"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn infer_body_schema(body: Option<&str>) -> Option<Value> {
|
||||
let body = body?.trim();
|
||||
if body.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
serde_json::from_str::<Value>(body)
|
||||
.ok()
|
||||
.map(|value| infer_json_schema(&value))
|
||||
}
|
||||
|
||||
fn merge_optional_schema(current: Option<Value>, next: Value) -> Option<Value> {
|
||||
Some(match current {
|
||||
Some(current) => merge_json_schemas(¤t, &next),
|
||||
None => next,
|
||||
})
|
||||
}
|
||||
|
||||
fn merge_json_schemas(left: &Value, right: &Value) -> Value {
|
||||
if left == right {
|
||||
return left.clone();
|
||||
}
|
||||
|
||||
let left_type = left.get("type").and_then(Value::as_str);
|
||||
let right_type = right.get("type").and_then(Value::as_str);
|
||||
|
||||
match (left_type, right_type) {
|
||||
(Some("object"), Some("object")) => merge_object_schemas(left, right),
|
||||
(Some("array"), Some("array")) => {
|
||||
let left_items = left.get("items").cloned().unwrap_or_else(|| json!({}));
|
||||
let right_items = right.get("items").cloned().unwrap_or_else(|| json!({}));
|
||||
json!({
|
||||
"type": "array",
|
||||
"items": merge_json_schemas(&left_items, &right_items)
|
||||
})
|
||||
}
|
||||
(Some(_), Some(_)) => {
|
||||
let mut variants = Vec::new();
|
||||
push_unique_schema(&mut variants, left.clone());
|
||||
push_unique_schema(&mut variants, right.clone());
|
||||
json!({ "oneOf": variants })
|
||||
}
|
||||
_ => right.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_object_schemas(left: &Value, right: &Value) -> Value {
|
||||
let mut properties = Map::new();
|
||||
|
||||
if let Some(left_properties) = left.get("properties").and_then(Value::as_object) {
|
||||
for (name, schema) in left_properties {
|
||||
properties.insert(name.clone(), schema.clone());
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(right_properties) = right.get("properties").and_then(Value::as_object) {
|
||||
for (name, schema) in right_properties {
|
||||
let schema = properties
|
||||
.remove(name)
|
||||
.map(|existing| merge_json_schemas(&existing, schema))
|
||||
.unwrap_or_else(|| schema.clone());
|
||||
properties.insert(name.clone(), schema);
|
||||
}
|
||||
}
|
||||
|
||||
json!({
|
||||
"type": "object",
|
||||
"properties": properties
|
||||
})
|
||||
}
|
||||
|
||||
fn push_unique_schema(variants: &mut Vec<Value>, schema: Value) {
|
||||
if let Some(nested) = schema.get("oneOf").and_then(Value::as_array) {
|
||||
for item in nested {
|
||||
push_unique_schema(variants, item.clone());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if !variants.iter().any(|existing| existing == &schema) {
|
||||
variants.push(schema);
|
||||
}
|
||||
}
|
||||
|
||||
fn endpoint_safety(method: &str) -> EndpointSafety {
|
||||
if is_safe_method(method) {
|
||||
EndpointSafety {
|
||||
safe_to_replay: true,
|
||||
requires_confirmation: false,
|
||||
reason: format!(
|
||||
"{} is a read-oriented HTTP method",
|
||||
method.to_ascii_uppercase()
|
||||
),
|
||||
}
|
||||
} else {
|
||||
EndpointSafety {
|
||||
safe_to_replay: false,
|
||||
requires_confirmation: true,
|
||||
reason: format!(
|
||||
"{} may mutate server state and requires confirmation",
|
||||
method.to_ascii_uppercase()
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_safe_method(method: &str) -> bool {
|
||||
matches!(
|
||||
method.to_ascii_uppercase().as_str(),
|
||||
"GET" | "HEAD" | "OPTIONS"
|
||||
)
|
||||
}
|
||||
|
||||
fn redact_body_sample(sample: Option<&str>) -> Option<String> {
|
||||
sample.map(|body| match serde_json::from_str::<Value>(body) {
|
||||
Ok(value) => crate::redact::redact_json(&value).to_string(),
|
||||
Err(_) => body.to_owned(),
|
||||
})
|
||||
}
|
||||
|
||||
fn is_auth_evidence_header(name: &str) -> bool {
|
||||
let lower = name.to_ascii_lowercase();
|
||||
let compact: String = lower
|
||||
.chars()
|
||||
.filter(|character| character.is_ascii_alphanumeric())
|
||||
.collect();
|
||||
|
||||
[
|
||||
"authorization",
|
||||
"cookie",
|
||||
"set-cookie",
|
||||
"api-key",
|
||||
"csrf",
|
||||
"token",
|
||||
"session",
|
||||
]
|
||||
.iter()
|
||||
.any(|needle| {
|
||||
let compact_needle: String = needle
|
||||
.chars()
|
||||
.filter(|character| character.is_ascii_alphanumeric())
|
||||
.collect();
|
||||
|
||||
lower.contains(needle) || compact.contains(&compact_needle)
|
||||
})
|
||||
}
|
||||
|
||||
fn is_identifier_segment(segment: &str) -> bool {
|
||||
is_numeric_segment(segment) || is_uuid_like_segment(segment) || is_high_entropy_segment(segment)
|
||||
}
|
||||
|
||||
fn is_numeric_segment(segment: &str) -> bool {
|
||||
!segment.is_empty() && segment.chars().all(|character| character.is_ascii_digit())
|
||||
}
|
||||
|
||||
fn is_uuid_like_segment(segment: &str) -> bool {
|
||||
let parts = segment.split('-').map(str::len).collect::<Vec<_>>();
|
||||
parts == [8, 4, 4, 4, 12]
|
||||
&& segment
|
||||
.chars()
|
||||
.all(|character| character == '-' || character.is_ascii_hexdigit())
|
||||
}
|
||||
|
||||
fn is_high_entropy_segment(segment: &str) -> bool {
|
||||
segment.len() >= 16
|
||||
&& segment.chars().all(|character| {
|
||||
character.is_ascii_alphanumeric() || matches!(character, '_' | '-' | '~')
|
||||
})
|
||||
&& segment.chars().any(|character| character.is_ascii_digit())
|
||||
&& segment
|
||||
.chars()
|
||||
.any(|character| character.is_ascii_alphabetic())
|
||||
}
|
||||
|
||||
fn ensure_leading_slash(path: &str) -> String {
|
||||
if path.starts_with('/') {
|
||||
path.to_owned()
|
||||
} else {
|
||||
format!("/{path}")
|
||||
}
|
||||
}
|
||||
8
crates/webclaw-capture/src/lib.rs
Normal file
8
crates/webclaw-capture/src/lib.rs
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
pub mod cdp;
|
||||
pub mod classify;
|
||||
pub mod infer;
|
||||
pub mod openapi;
|
||||
pub mod redact;
|
||||
pub mod replay;
|
||||
pub mod store;
|
||||
pub mod types;
|
||||
463
crates/webclaw-capture/src/openapi.rs
Normal file
463
crates/webclaw-capture/src/openapi.rs
Normal file
|
|
@ -0,0 +1,463 @@
|
|||
use std::fs;
|
||||
use std::path::{Component, Path, PathBuf};
|
||||
|
||||
use serde_json::{Map, Value, json};
|
||||
use url::Url;
|
||||
|
||||
use crate::redact::{redact_headers, redact_json};
|
||||
use crate::store::{capture_root, load_endpoints};
|
||||
use crate::types::{CaptureError, EndpointDefinition, EndpointExample};
|
||||
|
||||
const OPENAPI_FILE: &str = "openapi.json";
|
||||
const REDACTED: &str = "[REDACTED]";
|
||||
|
||||
pub fn export_openapi(endpoints: &[EndpointDefinition]) -> Value {
|
||||
let mut paths = Map::new();
|
||||
|
||||
for endpoint in endpoints {
|
||||
let path = normalize_openapi_path(&endpoint.path_template);
|
||||
let method = endpoint.method.to_ascii_lowercase();
|
||||
let operation = operation_for(endpoint);
|
||||
|
||||
let path_item = paths
|
||||
.entry(path)
|
||||
.or_insert_with(|| Value::Object(Map::new()));
|
||||
if let Value::Object(path_item) = path_item {
|
||||
path_item.insert(method, operation);
|
||||
}
|
||||
}
|
||||
|
||||
json!({
|
||||
"openapi": "3.1.0",
|
||||
"info": {
|
||||
"title": "Webclaw Learned API",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"paths": paths
|
||||
})
|
||||
}
|
||||
|
||||
pub fn write_openapi(capture_id: &str) -> Result<PathBuf, CaptureError> {
|
||||
let endpoints = load_endpoints(capture_id)?;
|
||||
let document = export_openapi(&endpoints);
|
||||
let capture_dir = capture_dir_for_id(&capture_root(), capture_id)?;
|
||||
fs::create_dir_all(&capture_dir)?;
|
||||
|
||||
let path = capture_dir.join(OPENAPI_FILE);
|
||||
fs::write(&path, serde_json::to_string_pretty(&document)?)?;
|
||||
|
||||
Ok(path)
|
||||
}
|
||||
|
||||
fn operation_for(endpoint: &EndpointDefinition) -> Value {
|
||||
let mut operation = Map::new();
|
||||
let method = endpoint.method.to_ascii_uppercase();
|
||||
|
||||
operation.insert(
|
||||
"operationId".to_owned(),
|
||||
Value::String(operation_id(endpoint)),
|
||||
);
|
||||
operation.insert(
|
||||
"summary".to_owned(),
|
||||
Value::String(format!("{method} {}", endpoint.path_template)),
|
||||
);
|
||||
operation.insert(
|
||||
"x-webclaw-endpoint-id".to_owned(),
|
||||
Value::String(endpoint.id.clone()),
|
||||
);
|
||||
operation.insert(
|
||||
"x-webclaw-origin".to_owned(),
|
||||
Value::String(endpoint.origin.clone()),
|
||||
);
|
||||
|
||||
if !endpoint.auth_evidence.is_empty() {
|
||||
operation.insert(
|
||||
"x-webclaw-auth-evidence".to_owned(),
|
||||
json!(endpoint.auth_evidence),
|
||||
);
|
||||
}
|
||||
|
||||
if endpoint.safety.requires_confirmation || !endpoint.safety.safe_to_replay {
|
||||
operation.insert("x-webclaw-requires-confirmation".to_owned(), json!(true));
|
||||
}
|
||||
|
||||
let parameters = parameters_for(endpoint);
|
||||
if !parameters.is_empty() {
|
||||
operation.insert("parameters".to_owned(), Value::Array(parameters));
|
||||
}
|
||||
|
||||
if let Some(request_body) = request_body_for(endpoint) {
|
||||
operation.insert("requestBody".to_owned(), request_body);
|
||||
}
|
||||
|
||||
operation.insert("responses".to_owned(), responses_for(endpoint));
|
||||
|
||||
let examples = examples_for(endpoint);
|
||||
if !examples.is_empty() {
|
||||
operation.insert("x-webclaw-examples".to_owned(), Value::Array(examples));
|
||||
}
|
||||
|
||||
Value::Object(operation)
|
||||
}
|
||||
|
||||
fn parameters_for(endpoint: &EndpointDefinition) -> Vec<Value> {
|
||||
let mut parameters = path_parameters(&endpoint.path_template);
|
||||
|
||||
for (name, values) in &endpoint.query_params {
|
||||
let examples = examples_object(
|
||||
values
|
||||
.iter()
|
||||
.map(|value| Value::String(redacted_parameter_value(name, value))),
|
||||
);
|
||||
let mut parameter = Map::new();
|
||||
|
||||
parameter.insert("name".to_owned(), Value::String(name.clone()));
|
||||
parameter.insert("in".to_owned(), Value::String("query".to_owned()));
|
||||
parameter.insert("required".to_owned(), Value::Bool(false));
|
||||
parameter.insert("schema".to_owned(), json!({ "type": "string" }));
|
||||
|
||||
if !examples.is_empty() {
|
||||
parameter.insert("examples".to_owned(), Value::Object(examples));
|
||||
}
|
||||
|
||||
parameters.push(Value::Object(parameter));
|
||||
}
|
||||
|
||||
parameters
|
||||
}
|
||||
|
||||
fn path_parameters(path_template: &str) -> Vec<Value> {
|
||||
let mut parameters = Vec::new();
|
||||
let mut cursor = path_template;
|
||||
|
||||
while let Some(start) = cursor.find('{') {
|
||||
let after_start = &cursor[start + 1..];
|
||||
let Some(end) = after_start.find('}') else {
|
||||
break;
|
||||
};
|
||||
|
||||
let name = &after_start[..end];
|
||||
if !name.is_empty()
|
||||
&& !parameters
|
||||
.iter()
|
||||
.any(|parameter| parameter_name(parameter) == name)
|
||||
{
|
||||
parameters.push(json!({
|
||||
"name": name,
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": { "type": "string" }
|
||||
}));
|
||||
}
|
||||
|
||||
cursor = &after_start[end + 1..];
|
||||
}
|
||||
|
||||
parameters
|
||||
}
|
||||
|
||||
fn request_body_for(endpoint: &EndpointDefinition) -> Option<Value> {
|
||||
let examples = body_examples(endpoint.examples.iter().filter_map(|example| {
|
||||
example
|
||||
.request_body_sample
|
||||
.as_deref()
|
||||
.map(redacted_body_sample)
|
||||
}));
|
||||
|
||||
if endpoint.request_schema.is_none() && examples.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(json!({
|
||||
"required": false,
|
||||
"content": {
|
||||
"application/json": media_type_object(endpoint.request_schema.clone(), examples)
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
fn responses_for(endpoint: &EndpointDefinition) -> Value {
|
||||
let mut responses = Map::new();
|
||||
let mut statuses = endpoint
|
||||
.examples
|
||||
.iter()
|
||||
.map(|example| example.response_status)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
statuses.sort_unstable();
|
||||
statuses.dedup();
|
||||
|
||||
if statuses.is_empty() {
|
||||
statuses.push(200);
|
||||
}
|
||||
|
||||
for status in statuses {
|
||||
let examples = body_examples(
|
||||
endpoint
|
||||
.examples
|
||||
.iter()
|
||||
.filter(move |example| example.response_status == status)
|
||||
.filter_map(|example| {
|
||||
example
|
||||
.response_body_sample
|
||||
.as_deref()
|
||||
.map(redacted_body_sample)
|
||||
}),
|
||||
);
|
||||
|
||||
responses.insert(
|
||||
status.to_string(),
|
||||
json!({
|
||||
"description": format!("Captured HTTP {status} response"),
|
||||
"content": {
|
||||
"application/json": media_type_object(endpoint.response_schema.clone(), examples)
|
||||
}
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
Value::Object(responses)
|
||||
}
|
||||
|
||||
fn media_type_object(schema: Option<Value>, examples: Map<String, Value>) -> Value {
|
||||
let mut media_type = Map::new();
|
||||
|
||||
if let Some(schema) = schema {
|
||||
media_type.insert("schema".to_owned(), redact_json(&schema));
|
||||
}
|
||||
|
||||
if !examples.is_empty() {
|
||||
media_type.insert("examples".to_owned(), Value::Object(examples));
|
||||
}
|
||||
|
||||
Value::Object(media_type)
|
||||
}
|
||||
|
||||
fn examples_for(endpoint: &EndpointDefinition) -> Vec<Value> {
|
||||
endpoint.examples.iter().map(redacted_example).collect()
|
||||
}
|
||||
|
||||
fn redacted_example(example: &EndpointExample) -> Value {
|
||||
json!({
|
||||
"url": redacted_example_url(&example.url),
|
||||
"request_headers": redact_headers(&example.request_headers),
|
||||
"request_body": example.request_body_sample.as_deref().map(redacted_body_sample),
|
||||
"response_status": example.response_status,
|
||||
"response_headers": redact_headers(&example.response_headers),
|
||||
"response_body": example.response_body_sample.as_deref().map(redacted_body_sample),
|
||||
"captured_at": example.captured_at
|
||||
})
|
||||
}
|
||||
|
||||
fn redacted_example_url(url: &str) -> String {
|
||||
let Ok(mut parsed) = Url::parse(url) else {
|
||||
return url.to_owned();
|
||||
};
|
||||
|
||||
let pairs: Vec<(String, String)> = parsed.query_pairs().into_owned().collect();
|
||||
if pairs.is_empty() {
|
||||
return parsed.to_string();
|
||||
}
|
||||
|
||||
parsed.set_query(None);
|
||||
{
|
||||
let mut query = parsed.query_pairs_mut();
|
||||
for (name, value) in pairs {
|
||||
query.append_pair(&name, &redacted_parameter_value(&name, &value));
|
||||
}
|
||||
}
|
||||
|
||||
parsed.to_string()
|
||||
}
|
||||
|
||||
fn body_examples(values: impl Iterator<Item = Value>) -> Map<String, Value> {
|
||||
examples_object(values)
|
||||
}
|
||||
|
||||
fn examples_object(values: impl Iterator<Item = Value>) -> Map<String, Value> {
|
||||
let mut examples = Map::new();
|
||||
|
||||
for (index, value) in values.enumerate() {
|
||||
examples.insert(format!("captured-{}", index + 1), json!({ "value": value }));
|
||||
}
|
||||
|
||||
examples
|
||||
}
|
||||
|
||||
fn redacted_body_sample(sample: &str) -> Value {
|
||||
match serde_json::from_str::<Value>(sample) {
|
||||
Ok(value) => redact_json(&value),
|
||||
Err(_) if contains_obvious_secret(sample) => Value::String(REDACTED.to_owned()),
|
||||
Err(_) => Value::String(sample.to_owned()),
|
||||
}
|
||||
}
|
||||
|
||||
fn contains_obvious_secret(value: &str) -> bool {
|
||||
let lower = value.to_ascii_lowercase();
|
||||
lower.contains("bearer ")
|
||||
|| lower.contains("authorization")
|
||||
|| lower.contains("api_key")
|
||||
|| lower.contains("api-key")
|
||||
|| lower.contains("csrf")
|
||||
|| lower.contains("token")
|
||||
|| lower.contains("session")
|
||||
|| lower.contains("password")
|
||||
|| lower.contains("cookie")
|
||||
|| contains_email_like_value(value)
|
||||
}
|
||||
|
||||
fn redacted_parameter_value(name: &str, value: &str) -> String {
|
||||
if is_sensitive_name(name) || contains_obvious_secret(value) {
|
||||
REDACTED.to_owned()
|
||||
} else {
|
||||
value.to_owned()
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sensitive_name(name: &str) -> bool {
|
||||
let lower = name.to_ascii_lowercase();
|
||||
let compact: String = lower
|
||||
.chars()
|
||||
.filter(|character| character.is_ascii_alphanumeric())
|
||||
.collect();
|
||||
|
||||
[
|
||||
"authorization",
|
||||
"cookie",
|
||||
"set-cookie",
|
||||
"api-key",
|
||||
"csrf",
|
||||
"token",
|
||||
"session",
|
||||
"password",
|
||||
"email",
|
||||
]
|
||||
.iter()
|
||||
.any(|sensitive| {
|
||||
let sensitive_compact: String = sensitive
|
||||
.chars()
|
||||
.filter(|character| character.is_ascii_alphanumeric())
|
||||
.collect();
|
||||
|
||||
lower.contains(sensitive) || compact.contains(&sensitive_compact)
|
||||
})
|
||||
}
|
||||
|
||||
fn contains_email_like_value(value: &str) -> bool {
|
||||
let Some(at_index) = value.find('@') else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let before = &value[..at_index];
|
||||
let after = &value[at_index + 1..];
|
||||
|
||||
before
|
||||
.chars()
|
||||
.rev()
|
||||
.take_while(|character| {
|
||||
character.is_ascii_alphanumeric() || matches!(character, '.' | '_' | '%' | '+' | '-')
|
||||
})
|
||||
.count()
|
||||
> 0
|
||||
&& after
|
||||
.chars()
|
||||
.take_while(|character| {
|
||||
character.is_ascii_alphanumeric() || matches!(character, '.' | '-')
|
||||
})
|
||||
.any(|character| character == '.')
|
||||
}
|
||||
|
||||
fn operation_id(endpoint: &EndpointDefinition) -> String {
|
||||
format!(
|
||||
"{}_{}",
|
||||
endpoint.method.to_ascii_lowercase(),
|
||||
endpoint
|
||||
.path_template
|
||||
.trim_matches('/')
|
||||
.chars()
|
||||
.map(|character| {
|
||||
if character.is_ascii_alphanumeric() {
|
||||
character.to_ascii_lowercase()
|
||||
} else {
|
||||
'_'
|
||||
}
|
||||
})
|
||||
.collect::<String>()
|
||||
)
|
||||
.trim_matches('_')
|
||||
.to_owned()
|
||||
}
|
||||
|
||||
fn normalize_openapi_path(path_template: &str) -> String {
|
||||
if path_template.starts_with('/') {
|
||||
path_template.to_owned()
|
||||
} else {
|
||||
format!("/{path_template}")
|
||||
}
|
||||
}
|
||||
|
||||
fn parameter_name(parameter: &Value) -> &str {
|
||||
parameter
|
||||
.get("name")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn capture_dir_for_id(root: &Path, capture_id: &str) -> Result<PathBuf, CaptureError> {
|
||||
let mut capture_dir = root.to_path_buf();
|
||||
let parts = capture_id
|
||||
.split(['/', '\\'])
|
||||
.filter(|part| !part.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if parts.is_empty() {
|
||||
return Err(CaptureError::Storage(
|
||||
"capture id cannot be empty".to_owned(),
|
||||
));
|
||||
}
|
||||
|
||||
for part in parts {
|
||||
if !is_safe_path_segment(part) {
|
||||
return Err(CaptureError::Storage(format!(
|
||||
"capture id contains unsafe path segment: {capture_id}"
|
||||
)));
|
||||
}
|
||||
capture_dir.push(part);
|
||||
}
|
||||
|
||||
ensure_within_root(root, &capture_dir)?;
|
||||
|
||||
Ok(capture_dir)
|
||||
}
|
||||
|
||||
fn ensure_within_root(root: &Path, path: &Path) -> Result<(), CaptureError> {
|
||||
if relative_components(path).starts_with(&relative_components(root)) {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(CaptureError::Storage(format!(
|
||||
"capture path escapes capture root: {}",
|
||||
path.display()
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
fn relative_components(path: &Path) -> Vec<String> {
|
||||
path.components()
|
||||
.filter_map(|component| match component {
|
||||
Component::Prefix(prefix) => Some(prefix.as_os_str().to_string_lossy().to_string()),
|
||||
Component::RootDir => Some(String::from("\\")),
|
||||
Component::Normal(value) => Some(value.to_string_lossy().to_string()),
|
||||
Component::CurDir => None,
|
||||
Component::ParentDir => Some(String::from("..")),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn is_safe_path_segment(segment: &str) -> bool {
|
||||
!segment.is_empty()
|
||||
&& segment != "."
|
||||
&& segment != ".."
|
||||
&& !segment.contains(':')
|
||||
&& !segment.contains('/')
|
||||
&& !segment.contains('\\')
|
||||
}
|
||||
236
crates/webclaw-capture/src/redact.rs
Normal file
236
crates/webclaw-capture/src/redact.rs
Normal file
|
|
@ -0,0 +1,236 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use serde_json::{Map, Value};
|
||||
use url::Url;
|
||||
|
||||
use crate::types::{
|
||||
CaptureArtifact, CapturedExchange, EndpointDefinition, EndpointExample, HeaderMap,
|
||||
};
|
||||
|
||||
const REDACTED: &str = "[REDACTED]";
|
||||
|
||||
const SENSITIVE_NAMES: &[&str] = &[
|
||||
"authorization",
|
||||
"cookie",
|
||||
"set-cookie",
|
||||
"api-key",
|
||||
"csrf",
|
||||
"token",
|
||||
"session",
|
||||
"password",
|
||||
"email",
|
||||
];
|
||||
|
||||
pub fn redact_headers(headers: &HeaderMap) -> HeaderMap {
|
||||
headers
|
||||
.iter()
|
||||
.map(|(name, value)| {
|
||||
let value = if is_sensitive_name(name) {
|
||||
Value::String(REDACTED.to_owned())
|
||||
} else {
|
||||
value.clone()
|
||||
};
|
||||
(name.clone(), value)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn redact_url(url: &str) -> String {
|
||||
let Ok(mut parsed) = Url::parse(url) else {
|
||||
return url.to_owned();
|
||||
};
|
||||
|
||||
let pairs: Vec<(String, String)> = parsed.query_pairs().into_owned().collect();
|
||||
if pairs.is_empty() {
|
||||
return parsed.to_string();
|
||||
}
|
||||
|
||||
parsed.set_query(None);
|
||||
{
|
||||
let mut query = parsed.query_pairs_mut();
|
||||
for (name, value) in pairs {
|
||||
let value = if is_sensitive_name(&name) {
|
||||
REDACTED.to_owned()
|
||||
} else {
|
||||
value
|
||||
};
|
||||
query.append_pair(&name, &value);
|
||||
}
|
||||
}
|
||||
|
||||
parsed.to_string()
|
||||
}
|
||||
|
||||
pub fn redact_json(value: &Value) -> Value {
|
||||
match value {
|
||||
Value::Object(object) => Value::Object(redact_json_object(object)),
|
||||
Value::Array(items) => Value::Array(items.iter().map(redact_json).collect()),
|
||||
_ => value.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn redact_artifact(artifact: &CaptureArtifact) -> CaptureArtifact {
|
||||
let metadata = match redact_json(&Value::Object(artifact.metadata.clone())) {
|
||||
Value::Object(metadata) => metadata,
|
||||
_ => Map::new(),
|
||||
};
|
||||
|
||||
CaptureArtifact {
|
||||
id: artifact.id.clone(),
|
||||
source_url: redact_url(&artifact.source_url),
|
||||
intent: artifact.intent.clone(),
|
||||
started_at: artifact.started_at,
|
||||
completed_at: artifact.completed_at,
|
||||
exchanges: artifact.exchanges.iter().map(redact_exchange).collect(),
|
||||
endpoints: artifact.endpoints.iter().map(redact_endpoint).collect(),
|
||||
metadata,
|
||||
}
|
||||
}
|
||||
|
||||
fn redact_exchange(exchange: &CapturedExchange) -> CapturedExchange {
|
||||
CapturedExchange {
|
||||
method: exchange.method.clone(),
|
||||
url: redact_url(&exchange.url),
|
||||
request_headers: redact_headers(&exchange.request_headers),
|
||||
request_body_sample: redact_body_sample(exchange.request_body_sample.as_deref()),
|
||||
resource_type: exchange.resource_type.clone(),
|
||||
status: exchange.status,
|
||||
response_headers: redact_headers(&exchange.response_headers),
|
||||
response_body_sample: redact_body_sample(exchange.response_body_sample.as_deref()),
|
||||
started_at: exchange.started_at,
|
||||
duration_ms: exchange.duration_ms,
|
||||
redirect_chain: exchange
|
||||
.redirect_chain
|
||||
.iter()
|
||||
.map(|redirect| redact_url(redirect))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
fn redact_endpoint(endpoint: &EndpointDefinition) -> EndpointDefinition {
|
||||
EndpointDefinition {
|
||||
id: endpoint.id.clone(),
|
||||
method: endpoint.method.clone(),
|
||||
origin: endpoint.origin.clone(),
|
||||
path_template: endpoint.path_template.clone(),
|
||||
query_params: redact_query_params(&endpoint.query_params),
|
||||
request_schema: endpoint.request_schema.as_ref().map(redact_json),
|
||||
response_schema: endpoint.response_schema.as_ref().map(redact_json),
|
||||
auth_evidence: endpoint.auth_evidence.clone(),
|
||||
safety: endpoint.safety.clone(),
|
||||
examples: endpoint
|
||||
.examples
|
||||
.iter()
|
||||
.map(redact_endpoint_example)
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
fn redact_endpoint_example(example: &EndpointExample) -> EndpointExample {
|
||||
EndpointExample {
|
||||
url: redact_url(&example.url),
|
||||
request_headers: redact_headers(&example.request_headers),
|
||||
request_body_sample: redact_body_sample(example.request_body_sample.as_deref()),
|
||||
response_status: example.response_status,
|
||||
response_headers: redact_headers(&example.response_headers),
|
||||
response_body_sample: redact_body_sample(example.response_body_sample.as_deref()),
|
||||
captured_at: example.captured_at,
|
||||
}
|
||||
}
|
||||
|
||||
fn redact_query_params(params: &BTreeMap<String, Vec<String>>) -> BTreeMap<String, Vec<String>> {
|
||||
params
|
||||
.iter()
|
||||
.map(|(name, values)| {
|
||||
let values = if is_sensitive_name(name) {
|
||||
vec![REDACTED.to_owned()]
|
||||
} else {
|
||||
values.clone()
|
||||
};
|
||||
(name.clone(), values)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn redact_json_object(object: &Map<String, Value>) -> Map<String, Value> {
|
||||
object
|
||||
.iter()
|
||||
.map(|(key, value)| {
|
||||
let value = if is_sensitive_name(key) {
|
||||
Value::String(REDACTED.to_owned())
|
||||
} else {
|
||||
redact_json(value)
|
||||
};
|
||||
(key.clone(), value)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn redact_body_sample(sample: Option<&str>) -> Option<String> {
|
||||
sample.map(|body| match serde_json::from_str::<Value>(body) {
|
||||
Ok(value) => redact_json(&value).to_string(),
|
||||
Err(_) => redact_text_body(body),
|
||||
})
|
||||
}
|
||||
|
||||
fn is_sensitive_name(name: &str) -> bool {
|
||||
let lower = name.to_ascii_lowercase();
|
||||
let compact: String = lower
|
||||
.chars()
|
||||
.filter(|ch| ch.is_ascii_alphanumeric())
|
||||
.collect();
|
||||
|
||||
SENSITIVE_NAMES.iter().any(|sensitive| {
|
||||
let sensitive_compact: String = sensitive
|
||||
.chars()
|
||||
.filter(|ch| ch.is_ascii_alphanumeric())
|
||||
.collect();
|
||||
|
||||
lower.contains(sensitive) || compact.contains(&sensitive_compact)
|
||||
})
|
||||
}
|
||||
|
||||
fn redact_text_body(body: &str) -> String {
|
||||
body.lines()
|
||||
.map(|line| {
|
||||
if is_sensitive_text_line(line) {
|
||||
REDACTED.to_owned()
|
||||
} else {
|
||||
line.to_owned()
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
fn is_sensitive_text_line(line: &str) -> bool {
|
||||
is_sensitive_name(line) || contains_bearer_token(line) || contains_email_like_value(line)
|
||||
}
|
||||
|
||||
fn contains_bearer_token(line: &str) -> bool {
|
||||
line.to_ascii_lowercase().contains("bearer ")
|
||||
}
|
||||
|
||||
fn contains_email_like_value(line: &str) -> bool {
|
||||
let Some(at_index) = line.find('@') else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let before = &line[..at_index];
|
||||
let after = &line[at_index + 1..];
|
||||
|
||||
before
|
||||
.chars()
|
||||
.rev()
|
||||
.take_while(|character| {
|
||||
character.is_ascii_alphanumeric() || matches!(character, '.' | '_' | '%' | '+' | '-')
|
||||
})
|
||||
.count()
|
||||
> 0
|
||||
&& after
|
||||
.chars()
|
||||
.take_while(|character| {
|
||||
character.is_ascii_alphanumeric() || matches!(character, '.' | '-')
|
||||
})
|
||||
.any(|character| character == '.')
|
||||
}
|
||||
383
crates/webclaw-capture/src/replay.rs
Normal file
383
crates/webclaw-capture/src/replay.rs
Normal file
|
|
@ -0,0 +1,383 @@
|
|||
use std::collections::BTreeSet;
|
||||
|
||||
use reqwest::{
|
||||
Client, Method, RequestBuilder,
|
||||
header::{HeaderName, HeaderValue},
|
||||
};
|
||||
use serde_json::{Map, Value};
|
||||
use url::{Url, form_urlencoded::byte_serialize};
|
||||
|
||||
use crate::types::{CaptureError, EndpointDefinition, HeaderMap, ReplayOptions, ReplayResult};
|
||||
|
||||
const MAX_BODY_SAMPLE_BYTES: usize = 64 * 1024;
|
||||
|
||||
pub async fn replay_endpoint(
|
||||
endpoint: &EndpointDefinition,
|
||||
options: ReplayOptions,
|
||||
) -> Result<ReplayResult, CaptureError> {
|
||||
if unsafe_replay_requires_confirmation(endpoint, &options) {
|
||||
return Ok(ReplayResult::Blocked {
|
||||
reason: format!(
|
||||
"{} replay requires --confirm-unsafe unless --dry-run is used",
|
||||
endpoint.method.to_ascii_uppercase()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let spec = replay_spec(endpoint, &options)?;
|
||||
if options.dry_run {
|
||||
return Ok(ReplayResult::Preview {
|
||||
method: spec.method.as_str().to_owned(),
|
||||
url: spec.url.to_string(),
|
||||
headers: spec.headers,
|
||||
body_sample: spec.body_sample,
|
||||
});
|
||||
}
|
||||
|
||||
let response = request_builder_from_spec(spec)?.send().await?;
|
||||
let status = response.status().as_u16();
|
||||
let headers = response_headers_to_json(response.headers());
|
||||
let body = response.bytes().await?;
|
||||
let body_sample = body_sample_from_bytes(&body);
|
||||
|
||||
Ok(ReplayResult::Executed {
|
||||
status,
|
||||
headers,
|
||||
body_sample,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn build_replay_request(
|
||||
endpoint: &EndpointDefinition,
|
||||
options: &ReplayOptions,
|
||||
) -> Result<RequestBuilder, CaptureError> {
|
||||
if unsafe_replay_requires_confirmation(endpoint, options) {
|
||||
return Err(CaptureError::Replay(format!(
|
||||
"{} replay requires confirmation",
|
||||
endpoint.method.to_ascii_uppercase()
|
||||
)));
|
||||
}
|
||||
|
||||
request_builder_from_spec(replay_spec(endpoint, options)?)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ReplaySpec {
|
||||
method: Method,
|
||||
url: Url,
|
||||
headers: HeaderMap,
|
||||
body_sample: Option<String>,
|
||||
}
|
||||
|
||||
fn replay_spec(
|
||||
endpoint: &EndpointDefinition,
|
||||
options: &ReplayOptions,
|
||||
) -> Result<ReplaySpec, CaptureError> {
|
||||
let method = Method::from_bytes(endpoint.method.as_bytes()).map_err(|error| {
|
||||
CaptureError::Replay(format!(
|
||||
"invalid replay method {:?}: {error}",
|
||||
endpoint.method
|
||||
))
|
||||
})?;
|
||||
|
||||
let (path, consumed_params) = interpolate_path_template(&endpoint.path_template, options)?;
|
||||
let mut url = Url::parse(&format!(
|
||||
"{}{}",
|
||||
endpoint.origin.trim_end_matches('/'),
|
||||
ensure_leading_slash(&path)
|
||||
))
|
||||
.map_err(|error| CaptureError::InvalidUrl(error.to_string()))?;
|
||||
|
||||
apply_query_params(&mut url, endpoint, options, &consumed_params);
|
||||
|
||||
let mut headers = HeaderMap::new();
|
||||
if let Some(example) = endpoint.examples.first() {
|
||||
merge_safe_headers(&mut headers, &example.request_headers);
|
||||
}
|
||||
merge_safe_headers(&mut headers, &options.headers);
|
||||
|
||||
let body_sample = replay_body_sample(endpoint, options)?;
|
||||
|
||||
Ok(ReplaySpec {
|
||||
method,
|
||||
url,
|
||||
headers,
|
||||
body_sample,
|
||||
})
|
||||
}
|
||||
|
||||
fn request_builder_from_spec(spec: ReplaySpec) -> Result<RequestBuilder, CaptureError> {
|
||||
let client = Client::new();
|
||||
let mut builder = client.request(spec.method, spec.url);
|
||||
|
||||
for (name, value) in spec.headers {
|
||||
let Some(value) = header_value_to_string(&value) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let Ok(name) = HeaderName::from_bytes(name.as_bytes()) else {
|
||||
continue;
|
||||
};
|
||||
let Ok(value) = HeaderValue::from_str(&value) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
builder = builder.header(name, value);
|
||||
}
|
||||
|
||||
if let Some(body_sample) = spec.body_sample
|
||||
&& !contains_redacted_material(&body_sample)
|
||||
{
|
||||
builder = builder.body(body_sample);
|
||||
}
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
fn unsafe_replay_requires_confirmation(
|
||||
endpoint: &EndpointDefinition,
|
||||
options: &ReplayOptions,
|
||||
) -> bool {
|
||||
is_unsafe_endpoint(endpoint) && !options.dry_run && !options.confirm_unsafe
|
||||
}
|
||||
|
||||
fn is_unsafe_endpoint(endpoint: &EndpointDefinition) -> bool {
|
||||
endpoint.safety.requires_confirmation
|
||||
|| !endpoint.safety.safe_to_replay
|
||||
|| !matches!(
|
||||
endpoint.method.to_ascii_uppercase().as_str(),
|
||||
"GET" | "HEAD" | "OPTIONS"
|
||||
)
|
||||
}
|
||||
|
||||
fn interpolate_path_template(
|
||||
path_template: &str,
|
||||
options: &ReplayOptions,
|
||||
) -> Result<(String, BTreeSet<String>), CaptureError> {
|
||||
let params = params_object(options);
|
||||
let mut consumed = BTreeSet::new();
|
||||
let mut path = String::new();
|
||||
let mut rest = path_template;
|
||||
|
||||
while let Some(start) = rest.find('{') {
|
||||
let (before, after_start) = rest.split_at(start);
|
||||
path.push_str(before);
|
||||
|
||||
let Some(end) = after_start.find('}') else {
|
||||
path.push_str(after_start);
|
||||
return Ok((path, consumed));
|
||||
};
|
||||
|
||||
let name = &after_start[1..end];
|
||||
if let Some(value) = params.and_then(|object| object.get(name)) {
|
||||
let value = scalar_param_to_string(value).ok_or_else(|| {
|
||||
CaptureError::Replay(format!("path parameter {name:?} must be scalar"))
|
||||
})?;
|
||||
path.push_str(&encode_path_segment(&value));
|
||||
consumed.insert(name.to_owned());
|
||||
} else {
|
||||
path.push_str(&after_start[..=end]);
|
||||
}
|
||||
|
||||
rest = &after_start[end + 1..];
|
||||
}
|
||||
|
||||
path.push_str(rest);
|
||||
Ok((path, consumed))
|
||||
}
|
||||
|
||||
fn apply_query_params(
|
||||
url: &mut Url,
|
||||
endpoint: &EndpointDefinition,
|
||||
options: &ReplayOptions,
|
||||
consumed_params: &BTreeSet<String>,
|
||||
) {
|
||||
url.set_query(None);
|
||||
let mut pairs = Vec::<(String, String)>::new();
|
||||
|
||||
for (name, values) in &endpoint.query_params {
|
||||
if consumed_params.contains(name) || is_sensitive_name(name) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(value) = values
|
||||
.iter()
|
||||
.find(|value| !contains_redacted_material(value))
|
||||
.cloned()
|
||||
{
|
||||
pairs.push((name.clone(), value));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(params) = params_object(options) {
|
||||
for (name, value) in params {
|
||||
if consumed_params.contains(name) || is_sensitive_name(name) {
|
||||
continue;
|
||||
}
|
||||
|
||||
append_query_value(&mut pairs, name, value);
|
||||
}
|
||||
}
|
||||
|
||||
if pairs.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut query = url.query_pairs_mut();
|
||||
for (name, value) in pairs {
|
||||
query.append_pair(&name, &value);
|
||||
}
|
||||
}
|
||||
|
||||
fn append_query_value(pairs: &mut Vec<(String, String)>, name: &str, value: &Value) {
|
||||
match value {
|
||||
Value::Array(values) => {
|
||||
for value in values {
|
||||
if let Some(value) = scalar_param_to_string(value)
|
||||
&& !contains_redacted_material(&value)
|
||||
{
|
||||
pairs.push((name.to_owned(), value));
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
if let Some(value) = scalar_param_to_string(value)
|
||||
&& !contains_redacted_material(&value)
|
||||
{
|
||||
pairs.retain(|(existing, _value)| existing != name);
|
||||
pairs.push((name.to_owned(), value));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn replay_body_sample(
|
||||
endpoint: &EndpointDefinition,
|
||||
options: &ReplayOptions,
|
||||
) -> Result<Option<String>, CaptureError> {
|
||||
if let Some(body_json) = &options.body_json {
|
||||
return Ok(Some(serde_json::to_string(body_json)?));
|
||||
}
|
||||
|
||||
let Some(example) = endpoint.examples.first() else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
Ok(example
|
||||
.request_body_sample
|
||||
.as_ref()
|
||||
.filter(|sample| !contains_redacted_material(sample))
|
||||
.cloned())
|
||||
}
|
||||
|
||||
fn merge_safe_headers(target: &mut HeaderMap, headers: &HeaderMap) {
|
||||
for (name, value) in headers {
|
||||
if should_skip_header(name, value) {
|
||||
continue;
|
||||
}
|
||||
|
||||
target.insert(name.clone(), value.clone());
|
||||
}
|
||||
}
|
||||
|
||||
fn should_skip_header(name: &str, value: &Value) -> bool {
|
||||
is_hop_by_hop_header(name)
|
||||
|| header_value_to_string(value)
|
||||
.map(|value| value.trim().is_empty() || contains_redacted_material(&value))
|
||||
.unwrap_or(true)
|
||||
}
|
||||
|
||||
fn is_hop_by_hop_header(name: &str) -> bool {
|
||||
matches!(
|
||||
name.to_ascii_lowercase().as_str(),
|
||||
"host" | "connection" | "content-length" | "transfer-encoding" | "accept-encoding"
|
||||
)
|
||||
}
|
||||
|
||||
fn header_value_to_string(value: &Value) -> Option<String> {
|
||||
match value {
|
||||
Value::String(value) => Some(value.clone()),
|
||||
Value::Number(value) => Some(value.to_string()),
|
||||
Value::Bool(value) => Some(value.to_string()),
|
||||
Value::Null | Value::Array(_) | Value::Object(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn response_headers_to_json(headers: &reqwest::header::HeaderMap) -> HeaderMap {
|
||||
headers
|
||||
.iter()
|
||||
.filter_map(|(name, value)| {
|
||||
value
|
||||
.to_str()
|
||||
.ok()
|
||||
.map(|value| (name.as_str().to_owned(), Value::String(value.to_owned())))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn body_sample_from_bytes(bytes: &[u8]) -> Option<String> {
|
||||
if bytes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let capped = &bytes[..bytes.len().min(MAX_BODY_SAMPLE_BYTES)];
|
||||
Some(String::from_utf8_lossy(capped).into_owned())
|
||||
}
|
||||
|
||||
fn params_object(options: &ReplayOptions) -> Option<&Map<String, Value>> {
|
||||
options.params_json.as_ref()?.as_object()
|
||||
}
|
||||
|
||||
fn scalar_param_to_string(value: &Value) -> Option<String> {
|
||||
match value {
|
||||
Value::String(value) => Some(value.clone()),
|
||||
Value::Number(value) => Some(value.to_string()),
|
||||
Value::Bool(value) => Some(value.to_string()),
|
||||
Value::Null | Value::Array(_) | Value::Object(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn contains_redacted_material(value: &str) -> bool {
|
||||
value.to_ascii_lowercase().contains("[redacted]")
|
||||
}
|
||||
|
||||
fn is_sensitive_name(name: &str) -> bool {
|
||||
let lower = name.to_ascii_lowercase();
|
||||
let compact: String = lower
|
||||
.chars()
|
||||
.filter(|character| character.is_ascii_alphanumeric())
|
||||
.collect();
|
||||
|
||||
[
|
||||
"authorization",
|
||||
"cookie",
|
||||
"set-cookie",
|
||||
"api-key",
|
||||
"csrf",
|
||||
"token",
|
||||
"session",
|
||||
"password",
|
||||
"email",
|
||||
]
|
||||
.iter()
|
||||
.any(|sensitive| {
|
||||
let sensitive_compact: String = sensitive
|
||||
.chars()
|
||||
.filter(|character| character.is_ascii_alphanumeric())
|
||||
.collect();
|
||||
|
||||
lower.contains(sensitive) || compact.contains(&sensitive_compact)
|
||||
})
|
||||
}
|
||||
|
||||
fn encode_path_segment(value: &str) -> String {
|
||||
byte_serialize(value.as_bytes()).collect()
|
||||
}
|
||||
|
||||
fn ensure_leading_slash(path: &str) -> String {
|
||||
if path.starts_with('/') {
|
||||
path.to_owned()
|
||||
} else {
|
||||
format!("/{path}")
|
||||
}
|
||||
}
|
||||
221
crates/webclaw-capture/src/store.rs
Normal file
221
crates/webclaw-capture/src/store.rs
Normal file
|
|
@ -0,0 +1,221 @@
|
|||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::{Component, Path, PathBuf};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde_json::{Map, Value, json};
|
||||
use url::Url;
|
||||
|
||||
use crate::redact::redact_artifact;
|
||||
use crate::types::{CaptureArtifact, CaptureError, EndpointDefinition, SavedCapture};
|
||||
|
||||
const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR";
|
||||
const RAW_CAPTURE_FILE: &str = "raw-capture.json";
|
||||
const REDACTED_CAPTURE_FILE: &str = "redacted-capture.json";
|
||||
const ENDPOINTS_FILE: &str = "endpoints.json";
|
||||
const METADATA_FILE: &str = "metadata.json";
|
||||
|
||||
pub fn capture_root() -> PathBuf {
|
||||
env::var_os(CAPTURE_DIR_ENV)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(PathBuf::from)
|
||||
.unwrap_or_else(|| home_dir().join(".webclaw").join("api-captures"))
|
||||
}
|
||||
|
||||
pub fn capture_id_for(url: &Url, started_at: DateTime<Utc>) -> String {
|
||||
let host = url.host_str().unwrap_or("unknown-host");
|
||||
let host = match url.port() {
|
||||
Some(port) => format!("{host}-{port}"),
|
||||
None => host.to_owned(),
|
||||
};
|
||||
let timestamp = started_at.format("%Y-%m-%dT%H-%M-%SZ");
|
||||
|
||||
format!("{}/{timestamp}", sanitize_id_segment(&host))
|
||||
}
|
||||
|
||||
pub fn save_capture(artifact: &CaptureArtifact) -> Result<SavedCapture, CaptureError> {
|
||||
let root = capture_root();
|
||||
let capture_dir = capture_dir_for_id(&root, &artifact.id)?;
|
||||
|
||||
fs::create_dir_all(&capture_dir)?;
|
||||
|
||||
let raw_capture_path = capture_dir.join(RAW_CAPTURE_FILE);
|
||||
let redacted_capture_path = capture_dir.join(REDACTED_CAPTURE_FILE);
|
||||
let endpoints_path = capture_dir.join(ENDPOINTS_FILE);
|
||||
let metadata_path = capture_dir.join(METADATA_FILE);
|
||||
let redacted_artifact = redact_artifact(artifact);
|
||||
|
||||
write_json(&raw_capture_path, artifact)?;
|
||||
write_json(&redacted_capture_path, &redacted_artifact)?;
|
||||
write_json(&endpoints_path, &redacted_artifact.endpoints)?;
|
||||
write_json(&metadata_path, &metadata_for(&redacted_artifact))?;
|
||||
|
||||
Ok(SavedCapture {
|
||||
id: artifact.id.clone(),
|
||||
root,
|
||||
capture_dir,
|
||||
raw_capture_path,
|
||||
redacted_capture_path,
|
||||
endpoints_path,
|
||||
metadata_path,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load_endpoints(capture_id: &str) -> Result<Vec<EndpointDefinition>, CaptureError> {
|
||||
let endpoints_path = capture_dir_for_id(&capture_root(), capture_id)?.join(ENDPOINTS_FILE);
|
||||
let contents = fs::read_to_string(&endpoints_path).map_err(|error| {
|
||||
CaptureError::Storage(format!(
|
||||
"could not read endpoints for capture id {capture_id}: {error}"
|
||||
))
|
||||
})?;
|
||||
|
||||
serde_json::from_str(&contents).map_err(CaptureError::from)
|
||||
}
|
||||
|
||||
pub fn find_endpoint(endpoint_id: &str) -> Result<EndpointDefinition, CaptureError> {
|
||||
let root = capture_root();
|
||||
if !root.exists() {
|
||||
return Err(CaptureError::EndpointNotFound(endpoint_id.to_owned()));
|
||||
}
|
||||
|
||||
let mut stack = vec![root];
|
||||
while let Some(path) = stack.pop() {
|
||||
let entries = match fs::read_dir(&path) {
|
||||
Ok(entries) => entries,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
stack.push(path);
|
||||
continue;
|
||||
}
|
||||
|
||||
if path.file_name().and_then(|name| name.to_str()) != Some(ENDPOINTS_FILE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let contents = match fs::read_to_string(&path) {
|
||||
Ok(contents) => contents,
|
||||
Err(_) => continue,
|
||||
};
|
||||
let endpoints: Vec<EndpointDefinition> = match serde_json::from_str(&contents) {
|
||||
Ok(endpoints) => endpoints,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
if let Some(endpoint) = endpoints
|
||||
.into_iter()
|
||||
.find(|endpoint| endpoint.id == endpoint_id)
|
||||
{
|
||||
return Ok(endpoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(CaptureError::EndpointNotFound(endpoint_id.to_owned()))
|
||||
}
|
||||
|
||||
fn home_dir() -> PathBuf {
|
||||
env::var_os("USERPROFILE")
|
||||
.map(PathBuf::from)
|
||||
.or_else(dirs::home_dir)
|
||||
.unwrap_or_else(|| PathBuf::from("."))
|
||||
}
|
||||
|
||||
fn capture_dir_for_id(root: &Path, capture_id: &str) -> Result<PathBuf, CaptureError> {
|
||||
let mut capture_dir = root.to_path_buf();
|
||||
let parts = capture_id
|
||||
.split(['/', '\\'])
|
||||
.filter(|part| !part.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if parts.is_empty() {
|
||||
return Err(CaptureError::Storage(
|
||||
"capture id cannot be empty".to_owned(),
|
||||
));
|
||||
}
|
||||
|
||||
for part in parts {
|
||||
if !is_safe_path_segment(part) {
|
||||
return Err(CaptureError::Storage(format!(
|
||||
"capture id contains unsafe path segment: {capture_id}"
|
||||
)));
|
||||
}
|
||||
capture_dir.push(part);
|
||||
}
|
||||
|
||||
ensure_within_root(root, &capture_dir)?;
|
||||
|
||||
Ok(capture_dir)
|
||||
}
|
||||
|
||||
fn ensure_within_root(root: &Path, path: &Path) -> Result<(), CaptureError> {
|
||||
if relative_components(path).starts_with(&relative_components(root)) {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(CaptureError::Storage(format!(
|
||||
"capture path escapes capture root: {}",
|
||||
path.display()
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
fn relative_components(path: &Path) -> Vec<String> {
|
||||
path.components()
|
||||
.filter_map(|component| match component {
|
||||
Component::Prefix(prefix) => Some(prefix.as_os_str().to_string_lossy().to_string()),
|
||||
Component::RootDir => Some(String::from("\\")),
|
||||
Component::Normal(value) => Some(value.to_string_lossy().to_string()),
|
||||
Component::CurDir => None,
|
||||
Component::ParentDir => Some(String::from("..")),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn is_safe_path_segment(segment: &str) -> bool {
|
||||
!segment.is_empty()
|
||||
&& segment != "."
|
||||
&& segment != ".."
|
||||
&& !segment.contains(':')
|
||||
&& !segment.contains('/')
|
||||
&& !segment.contains('\\')
|
||||
}
|
||||
|
||||
fn sanitize_id_segment(segment: &str) -> String {
|
||||
let sanitized = segment
|
||||
.chars()
|
||||
.map(|character| {
|
||||
if character.is_ascii_alphanumeric() || matches!(character, '.' | '-' | '_') {
|
||||
character
|
||||
} else {
|
||||
'-'
|
||||
}
|
||||
})
|
||||
.collect::<String>();
|
||||
|
||||
if sanitized.is_empty() {
|
||||
"unknown".to_owned()
|
||||
} else {
|
||||
sanitized
|
||||
}
|
||||
}
|
||||
|
||||
fn write_json<T: serde::Serialize>(path: &PathBuf, value: &T) -> Result<(), CaptureError> {
|
||||
let contents = serde_json::to_string_pretty(value)?;
|
||||
fs::write(path, contents)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn metadata_for(artifact: &CaptureArtifact) -> Map<String, Value> {
|
||||
let mut metadata = artifact.metadata.clone();
|
||||
metadata.insert("id".to_owned(), json!(artifact.id));
|
||||
metadata.insert("source_url".to_owned(), json!(artifact.source_url));
|
||||
metadata.insert("intent".to_owned(), json!(artifact.intent));
|
||||
metadata.insert("started_at".to_owned(), json!(artifact.started_at));
|
||||
metadata.insert("completed_at".to_owned(), json!(artifact.completed_at));
|
||||
metadata.insert("exchange_count".to_owned(), json!(artifact.exchanges.len()));
|
||||
metadata.insert("endpoint_count".to_owned(), json!(artifact.endpoints.len()));
|
||||
metadata
|
||||
}
|
||||
174
crates/webclaw-capture/src/types.rs
Normal file
174
crates/webclaw-capture/src/types.rs
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
pub type HeaderMap = Map<String, Value>;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct CapturedRequest {
|
||||
pub method: String,
|
||||
pub url: String,
|
||||
pub headers: HeaderMap,
|
||||
pub body_sample: Option<String>,
|
||||
pub resource_type: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct CapturedResponse {
|
||||
pub status: u16,
|
||||
pub headers: HeaderMap,
|
||||
pub body_sample: Option<String>,
|
||||
pub mime_type: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct CapturedExchange {
|
||||
pub method: String,
|
||||
pub url: String,
|
||||
pub request_headers: HeaderMap,
|
||||
pub request_body_sample: Option<String>,
|
||||
pub resource_type: Option<String>,
|
||||
pub status: u16,
|
||||
pub response_headers: HeaderMap,
|
||||
pub response_body_sample: Option<String>,
|
||||
pub started_at: DateTime<Utc>,
|
||||
pub duration_ms: u64,
|
||||
pub redirect_chain: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct CaptureArtifact {
|
||||
pub id: String,
|
||||
pub source_url: String,
|
||||
pub intent: Option<String>,
|
||||
pub started_at: DateTime<Utc>,
|
||||
pub completed_at: Option<DateTime<Utc>>,
|
||||
pub exchanges: Vec<CapturedExchange>,
|
||||
pub endpoints: Vec<EndpointDefinition>,
|
||||
pub metadata: Map<String, Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct EndpointDefinition {
|
||||
pub id: String,
|
||||
pub method: String,
|
||||
pub origin: String,
|
||||
pub path_template: String,
|
||||
pub query_params: BTreeMap<String, Vec<String>>,
|
||||
pub request_schema: Option<Value>,
|
||||
pub response_schema: Option<Value>,
|
||||
pub auth_evidence: Vec<String>,
|
||||
pub safety: EndpointSafety,
|
||||
pub examples: Vec<EndpointExample>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct EndpointExample {
|
||||
pub url: String,
|
||||
pub request_headers: HeaderMap,
|
||||
pub request_body_sample: Option<String>,
|
||||
pub response_status: u16,
|
||||
pub response_headers: HeaderMap,
|
||||
pub response_body_sample: Option<String>,
|
||||
pub captured_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct EndpointSafety {
|
||||
pub safe_to_replay: bool,
|
||||
pub requires_confirmation: bool,
|
||||
pub reason: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct ReplayOptions {
|
||||
pub dry_run: bool,
|
||||
pub confirm_unsafe: bool,
|
||||
pub params_json: Option<Value>,
|
||||
pub headers: HeaderMap,
|
||||
pub body_json: Option<Value>,
|
||||
}
|
||||
|
||||
impl Default for ReplayOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
dry_run: true,
|
||||
confirm_unsafe: false,
|
||||
params_json: None,
|
||||
headers: HeaderMap::new(),
|
||||
body_json: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum ReplayResult {
|
||||
Preview {
|
||||
method: String,
|
||||
url: String,
|
||||
headers: HeaderMap,
|
||||
body_sample: Option<String>,
|
||||
},
|
||||
Executed {
|
||||
status: u16,
|
||||
headers: HeaderMap,
|
||||
body_sample: Option<String>,
|
||||
},
|
||||
Blocked {
|
||||
reason: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct SavedCapture {
|
||||
pub id: String,
|
||||
pub root: PathBuf,
|
||||
pub capture_dir: PathBuf,
|
||||
pub raw_capture_path: PathBuf,
|
||||
pub redacted_capture_path: PathBuf,
|
||||
pub endpoints_path: PathBuf,
|
||||
pub metadata_path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum CaptureError {
|
||||
#[error("invalid url: {0}")]
|
||||
InvalidUrl(String),
|
||||
|
||||
#[error("capture failed: {0}")]
|
||||
Capture(String),
|
||||
|
||||
#[error("storage failed: {0}")]
|
||||
Storage(String),
|
||||
|
||||
#[error("replay failed: {0}")]
|
||||
Replay(String),
|
||||
|
||||
#[error("endpoint not found: {0}")]
|
||||
EndpointNotFound(String),
|
||||
|
||||
#[error("request failed: {0}")]
|
||||
Request(#[from] reqwest::Error),
|
||||
|
||||
#[error("I/O failed: {0}")]
|
||||
Io(String),
|
||||
|
||||
#[error("JSON failed: {0}")]
|
||||
Json(String),
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for CaptureError {
|
||||
fn from(error: std::io::Error) -> Self {
|
||||
Self::Io(error.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for CaptureError {
|
||||
fn from(error: serde_json::Error) -> Self {
|
||||
Self::Json(error.to_string())
|
||||
}
|
||||
}
|
||||
216
crates/webclaw-capture/tests/classify.rs
Normal file
216
crates/webclaw-capture/tests/classify.rs
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
use chrono::{TimeZone, Utc};
|
||||
use serde_json::{Map, Value, json};
|
||||
use webclaw_capture::classify::{classify_exchange, filter_api_exchanges};
|
||||
use webclaw_capture::types::CapturedExchange;
|
||||
|
||||
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
|
||||
entries
|
||||
.iter()
|
||||
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exchange(url: &str) -> CapturedExchange {
|
||||
CapturedExchange {
|
||||
method: "GET".to_owned(),
|
||||
url: url.to_owned(),
|
||||
request_headers: Map::new(),
|
||||
request_body_sample: None,
|
||||
resource_type: Some("document".to_owned()),
|
||||
status: 200,
|
||||
response_headers: Map::new(),
|
||||
response_body_sample: None,
|
||||
started_at: Utc.with_ymd_and_hms(2026, 5, 16, 12, 0, 0).unwrap(),
|
||||
duration_ms: 25,
|
||||
redirect_chain: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn with_resource_type(mut exchange: CapturedExchange, resource_type: &str) -> CapturedExchange {
|
||||
exchange.resource_type = Some(resource_type.to_owned());
|
||||
exchange
|
||||
}
|
||||
|
||||
fn with_response_header(
|
||||
mut exchange: CapturedExchange,
|
||||
name: &str,
|
||||
value: &str,
|
||||
) -> CapturedExchange {
|
||||
exchange.response_headers = headers(&[(name, value)]);
|
||||
exchange
|
||||
}
|
||||
|
||||
fn with_request_body(mut exchange: CapturedExchange, body: serde_json::Value) -> CapturedExchange {
|
||||
exchange.method = "POST".to_owned();
|
||||
exchange.request_headers = headers(&[("Content-Type", "application/json")]);
|
||||
exchange.request_body_sample = Some(body.to_string());
|
||||
exchange
|
||||
}
|
||||
|
||||
fn assert_included(exchange: &CapturedExchange, label: &str) {
|
||||
let classification = classify_exchange(exchange);
|
||||
|
||||
assert!(
|
||||
classification.include,
|
||||
"{label} should be included, got {classification:?}"
|
||||
);
|
||||
assert!(
|
||||
classification.confidence >= 0.5,
|
||||
"{label} should have useful confidence, got {classification:?}"
|
||||
);
|
||||
assert!(
|
||||
!classification.reasons.is_empty(),
|
||||
"{label} should explain why it was classified as API traffic"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_excluded(exchange: &CapturedExchange, label: &str) {
|
||||
let classification = classify_exchange(exchange);
|
||||
|
||||
assert!(
|
||||
!classification.include,
|
||||
"{label} should be excluded, got {classification:?}"
|
||||
);
|
||||
assert!(
|
||||
classification.confidence <= 0.5,
|
||||
"{label} should not look like confident API traffic, got {classification:?}"
|
||||
);
|
||||
assert!(
|
||||
!classification.reasons.is_empty(),
|
||||
"{label} should explain why it was excluded"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn includes_fetch_and_xhr_resource_types() {
|
||||
let cases = [
|
||||
with_resource_type(exchange("https://example.test/products"), "fetch"),
|
||||
with_resource_type(exchange("https://example.test/products"), "xhr"),
|
||||
];
|
||||
|
||||
for case in cases {
|
||||
assert_included(
|
||||
&case,
|
||||
case.resource_type
|
||||
.as_deref()
|
||||
.expect("resource type should be set"),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn includes_json_responses() {
|
||||
let case = with_response_header(
|
||||
exchange("https://example.test/products"),
|
||||
"Content-Type",
|
||||
"application/json; charset=utf-8",
|
||||
);
|
||||
|
||||
assert_included(&case, "JSON response");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn includes_common_api_path_prefixes() {
|
||||
let cases = [
|
||||
exchange("https://example.test/api/products"),
|
||||
exchange("https://example.test/v1/products"),
|
||||
exchange("https://example.test/v2/products"),
|
||||
];
|
||||
|
||||
for case in cases {
|
||||
assert_included(&case, &case.url);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn includes_graphql_paths() {
|
||||
let case = exchange("https://example.test/graphql");
|
||||
|
||||
assert_included(&case, "GraphQL path");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn includes_graphql_request_bodies() {
|
||||
let case = with_request_body(
|
||||
exchange("https://example.test/query"),
|
||||
json!({
|
||||
"operationName": "Products",
|
||||
"query": "query Products { products { id name } }",
|
||||
"variables": {
|
||||
"first": 25
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
assert_included(&case, "GraphQL request body");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_static_assets_by_extension() {
|
||||
let cases = [
|
||||
exchange("https://example.test/static/logo.png"),
|
||||
exchange("https://example.test/static/photo.jpg"),
|
||||
exchange("https://example.test/static/icon.svg"),
|
||||
exchange("https://example.test/static/site.css"),
|
||||
exchange("https://example.test/static/app.js"),
|
||||
exchange("https://example.test/static/font.woff2"),
|
||||
exchange("https://example.test/static/app.js.map"),
|
||||
];
|
||||
|
||||
for case in cases {
|
||||
assert_excluded(&case, &case.url);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_tracking_hosts() {
|
||||
let cases = [
|
||||
with_response_header(
|
||||
exchange("https://www.google-analytics.com/g/collect?v=2"),
|
||||
"Content-Type",
|
||||
"application/json",
|
||||
),
|
||||
with_response_header(
|
||||
exchange("https://ads.doubleclick.net/pagead/id"),
|
||||
"Content-Type",
|
||||
"application/json",
|
||||
),
|
||||
with_response_header(
|
||||
exchange("https://telemetry.example.test/v1/events"),
|
||||
"Content-Type",
|
||||
"application/json",
|
||||
),
|
||||
];
|
||||
|
||||
for case in cases {
|
||||
assert_excluded(&case, &case.url);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_browser_extension_urls() {
|
||||
let cases = [
|
||||
with_resource_type(exchange("chrome-extension://abcdef/options.html"), "fetch"),
|
||||
with_resource_type(exchange("moz-extension://abcdef/options.html"), "xhr"),
|
||||
];
|
||||
|
||||
for case in cases {
|
||||
assert_excluded(&case, &case.url);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_api_exchanges_returns_only_included_traffic() {
|
||||
let api = exchange("https://example.test/api/products");
|
||||
let asset = exchange("https://example.test/static/app.js");
|
||||
let tracking = with_response_header(
|
||||
exchange("https://telemetry.example.test/v1/events"),
|
||||
"Content-Type",
|
||||
"application/json",
|
||||
);
|
||||
let exchanges = vec![api.clone(), asset, tracking];
|
||||
|
||||
let filtered = filter_api_exchanges(&exchanges);
|
||||
|
||||
assert_eq!(filtered, vec![api]);
|
||||
}
|
||||
139
crates/webclaw-capture/tests/fixtures/sample.har.json
vendored
Normal file
139
crates/webclaw-capture/tests/fixtures/sample.har.json
vendored
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
{
|
||||
"log": {
|
||||
"version": "1.2",
|
||||
"creator": {
|
||||
"name": "webclaw-capture-test",
|
||||
"version": "0.1.0"
|
||||
},
|
||||
"entries": [
|
||||
{
|
||||
"startedDateTime": "2026-05-16T12:00:00Z",
|
||||
"time": 42,
|
||||
"_resourceType": "fetch",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "https://example.test/api/products?category=tools&page=2",
|
||||
"headers": [
|
||||
{
|
||||
"name": "Accept",
|
||||
"value": "application/json"
|
||||
},
|
||||
{
|
||||
"name": "Authorization",
|
||||
"value": "Bearer example-token"
|
||||
}
|
||||
]
|
||||
},
|
||||
"response": {
|
||||
"status": 200,
|
||||
"headers": [
|
||||
{
|
||||
"name": "Content-Type",
|
||||
"value": "application/json; charset=utf-8"
|
||||
}
|
||||
],
|
||||
"content": {
|
||||
"mimeType": "application/json",
|
||||
"text": "{\"items\":[{\"id\":12345,\"name\":\"Hammer\",\"price\":12.5,\"inStock\":true}],\"page\":2,\"hasMore\":false}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"startedDateTime": "2026-05-16T12:00:01Z",
|
||||
"time": 31,
|
||||
"_resourceType": "xhr",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "https://example.test/api/products/12345",
|
||||
"headers": [
|
||||
{
|
||||
"name": "Accept",
|
||||
"value": "application/json"
|
||||
},
|
||||
{
|
||||
"name": "Cookie",
|
||||
"value": "session_id=example-session"
|
||||
}
|
||||
]
|
||||
},
|
||||
"response": {
|
||||
"status": 200,
|
||||
"headers": [
|
||||
{
|
||||
"name": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"content": {
|
||||
"mimeType": "application/json",
|
||||
"text": "{\"id\":12345,\"name\":\"Hammer\",\"category\":\"tools\",\"tags\":[\"hand-tool\",\"steel\"]}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"startedDateTime": "2026-05-16T12:00:02Z",
|
||||
"time": 57,
|
||||
"_resourceType": "fetch",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "https://example.test/graphql",
|
||||
"headers": [
|
||||
{
|
||||
"name": "Content-Type",
|
||||
"value": "application/json"
|
||||
},
|
||||
{
|
||||
"name": "X-CSRF-Token",
|
||||
"value": "example-csrf"
|
||||
}
|
||||
],
|
||||
"postData": {
|
||||
"mimeType": "application/json",
|
||||
"text": "{\"operationName\":\"CreateProduct\",\"query\":\"mutation CreateProduct($name: String!) { createProduct(input: { name: $name }) { id name } }\",\"variables\":{\"name\":\"Hammer\"}}"
|
||||
}
|
||||
},
|
||||
"response": {
|
||||
"status": 200,
|
||||
"headers": [
|
||||
{
|
||||
"name": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"content": {
|
||||
"mimeType": "application/json",
|
||||
"text": "{\"data\":{\"createProduct\":{\"id\":\"gid://example/Product/12345\",\"name\":\"Hammer\"}}}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"startedDateTime": "2026-05-16T12:00:03Z",
|
||||
"time": 8,
|
||||
"_resourceType": "script",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "https://example.test/static/app.js",
|
||||
"headers": [
|
||||
{
|
||||
"name": "Accept",
|
||||
"value": "application/javascript"
|
||||
}
|
||||
]
|
||||
},
|
||||
"response": {
|
||||
"status": 200,
|
||||
"headers": [
|
||||
{
|
||||
"name": "Content-Type",
|
||||
"value": "application/javascript"
|
||||
}
|
||||
],
|
||||
"content": {
|
||||
"mimeType": "application/javascript",
|
||||
"text": "fetch('/api/products?category=tools')"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
261
crates/webclaw-capture/tests/infer.rs
Normal file
261
crates/webclaw-capture/tests/infer.rs
Normal file
|
|
@ -0,0 +1,261 @@
|
|||
use chrono::{DateTime, Utc};
|
||||
use serde_json::{Map, Value, json};
|
||||
use webclaw_capture::infer::{
|
||||
endpoint_id, infer_endpoints, infer_json_schema, normalize_path_template,
|
||||
};
|
||||
use webclaw_capture::types::{CapturedExchange, EndpointDefinition};
|
||||
|
||||
fn fixture_exchanges() -> Vec<CapturedExchange> {
|
||||
let har: Value =
|
||||
serde_json::from_str(include_str!("fixtures/sample.har.json")).expect("valid HAR fixture");
|
||||
let entries = har
|
||||
.pointer("/log/entries")
|
||||
.and_then(Value::as_array)
|
||||
.expect("HAR fixture entries");
|
||||
|
||||
entries.iter().map(har_entry_to_exchange).collect()
|
||||
}
|
||||
|
||||
fn har_entry_to_exchange(entry: &Value) -> CapturedExchange {
|
||||
let request = entry.get("request").expect("request");
|
||||
let response = entry.get("response").expect("response");
|
||||
|
||||
CapturedExchange {
|
||||
method: string_at(request, "method"),
|
||||
url: string_at(request, "url"),
|
||||
request_headers: har_headers(request),
|
||||
request_body_sample: request
|
||||
.pointer("/postData/text")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_owned),
|
||||
resource_type: entry
|
||||
.get("_resourceType")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_owned),
|
||||
status: response
|
||||
.get("status")
|
||||
.and_then(Value::as_u64)
|
||||
.expect("response status") as u16,
|
||||
response_headers: har_headers(response),
|
||||
response_body_sample: response
|
||||
.pointer("/content/text")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_owned),
|
||||
started_at: DateTime::parse_from_rfc3339(&string_at(entry, "startedDateTime"))
|
||||
.expect("RFC3339 startedDateTime")
|
||||
.with_timezone(&Utc),
|
||||
duration_ms: entry.get("time").and_then(Value::as_u64).expect("duration"),
|
||||
redirect_chain: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn har_headers(container: &Value) -> Map<String, Value> {
|
||||
container
|
||||
.get("headers")
|
||||
.and_then(Value::as_array)
|
||||
.expect("headers")
|
||||
.iter()
|
||||
.map(|header| {
|
||||
(
|
||||
string_at(header, "name"),
|
||||
Value::String(string_at(header, "value")),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn string_at(value: &Value, key: &str) -> String {
|
||||
value
|
||||
.get(key)
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_else(|| panic!("{key} should be a string"))
|
||||
.to_owned()
|
||||
}
|
||||
|
||||
fn find_endpoint<'a>(
|
||||
endpoints: &'a [EndpointDefinition],
|
||||
method: &str,
|
||||
path_template: &str,
|
||||
) -> &'a EndpointDefinition {
|
||||
endpoints
|
||||
.iter()
|
||||
.find(|endpoint| endpoint.method == method && endpoint.path_template == path_template)
|
||||
.unwrap_or_else(|| panic!("missing endpoint {method} {path_template}; got {endpoints:#?}"))
|
||||
}
|
||||
|
||||
fn sorted_ids(endpoints: &[EndpointDefinition]) -> Vec<String> {
|
||||
let mut ids = endpoints
|
||||
.iter()
|
||||
.map(|endpoint| endpoint.id.clone())
|
||||
.collect::<Vec<_>>();
|
||||
ids.sort();
|
||||
ids
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn infers_stable_endpoint_ids_and_path_templates_from_har_fixture() {
|
||||
let exchanges = fixture_exchanges();
|
||||
|
||||
let endpoints = infer_endpoints(&exchanges);
|
||||
let repeated = infer_endpoints(&exchanges);
|
||||
|
||||
assert_eq!(endpoints.len(), 3, "static assets should be ignored");
|
||||
assert_eq!(
|
||||
sorted_ids(&endpoints),
|
||||
sorted_ids(&repeated),
|
||||
"endpoint ids should be deterministic across inference runs"
|
||||
);
|
||||
|
||||
let products = find_endpoint(&endpoints, "GET", "/api/products");
|
||||
assert_eq!(
|
||||
products.id,
|
||||
endpoint_id("GET", "https://example.test", "/api/products")
|
||||
);
|
||||
|
||||
let product_detail = find_endpoint(&endpoints, "GET", "/api/products/{id}");
|
||||
assert_eq!(
|
||||
product_detail.id,
|
||||
endpoint_id("GET", "https://example.test", "/api/products/{id}")
|
||||
);
|
||||
|
||||
let graphql = find_endpoint(&endpoints, "POST", "/graphql");
|
||||
assert_eq!(
|
||||
graphql.id,
|
||||
endpoint_id("POST", "https://example.test", "/graphql")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn infers_query_examples_schemas_auth_evidence_and_mutation_safety() {
|
||||
let endpoints = infer_endpoints(&fixture_exchanges());
|
||||
|
||||
let products = find_endpoint(&endpoints, "GET", "/api/products");
|
||||
assert_eq!(
|
||||
products.query_params.get("category"),
|
||||
Some(&vec!["tools".to_owned()])
|
||||
);
|
||||
assert_eq!(
|
||||
products.query_params.get("page"),
|
||||
Some(&vec!["2".to_owned()])
|
||||
);
|
||||
assert!(
|
||||
products
|
||||
.auth_evidence
|
||||
.iter()
|
||||
.any(|evidence| evidence.to_ascii_lowercase().contains("authorization")),
|
||||
"Authorization header should be recorded as auth evidence"
|
||||
);
|
||||
assert!(products.safety.safe_to_replay);
|
||||
assert!(!products.safety.requires_confirmation);
|
||||
|
||||
let products_schema = products.response_schema.as_ref().expect("response schema");
|
||||
assert_eq!(
|
||||
products_schema.pointer("/properties/items/type"),
|
||||
Some(&json!("array"))
|
||||
);
|
||||
assert_eq!(
|
||||
products_schema.pointer("/properties/items/items/properties/id/type"),
|
||||
Some(&json!("integer"))
|
||||
);
|
||||
assert_eq!(
|
||||
products_schema.pointer("/properties/hasMore/type"),
|
||||
Some(&json!("boolean"))
|
||||
);
|
||||
|
||||
let graphql = find_endpoint(&endpoints, "POST", "/graphql");
|
||||
assert!(!graphql.safety.safe_to_replay);
|
||||
assert!(graphql.safety.requires_confirmation);
|
||||
assert!(
|
||||
graphql
|
||||
.auth_evidence
|
||||
.iter()
|
||||
.any(|evidence| evidence.to_ascii_lowercase().contains("csrf")),
|
||||
"CSRF header should be recorded as auth evidence"
|
||||
);
|
||||
|
||||
let request_schema = graphql.request_schema.as_ref().expect("request schema");
|
||||
assert_eq!(
|
||||
request_schema.pointer("/properties/query/type"),
|
||||
Some(&json!("string"))
|
||||
);
|
||||
assert_eq!(
|
||||
request_schema.pointer("/properties/variables/properties/name/type"),
|
||||
Some(&json!("string"))
|
||||
);
|
||||
|
||||
let response_schema = graphql.response_schema.as_ref().expect("response schema");
|
||||
assert_eq!(
|
||||
response_schema.pointer("/properties/data/properties/createProduct/properties/id/type"),
|
||||
Some(&json!("string"))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignores_static_asset_entries_from_the_fixture() {
|
||||
let endpoints = infer_endpoints(&fixture_exchanges());
|
||||
|
||||
assert!(
|
||||
endpoints
|
||||
.iter()
|
||||
.all(|endpoint| !endpoint.path_template.contains("/static/")),
|
||||
"static asset requests should not become learned endpoints: {endpoints:#?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalizes_numeric_uuid_and_high_entropy_path_segments() {
|
||||
assert_eq!(
|
||||
normalize_path_template("/api/products/12345"),
|
||||
"/api/products/{id}"
|
||||
);
|
||||
assert_eq!(
|
||||
normalize_path_template("/api/users/550e8400-e29b-41d4-a716-446655440000"),
|
||||
"/api/users/{id}"
|
||||
);
|
||||
assert_eq!(
|
||||
normalize_path_template("/api/sessions/a1b2c3d4e5f6a7b8"),
|
||||
"/api/sessions/{id}"
|
||||
);
|
||||
assert_eq!(
|
||||
normalize_path_template("/api/categories/tools"),
|
||||
"/api/categories/tools"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn infers_basic_json_schema_shapes() {
|
||||
let schema = infer_json_schema(&json!({
|
||||
"id": 12345,
|
||||
"name": "Hammer",
|
||||
"price": 12.5,
|
||||
"inStock": true,
|
||||
"tags": ["hand-tool"],
|
||||
"metadata": null
|
||||
}));
|
||||
|
||||
assert_eq!(schema.pointer("/type"), Some(&json!("object")));
|
||||
assert_eq!(
|
||||
schema.pointer("/properties/id/type"),
|
||||
Some(&json!("integer"))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.pointer("/properties/price/type"),
|
||||
Some(&json!("number"))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.pointer("/properties/inStock/type"),
|
||||
Some(&json!("boolean"))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.pointer("/properties/tags/type"),
|
||||
Some(&json!("array"))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.pointer("/properties/tags/items/type"),
|
||||
Some(&json!("string"))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.pointer("/properties/metadata/type"),
|
||||
Some(&json!("null"))
|
||||
);
|
||||
}
|
||||
245
crates/webclaw-capture/tests/integration_capture.rs
Normal file
245
crates/webclaw-capture/tests/integration_capture.rs
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
use std::env;
|
||||
use std::ffi::OsString;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio::sync::oneshot;
|
||||
use webclaw_capture::cdp::{CaptureOptions, capture_network};
|
||||
use webclaw_capture::types::{CaptureArtifact, EndpointDefinition};
|
||||
|
||||
const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR";
|
||||
|
||||
struct CaptureDirGuard {
|
||||
original: Option<OsString>,
|
||||
}
|
||||
|
||||
impl CaptureDirGuard {
|
||||
fn set(path: &Path) -> Self {
|
||||
let original = env::var_os(CAPTURE_DIR_ENV);
|
||||
|
||||
unsafe {
|
||||
env::set_var(CAPTURE_DIR_ENV, path);
|
||||
}
|
||||
|
||||
Self { original }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for CaptureDirGuard {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
match &self.original {
|
||||
Some(value) => env::set_var(CAPTURE_DIR_ENV, value),
|
||||
None => env::remove_var(CAPTURE_DIR_ENV),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct LocalServer {
|
||||
base_url: String,
|
||||
shutdown: Option<oneshot::Sender<()>>,
|
||||
}
|
||||
|
||||
impl LocalServer {
|
||||
async fn start() -> Self {
|
||||
let listener = TcpListener::bind("127.0.0.1:0")
|
||||
.await
|
||||
.expect("bind local test server");
|
||||
let address = listener.local_addr().expect("local test server address");
|
||||
let (shutdown, mut shutdown_rx) = oneshot::channel::<()>();
|
||||
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = &mut shutdown_rx => break,
|
||||
accepted = listener.accept() => {
|
||||
let Ok((stream, _peer)) = accepted else {
|
||||
continue;
|
||||
};
|
||||
|
||||
tokio::spawn(handle_connection(stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Self {
|
||||
base_url: format!("http://{address}"),
|
||||
shutdown: Some(shutdown),
|
||||
}
|
||||
}
|
||||
|
||||
fn url(&self, path: &str) -> String {
|
||||
format!("{}{}", self.base_url, path)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LocalServer {
|
||||
fn drop(&mut self) {
|
||||
if let Some(shutdown) = self.shutdown.take() {
|
||||
let _ = shutdown.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn capture_network_records_fetches_redacts_secrets_and_learns_api_endpoints() {
|
||||
let capture_root = unique_temp_root("integration-capture");
|
||||
let _capture_dir = CaptureDirGuard::set(&capture_root);
|
||||
let server = LocalServer::start().await;
|
||||
|
||||
let saved = capture_network(CaptureOptions {
|
||||
url: server.url("/"),
|
||||
intent: Some("discover product listing API".to_owned()),
|
||||
wait_ms: 1_500,
|
||||
headed: false,
|
||||
})
|
||||
.await
|
||||
.expect("capture network traffic");
|
||||
|
||||
let raw_capture: CaptureArtifact = read_json(&saved.raw_capture_path);
|
||||
assert!(
|
||||
raw_capture
|
||||
.exchanges
|
||||
.iter()
|
||||
.any(|exchange| exchange.url.contains("/api/products?category=tools")),
|
||||
"raw capture should include the fetch to /api/products"
|
||||
);
|
||||
|
||||
let redacted_capture_text =
|
||||
fs::read_to_string(&saved.redacted_capture_path).expect("read redacted capture");
|
||||
for secret in [
|
||||
"browser-authorization-secret",
|
||||
"browser-api-key-secret",
|
||||
"browser-csrf-secret",
|
||||
"page-session-secret",
|
||||
"api-session-secret",
|
||||
] {
|
||||
assert!(
|
||||
!redacted_capture_text.contains(secret),
|
||||
"redacted capture should not contain raw secret value {secret}"
|
||||
);
|
||||
}
|
||||
|
||||
let endpoints: Vec<EndpointDefinition> = read_json(&saved.endpoints_path);
|
||||
let api_endpoints = endpoints
|
||||
.iter()
|
||||
.filter(|endpoint| endpoint.method == "GET" && endpoint.path_template == "/api/products")
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(
|
||||
api_endpoints.len(),
|
||||
1,
|
||||
"inferred endpoints should contain one GET /api/products endpoint"
|
||||
);
|
||||
assert!(
|
||||
endpoints
|
||||
.iter()
|
||||
.all(|endpoint| endpoint.path_template != "/static/app.js"),
|
||||
"static assets should not be included as learned endpoints"
|
||||
);
|
||||
|
||||
let _ = fs::remove_dir_all(capture_root);
|
||||
}
|
||||
|
||||
async fn handle_connection(mut stream: TcpStream) {
|
||||
let mut buffer = vec![0_u8; 8192];
|
||||
let Ok(bytes_read) = stream.read(&mut buffer).await else {
|
||||
return;
|
||||
};
|
||||
if bytes_read == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let request = String::from_utf8_lossy(&buffer[..bytes_read]);
|
||||
let path = request
|
||||
.lines()
|
||||
.next()
|
||||
.and_then(|line| line.split_whitespace().nth(1))
|
||||
.unwrap_or("/");
|
||||
|
||||
let response = match path.split('?').next().unwrap_or(path) {
|
||||
"/" => http_response(
|
||||
"200 OK",
|
||||
&[
|
||||
("Content-Type", "text/html; charset=utf-8"),
|
||||
("Set-Cookie", "session=page-session-secret; HttpOnly"),
|
||||
],
|
||||
r#"<!doctype html>
|
||||
<html>
|
||||
<head><title>Webclaw capture test</title></head>
|
||||
<body>
|
||||
<script src="/static/app.js"></script>
|
||||
</body>
|
||||
</html>"#,
|
||||
),
|
||||
"/static/app.js" => http_response(
|
||||
"200 OK",
|
||||
&[("Content-Type", "application/javascript; charset=utf-8")],
|
||||
r#"fetch('/api/products?category=tools', {
|
||||
headers: {
|
||||
'Authorization': 'Bearer browser-authorization-secret',
|
||||
'X-Api-Key': 'browser-api-key-secret',
|
||||
'X-CSRF-Token': 'browser-csrf-secret'
|
||||
}
|
||||
}).then(response => response.json()).then(products => {
|
||||
window.__webclawProducts = products;
|
||||
});"#,
|
||||
),
|
||||
"/api/products" => http_response(
|
||||
"200 OK",
|
||||
&[
|
||||
("Content-Type", "application/json"),
|
||||
("Set-Cookie", "session=api-session-secret; HttpOnly"),
|
||||
],
|
||||
r#"{"items":[{"id":12345,"name":"Hammer","category":"tools"}]}"#,
|
||||
),
|
||||
_ => http_response(
|
||||
"404 Not Found",
|
||||
&[("Content-Type", "text/plain; charset=utf-8")],
|
||||
"not found",
|
||||
),
|
||||
};
|
||||
|
||||
let _ = stream.write_all(response.as_bytes()).await;
|
||||
let _ = stream.shutdown().await;
|
||||
}
|
||||
|
||||
fn http_response(status: &str, headers: &[(&str, &str)], body: &str) -> String {
|
||||
let mut response = format!(
|
||||
"HTTP/1.1 {status}\r\nContent-Length: {}\r\nConnection: close\r\nCache-Control: no-store\r\n",
|
||||
body.len()
|
||||
);
|
||||
|
||||
for (name, value) in headers {
|
||||
response.push_str(name);
|
||||
response.push_str(": ");
|
||||
response.push_str(value);
|
||||
response.push_str("\r\n");
|
||||
}
|
||||
|
||||
response.push_str("\r\n");
|
||||
response.push_str(body);
|
||||
response
|
||||
}
|
||||
|
||||
fn unique_temp_root(test_name: &str) -> PathBuf {
|
||||
let nanos = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.expect("system time after unix epoch")
|
||||
.as_nanos();
|
||||
|
||||
env::temp_dir().join(format!(
|
||||
"webclaw-capture-{test_name}-{}-{nanos}",
|
||||
std::process::id()
|
||||
))
|
||||
}
|
||||
|
||||
fn read_json<T: serde::de::DeserializeOwned>(path: &Path) -> T {
|
||||
let contents = fs::read_to_string(path).expect("read JSON file");
|
||||
serde_json::from_str(&contents).expect("valid JSON file")
|
||||
}
|
||||
358
crates/webclaw-capture/tests/openapi.rs
Normal file
358
crates/webclaw-capture/tests/openapi.rs
Normal file
|
|
@ -0,0 +1,358 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::env;
|
||||
use std::ffi::OsString;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Mutex;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde_json::{Map, Value, json};
|
||||
use webclaw_capture::openapi::{export_openapi, write_openapi};
|
||||
use webclaw_capture::store::save_capture;
|
||||
use webclaw_capture::types::{
|
||||
CaptureArtifact, EndpointDefinition, EndpointExample, EndpointSafety,
|
||||
};
|
||||
|
||||
static ENV_LOCK: Mutex<()> = Mutex::new(());
|
||||
const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR";
|
||||
|
||||
struct EnvVarGuard {
|
||||
original: Option<OsString>,
|
||||
}
|
||||
|
||||
impl EnvVarGuard {
|
||||
fn set_capture_dir(value: Option<&Path>) -> Self {
|
||||
let original = env::var_os(CAPTURE_DIR_ENV);
|
||||
|
||||
unsafe {
|
||||
match value {
|
||||
Some(path) => env::set_var(CAPTURE_DIR_ENV, path),
|
||||
None => env::remove_var(CAPTURE_DIR_ENV),
|
||||
}
|
||||
}
|
||||
|
||||
Self { original }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EnvVarGuard {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
match &self.original {
|
||||
Some(value) => env::set_var(CAPTURE_DIR_ENV, value),
|
||||
None => env::remove_var(CAPTURE_DIR_ENV),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn with_capture_dir<T>(value: Option<&Path>, test: impl FnOnce() -> T) -> T {
|
||||
let _lock = ENV_LOCK.lock().expect("capture env lock");
|
||||
let _guard = EnvVarGuard::set_capture_dir(value);
|
||||
|
||||
test()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn exports_openapi_31_and_an_operation_for_every_endpoint() {
|
||||
let doc = export_openapi(&sample_endpoints());
|
||||
|
||||
assert_eq!(doc.get("openapi").and_then(Value::as_str), Some("3.1.0"));
|
||||
|
||||
let paths = doc
|
||||
.get("paths")
|
||||
.and_then(Value::as_object)
|
||||
.expect("OpenAPI document should contain paths");
|
||||
|
||||
assert!(
|
||||
operation(&doc, "/api/products", "get").is_some(),
|
||||
"GET product endpoint should become an OpenAPI operation"
|
||||
);
|
||||
assert!(
|
||||
operation(&doc, "/graphql", "post").is_some(),
|
||||
"POST GraphQL endpoint should become an OpenAPI operation"
|
||||
);
|
||||
assert_eq!(
|
||||
operation_count(paths),
|
||||
2,
|
||||
"every learned endpoint should become exactly one operation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unsafe_operations_require_confirmation_extension() {
|
||||
let doc = export_openapi(&sample_endpoints());
|
||||
|
||||
let get_operation =
|
||||
operation(&doc, "/api/products", "get").expect("GET product endpoint should be exported");
|
||||
let post_operation =
|
||||
operation(&doc, "/graphql", "post").expect("POST GraphQL endpoint should be exported");
|
||||
|
||||
assert_ne!(
|
||||
get_operation.get("x-webclaw-requires-confirmation"),
|
||||
Some(&json!(true)),
|
||||
"safe GET operations should not require unsafe replay confirmation"
|
||||
);
|
||||
assert_eq!(
|
||||
post_operation.get("x-webclaw-requires-confirmation"),
|
||||
Some(&json!(true)),
|
||||
"unsafe POST operations should require explicit replay confirmation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn generated_examples_do_not_leak_secret_values() {
|
||||
let doc = export_openapi(&sample_endpoints());
|
||||
|
||||
assert!(
|
||||
contains_example_node(&doc),
|
||||
"OpenAPI export should include examples derived from captured endpoint examples"
|
||||
);
|
||||
|
||||
let doc_text = serde_json::to_string(&doc).expect("serialize OpenAPI document");
|
||||
for forbidden in [
|
||||
"Bearer raw-secret",
|
||||
"raw-api-key",
|
||||
"raw-csrf-token",
|
||||
"raw-session-id",
|
||||
"raw-password",
|
||||
"user@example.test",
|
||||
] {
|
||||
assert!(
|
||||
!doc_text.contains(forbidden),
|
||||
"OpenAPI examples should not leak secret value {forbidden:?}"
|
||||
);
|
||||
}
|
||||
assert!(
|
||||
doc_text.contains("[REDACTED]"),
|
||||
"OpenAPI examples should preserve redaction markers instead of raw secrets"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn write_openapi_writes_openapi_json_next_to_saved_endpoints() {
|
||||
let root = unique_temp_root("write");
|
||||
|
||||
with_capture_dir(Some(&root), || {
|
||||
let artifact = sample_artifact();
|
||||
save_capture(&artifact).expect("save capture before OpenAPI export");
|
||||
|
||||
let openapi_path = write_openapi(&artifact.id).expect("write OpenAPI document");
|
||||
|
||||
assert_eq!(
|
||||
openapi_path,
|
||||
root.join("example.test")
|
||||
.join("2026-05-16T12-00-00Z")
|
||||
.join("openapi.json")
|
||||
);
|
||||
assert!(openapi_path.is_file());
|
||||
|
||||
let doc: Value = read_json(&openapi_path);
|
||||
assert_eq!(doc.get("openapi").and_then(Value::as_str), Some("3.1.0"));
|
||||
assert!(
|
||||
operation(&doc, "/api/products", "get").is_some(),
|
||||
"written OpenAPI document should contain saved capture endpoints"
|
||||
);
|
||||
});
|
||||
|
||||
let _ = fs::remove_dir_all(root);
|
||||
}
|
||||
|
||||
fn sample_artifact() -> CaptureArtifact {
|
||||
CaptureArtifact {
|
||||
id: "example.test/2026-05-16T12-00-00Z".to_owned(),
|
||||
source_url: "https://example.test/products?email=user@example.test".to_owned(),
|
||||
intent: Some("discover product listing API".to_owned()),
|
||||
started_at: test_time(),
|
||||
completed_at: Some(test_time()),
|
||||
exchanges: Vec::new(),
|
||||
endpoints: sample_endpoints(),
|
||||
metadata: Map::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn sample_endpoints() -> Vec<EndpointDefinition> {
|
||||
vec![product_endpoint(), graphql_endpoint()]
|
||||
}
|
||||
|
||||
fn product_endpoint() -> EndpointDefinition {
|
||||
let mut query_params = BTreeMap::new();
|
||||
query_params.insert("category".to_owned(), vec!["tools".to_owned()]);
|
||||
query_params.insert("page".to_owned(), vec!["2".to_owned()]);
|
||||
|
||||
EndpointDefinition {
|
||||
id: "GET https://example.test/api/products".to_owned(),
|
||||
method: "GET".to_owned(),
|
||||
origin: "https://example.test".to_owned(),
|
||||
path_template: "/api/products".to_owned(),
|
||||
query_params,
|
||||
request_schema: None,
|
||||
response_schema: Some(json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": { "type": "integer" },
|
||||
"name": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})),
|
||||
auth_evidence: vec!["Authorization header observed".to_owned()],
|
||||
safety: EndpointSafety {
|
||||
safe_to_replay: true,
|
||||
requires_confirmation: false,
|
||||
reason: "GET is a read-oriented HTTP method".to_owned(),
|
||||
},
|
||||
examples: vec![EndpointExample {
|
||||
url: "https://example.test/api/products?category=tools&page=2&api_key=raw-api-key"
|
||||
.to_owned(),
|
||||
request_headers: headers(&[
|
||||
("Authorization", "Bearer raw-secret"),
|
||||
("Accept", "application/json"),
|
||||
("X-Api-Key", "raw-api-key"),
|
||||
]),
|
||||
request_body_sample: None,
|
||||
response_status: 200,
|
||||
response_headers: headers(&[
|
||||
("Content-Type", "application/json"),
|
||||
("Set-Cookie", "session=raw-session-id"),
|
||||
]),
|
||||
response_body_sample: Some(
|
||||
r#"{"items":[{"id":12345,"name":"Hammer","email":"user@example.test"}]}"#
|
||||
.to_owned(),
|
||||
),
|
||||
captured_at: test_time(),
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn graphql_endpoint() -> EndpointDefinition {
|
||||
EndpointDefinition {
|
||||
id: "POST https://example.test/graphql".to_owned(),
|
||||
method: "POST".to_owned(),
|
||||
origin: "https://example.test".to_owned(),
|
||||
path_template: "/graphql".to_owned(),
|
||||
query_params: BTreeMap::new(),
|
||||
request_schema: Some(json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": { "type": "string" },
|
||||
"variables": { "type": "object" }
|
||||
}
|
||||
})),
|
||||
response_schema: Some(json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"data": { "type": "object" }
|
||||
}
|
||||
})),
|
||||
auth_evidence: vec!["X-CSRF-Token header observed".to_owned()],
|
||||
safety: EndpointSafety {
|
||||
safe_to_replay: false,
|
||||
requires_confirmation: true,
|
||||
reason: "POST may mutate server state and requires confirmation".to_owned(),
|
||||
},
|
||||
examples: vec![EndpointExample {
|
||||
url: concat!(
|
||||
"https://example.test/graphql?",
|
||||
"ref=user%40example.test&",
|
||||
"debug=Bearer%20raw-secret&",
|
||||
"trace=raw-session-id"
|
||||
)
|
||||
.to_owned(),
|
||||
request_headers: headers(&[
|
||||
("Content-Type", "application/json"),
|
||||
("X-CSRF-Token", "raw-csrf-token"),
|
||||
]),
|
||||
request_body_sample: Some(
|
||||
json!({
|
||||
"query": "mutation CreateProduct($name: String!) { createProduct(name: $name) { id } }",
|
||||
"variables": {
|
||||
"name": "Hammer",
|
||||
"password": "raw-password"
|
||||
}
|
||||
})
|
||||
.to_string(),
|
||||
),
|
||||
response_status: 200,
|
||||
response_headers: headers(&[("Content-Type", "application/json")]),
|
||||
response_body_sample: Some(r#"{"data":{"createProduct":{"id":"12345"}}}"#.to_owned()),
|
||||
captured_at: test_time(),
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
|
||||
entries
|
||||
.iter()
|
||||
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn operation<'a>(doc: &'a Value, path: &str, method: &str) -> Option<&'a Map<String, Value>> {
|
||||
doc.get("paths")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|paths| paths.get(path))
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|path_item| path_item.get(method))
|
||||
.and_then(Value::as_object)
|
||||
}
|
||||
|
||||
fn operation_count(paths: &Map<String, Value>) -> usize {
|
||||
const HTTP_METHODS: &[&str] = &[
|
||||
"get", "put", "post", "delete", "options", "head", "patch", "trace",
|
||||
];
|
||||
|
||||
paths
|
||||
.values()
|
||||
.filter_map(Value::as_object)
|
||||
.map(|path_item| {
|
||||
HTTP_METHODS
|
||||
.iter()
|
||||
.filter(|method| path_item.contains_key(**method))
|
||||
.count()
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
|
||||
fn contains_example_node(value: &Value) -> bool {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
object
|
||||
.keys()
|
||||
.any(|key| matches!(key.as_str(), "example" | "examples" | "x-webclaw-examples"))
|
||||
|| object.values().any(contains_example_node)
|
||||
}
|
||||
Value::Array(items) => items.iter().any(contains_example_node),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn unique_temp_root(test_name: &str) -> PathBuf {
|
||||
let nanos = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.expect("system time after unix epoch")
|
||||
.as_nanos();
|
||||
|
||||
env::temp_dir().join(format!(
|
||||
"webclaw-capture-openapi-{test_name}-{}-{nanos}",
|
||||
std::process::id()
|
||||
))
|
||||
}
|
||||
|
||||
fn read_json<T: serde::de::DeserializeOwned>(path: &Path) -> T {
|
||||
let contents = fs::read_to_string(path).expect("read JSON file");
|
||||
serde_json::from_str(&contents).expect("valid JSON file")
|
||||
}
|
||||
|
||||
fn test_time() -> DateTime<Utc> {
|
||||
DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z")
|
||||
.expect("valid test timestamp")
|
||||
.with_timezone(&Utc)
|
||||
}
|
||||
209
crates/webclaw-capture/tests/redact.rs
Normal file
209
crates/webclaw-capture/tests/redact.rs
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
use chrono::{TimeZone, Utc};
|
||||
use serde_json::{Map, Value, json};
|
||||
use url::Url;
|
||||
use webclaw_capture::redact::{redact_artifact, redact_headers, redact_json, redact_url};
|
||||
use webclaw_capture::types::{CaptureArtifact, CapturedExchange};
|
||||
|
||||
const REDACTED: &str = "[REDACTED]";
|
||||
|
||||
fn header_map(entries: &[(&str, &str)]) -> Map<String, Value> {
|
||||
entries
|
||||
.iter()
|
||||
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn query_value(url: &str, name: &str) -> Option<String> {
|
||||
Url::parse(url)
|
||||
.unwrap()
|
||||
.query_pairs()
|
||||
.find(|(key, _)| key == name)
|
||||
.map(|(_, value)| value.into_owned())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn redacts_sensitive_header_and_cookie_values_by_name() {
|
||||
let headers = header_map(&[
|
||||
("Authorization", "Bearer secret-token"),
|
||||
("Cookie", "session=secret-session; theme=dark"),
|
||||
("Set-Cookie", "account=secret-cookie; HttpOnly"),
|
||||
("X-Api-Key", "secret-api-key"),
|
||||
("X-CSRF-Token", "secret-csrf-token"),
|
||||
("X-Session-Id", "secret-session-id"),
|
||||
("X-Password-Hash", "secret-password"),
|
||||
("X-User-Email", "person@example.test"),
|
||||
("Content-Type", "application/json"),
|
||||
]);
|
||||
|
||||
let redacted = redact_headers(&headers);
|
||||
|
||||
assert_eq!(redacted["Authorization"], REDACTED);
|
||||
assert_eq!(redacted["Cookie"], REDACTED);
|
||||
assert_eq!(redacted["Set-Cookie"], REDACTED);
|
||||
assert_eq!(redacted["X-Api-Key"], REDACTED);
|
||||
assert_eq!(redacted["X-CSRF-Token"], REDACTED);
|
||||
assert_eq!(redacted["X-Session-Id"], REDACTED);
|
||||
assert_eq!(redacted["X-Password-Hash"], REDACTED);
|
||||
assert_eq!(redacted["X-User-Email"], REDACTED);
|
||||
assert_eq!(redacted["Content-Type"], "application/json");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn redacts_sensitive_query_parameter_values_by_name() {
|
||||
let url = concat!(
|
||||
"https://example.test/api/products?",
|
||||
"authorization=Bearer%20secret-token&",
|
||||
"api-key=secret-api-key&",
|
||||
"csrf=secret-csrf&",
|
||||
"access_token=secret-access-token&",
|
||||
"session_id=secret-session&",
|
||||
"password=secret-password&",
|
||||
"email=person%40example.test&",
|
||||
"cookie=secret-cookie&",
|
||||
"page=2"
|
||||
);
|
||||
|
||||
let redacted = redact_url(url);
|
||||
|
||||
assert_eq!(
|
||||
query_value(&redacted, "authorization").as_deref(),
|
||||
Some(REDACTED)
|
||||
);
|
||||
assert_eq!(query_value(&redacted, "api-key").as_deref(), Some(REDACTED));
|
||||
assert_eq!(query_value(&redacted, "csrf").as_deref(), Some(REDACTED));
|
||||
assert_eq!(
|
||||
query_value(&redacted, "access_token").as_deref(),
|
||||
Some(REDACTED)
|
||||
);
|
||||
assert_eq!(
|
||||
query_value(&redacted, "session_id").as_deref(),
|
||||
Some(REDACTED)
|
||||
);
|
||||
assert_eq!(
|
||||
query_value(&redacted, "password").as_deref(),
|
||||
Some(REDACTED)
|
||||
);
|
||||
assert_eq!(query_value(&redacted, "email").as_deref(), Some(REDACTED));
|
||||
assert_eq!(query_value(&redacted, "cookie").as_deref(), Some(REDACTED));
|
||||
assert_eq!(query_value(&redacted, "page").as_deref(), Some("2"));
|
||||
assert!(!redacted.contains("secret"));
|
||||
assert!(!redacted.contains("person%40example.test"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn redacts_sensitive_json_body_keys_recursively() {
|
||||
let body = json!({
|
||||
"authorization": "Bearer secret-token",
|
||||
"cookie": "session=secret-session",
|
||||
"set-cookie": "session=secret-session",
|
||||
"api-key": "secret-api-key",
|
||||
"csrf": "secret-csrf",
|
||||
"access_token": "secret-access-token",
|
||||
"session_id": "secret-session",
|
||||
"password": "secret-password",
|
||||
"email": "person@example.test",
|
||||
"profile": {
|
||||
"backupEmail": "backup@example.test",
|
||||
"display_name": "Visible Name"
|
||||
},
|
||||
"items": [
|
||||
{
|
||||
"sessionToken": "nested-secret-session-token",
|
||||
"quantity": 3
|
||||
}
|
||||
]
|
||||
});
|
||||
|
||||
let redacted = redact_json(&body);
|
||||
|
||||
assert_eq!(redacted["authorization"], REDACTED);
|
||||
assert_eq!(redacted["cookie"], REDACTED);
|
||||
assert_eq!(redacted["set-cookie"], REDACTED);
|
||||
assert_eq!(redacted["api-key"], REDACTED);
|
||||
assert_eq!(redacted["csrf"], REDACTED);
|
||||
assert_eq!(redacted["access_token"], REDACTED);
|
||||
assert_eq!(redacted["session_id"], REDACTED);
|
||||
assert_eq!(redacted["password"], REDACTED);
|
||||
assert_eq!(redacted["email"], REDACTED);
|
||||
assert_eq!(redacted["profile"]["backupEmail"], REDACTED);
|
||||
assert_eq!(redacted["profile"]["display_name"], "Visible Name");
|
||||
assert_eq!(redacted["items"][0]["sessionToken"], REDACTED);
|
||||
assert_eq!(redacted["items"][0]["quantity"], 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn redacts_capture_artifact_headers_urls_and_json_body_samples() {
|
||||
let captured_at = Utc.with_ymd_and_hms(2026, 5, 16, 12, 0, 0).unwrap();
|
||||
let artifact = CaptureArtifact {
|
||||
id: "example.test/2026-05-16T12-00-00Z".to_owned(),
|
||||
source_url: "https://example.test/app?email=person@example.test".to_owned(),
|
||||
intent: Some("discover public API".to_owned()),
|
||||
started_at: captured_at,
|
||||
completed_at: Some(captured_at),
|
||||
exchanges: vec![CapturedExchange {
|
||||
method: "POST".to_owned(),
|
||||
url: "https://example.test/api/session?token=secret-token&page=2".to_owned(),
|
||||
request_headers: header_map(&[
|
||||
("Authorization", "Bearer secret-token"),
|
||||
("Content-Type", "application/json"),
|
||||
]),
|
||||
request_body_sample: Some(
|
||||
json!({
|
||||
"email": "person@example.test",
|
||||
"password": "secret-password",
|
||||
"name": "Visible Name"
|
||||
})
|
||||
.to_string(),
|
||||
),
|
||||
resource_type: Some("fetch".to_owned()),
|
||||
status: 200,
|
||||
response_headers: header_map(&[
|
||||
("Set-Cookie", "session=secret-session; HttpOnly"),
|
||||
("Content-Type", "application/json"),
|
||||
]),
|
||||
response_body_sample: Some(
|
||||
json!({
|
||||
"sessionToken": "secret-session-token",
|
||||
"status": "ok"
|
||||
})
|
||||
.to_string(),
|
||||
),
|
||||
started_at: captured_at,
|
||||
duration_ms: 25,
|
||||
redirect_chain: vec!["https://example.test/login?csrf=secret-csrf".to_owned()],
|
||||
}],
|
||||
endpoints: Vec::new(),
|
||||
metadata: Map::new(),
|
||||
};
|
||||
|
||||
let redacted = redact_artifact(&artifact);
|
||||
let exchange = &redacted.exchanges[0];
|
||||
|
||||
assert_eq!(
|
||||
query_value(&redacted.source_url, "email").as_deref(),
|
||||
Some(REDACTED)
|
||||
);
|
||||
assert_eq!(
|
||||
query_value(&exchange.url, "token").as_deref(),
|
||||
Some(REDACTED)
|
||||
);
|
||||
assert_eq!(query_value(&exchange.url, "page").as_deref(), Some("2"));
|
||||
assert_eq!(exchange.request_headers["Authorization"], REDACTED);
|
||||
assert_eq!(exchange.request_headers["Content-Type"], "application/json");
|
||||
assert_eq!(exchange.response_headers["Set-Cookie"], REDACTED);
|
||||
assert_eq!(
|
||||
query_value(&exchange.redirect_chain[0], "csrf").as_deref(),
|
||||
Some(REDACTED)
|
||||
);
|
||||
|
||||
let request_body = exchange.request_body_sample.as_deref().unwrap();
|
||||
assert!(request_body.contains(REDACTED));
|
||||
assert!(request_body.contains("Visible Name"));
|
||||
assert!(!request_body.contains("person@example.test"));
|
||||
assert!(!request_body.contains("secret-password"));
|
||||
|
||||
let response_body = exchange.response_body_sample.as_deref().unwrap();
|
||||
assert!(response_body.contains(REDACTED));
|
||||
assert!(response_body.contains("ok"));
|
||||
assert!(!response_body.contains("secret-session-token"));
|
||||
}
|
||||
414
crates/webclaw-capture/tests/replay.rs
Normal file
414
crates/webclaw-capture/tests/replay.rs
Normal file
|
|
@ -0,0 +1,414 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde_json::{Map, Value, json};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
use webclaw_capture::replay::replay_endpoint;
|
||||
use webclaw_capture::types::{
|
||||
EndpointDefinition, EndpointExample, EndpointSafety, ReplayOptions, ReplayResult,
|
||||
};
|
||||
|
||||
struct LocalServer {
|
||||
base_url: String,
|
||||
requests: mpsc::UnboundedReceiver<String>,
|
||||
shutdown: Option<oneshot::Sender<()>>,
|
||||
}
|
||||
|
||||
impl LocalServer {
|
||||
async fn start() -> Self {
|
||||
let listener = TcpListener::bind("127.0.0.1:0")
|
||||
.await
|
||||
.expect("bind local replay test server");
|
||||
let address = listener.local_addr().expect("local replay server address");
|
||||
let (shutdown, mut shutdown_rx) = oneshot::channel::<()>();
|
||||
let (requests_tx, requests_rx) = mpsc::unbounded_channel::<String>();
|
||||
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = &mut shutdown_rx => break,
|
||||
accepted = listener.accept() => {
|
||||
let Ok((stream, _peer)) = accepted else {
|
||||
continue;
|
||||
};
|
||||
|
||||
tokio::spawn(handle_connection(stream, requests_tx.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Self {
|
||||
base_url: format!("http://{address}"),
|
||||
requests: requests_rx,
|
||||
shutdown: Some(shutdown),
|
||||
}
|
||||
}
|
||||
|
||||
async fn next_request(&mut self) -> String {
|
||||
tokio::time::timeout(Duration::from_secs(2), self.requests.recv())
|
||||
.await
|
||||
.expect("local replay server should receive a request")
|
||||
.expect("local replay server request channel should remain open")
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LocalServer {
|
||||
fn drop(&mut self) {
|
||||
if let Some(shutdown) = self.shutdown.take() {
|
||||
let _ = shutdown.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn get_endpoint_executes_when_dry_run_is_false() {
|
||||
let mut server = LocalServer::start().await;
|
||||
let endpoint = get_endpoint(&server.base_url, headers(&[("Accept", "application/json")]));
|
||||
|
||||
let result = replay_endpoint(
|
||||
&endpoint,
|
||||
ReplayOptions {
|
||||
dry_run: false,
|
||||
confirm_unsafe: false,
|
||||
params_json: Some(json!({ "category": "tools" })),
|
||||
headers: Map::new(),
|
||||
body_json: None,
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("replay GET endpoint");
|
||||
|
||||
match result {
|
||||
ReplayResult::Executed {
|
||||
status,
|
||||
body_sample,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(status, 200);
|
||||
assert!(
|
||||
body_sample
|
||||
.as_deref()
|
||||
.unwrap_or_default()
|
||||
.contains(r#""ok":true"#),
|
||||
"executed replay should return the response body sample"
|
||||
);
|
||||
}
|
||||
other => panic!("GET replay should execute, got {other:#?}"),
|
||||
}
|
||||
|
||||
let request = server.next_request().await;
|
||||
assert!(
|
||||
request.starts_with("GET /api/products"),
|
||||
"server should receive the replayed GET request, got {request:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn get_endpoint_with_dry_run_returns_preview_without_network() {
|
||||
let endpoint = get_endpoint(
|
||||
"http://127.0.0.1:9",
|
||||
headers(&[("Accept", "application/json")]),
|
||||
);
|
||||
|
||||
let result = replay_endpoint(
|
||||
&endpoint,
|
||||
ReplayOptions {
|
||||
dry_run: true,
|
||||
confirm_unsafe: false,
|
||||
params_json: Some(json!({ "category": "tools" })),
|
||||
headers: headers(&[("X-Replay-Trace", "dry-run")]),
|
||||
body_json: None,
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("preview GET endpoint");
|
||||
|
||||
match result {
|
||||
ReplayResult::Preview {
|
||||
method,
|
||||
url,
|
||||
headers,
|
||||
body_sample,
|
||||
} => {
|
||||
assert_eq!(method, "GET");
|
||||
assert!(url.starts_with("http://127.0.0.1:9/api/products"));
|
||||
assert!(url.contains("category=tools"));
|
||||
assert_eq!(header_string(&headers, "X-Replay-Trace"), Some("dry-run"));
|
||||
assert_eq!(body_sample, None);
|
||||
}
|
||||
other => panic!("dry-run GET replay should return a preview, got {other:#?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn post_without_confirmation_is_blocked() {
|
||||
let endpoint = post_endpoint("http://127.0.0.1:9");
|
||||
|
||||
let result = replay_endpoint(
|
||||
&endpoint,
|
||||
ReplayOptions {
|
||||
dry_run: false,
|
||||
confirm_unsafe: false,
|
||||
params_json: None,
|
||||
headers: Map::new(),
|
||||
body_json: Some(graphql_body()),
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("block unsafe POST replay");
|
||||
|
||||
match result {
|
||||
ReplayResult::Blocked { reason } => {
|
||||
let reason = reason.to_ascii_lowercase();
|
||||
assert!(
|
||||
reason.contains("confirm") || reason.contains("unsafe"),
|
||||
"blocked replay should explain confirmation is required, got {reason:?}"
|
||||
);
|
||||
}
|
||||
other => {
|
||||
panic!("unsafe POST replay without confirmation should be blocked, got {other:#?}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn post_with_dry_run_returns_preview_only() {
|
||||
let endpoint = post_endpoint("http://127.0.0.1:9");
|
||||
|
||||
let result = replay_endpoint(
|
||||
&endpoint,
|
||||
ReplayOptions {
|
||||
dry_run: true,
|
||||
confirm_unsafe: false,
|
||||
params_json: None,
|
||||
headers: headers(&[("Content-Type", "application/json")]),
|
||||
body_json: Some(graphql_body()),
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("preview unsafe POST replay");
|
||||
|
||||
match result {
|
||||
ReplayResult::Preview {
|
||||
method,
|
||||
url,
|
||||
body_sample,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(method, "POST");
|
||||
assert_eq!(url, "http://127.0.0.1:9/graphql");
|
||||
assert!(
|
||||
body_sample
|
||||
.as_deref()
|
||||
.unwrap_or_default()
|
||||
.contains("CreateProduct"),
|
||||
"dry-run POST preview should include the request body sample"
|
||||
);
|
||||
}
|
||||
other => panic!("dry-run POST replay should return a preview, got {other:#?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn redacted_headers_are_never_sent() {
|
||||
let mut server = LocalServer::start().await;
|
||||
let endpoint = get_endpoint(
|
||||
&server.base_url,
|
||||
headers(&[
|
||||
("Authorization", "[REDACTED]"),
|
||||
("Cookie", "[REDACTED]"),
|
||||
("X-Api-Key", "[REDACTED]"),
|
||||
("X-Trace-Id", "captured-trace"),
|
||||
]),
|
||||
);
|
||||
|
||||
let result = replay_endpoint(
|
||||
&endpoint,
|
||||
ReplayOptions {
|
||||
dry_run: false,
|
||||
confirm_unsafe: false,
|
||||
params_json: None,
|
||||
headers: headers(&[
|
||||
("X-User-Email", "[REDACTED]"),
|
||||
("X-Allowed-Override", "override-ok"),
|
||||
]),
|
||||
body_json: None,
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("replay GET endpoint without redacted headers");
|
||||
|
||||
assert!(
|
||||
matches!(result, ReplayResult::Executed { status: 200, .. }),
|
||||
"GET replay should execute, got {result:#?}"
|
||||
);
|
||||
|
||||
let request = server.next_request().await;
|
||||
let lower_request = request.to_ascii_lowercase();
|
||||
|
||||
for forbidden in [
|
||||
"authorization:",
|
||||
"cookie:",
|
||||
"x-api-key:",
|
||||
"x-user-email:",
|
||||
"[redacted]",
|
||||
] {
|
||||
assert!(
|
||||
!lower_request.contains(forbidden),
|
||||
"replay request should not send redacted header material {forbidden:?}: {request}"
|
||||
);
|
||||
}
|
||||
assert!(
|
||||
lower_request.contains("x-allowed-override: override-ok"),
|
||||
"non-redacted caller-supplied headers should still be sent: {request}"
|
||||
);
|
||||
}
|
||||
|
||||
async fn handle_connection(mut stream: TcpStream, requests: mpsc::UnboundedSender<String>) {
|
||||
let mut buffer = vec![0_u8; 8192];
|
||||
let Ok(bytes_read) = stream.read(&mut buffer).await else {
|
||||
return;
|
||||
};
|
||||
if bytes_read == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let request = String::from_utf8_lossy(&buffer[..bytes_read]).to_string();
|
||||
let status = if request.starts_with("GET /api/products") {
|
||||
"200 OK"
|
||||
} else {
|
||||
"404 Not Found"
|
||||
};
|
||||
let body = if status == "200 OK" {
|
||||
r#"{"ok":true,"items":[{"id":12345,"name":"Hammer"}]}"#
|
||||
} else {
|
||||
r#"{"ok":false}"#
|
||||
};
|
||||
let response = http_response(status, &[("Content-Type", "application/json")], body);
|
||||
|
||||
let _ = requests.send(request);
|
||||
let _ = stream.write_all(response.as_bytes()).await;
|
||||
let _ = stream.shutdown().await;
|
||||
}
|
||||
|
||||
fn http_response(status: &str, headers: &[(&str, &str)], body: &str) -> String {
|
||||
let mut response = format!(
|
||||
"HTTP/1.1 {status}\r\nContent-Length: {}\r\nConnection: close\r\nCache-Control: no-store\r\n",
|
||||
body.len()
|
||||
);
|
||||
|
||||
for (name, value) in headers {
|
||||
response.push_str(name);
|
||||
response.push_str(": ");
|
||||
response.push_str(value);
|
||||
response.push_str("\r\n");
|
||||
}
|
||||
|
||||
response.push_str("\r\n");
|
||||
response.push_str(body);
|
||||
response
|
||||
}
|
||||
|
||||
fn get_endpoint(origin: &str, request_headers: Map<String, Value>) -> EndpointDefinition {
|
||||
let mut query_params = BTreeMap::new();
|
||||
query_params.insert("category".to_owned(), vec!["tools".to_owned()]);
|
||||
|
||||
EndpointDefinition {
|
||||
id: format!("GET {origin}/api/products"),
|
||||
method: "GET".to_owned(),
|
||||
origin: origin.to_owned(),
|
||||
path_template: "/api/products".to_owned(),
|
||||
query_params,
|
||||
request_schema: None,
|
||||
response_schema: Some(json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": { "type": "array" }
|
||||
}
|
||||
})),
|
||||
auth_evidence: Vec::new(),
|
||||
safety: EndpointSafety {
|
||||
safe_to_replay: true,
|
||||
requires_confirmation: false,
|
||||
reason: "GET is a read-oriented HTTP method".to_owned(),
|
||||
},
|
||||
examples: vec![EndpointExample {
|
||||
url: format!("{origin}/api/products?category=tools"),
|
||||
request_headers,
|
||||
request_body_sample: None,
|
||||
response_status: 200,
|
||||
response_headers: headers(&[("Content-Type", "application/json")]),
|
||||
response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()),
|
||||
captured_at: test_time(),
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn post_endpoint(origin: &str) -> EndpointDefinition {
|
||||
EndpointDefinition {
|
||||
id: format!("POST {origin}/graphql"),
|
||||
method: "POST".to_owned(),
|
||||
origin: origin.to_owned(),
|
||||
path_template: "/graphql".to_owned(),
|
||||
query_params: BTreeMap::new(),
|
||||
request_schema: Some(json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": { "type": "string" },
|
||||
"variables": { "type": "object" }
|
||||
}
|
||||
})),
|
||||
response_schema: Some(json!({ "type": "object" })),
|
||||
auth_evidence: vec!["X-CSRF-Token header observed".to_owned()],
|
||||
safety: EndpointSafety {
|
||||
safe_to_replay: false,
|
||||
requires_confirmation: true,
|
||||
reason: "POST may mutate server state and requires confirmation".to_owned(),
|
||||
},
|
||||
examples: vec![EndpointExample {
|
||||
url: format!("{origin}/graphql"),
|
||||
request_headers: headers(&[
|
||||
("Content-Type", "application/json"),
|
||||
("X-CSRF-Token", "[REDACTED]"),
|
||||
]),
|
||||
request_body_sample: Some(graphql_body().to_string()),
|
||||
response_status: 200,
|
||||
response_headers: headers(&[("Content-Type", "application/json")]),
|
||||
response_body_sample: Some(r#"{"data":{"createProduct":{"id":"12345"}}}"#.to_owned()),
|
||||
captured_at: test_time(),
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
|
||||
entries
|
||||
.iter()
|
||||
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn header_string<'a>(headers: &'a Map<String, Value>, name: &str) -> Option<&'a str> {
|
||||
headers
|
||||
.iter()
|
||||
.find(|(header_name, _value)| header_name.eq_ignore_ascii_case(name))
|
||||
.and_then(|(_header_name, value)| value.as_str())
|
||||
}
|
||||
|
||||
fn graphql_body() -> Value {
|
||||
json!({
|
||||
"query": "mutation CreateProduct($name: String!) { createProduct(name: $name) { id } }",
|
||||
"variables": {
|
||||
"name": "Hammer"
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn test_time() -> DateTime<Utc> {
|
||||
DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z")
|
||||
.expect("valid test timestamp")
|
||||
.with_timezone(&Utc)
|
||||
}
|
||||
312
crates/webclaw-capture/tests/store.rs
Normal file
312
crates/webclaw-capture/tests/store.rs
Normal file
|
|
@ -0,0 +1,312 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::env;
|
||||
use std::ffi::OsString;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Mutex;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde_json::{Map, Value, json};
|
||||
use url::Url;
|
||||
use webclaw_capture::redact::redact_artifact;
|
||||
use webclaw_capture::store::{
|
||||
capture_id_for, capture_root, find_endpoint, load_endpoints, save_capture,
|
||||
};
|
||||
use webclaw_capture::types::{
|
||||
CaptureArtifact, CapturedExchange, EndpointDefinition, EndpointExample, EndpointSafety,
|
||||
};
|
||||
|
||||
static ENV_LOCK: Mutex<()> = Mutex::new(());
|
||||
const CAPTURE_DIR_ENV: &str = "WEBCLAW_CAPTURE_DIR";
|
||||
|
||||
struct EnvVarGuard {
|
||||
original: Option<OsString>,
|
||||
}
|
||||
|
||||
impl EnvVarGuard {
|
||||
fn set_capture_dir(value: Option<&Path>) -> Self {
|
||||
let original = env::var_os(CAPTURE_DIR_ENV);
|
||||
|
||||
unsafe {
|
||||
match value {
|
||||
Some(path) => env::set_var(CAPTURE_DIR_ENV, path),
|
||||
None => env::remove_var(CAPTURE_DIR_ENV),
|
||||
}
|
||||
}
|
||||
|
||||
Self { original }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EnvVarGuard {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
match &self.original {
|
||||
Some(value) => env::set_var(CAPTURE_DIR_ENV, value),
|
||||
None => env::remove_var(CAPTURE_DIR_ENV),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn with_capture_dir<T>(value: Option<&Path>, test: impl FnOnce() -> T) -> T {
|
||||
let _lock = ENV_LOCK.lock().expect("capture env lock");
|
||||
let _guard = EnvVarGuard::set_capture_dir(value);
|
||||
|
||||
test()
|
||||
}
|
||||
|
||||
fn unique_temp_root(test_name: &str) -> PathBuf {
|
||||
let nanos = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.expect("system time after unix epoch")
|
||||
.as_nanos();
|
||||
|
||||
env::temp_dir().join(format!(
|
||||
"webclaw-capture-store-{test_name}-{}-{nanos}",
|
||||
std::process::id()
|
||||
))
|
||||
}
|
||||
|
||||
fn test_time() -> DateTime<Utc> {
|
||||
DateTime::parse_from_rfc3339("2026-05-16T12:00:00Z")
|
||||
.expect("valid test timestamp")
|
||||
.with_timezone(&Utc)
|
||||
}
|
||||
|
||||
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
|
||||
entries
|
||||
.iter()
|
||||
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn sample_endpoint() -> EndpointDefinition {
|
||||
let mut query_params = BTreeMap::new();
|
||||
query_params.insert("category".to_owned(), vec!["tools".to_owned()]);
|
||||
|
||||
EndpointDefinition {
|
||||
id: "GET https://example.test/api/products".to_owned(),
|
||||
method: "GET".to_owned(),
|
||||
origin: "https://example.test".to_owned(),
|
||||
path_template: "/api/products".to_owned(),
|
||||
query_params,
|
||||
request_schema: None,
|
||||
response_schema: Some(json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": { "type": "object" }
|
||||
}
|
||||
}
|
||||
})),
|
||||
auth_evidence: vec!["Authorization header observed".to_owned()],
|
||||
safety: EndpointSafety {
|
||||
safe_to_replay: true,
|
||||
requires_confirmation: false,
|
||||
reason: "GET is a read-oriented HTTP method".to_owned(),
|
||||
},
|
||||
examples: vec![EndpointExample {
|
||||
url: "https://example.test/api/products?category=tools".to_owned(),
|
||||
request_headers: headers(&[
|
||||
("Authorization", "Bearer raw-secret"),
|
||||
("Accept", "application/json"),
|
||||
]),
|
||||
request_body_sample: None,
|
||||
response_status: 200,
|
||||
response_headers: headers(&[("Content-Type", "application/json")]),
|
||||
response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()),
|
||||
captured_at: test_time(),
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn sample_exchange() -> CapturedExchange {
|
||||
CapturedExchange {
|
||||
method: "GET".to_owned(),
|
||||
url: "https://example.test/api/products?category=tools&token=raw-secret".to_owned(),
|
||||
request_headers: headers(&[
|
||||
("Authorization", "Bearer raw-secret"),
|
||||
("Accept", "application/json"),
|
||||
]),
|
||||
request_body_sample: None,
|
||||
resource_type: Some("fetch".to_owned()),
|
||||
status: 200,
|
||||
response_headers: headers(&[("Content-Type", "application/json")]),
|
||||
response_body_sample: Some(r#"{"items":[{"id":12345,"name":"Hammer"}]}"#.to_owned()),
|
||||
started_at: test_time(),
|
||||
duration_ms: 42,
|
||||
redirect_chain: vec!["https://example.test/login?session=raw-secret".to_owned()],
|
||||
}
|
||||
}
|
||||
|
||||
fn sample_artifact() -> CaptureArtifact {
|
||||
let mut metadata = Map::new();
|
||||
metadata.insert("runner".to_owned(), json!("store-test"));
|
||||
|
||||
CaptureArtifact {
|
||||
id: "example.test/2026-05-16T12-00-00Z".to_owned(),
|
||||
source_url: "https://example.test/products?email=user@example.test".to_owned(),
|
||||
intent: Some("discover product listing API".to_owned()),
|
||||
started_at: test_time(),
|
||||
completed_at: Some(test_time()),
|
||||
exchanges: vec![sample_exchange()],
|
||||
endpoints: vec![sample_endpoint()],
|
||||
metadata,
|
||||
}
|
||||
}
|
||||
|
||||
fn read_json<T: serde::de::DeserializeOwned>(path: &Path) -> T {
|
||||
let contents = fs::read_to_string(path).expect("read JSON file");
|
||||
serde_json::from_str(&contents).expect("valid JSON file")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_capture_root_resolves_under_user_profile_webclaw_api_captures() {
|
||||
with_capture_dir(None, || {
|
||||
let home = env::var_os("USERPROFILE")
|
||||
.map(PathBuf::from)
|
||||
.or_else(dirs::home_dir)
|
||||
.expect("home directory");
|
||||
|
||||
assert_eq!(capture_root(), home.join(".webclaw").join("api-captures"));
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capture_root_uses_webclaw_capture_dir_override() {
|
||||
let root = unique_temp_root("override");
|
||||
|
||||
with_capture_dir(Some(&root), || {
|
||||
assert_eq!(capture_root(), root);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capture_id_for_uses_domain_and_filesystem_safe_utc_timestamp() {
|
||||
let url = Url::parse("https://example.test/api/products?category=tools").expect("valid URL");
|
||||
|
||||
assert_eq!(
|
||||
capture_id_for(&url, test_time()),
|
||||
"example.test/2026-05-16T12-00-00Z"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn save_capture_writes_raw_redacted_endpoints_and_metadata_files() {
|
||||
let root = unique_temp_root("save");
|
||||
|
||||
with_capture_dir(Some(&root), || {
|
||||
let artifact = sample_artifact();
|
||||
let saved = save_capture(&artifact).expect("save capture");
|
||||
|
||||
assert_eq!(saved.id, artifact.id);
|
||||
assert_eq!(
|
||||
saved.capture_dir,
|
||||
root.join("example.test").join("2026-05-16T12-00-00Z")
|
||||
);
|
||||
assert_eq!(
|
||||
saved.raw_capture_path,
|
||||
saved.capture_dir.join("raw-capture.json")
|
||||
);
|
||||
assert_eq!(
|
||||
saved.redacted_capture_path,
|
||||
saved.capture_dir.join("redacted-capture.json")
|
||||
);
|
||||
assert_eq!(
|
||||
saved.endpoints_path,
|
||||
saved.capture_dir.join("endpoints.json")
|
||||
);
|
||||
assert_eq!(saved.metadata_path, saved.capture_dir.join("metadata.json"));
|
||||
|
||||
assert!(saved.raw_capture_path.is_file());
|
||||
assert!(saved.redacted_capture_path.is_file());
|
||||
assert!(saved.endpoints_path.is_file());
|
||||
assert!(saved.metadata_path.is_file());
|
||||
|
||||
let raw_capture: CaptureArtifact = read_json(&saved.raw_capture_path);
|
||||
assert_eq!(raw_capture, artifact);
|
||||
|
||||
let redacted_capture: CaptureArtifact = read_json(&saved.redacted_capture_path);
|
||||
assert_ne!(redacted_capture, artifact);
|
||||
assert!(
|
||||
!serde_json::to_string(&redacted_capture)
|
||||
.expect("serialize redacted capture")
|
||||
.contains("raw-secret"),
|
||||
"redacted capture should not contain raw secrets"
|
||||
);
|
||||
|
||||
let endpoints: Vec<EndpointDefinition> = read_json(&saved.endpoints_path);
|
||||
assert_eq!(endpoints, redact_artifact(&artifact).endpoints);
|
||||
assert!(
|
||||
!serde_json::to_string(&endpoints)
|
||||
.expect("serialize endpoints")
|
||||
.contains("raw-secret"),
|
||||
"endpoints.json should not contain raw secrets"
|
||||
);
|
||||
|
||||
let metadata: Value = read_json(&saved.metadata_path);
|
||||
assert!(
|
||||
metadata.is_object(),
|
||||
"metadata.json should contain a JSON object"
|
||||
);
|
||||
let metadata_text = serde_json::to_string(&metadata).expect("serialize metadata");
|
||||
assert!(
|
||||
!metadata_text.contains("user@example.test"),
|
||||
"metadata.json should redact PII from source_url"
|
||||
);
|
||||
assert!(
|
||||
metadata_text.contains("REDACTED"),
|
||||
"metadata.json should preserve the redaction marker"
|
||||
);
|
||||
});
|
||||
|
||||
let _ = fs::remove_dir_all(root);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn load_endpoints_by_capture_id_reads_endpoints_json() {
|
||||
let root = unique_temp_root("load");
|
||||
|
||||
with_capture_dir(Some(&root), || {
|
||||
let artifact = sample_artifact();
|
||||
save_capture(&artifact).expect("save capture");
|
||||
|
||||
let loaded = load_endpoints(&artifact.id).expect("load endpoints");
|
||||
|
||||
assert_eq!(loaded, redact_artifact(&artifact).endpoints);
|
||||
assert!(
|
||||
!serde_json::to_string(&loaded)
|
||||
.expect("serialize loaded endpoints")
|
||||
.contains("raw-secret"),
|
||||
"loaded endpoints should not contain raw secrets"
|
||||
);
|
||||
});
|
||||
|
||||
let _ = fs::remove_dir_all(root);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_endpoint_scans_saved_capture_endpoints() {
|
||||
let root = unique_temp_root("find");
|
||||
|
||||
with_capture_dir(Some(&root), || {
|
||||
let artifact = sample_artifact();
|
||||
let expected = redact_artifact(&artifact).endpoints[0].clone();
|
||||
save_capture(&artifact).expect("save capture");
|
||||
|
||||
let found = find_endpoint(&expected.id).expect("find endpoint");
|
||||
|
||||
assert_eq!(found, expected);
|
||||
assert!(
|
||||
!serde_json::to_string(&found)
|
||||
.expect("serialize found endpoint")
|
||||
.contains("raw-secret"),
|
||||
"found endpoint should not contain raw secrets"
|
||||
);
|
||||
});
|
||||
|
||||
let _ = fs::remove_dir_all(root);
|
||||
}
|
||||
|
|
@ -11,6 +11,7 @@ path = "src/main.rs"
|
|||
|
||||
[dependencies]
|
||||
webclaw-core = { workspace = true }
|
||||
webclaw-capture = { path = "../webclaw-capture" }
|
||||
webclaw-fetch = { workspace = true }
|
||||
webclaw-llm = { workspace = true }
|
||||
webclaw-pdf = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -10,6 +10,11 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
|||
|
||||
use clap::{Parser, Subcommand, ValueEnum};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use webclaw_capture::cdp::{CaptureOptions, capture_network};
|
||||
use webclaw_capture::openapi::write_openapi;
|
||||
use webclaw_capture::replay::replay_endpoint;
|
||||
use webclaw_capture::store::{find_endpoint, load_endpoints};
|
||||
use webclaw_capture::types::{EndpointDefinition, ReplayOptions};
|
||||
use webclaw_core::{
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
|
||||
to_llm_text,
|
||||
|
|
@ -336,6 +341,61 @@ enum Commands {
|
|||
#[arg(long)]
|
||||
raw: bool,
|
||||
},
|
||||
|
||||
/// Capture browser network traffic and learn reusable API endpoints.
|
||||
CaptureNetwork {
|
||||
/// Page URL to inspect.
|
||||
url: String,
|
||||
|
||||
/// Capture intent, stored with the capture metadata.
|
||||
#[arg(long)]
|
||||
intent: Option<String>,
|
||||
|
||||
/// Milliseconds to wait after page navigation before saving the capture.
|
||||
#[arg(long, default_value_t = 3000)]
|
||||
wait_ms: u64,
|
||||
|
||||
/// Run Chromium with a visible window instead of headless mode.
|
||||
#[arg(long)]
|
||||
headed: bool,
|
||||
},
|
||||
|
||||
/// Print learned endpoints for a saved capture id.
|
||||
Endpoints {
|
||||
/// Capture id, for example `example.com/2026-05-16T12-00-00Z`.
|
||||
capture_id: String,
|
||||
},
|
||||
|
||||
/// Print one learned endpoint by endpoint id.
|
||||
ShowEndpoint {
|
||||
/// Endpoint id, for example `get_example_test_api_products`.
|
||||
endpoint_id: String,
|
||||
},
|
||||
|
||||
/// Replay or preview a learned endpoint.
|
||||
ReplayEndpoint {
|
||||
/// Endpoint id to replay.
|
||||
endpoint_id: String,
|
||||
|
||||
/// JSON object with path/query parameter overrides.
|
||||
#[arg(long, default_value = "{}")]
|
||||
params_json: String,
|
||||
|
||||
/// Preview the replay request without network access.
|
||||
#[arg(long)]
|
||||
dry_run: bool,
|
||||
|
||||
/// Allow unsafe methods such as POST, PUT, PATCH, and DELETE to execute.
|
||||
#[arg(long)]
|
||||
confirm_unsafe: bool,
|
||||
},
|
||||
|
||||
/// Export a saved capture's learned endpoints as OpenAPI 3.1 JSON.
|
||||
#[command(name = "export-openapi")]
|
||||
ExportOpenapi {
|
||||
/// Capture id, for example `example.com/2026-05-16T12-00-00Z`.
|
||||
capture_id: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
|
|
@ -2169,6 +2229,121 @@ fn has_llm_flags(cli: &Cli) -> bool {
|
|||
cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
|
||||
}
|
||||
|
||||
async fn run_capture_network_command(
|
||||
url: &str,
|
||||
intent: Option<String>,
|
||||
wait_ms: u64,
|
||||
headed: bool,
|
||||
) -> Result<(), String> {
|
||||
let saved = capture_network(CaptureOptions {
|
||||
url: normalize_url(url),
|
||||
intent,
|
||||
wait_ms,
|
||||
headed,
|
||||
})
|
||||
.await
|
||||
.map_err(|e| format!("capture-network failed: {e}"))?;
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&saved).map_err(|e| format!("JSON encode failed: {e}"))?
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_endpoints_command(capture_id: &str) -> Result<(), String> {
|
||||
let endpoints = load_endpoints(capture_id)
|
||||
.map_err(|e| format!("could not load endpoints for capture id {capture_id}: {e}"))?;
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&endpoints).map_err(|e| format!("JSON encode failed: {e}"))?
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_show_endpoint_command(endpoint_id: &str) -> Result<(), String> {
|
||||
let endpoint = find_endpoint(endpoint_id)
|
||||
.map_err(|e| format!("could not find endpoint id {endpoint_id}: {e}"))?;
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&endpoint).map_err(|e| format!("JSON encode failed: {e}"))?
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_replay_endpoint_command(
|
||||
endpoint_id: &str,
|
||||
params_json: &str,
|
||||
dry_run: bool,
|
||||
confirm_unsafe: bool,
|
||||
) -> Result<(), String> {
|
||||
let endpoint = find_endpoint(endpoint_id)
|
||||
.map_err(|e| format!("could not find endpoint id {endpoint_id}: {e}"))?;
|
||||
let params_json = parse_params_json(params_json)?;
|
||||
let default_dry_run = endpoint_defaults_to_dry_run(&endpoint) && !confirm_unsafe;
|
||||
|
||||
if default_dry_run && !dry_run {
|
||||
eprintln!(
|
||||
"Unsafe endpoint replay defaults to dry-run. Re-run with --confirm-unsafe to execute."
|
||||
);
|
||||
}
|
||||
|
||||
let options = ReplayOptions {
|
||||
dry_run: dry_run || default_dry_run,
|
||||
confirm_unsafe,
|
||||
params_json,
|
||||
headers: serde_json::Map::new(),
|
||||
body_json: None,
|
||||
};
|
||||
|
||||
let result = replay_endpoint(&endpoint, options)
|
||||
.await
|
||||
.map_err(|e| format!("replay-endpoint failed: {e}"))?;
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&result).map_err(|e| format!("JSON encode failed: {e}"))?
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_export_openapi_command(capture_id: &str) -> Result<(), String> {
|
||||
let path = write_openapi(capture_id)
|
||||
.map_err(|e| format!("could not export OpenAPI for capture id {capture_id}: {e}"))?;
|
||||
println!("{}", path.display());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_params_json(params_json: &str) -> Result<Option<serde_json::Value>, String> {
|
||||
let trimmed = params_json.trim();
|
||||
if trimmed.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let value: serde_json::Value = serde_json::from_str(trimmed)
|
||||
.map_err(|e| format!("--params-json must be valid JSON: {e}"))?;
|
||||
if !value.is_object() {
|
||||
return Err("--params-json must be a JSON object".to_owned());
|
||||
}
|
||||
|
||||
Ok(Some(value))
|
||||
}
|
||||
|
||||
fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool {
|
||||
endpoint.safety.requires_confirmation
|
||||
|| !endpoint.safety.safe_to_replay
|
||||
|| !matches!(
|
||||
endpoint.method.to_ascii_uppercase().as_str(),
|
||||
"GET" | "HEAD" | "OPTIONS"
|
||||
)
|
||||
}
|
||||
|
||||
async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
||||
let api_key = cli
|
||||
.api_key
|
||||
|
|
@ -2405,6 +2580,56 @@ async fn main() {
|
|||
}
|
||||
return;
|
||||
}
|
||||
Commands::CaptureNetwork {
|
||||
url,
|
||||
intent,
|
||||
wait_ms,
|
||||
headed,
|
||||
} => {
|
||||
if let Err(e) =
|
||||
run_capture_network_command(url, intent.clone(), *wait_ms, *headed).await
|
||||
{
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
Commands::Endpoints { capture_id } => {
|
||||
if let Err(e) = run_endpoints_command(capture_id) {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
Commands::ShowEndpoint { endpoint_id } => {
|
||||
if let Err(e) = run_show_endpoint_command(endpoint_id) {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
Commands::ReplayEndpoint {
|
||||
endpoint_id,
|
||||
params_json,
|
||||
dry_run,
|
||||
confirm_unsafe,
|
||||
} => {
|
||||
if let Err(e) =
|
||||
run_replay_endpoint_command(endpoint_id, params_json, *dry_run, *confirm_unsafe)
|
||||
.await
|
||||
{
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
Commands::ExportOpenapi { capture_id } => {
|
||||
if let Err(e) = run_export_openapi_command(capture_id) {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -597,7 +597,7 @@ mod tests {
|
|||
"#;
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
// Should return at least the successfully parsed entry
|
||||
assert!(entries.len() >= 1);
|
||||
assert!(!entries.is_empty());
|
||||
assert_eq!(entries[0].url, "https://example.com/good");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -193,7 +193,7 @@ mod tests {
|
|||
.await
|
||||
.is_ok()
|
||||
);
|
||||
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
|
||||
assert!(!is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ fn classify(body: &str, len: usize, status: u16, kw: &[String]) -> &'static str
|
|||
"CHALLENGE"
|
||||
} else if status == 403 || status == 429 {
|
||||
"BLOCKED"
|
||||
} else if status >= 300 && status < 400 {
|
||||
} else if (300..400).contains(&status) {
|
||||
"REDIRECT"
|
||||
} else if len < 1000 {
|
||||
"EMPTY"
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ webclaw-core = { workspace = true }
|
|||
webclaw-fetch = { workspace = true }
|
||||
webclaw-llm = { workspace = true }
|
||||
webclaw-pdf = { workspace = true }
|
||||
webclaw-capture = { path = "../webclaw-capture" }
|
||||
rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
|
||||
schemars = "1.0"
|
||||
dotenvy = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -11,6 +11,10 @@ use server::WebclawMcp;
|
|||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
if print_help_or_version() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
dotenvy::dotenv().ok();
|
||||
|
||||
// Log to stderr -- stdout is the MCP transport channel
|
||||
|
|
@ -25,3 +29,42 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
service.waiting().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_help_or_version() -> bool {
|
||||
let mut args = std::env::args().skip(1);
|
||||
let Some(arg) = args.next() else {
|
||||
return false;
|
||||
};
|
||||
|
||||
match arg.as_str() {
|
||||
"-h" | "--help" => {
|
||||
println!("{}", help_text());
|
||||
true
|
||||
}
|
||||
"-V" | "--version" => {
|
||||
println!("webclaw-mcp {}", env!("CARGO_PKG_VERSION"));
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn help_text() -> String {
|
||||
format!(
|
||||
"\
|
||||
webclaw-mcp {version}
|
||||
MCP server for webclaw web extraction toolkit
|
||||
|
||||
Usage: webclaw-mcp
|
||||
|
||||
Options:
|
||||
-h, --help Print help
|
||||
-V, --version Print version
|
||||
|
||||
Tools:
|
||||
scrape, crawl, map, batch, extract, summarize, diff, brand, research, search,
|
||||
capture_network, discover_endpoints, show_endpoint, replay_endpoint,
|
||||
export_openapi, list_captures, list_extractors, vertical_scrape",
|
||||
version = env!("CARGO_PKG_VERSION")
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@
|
|||
/// Uses a local-first architecture: fetches pages directly, then falls back
|
||||
/// to the webclaw cloud API (api.webclaw.io) when bot protection or
|
||||
/// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Duration;
|
||||
|
||||
|
|
@ -11,9 +13,14 @@ use rmcp::handler::server::router::tool::ToolRouter;
|
|||
use rmcp::handler::server::wrapper::Parameters;
|
||||
use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
|
||||
use rmcp::{ServerHandler, tool, tool_handler, tool_router};
|
||||
use serde_json::json;
|
||||
use serde_json::{Map, Value, json};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use webclaw_capture::cdp::{CaptureOptions, capture_network as run_network_capture};
|
||||
use webclaw_capture::openapi::write_openapi;
|
||||
use webclaw_capture::replay::replay_endpoint as run_endpoint_replay;
|
||||
use webclaw_capture::store::{capture_root, find_endpoint, load_endpoints};
|
||||
use webclaw_capture::types::{EndpointDefinition, HeaderMap, ReplayOptions};
|
||||
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
|
||||
|
||||
use crate::tools::*;
|
||||
|
|
@ -709,6 +716,96 @@ impl WebclawMcp {
|
|||
}
|
||||
}
|
||||
|
||||
/// Capture browser network traffic from a page and save learned API endpoints for later replay.
|
||||
#[tool]
|
||||
async fn capture_network(
|
||||
&self,
|
||||
Parameters(params): Parameters<CaptureNetworkParams>,
|
||||
) -> Result<String, String> {
|
||||
let url = normalize_capture_url(¶ms.url)?;
|
||||
validate_url(&url).await?;
|
||||
|
||||
let saved = run_network_capture(CaptureOptions {
|
||||
url,
|
||||
intent: params.intent,
|
||||
wait_ms: params.wait_ms.unwrap_or(3000),
|
||||
headed: params.headed.unwrap_or(false),
|
||||
})
|
||||
.await
|
||||
.map_err(|e| format!("capture_network failed: {e}"))?;
|
||||
|
||||
to_pretty_json(&saved)
|
||||
}
|
||||
|
||||
/// Return learned endpoint definitions for a saved capture id.
|
||||
#[tool]
|
||||
async fn discover_endpoints(
|
||||
&self,
|
||||
Parameters(params): Parameters<DiscoverEndpointsParams>,
|
||||
) -> Result<String, String> {
|
||||
let endpoints = load_endpoints(¶ms.capture_id).map_err(|e| {
|
||||
format!(
|
||||
"could not load endpoints for capture id {}: {e}",
|
||||
params.capture_id
|
||||
)
|
||||
})?;
|
||||
|
||||
to_pretty_json(&endpoints)
|
||||
}
|
||||
|
||||
/// Show one learned endpoint definition by endpoint id.
|
||||
#[tool]
|
||||
async fn show_endpoint(
|
||||
&self,
|
||||
Parameters(params): Parameters<ShowEndpointParams>,
|
||||
) -> Result<String, String> {
|
||||
let endpoint = find_endpoint(¶ms.endpoint_id)
|
||||
.map_err(|e| format!("could not find endpoint id {}: {e}", params.endpoint_id))?;
|
||||
|
||||
to_pretty_json(&endpoint)
|
||||
}
|
||||
|
||||
/// Replay or preview a learned endpoint. Mutating methods default to dry-run unless confirmed.
|
||||
#[tool]
|
||||
async fn replay_endpoint(
|
||||
&self,
|
||||
Parameters(params): Parameters<ReplayEndpointParams>,
|
||||
) -> Result<String, String> {
|
||||
let endpoint = find_endpoint(¶ms.endpoint_id)
|
||||
.map_err(|e| format!("could not find endpoint id {}: {e}", params.endpoint_id))?;
|
||||
let options = replay_options_from_params(&endpoint, ¶ms)?;
|
||||
let result = run_endpoint_replay(&endpoint, options)
|
||||
.await
|
||||
.map_err(|e| format!("replay_endpoint failed: {e}"))?;
|
||||
|
||||
to_pretty_json(&result)
|
||||
}
|
||||
|
||||
/// Export a saved capture's learned endpoints as OpenAPI 3.1 JSON.
|
||||
#[tool]
|
||||
async fn export_openapi(
|
||||
&self,
|
||||
Parameters(params): Parameters<ExportOpenApiParams>,
|
||||
) -> Result<String, String> {
|
||||
let path = write_openapi(¶ms.capture_id).map_err(|e| {
|
||||
format!(
|
||||
"could not export OpenAPI for capture id {}: {e}",
|
||||
params.capture_id
|
||||
)
|
||||
})?;
|
||||
|
||||
to_pretty_json(&json!({ "path": path }))
|
||||
}
|
||||
|
||||
/// List saved network captures from the configured capture root.
|
||||
#[tool]
|
||||
async fn list_captures(
|
||||
&self,
|
||||
Parameters(_params): Parameters<ListCapturesParams>,
|
||||
) -> Result<String, String> {
|
||||
to_pretty_json(&list_saved_captures_from_root(&capture_root())?)
|
||||
}
|
||||
|
||||
/// List every vertical extractor the server knows about. Returns a
|
||||
/// JSON array of `{name, label, description, url_patterns}` entries.
|
||||
/// Call this to discover what verticals are available before using
|
||||
|
|
@ -767,11 +864,183 @@ impl ServerHandler for WebclawMcp {
|
|||
.with_instructions(String::from(
|
||||
"Webclaw MCP server -- web content extraction for AI agents. \
|
||||
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, \
|
||||
list_extractors, vertical_scrape.",
|
||||
capture_network, discover_endpoints, show_endpoint, replay_endpoint, export_openapi, \
|
||||
list_captures, list_extractors, vertical_scrape.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_capture_url(url: &str) -> Result<String, String> {
|
||||
let trimmed = url.trim();
|
||||
if trimmed.is_empty() {
|
||||
return Err("url must not be empty".to_owned());
|
||||
}
|
||||
|
||||
let normalized = if trimmed.contains("://") {
|
||||
trimmed.to_owned()
|
||||
} else {
|
||||
format!("https://{trimmed}")
|
||||
};
|
||||
|
||||
let parsed = url::Url::parse(&normalized).map_err(|e| format!("invalid URL: {e}"))?;
|
||||
match parsed.scheme() {
|
||||
"http" | "https" => Ok(normalized),
|
||||
scheme => Err(format!(
|
||||
"capture_network only supports http and https URLs, got {scheme:?}"
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn replay_options_from_params(
|
||||
endpoint: &EndpointDefinition,
|
||||
params: &ReplayEndpointParams,
|
||||
) -> Result<ReplayOptions, String> {
|
||||
if let Some(value) = ¶ms.params_json
|
||||
&& !value.is_object()
|
||||
{
|
||||
return Err("params_json must be a JSON object".to_owned());
|
||||
}
|
||||
|
||||
let confirm_unsafe = params.confirm_unsafe.unwrap_or(false);
|
||||
let default_dry_run = endpoint_defaults_to_dry_run(endpoint) && !confirm_unsafe;
|
||||
|
||||
Ok(ReplayOptions {
|
||||
dry_run: params.dry_run.unwrap_or(false) || default_dry_run,
|
||||
confirm_unsafe,
|
||||
params_json: params.params_json.clone(),
|
||||
headers: header_map_from_strings(params.headers.as_ref()),
|
||||
body_json: params.body_json.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool {
|
||||
endpoint.safety.requires_confirmation
|
||||
|| !endpoint.safety.safe_to_replay
|
||||
|| !matches!(
|
||||
endpoint.method.to_ascii_uppercase().as_str(),
|
||||
"GET" | "HEAD" | "OPTIONS"
|
||||
)
|
||||
}
|
||||
|
||||
fn header_map_from_strings(
|
||||
headers: Option<&std::collections::BTreeMap<String, String>>,
|
||||
) -> HeaderMap {
|
||||
headers
|
||||
.into_iter()
|
||||
.flat_map(|headers| headers.iter())
|
||||
.map(|(name, value)| (name.clone(), Value::String(value.clone())))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn list_saved_captures_from_root(root: &Path) -> Result<Vec<Value>, String> {
|
||||
if !root.exists() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let mut captures = Vec::new();
|
||||
collect_saved_captures(root, root, &mut captures)?;
|
||||
captures.sort_by(|left, right| {
|
||||
left.get("id")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_default()
|
||||
.cmp(right.get("id").and_then(Value::as_str).unwrap_or_default())
|
||||
});
|
||||
|
||||
Ok(captures)
|
||||
}
|
||||
|
||||
fn collect_saved_captures(
|
||||
root: &Path,
|
||||
current: &Path,
|
||||
captures: &mut Vec<Value>,
|
||||
) -> Result<(), String> {
|
||||
let entries = fs::read_dir(current).map_err(|e| {
|
||||
format!(
|
||||
"could not read capture directory {}: {e}",
|
||||
current.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
for entry in entries {
|
||||
let entry = entry.map_err(|e| format!("could not read capture directory entry: {e}"))?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
collect_saved_captures(root, &path, captures)?;
|
||||
continue;
|
||||
}
|
||||
|
||||
if path.file_name().and_then(|name| name.to_str()) == Some("metadata.json") {
|
||||
captures.push(read_capture_metadata(root, &path)?);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_capture_metadata(root: &Path, metadata_path: &Path) -> Result<Value, String> {
|
||||
let contents = fs::read_to_string(metadata_path).map_err(|e| {
|
||||
format!(
|
||||
"could not read capture metadata {}: {e}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut metadata = match serde_json::from_str::<Value>(&contents).map_err(|e| {
|
||||
format!(
|
||||
"could not parse capture metadata {}: {e}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})? {
|
||||
Value::Object(metadata) => metadata,
|
||||
_ => Map::new(),
|
||||
};
|
||||
|
||||
let capture_dir = metadata_path
|
||||
.parent()
|
||||
.ok_or_else(|| format!("metadata path has no parent: {}", metadata_path.display()))?;
|
||||
let capture_id = capture_id_from_dir(root, capture_dir)?;
|
||||
|
||||
metadata
|
||||
.entry("id".to_owned())
|
||||
.or_insert_with(|| Value::String(capture_id));
|
||||
metadata.insert(
|
||||
"capture_dir".to_owned(),
|
||||
Value::String(capture_dir.display().to_string()),
|
||||
);
|
||||
|
||||
Ok(Value::Object(metadata))
|
||||
}
|
||||
|
||||
fn capture_id_from_dir(root: &Path, capture_dir: &Path) -> Result<String, String> {
|
||||
let relative = capture_dir.strip_prefix(root).map_err(|e| {
|
||||
format!(
|
||||
"capture directory {} is not under root {}: {e}",
|
||||
capture_dir.display(),
|
||||
root.display()
|
||||
)
|
||||
})?;
|
||||
let parts = relative
|
||||
.components()
|
||||
.filter_map(|component| match component {
|
||||
std::path::Component::Normal(value) => Some(value.to_string_lossy().to_string()),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if parts.is_empty() {
|
||||
Err(format!(
|
||||
"capture directory {} does not contain a capture id",
|
||||
capture_dir.display()
|
||||
))
|
||||
} else {
|
||||
Ok(parts.join("/"))
|
||||
}
|
||||
}
|
||||
|
||||
fn to_pretty_json<T: serde::Serialize>(value: &T) -> Result<String, String> {
|
||||
serde_json::to_string_pretty(value).map_err(|e| format!("JSON encode failed: {e}"))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Research file helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -856,3 +1125,127 @@ fn save_research(dir: &std::path::Path, slug: &str, data: &serde_json::Value) ->
|
|||
json_path.to_string_lossy().to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
|
||||
use serde_json::json;
|
||||
use webclaw_capture::types::{EndpointDefinition, EndpointSafety};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn endpoint(
|
||||
method: &str,
|
||||
safe_to_replay: bool,
|
||||
requires_confirmation: bool,
|
||||
) -> EndpointDefinition {
|
||||
EndpointDefinition {
|
||||
id: format!("{}_example", method.to_ascii_lowercase()),
|
||||
method: method.to_owned(),
|
||||
origin: "https://example.test".to_owned(),
|
||||
path_template: "/api/items".to_owned(),
|
||||
query_params: BTreeMap::new(),
|
||||
request_schema: None,
|
||||
response_schema: None,
|
||||
auth_evidence: Vec::new(),
|
||||
safety: EndpointSafety {
|
||||
safe_to_replay,
|
||||
requires_confirmation,
|
||||
reason: "test".to_owned(),
|
||||
},
|
||||
examples: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_capture_url_adds_https_and_rejects_non_http_schemes() {
|
||||
assert_eq!(
|
||||
normalize_capture_url("example.test/path").unwrap(),
|
||||
"https://example.test/path"
|
||||
);
|
||||
|
||||
assert!(normalize_capture_url("file:///C:/secret.txt").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replay_options_default_unsafe_methods_to_dry_run_unless_confirmed() {
|
||||
let unsafe_endpoint = endpoint("POST", false, true);
|
||||
let params = ReplayEndpointParams {
|
||||
endpoint_id: unsafe_endpoint.id.clone(),
|
||||
params_json: Some(json!({"id": "123"})),
|
||||
dry_run: None,
|
||||
confirm_unsafe: None,
|
||||
headers: Some(BTreeMap::from([("X-Test".to_owned(), "ok".to_owned())])),
|
||||
body_json: Some(json!({"name": "tool"})),
|
||||
};
|
||||
|
||||
let options = replay_options_from_params(&unsafe_endpoint, ¶ms).unwrap();
|
||||
assert!(options.dry_run);
|
||||
assert!(!options.confirm_unsafe);
|
||||
assert_eq!(options.params_json, Some(json!({"id": "123"})));
|
||||
assert_eq!(options.headers.get("X-Test"), Some(&json!("ok")));
|
||||
|
||||
let confirmed = ReplayEndpointParams {
|
||||
confirm_unsafe: Some(true),
|
||||
..params
|
||||
};
|
||||
let options = replay_options_from_params(&unsafe_endpoint, &confirmed).unwrap();
|
||||
assert!(!options.dry_run);
|
||||
assert!(options.confirm_unsafe);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replay_options_leave_safe_gets_executable_by_default() {
|
||||
let safe_endpoint = endpoint("GET", true, false);
|
||||
let params = ReplayEndpointParams {
|
||||
endpoint_id: safe_endpoint.id.clone(),
|
||||
params_json: None,
|
||||
dry_run: None,
|
||||
confirm_unsafe: None,
|
||||
headers: None,
|
||||
body_json: None,
|
||||
};
|
||||
|
||||
let options = replay_options_from_params(&safe_endpoint, ¶ms).unwrap();
|
||||
assert!(!options.dry_run);
|
||||
assert!(!options.confirm_unsafe);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list_saved_captures_from_root_returns_metadata_with_capture_id() {
|
||||
let root = std::env::temp_dir().join(format!(
|
||||
"webclaw-mcp-list-captures-{}-{}",
|
||||
std::process::id(),
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_nanos()
|
||||
));
|
||||
let capture_dir = root.join("example.test").join("2026-05-16T12-00-00Z");
|
||||
fs::create_dir_all(&capture_dir).unwrap();
|
||||
fs::write(
|
||||
capture_dir.join("metadata.json"),
|
||||
serde_json::to_string(&json!({
|
||||
"source_url": "https://example.test",
|
||||
"endpoint_count": 2
|
||||
}))
|
||||
.unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let captures = list_saved_captures_from_root(&root).unwrap();
|
||||
fs::remove_dir_all(&root).ok();
|
||||
|
||||
assert_eq!(captures.len(), 1);
|
||||
assert_eq!(captures[0]["id"], "example.test/2026-05-16T12-00-00Z");
|
||||
assert_eq!(captures[0]["endpoint_count"], 2);
|
||||
assert!(
|
||||
captures[0]["capture_dir"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
.contains("example.test")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -104,6 +104,63 @@ pub struct SearchParams {
|
|||
pub num_results: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
#[allow(dead_code)]
|
||||
pub struct CaptureNetworkParams {
|
||||
/// URL to open in Chromium and capture network traffic from.
|
||||
pub url: String,
|
||||
/// Optional natural-language purpose for the capture.
|
||||
pub intent: Option<String>,
|
||||
/// Milliseconds to wait after navigation while collecting network events.
|
||||
pub wait_ms: Option<u64>,
|
||||
/// Run the browser in headed mode for debugging.
|
||||
pub headed: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
#[allow(dead_code)]
|
||||
pub struct DiscoverEndpointsParams {
|
||||
/// Saved capture id, for example `example.com/2026-05-16T12-00-00Z`.
|
||||
pub capture_id: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
#[allow(dead_code)]
|
||||
pub struct ShowEndpointParams {
|
||||
/// Learned endpoint id to load from saved captures.
|
||||
pub endpoint_id: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
#[allow(dead_code)]
|
||||
pub struct ReplayEndpointParams {
|
||||
/// Learned endpoint id to replay or preview.
|
||||
pub endpoint_id: String,
|
||||
/// Path/query parameter values to substitute into the learned endpoint.
|
||||
pub params_json: Option<serde_json::Value>,
|
||||
/// Preview the replay request without sending network traffic.
|
||||
pub dry_run: Option<bool>,
|
||||
/// Allow mutating methods such as POST, PUT, PATCH, and DELETE to execute.
|
||||
pub confirm_unsafe: Option<bool>,
|
||||
/// Additional non-secret request headers to include in the replay.
|
||||
pub headers: Option<std::collections::BTreeMap<String, String>>,
|
||||
/// JSON request body override for replay.
|
||||
pub body_json: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
#[allow(dead_code)]
|
||||
pub struct ExportOpenApiParams {
|
||||
/// Saved capture id whose learned endpoints should be exported.
|
||||
pub capture_id: String,
|
||||
}
|
||||
|
||||
/// `list_captures` takes no arguments but uses a struct so rmcp can generate
|
||||
/// a schema and parse the empty JSON-RPC params.
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
#[allow(dead_code)]
|
||||
pub struct ListCapturesParams {}
|
||||
|
||||
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
pub struct VerticalParams {
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ webclaw-core = { workspace = true }
|
|||
webclaw-fetch = { workspace = true }
|
||||
webclaw-llm = { workspace = true }
|
||||
webclaw-pdf = { workspace = true }
|
||||
webclaw-capture = { path = "../webclaw-capture" }
|
||||
|
||||
axum = { version = "0.8", features = ["macros"] }
|
||||
tokio = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -95,8 +95,18 @@ async fn main() -> anyhow::Result<()> {
|
|||
.route("/crawl", post(routes::crawl::crawl))
|
||||
.route("/map", post(routes::map::map))
|
||||
.route("/batch", post(routes::batch::batch))
|
||||
.route("/capture-network", post(routes::capture::capture_network))
|
||||
.route(
|
||||
"/captures/{domain}/{timestamp}/endpoints",
|
||||
get(routes::capture::endpoints),
|
||||
)
|
||||
.route(
|
||||
"/captures/{domain}/{timestamp}/openapi",
|
||||
post(routes::capture::export_openapi),
|
||||
)
|
||||
.route("/extract", post(routes::extract::extract))
|
||||
.route("/extractors", get(routes::structured::list_extractors))
|
||||
.route("/replay-endpoint", post(routes::capture::replay_endpoint))
|
||||
.route("/summarize", post(routes::summarize::summarize_route))
|
||||
.route("/diff", post(routes::diff::diff_route))
|
||||
.route("/brand", post(routes::brand::brand))
|
||||
|
|
|
|||
283
crates/webclaw-server/src/routes/capture.rs
Normal file
283
crates/webclaw-server/src/routes/capture.rs
Normal file
|
|
@ -0,0 +1,283 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use axum::{Json, extract::Path};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_capture::cdp::{CaptureOptions, capture_network as run_network_capture};
|
||||
use webclaw_capture::openapi::write_openapi;
|
||||
use webclaw_capture::replay::replay_endpoint as run_endpoint_replay;
|
||||
use webclaw_capture::store::{find_endpoint, load_endpoints};
|
||||
use webclaw_capture::types::{
|
||||
CaptureError, EndpointDefinition, HeaderMap, ReplayOptions, ReplayResult,
|
||||
};
|
||||
|
||||
use crate::error::ApiError;
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct CaptureNetworkRequest {
|
||||
pub url: String,
|
||||
pub intent: Option<String>,
|
||||
pub wait_ms: Option<u64>,
|
||||
pub headed: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct ReplayEndpointRequest {
|
||||
pub endpoint_id: String,
|
||||
pub params_json: Option<Value>,
|
||||
pub dry_run: Option<bool>,
|
||||
pub confirm_unsafe: Option<bool>,
|
||||
pub headers: Option<BTreeMap<String, String>>,
|
||||
pub body_json: Option<Value>,
|
||||
}
|
||||
|
||||
pub async fn capture_network(
|
||||
Json(request): Json<CaptureNetworkRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if request.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
|
||||
let url = normalize_capture_url(&request.url)?;
|
||||
webclaw_fetch::url_security::validate_public_http_url(&url).await?;
|
||||
|
||||
let saved = run_network_capture(CaptureOptions {
|
||||
url,
|
||||
intent: request.intent,
|
||||
wait_ms: request.wait_ms.unwrap_or(3000),
|
||||
headed: request.headed.unwrap_or(false),
|
||||
})
|
||||
.await
|
||||
.map_err(|error| capture_error("capture-network failed", error))?;
|
||||
|
||||
Ok(Json(json!(saved)))
|
||||
}
|
||||
|
||||
pub async fn endpoints(
|
||||
Path((domain, timestamp)): Path<(String, String)>,
|
||||
) -> Result<Json<Vec<EndpointDefinition>>, ApiError> {
|
||||
let capture_id = capture_id_from_path(&domain, ×tamp)?;
|
||||
let endpoints = load_endpoints(&capture_id).map_err(|error| {
|
||||
capture_error(
|
||||
format!("could not load endpoints for capture id {capture_id}"),
|
||||
error,
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(Json(endpoints))
|
||||
}
|
||||
|
||||
pub async fn replay_endpoint(
|
||||
Json(request): Json<ReplayEndpointRequest>,
|
||||
) -> Result<Json<ReplayResult>, ApiError> {
|
||||
if request.endpoint_id.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`endpoint_id` is required"));
|
||||
}
|
||||
|
||||
let endpoint = find_endpoint(&request.endpoint_id).map_err(|error| {
|
||||
capture_error(
|
||||
format!("could not find endpoint id {}", request.endpoint_id),
|
||||
error,
|
||||
)
|
||||
})?;
|
||||
let options = replay_options_from_request(&endpoint, &request)?;
|
||||
let result = run_endpoint_replay(&endpoint, options)
|
||||
.await
|
||||
.map_err(|error| capture_error("replay-endpoint failed", error))?;
|
||||
|
||||
Ok(Json(result))
|
||||
}
|
||||
|
||||
pub async fn export_openapi(
|
||||
Path((domain, timestamp)): Path<(String, String)>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
let capture_id = capture_id_from_path(&domain, ×tamp)?;
|
||||
let path = write_openapi(&capture_id).map_err(|error| {
|
||||
capture_error(
|
||||
format!("could not export OpenAPI for capture id {capture_id}"),
|
||||
error,
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(Json(json!({ "path": path.display().to_string() })))
|
||||
}
|
||||
|
||||
fn normalize_capture_url(url: &str) -> Result<String, ApiError> {
|
||||
let trimmed = url.trim();
|
||||
if trimmed.is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
|
||||
let normalized = if let Some((scheme, _rest)) = trimmed.split_once("://") {
|
||||
if !matches!(scheme, "http" | "https") {
|
||||
return Err(ApiError::bad_request(format!(
|
||||
"capture-network only supports http and https URLs, got {scheme:?}"
|
||||
)));
|
||||
}
|
||||
trimmed.to_owned()
|
||||
} else {
|
||||
format!("https://{trimmed}")
|
||||
};
|
||||
|
||||
Ok(normalized)
|
||||
}
|
||||
|
||||
fn capture_id_from_path(domain: &str, timestamp: &str) -> Result<String, ApiError> {
|
||||
if !is_safe_capture_segment(domain) || !is_safe_capture_segment(timestamp) {
|
||||
return Err(ApiError::bad_request(
|
||||
"capture id contains an unsafe path segment",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(format!("{domain}/{timestamp}"))
|
||||
}
|
||||
|
||||
fn replay_options_from_request(
|
||||
endpoint: &EndpointDefinition,
|
||||
request: &ReplayEndpointRequest,
|
||||
) -> Result<ReplayOptions, ApiError> {
|
||||
if let Some(value) = &request.params_json
|
||||
&& !value.is_object()
|
||||
{
|
||||
return Err(ApiError::bad_request("`params_json` must be a JSON object"));
|
||||
}
|
||||
|
||||
let confirm_unsafe = request.confirm_unsafe.unwrap_or(false);
|
||||
let default_dry_run = endpoint_defaults_to_dry_run(endpoint) && !confirm_unsafe;
|
||||
|
||||
Ok(ReplayOptions {
|
||||
dry_run: request.dry_run.unwrap_or(false) || default_dry_run,
|
||||
confirm_unsafe,
|
||||
params_json: request.params_json.clone(),
|
||||
headers: header_map_from_strings(request.headers.as_ref()),
|
||||
body_json: request.body_json.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn endpoint_defaults_to_dry_run(endpoint: &EndpointDefinition) -> bool {
|
||||
endpoint.safety.requires_confirmation
|
||||
|| !endpoint.safety.safe_to_replay
|
||||
|| !matches!(
|
||||
endpoint.method.to_ascii_uppercase().as_str(),
|
||||
"GET" | "HEAD" | "OPTIONS"
|
||||
)
|
||||
}
|
||||
|
||||
fn header_map_from_strings(headers: Option<&BTreeMap<String, String>>) -> HeaderMap {
|
||||
headers
|
||||
.into_iter()
|
||||
.flat_map(|headers| headers.iter())
|
||||
.map(|(name, value)| (name.clone(), Value::String(value.clone())))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn is_safe_capture_segment(segment: &str) -> bool {
|
||||
!segment.is_empty()
|
||||
&& segment != "."
|
||||
&& segment != ".."
|
||||
&& !segment.contains(':')
|
||||
&& !segment.contains('/')
|
||||
&& !segment.contains('\\')
|
||||
}
|
||||
|
||||
fn capture_error(context: impl Into<String>, error: CaptureError) -> ApiError {
|
||||
let context = context.into();
|
||||
match error {
|
||||
CaptureError::InvalidUrl(_) | CaptureError::Replay(_) | CaptureError::Storage(_) => {
|
||||
ApiError::bad_request(format!("{context}: {error}"))
|
||||
}
|
||||
CaptureError::EndpointNotFound(_) => ApiError::NotFound,
|
||||
CaptureError::Request(_) | CaptureError::Capture(_) => ApiError::Fetch(error.to_string()),
|
||||
CaptureError::Io(_) | CaptureError::Json(_) => ApiError::Internal(error.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use serde_json::json;
|
||||
use webclaw_capture::types::{EndpointDefinition, EndpointSafety};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn endpoint(
|
||||
method: &str,
|
||||
safe_to_replay: bool,
|
||||
requires_confirmation: bool,
|
||||
) -> EndpointDefinition {
|
||||
EndpointDefinition {
|
||||
id: format!("{}_example", method.to_ascii_lowercase()),
|
||||
method: method.to_owned(),
|
||||
origin: "https://example.test".to_owned(),
|
||||
path_template: "/api/items".to_owned(),
|
||||
query_params: BTreeMap::new(),
|
||||
request_schema: None,
|
||||
response_schema: None,
|
||||
auth_evidence: Vec::new(),
|
||||
safety: EndpointSafety {
|
||||
safe_to_replay,
|
||||
requires_confirmation,
|
||||
reason: "test".to_owned(),
|
||||
},
|
||||
examples: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capture_id_from_path_joins_domain_timestamp_and_rejects_unsafe_segments() {
|
||||
assert_eq!(
|
||||
capture_id_from_path("example.test", "2026-05-16T12-00-00Z").unwrap(),
|
||||
"example.test/2026-05-16T12-00-00Z"
|
||||
);
|
||||
|
||||
assert!(capture_id_from_path("..", "2026-05-16T12-00-00Z").is_err());
|
||||
assert!(capture_id_from_path("example.test", "..").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replay_request_defaults_unsafe_methods_to_dry_run_unless_confirmed() {
|
||||
let unsafe_endpoint = endpoint("POST", false, true);
|
||||
let request = ReplayEndpointRequest {
|
||||
endpoint_id: unsafe_endpoint.id.clone(),
|
||||
params_json: Some(json!({"id": "123"})),
|
||||
dry_run: None,
|
||||
confirm_unsafe: None,
|
||||
headers: Some(BTreeMap::from([("X-Test".to_owned(), "ok".to_owned())])),
|
||||
body_json: Some(json!({"name": "tool"})),
|
||||
};
|
||||
|
||||
let options = replay_options_from_request(&unsafe_endpoint, &request).unwrap();
|
||||
assert!(options.dry_run);
|
||||
assert!(!options.confirm_unsafe);
|
||||
assert_eq!(options.params_json, Some(json!({"id": "123"})));
|
||||
assert_eq!(options.headers.get("X-Test"), Some(&json!("ok")));
|
||||
assert_eq!(options.body_json, Some(json!({"name": "tool"})));
|
||||
|
||||
let confirmed = ReplayEndpointRequest {
|
||||
confirm_unsafe: Some(true),
|
||||
..request
|
||||
};
|
||||
let options = replay_options_from_request(&unsafe_endpoint, &confirmed).unwrap();
|
||||
assert!(!options.dry_run);
|
||||
assert!(options.confirm_unsafe);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replay_request_rejects_non_object_params_json() {
|
||||
let safe_endpoint = endpoint("GET", true, false);
|
||||
let request = ReplayEndpointRequest {
|
||||
endpoint_id: safe_endpoint.id.clone(),
|
||||
params_json: Some(json!(["not", "an", "object"])),
|
||||
dry_run: None,
|
||||
confirm_unsafe: None,
|
||||
headers: None,
|
||||
body_json: None,
|
||||
};
|
||||
|
||||
let error = replay_options_from_request(&safe_endpoint, &request).unwrap_err();
|
||||
assert!(error.to_string().contains("params_json"));
|
||||
}
|
||||
}
|
||||
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
pub mod batch;
|
||||
pub mod brand;
|
||||
pub mod capture;
|
||||
pub mod crawl;
|
||||
pub mod diff;
|
||||
pub mod extract;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue