mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-15 23:35:14 +02:00
216 lines
6 KiB
Rust
216 lines
6 KiB
Rust
use chrono::{TimeZone, Utc};
|
|
use serde_json::{Map, Value, json};
|
|
use webclaw_capture::classify::{classify_exchange, filter_api_exchanges};
|
|
use webclaw_capture::types::CapturedExchange;
|
|
|
|
fn headers(entries: &[(&str, &str)]) -> Map<String, Value> {
|
|
entries
|
|
.iter()
|
|
.map(|(name, value)| ((*name).to_owned(), Value::String((*value).to_owned())))
|
|
.collect()
|
|
}
|
|
|
|
fn exchange(url: &str) -> CapturedExchange {
|
|
CapturedExchange {
|
|
method: "GET".to_owned(),
|
|
url: url.to_owned(),
|
|
request_headers: Map::new(),
|
|
request_body_sample: None,
|
|
resource_type: Some("document".to_owned()),
|
|
status: 200,
|
|
response_headers: Map::new(),
|
|
response_body_sample: None,
|
|
started_at: Utc.with_ymd_and_hms(2026, 5, 16, 12, 0, 0).unwrap(),
|
|
duration_ms: 25,
|
|
redirect_chain: Vec::new(),
|
|
}
|
|
}
|
|
|
|
fn with_resource_type(mut exchange: CapturedExchange, resource_type: &str) -> CapturedExchange {
|
|
exchange.resource_type = Some(resource_type.to_owned());
|
|
exchange
|
|
}
|
|
|
|
fn with_response_header(
|
|
mut exchange: CapturedExchange,
|
|
name: &str,
|
|
value: &str,
|
|
) -> CapturedExchange {
|
|
exchange.response_headers = headers(&[(name, value)]);
|
|
exchange
|
|
}
|
|
|
|
fn with_request_body(mut exchange: CapturedExchange, body: serde_json::Value) -> CapturedExchange {
|
|
exchange.method = "POST".to_owned();
|
|
exchange.request_headers = headers(&[("Content-Type", "application/json")]);
|
|
exchange.request_body_sample = Some(body.to_string());
|
|
exchange
|
|
}
|
|
|
|
fn assert_included(exchange: &CapturedExchange, label: &str) {
|
|
let classification = classify_exchange(exchange);
|
|
|
|
assert!(
|
|
classification.include,
|
|
"{label} should be included, got {classification:?}"
|
|
);
|
|
assert!(
|
|
classification.confidence >= 0.5,
|
|
"{label} should have useful confidence, got {classification:?}"
|
|
);
|
|
assert!(
|
|
!classification.reasons.is_empty(),
|
|
"{label} should explain why it was classified as API traffic"
|
|
);
|
|
}
|
|
|
|
fn assert_excluded(exchange: &CapturedExchange, label: &str) {
|
|
let classification = classify_exchange(exchange);
|
|
|
|
assert!(
|
|
!classification.include,
|
|
"{label} should be excluded, got {classification:?}"
|
|
);
|
|
assert!(
|
|
classification.confidence <= 0.5,
|
|
"{label} should not look like confident API traffic, got {classification:?}"
|
|
);
|
|
assert!(
|
|
!classification.reasons.is_empty(),
|
|
"{label} should explain why it was excluded"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn includes_fetch_and_xhr_resource_types() {
|
|
let cases = [
|
|
with_resource_type(exchange("https://example.test/products"), "fetch"),
|
|
with_resource_type(exchange("https://example.test/products"), "xhr"),
|
|
];
|
|
|
|
for case in cases {
|
|
assert_included(
|
|
&case,
|
|
case.resource_type
|
|
.as_deref()
|
|
.expect("resource type should be set"),
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn includes_json_responses() {
|
|
let case = with_response_header(
|
|
exchange("https://example.test/products"),
|
|
"Content-Type",
|
|
"application/json; charset=utf-8",
|
|
);
|
|
|
|
assert_included(&case, "JSON response");
|
|
}
|
|
|
|
#[test]
|
|
fn includes_common_api_path_prefixes() {
|
|
let cases = [
|
|
exchange("https://example.test/api/products"),
|
|
exchange("https://example.test/v1/products"),
|
|
exchange("https://example.test/v2/products"),
|
|
];
|
|
|
|
for case in cases {
|
|
assert_included(&case, &case.url);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn includes_graphql_paths() {
|
|
let case = exchange("https://example.test/graphql");
|
|
|
|
assert_included(&case, "GraphQL path");
|
|
}
|
|
|
|
#[test]
|
|
fn includes_graphql_request_bodies() {
|
|
let case = with_request_body(
|
|
exchange("https://example.test/query"),
|
|
json!({
|
|
"operationName": "Products",
|
|
"query": "query Products { products { id name } }",
|
|
"variables": {
|
|
"first": 25
|
|
}
|
|
}),
|
|
);
|
|
|
|
assert_included(&case, "GraphQL request body");
|
|
}
|
|
|
|
#[test]
|
|
fn excludes_static_assets_by_extension() {
|
|
let cases = [
|
|
exchange("https://example.test/static/logo.png"),
|
|
exchange("https://example.test/static/photo.jpg"),
|
|
exchange("https://example.test/static/icon.svg"),
|
|
exchange("https://example.test/static/site.css"),
|
|
exchange("https://example.test/static/app.js"),
|
|
exchange("https://example.test/static/font.woff2"),
|
|
exchange("https://example.test/static/app.js.map"),
|
|
];
|
|
|
|
for case in cases {
|
|
assert_excluded(&case, &case.url);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn excludes_tracking_hosts() {
|
|
let cases = [
|
|
with_response_header(
|
|
exchange("https://www.google-analytics.com/g/collect?v=2"),
|
|
"Content-Type",
|
|
"application/json",
|
|
),
|
|
with_response_header(
|
|
exchange("https://ads.doubleclick.net/pagead/id"),
|
|
"Content-Type",
|
|
"application/json",
|
|
),
|
|
with_response_header(
|
|
exchange("https://telemetry.example.test/v1/events"),
|
|
"Content-Type",
|
|
"application/json",
|
|
),
|
|
];
|
|
|
|
for case in cases {
|
|
assert_excluded(&case, &case.url);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn excludes_browser_extension_urls() {
|
|
let cases = [
|
|
with_resource_type(exchange("chrome-extension://abcdef/options.html"), "fetch"),
|
|
with_resource_type(exchange("moz-extension://abcdef/options.html"), "xhr"),
|
|
];
|
|
|
|
for case in cases {
|
|
assert_excluded(&case, &case.url);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn filter_api_exchanges_returns_only_included_traffic() {
|
|
let api = exchange("https://example.test/api/products");
|
|
let asset = exchange("https://example.test/static/app.js");
|
|
let tracking = with_response_header(
|
|
exchange("https://telemetry.example.test/v1/events"),
|
|
"Content-Type",
|
|
"application/json",
|
|
);
|
|
let exchanges = vec![api.clone(), asset, tracking];
|
|
|
|
let filtered = filter_api_exchanges(&exchanges);
|
|
|
|
assert_eq!(filtered, vec![api]);
|
|
}
|