diff --git a/Cargo.lock b/Cargo.lock index d202937..c5eb6f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3072,7 +3072,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.3.0" +version = "0.3.1" dependencies = [ "clap", "dotenvy", @@ -3092,7 +3092,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.3.0" +version = "0.3.1" dependencies = [ "ego-tree", "once_cell", @@ -3110,7 +3110,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.3.0" +version = "0.3.1" dependencies = [ "calamine", "http", @@ -3148,7 +3148,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.3.0" +version = "0.3.1" dependencies = [ "async-trait", "reqwest 0.12.28", @@ -3161,7 +3161,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.3.0" +version = "0.3.1" dependencies = [ "dotenvy", "reqwest 0.12.28", @@ -3181,7 +3181,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.3.0" +version = "0.3.1" dependencies = [ "pdf-extract", "thiserror", diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index f58c68b..24609e3 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -151,6 +151,10 @@ struct Cli { #[arg(long)] cookie: Option, + /// JSON cookie file (Chrome extension format: [{name, value, domain, ...}]) + #[arg(long)] + cookie_file: Option, + /// Enable verbose logging #[arg(short, long)] verbose: bool, @@ -371,6 +375,24 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig { headers.insert("Cookie".to_string(), cookie.clone()); } + // --cookie-file: parse JSON array of {name, value, domain, ...} + if let Some(ref path) = cli.cookie_file { + match parse_cookie_file(path) { + Ok(cookie_str) => { + // Merge with existing cookies if --cookie was also provided + if let Some(existing) = headers.get("Cookie") { + headers.insert("Cookie".to_string(), format!("{existing}; {cookie_str}")); + } else { + headers.insert("Cookie".to_string(), cookie_str); + } + } + Err(e) => { + eprintln!("error: failed to parse cookie file: {e}"); + process::exit(1); + } + } + } + FetchConfig { browser: cli.browser.clone().into(), proxy, @@ -382,6 +404,29 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig { } } +/// Parse a JSON cookie file (Chrome extension format) into a Cookie header string. +/// Supports: [{name, value, domain, path, secure, httpOnly, expirationDate, ...}] +fn parse_cookie_file(path: &str) -> Result { + let content = std::fs::read_to_string(path).map_err(|e| format!("cannot read {path}: {e}"))?; + let cookies: Vec = + serde_json::from_str(&content).map_err(|e| format!("invalid JSON: {e}"))?; + + let pairs: Vec = cookies + .iter() + .filter_map(|c| { + let name = c.get("name")?.as_str()?; + let value = c.get("value")?.as_str()?; + Some(format!("{name}={value}")) + }) + .collect(); + + if pairs.is_empty() { + return Err("no cookies found in file".to_string()); + } + + Ok(pairs.join("; ")) +} + fn build_extraction_options(cli: &Cli) -> ExtractionOptions { ExtractionOptions { include_selectors: cli diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index 1290f44..607bf07 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -139,19 +139,33 @@ impl WebclawMcp { let exclude = params.exclude_selectors.unwrap_or_default(); let main_only = params.only_main_content.unwrap_or(false); - // Use a custom client if a non-default browser is requested + // Build cookie header from params + let cookie_header = params + .cookies + .as_ref() + .filter(|c| !c.is_empty()) + .map(|c| c.join("; ")); + + // Use a custom client if non-default browser or cookies are provided let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome); + let needs_custom = !is_default_browser || cookie_header.is_some(); let custom_client; - let client: &webclaw_fetch::FetchClient = if is_default_browser { - &self.fetch_client - } else { + let client: &webclaw_fetch::FetchClient = if needs_custom { + let mut headers = std::collections::HashMap::new(); + headers.insert("Accept-Language".to_string(), "en-US,en;q=0.9".to_string()); + if let Some(ref cookies) = cookie_header { + headers.insert("Cookie".to_string(), cookies.clone()); + } let config = webclaw_fetch::FetchConfig { browser, + headers, ..Default::default() }; custom_client = webclaw_fetch::FetchClient::new(config) .map_err(|e| format!("Failed to build client: {e}"))?; &custom_client + } else { + &self.fetch_client }; let formats = [format]; diff --git a/crates/webclaw-mcp/src/tools.rs b/crates/webclaw-mcp/src/tools.rs index b3ae1e2..e0195f1 100644 --- a/crates/webclaw-mcp/src/tools.rs +++ b/crates/webclaw-mcp/src/tools.rs @@ -18,6 +18,8 @@ pub struct ScrapeParams { pub only_main_content: Option, /// Browser profile: "chrome" (default), "firefox", or "random" pub browser: Option, + /// Cookies to send with the request (e.g. ["name=value", "session=abc123"]) + pub cookies: Option>, } #[derive(Debug, Deserialize, JsonSchema)]