mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-29 03:39:37 +02:00
feat: add --cookie-file support for JSON cookie files
- --cookie-file reads Chrome extension format ([{name, value, domain, ...}])
- Works with EditThisCookie, Cookie-Editor, and similar browser extensions
- Merges with --cookie when both provided
- MCP scrape tool now accepts cookies parameter
- Closes #7
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
44f23332cc
commit
da1d76c97a
4 changed files with 71 additions and 10 deletions
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3072,7 +3072,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3092,7 +3092,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3110,7 +3110,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"calamine",
|
"calamine",
|
||||||
"http",
|
"http",
|
||||||
|
|
@ -3148,7 +3148,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest 0.12.28",
|
"reqwest 0.12.28",
|
||||||
|
|
@ -3161,7 +3161,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
"reqwest 0.12.28",
|
"reqwest 0.12.28",
|
||||||
|
|
@ -3181,7 +3181,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
|
||||||
|
|
@ -151,6 +151,10 @@ struct Cli {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
cookie: Option<String>,
|
cookie: Option<String>,
|
||||||
|
|
||||||
|
/// JSON cookie file (Chrome extension format: [{name, value, domain, ...}])
|
||||||
|
#[arg(long)]
|
||||||
|
cookie_file: Option<String>,
|
||||||
|
|
||||||
/// Enable verbose logging
|
/// Enable verbose logging
|
||||||
#[arg(short, long)]
|
#[arg(short, long)]
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
|
|
@ -371,6 +375,24 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
||||||
headers.insert("Cookie".to_string(), cookie.clone());
|
headers.insert("Cookie".to_string(), cookie.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --cookie-file: parse JSON array of {name, value, domain, ...}
|
||||||
|
if let Some(ref path) = cli.cookie_file {
|
||||||
|
match parse_cookie_file(path) {
|
||||||
|
Ok(cookie_str) => {
|
||||||
|
// Merge with existing cookies if --cookie was also provided
|
||||||
|
if let Some(existing) = headers.get("Cookie") {
|
||||||
|
headers.insert("Cookie".to_string(), format!("{existing}; {cookie_str}"));
|
||||||
|
} else {
|
||||||
|
headers.insert("Cookie".to_string(), cookie_str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: failed to parse cookie file: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
FetchConfig {
|
FetchConfig {
|
||||||
browser: cli.browser.clone().into(),
|
browser: cli.browser.clone().into(),
|
||||||
proxy,
|
proxy,
|
||||||
|
|
@ -382,6 +404,29 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parse a JSON cookie file (Chrome extension format) into a Cookie header string.
|
||||||
|
/// Supports: [{name, value, domain, path, secure, httpOnly, expirationDate, ...}]
|
||||||
|
fn parse_cookie_file(path: &str) -> Result<String, String> {
|
||||||
|
let content = std::fs::read_to_string(path).map_err(|e| format!("cannot read {path}: {e}"))?;
|
||||||
|
let cookies: Vec<serde_json::Value> =
|
||||||
|
serde_json::from_str(&content).map_err(|e| format!("invalid JSON: {e}"))?;
|
||||||
|
|
||||||
|
let pairs: Vec<String> = cookies
|
||||||
|
.iter()
|
||||||
|
.filter_map(|c| {
|
||||||
|
let name = c.get("name")?.as_str()?;
|
||||||
|
let value = c.get("value")?.as_str()?;
|
||||||
|
Some(format!("{name}={value}"))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if pairs.is_empty() {
|
||||||
|
return Err("no cookies found in file".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(pairs.join("; "))
|
||||||
|
}
|
||||||
|
|
||||||
fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
|
fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
|
||||||
ExtractionOptions {
|
ExtractionOptions {
|
||||||
include_selectors: cli
|
include_selectors: cli
|
||||||
|
|
|
||||||
|
|
@ -139,19 +139,33 @@ impl WebclawMcp {
|
||||||
let exclude = params.exclude_selectors.unwrap_or_default();
|
let exclude = params.exclude_selectors.unwrap_or_default();
|
||||||
let main_only = params.only_main_content.unwrap_or(false);
|
let main_only = params.only_main_content.unwrap_or(false);
|
||||||
|
|
||||||
// Use a custom client if a non-default browser is requested
|
// Build cookie header from params
|
||||||
|
let cookie_header = params
|
||||||
|
.cookies
|
||||||
|
.as_ref()
|
||||||
|
.filter(|c| !c.is_empty())
|
||||||
|
.map(|c| c.join("; "));
|
||||||
|
|
||||||
|
// Use a custom client if non-default browser or cookies are provided
|
||||||
let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome);
|
let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome);
|
||||||
|
let needs_custom = !is_default_browser || cookie_header.is_some();
|
||||||
let custom_client;
|
let custom_client;
|
||||||
let client: &webclaw_fetch::FetchClient = if is_default_browser {
|
let client: &webclaw_fetch::FetchClient = if needs_custom {
|
||||||
&self.fetch_client
|
let mut headers = std::collections::HashMap::new();
|
||||||
} else {
|
headers.insert("Accept-Language".to_string(), "en-US,en;q=0.9".to_string());
|
||||||
|
if let Some(ref cookies) = cookie_header {
|
||||||
|
headers.insert("Cookie".to_string(), cookies.clone());
|
||||||
|
}
|
||||||
let config = webclaw_fetch::FetchConfig {
|
let config = webclaw_fetch::FetchConfig {
|
||||||
browser,
|
browser,
|
||||||
|
headers,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
custom_client = webclaw_fetch::FetchClient::new(config)
|
custom_client = webclaw_fetch::FetchClient::new(config)
|
||||||
.map_err(|e| format!("Failed to build client: {e}"))?;
|
.map_err(|e| format!("Failed to build client: {e}"))?;
|
||||||
&custom_client
|
&custom_client
|
||||||
|
} else {
|
||||||
|
&self.fetch_client
|
||||||
};
|
};
|
||||||
|
|
||||||
let formats = [format];
|
let formats = [format];
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,8 @@ pub struct ScrapeParams {
|
||||||
pub only_main_content: Option<bool>,
|
pub only_main_content: Option<bool>,
|
||||||
/// Browser profile: "chrome" (default), "firefox", or "random"
|
/// Browser profile: "chrome" (default), "firefox", or "random"
|
||||||
pub browser: Option<String>,
|
pub browser: Option<String>,
|
||||||
|
/// Cookies to send with the request (e.g. ["name=value", "session=abc123"])
|
||||||
|
pub cookies: Option<Vec<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, JsonSchema)]
|
#[derive(Debug, Deserialize, JsonSchema)]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue