From 0e4128782a937960adac3219cc0fc78f6a6d10b0 Mon Sep 17 00:00:00 2001 From: Valerio Date: Thu, 26 Mar 2026 13:30:20 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20v0.1.7=20=E2=80=94=20extraction=20option?= =?UTF-8?q?s=20now=20work=20in=20batch=20mode=20(#3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --only-main-content, --include, and --exclude were ignored in batch mode because run_batch used default ExtractionOptions. Added fetch_and_extract_batch_with_options to pass CLI options through. Closes #3 Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 7 +++++++ Cargo.lock | 12 ++++++------ Cargo.toml | 2 +- crates/webclaw-cli/src/main.rs | 5 ++++- crates/webclaw-fetch/src/client.rs | 21 ++++++++++++++++++++- 5 files changed, 38 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8073ad3..e263948 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.1.7] — 2026-03-26 + +### Fixed +- `--only-main-content`, `--include`, and `--exclude` now work in batch mode (#3) + +--- + ## [0.1.6] — 2026-03-26 ### Added diff --git a/Cargo.lock b/Cargo.lock index 4d1f40c..0c8c50d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2881,7 +2881,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.1.6" +version = "0.1.7" dependencies = [ "clap", "dotenvy", @@ -2901,7 +2901,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.1.6" +version = "0.1.7" dependencies = [ "ego-tree", "once_cell", @@ -2919,7 +2919,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.1.6" +version = "0.1.7" dependencies = [ "primp", "quick-xml", @@ -2937,7 +2937,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.1.6" +version = "0.1.7" dependencies = [ "async-trait", "reqwest", @@ -2950,7 +2950,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.1.6" +version = "0.1.7" dependencies = [ "dotenvy", "reqwest", @@ -2970,7 +2970,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.1.6" +version = "0.1.7" dependencies = [ "pdf-extract", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index 84f149a..40eada1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.1.6" +version = "0.1.7" edition = "2024" license = "MIT" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index d4c92a7..4aa8a7f 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -1248,7 +1248,10 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<() ); let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect(); - let results = client.fetch_and_extract_batch(&urls, cli.concurrency).await; + let options = build_extraction_options(cli); + let results = client + .fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options) + .await; let ok = results.iter().filter(|r| r.result.is_ok()).count(); let errors = results.len() - ok; diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index ef6c249..4af675e 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -465,6 +465,24 @@ impl FetchClient { self: &Arc, urls: &[&str], concurrency: usize, + ) -> Vec { + self.fetch_and_extract_batch_with_options( + urls, + concurrency, + &webclaw_core::ExtractionOptions::default(), + ) + .await + } + + /// Fetch and extract multiple URLs concurrently with custom extraction options. + /// + /// Same as [`fetch_and_extract_batch`] but applies the given options + /// (include/exclude selectors, only-main-content, etc.) to each extraction. + pub async fn fetch_and_extract_batch_with_options( + self: &Arc, + urls: &[&str], + concurrency: usize, + options: &webclaw_core::ExtractionOptions, ) -> Vec { let semaphore = Arc::new(Semaphore::new(concurrency)); let mut handles = Vec::with_capacity(urls.len()); @@ -473,10 +491,11 @@ impl FetchClient { let permit = Arc::clone(&semaphore); let client = Arc::clone(self); let url = url.to_string(); + let opts = options.clone(); handles.push(tokio::spawn(async move { let _permit = permit.acquire().await.expect("semaphore closed"); - let result = client.fetch_and_extract(&url).await; + let result = client.fetch_and_extract_with_options(&url, &opts).await; (idx, BatchExtractResult { url, result }) })); }