mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix: v0.1.7 — extraction options now work in batch mode (#3)
--only-main-content, --include, and --exclude were ignored in batch mode because run_batch used default ExtractionOptions. Added fetch_and_extract_batch_with_options to pass CLI options through. Closes #3 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1b8dfb77a6
commit
0e4128782a
5 changed files with 38 additions and 9 deletions
|
|
@ -3,6 +3,13 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.1.7] — 2026-03-26
|
||||
|
||||
### Fixed
|
||||
- `--only-main-content`, `--include`, and `--exclude` now work in batch mode (#3)
|
||||
|
||||
---
|
||||
|
||||
## [0.1.6] — 2026-03-26
|
||||
|
||||
### Added
|
||||
|
|
|
|||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -2881,7 +2881,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.1.6"
|
||||
version = "0.1.7"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -2901,7 +2901,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.1.6"
|
||||
version = "0.1.7"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -2919,7 +2919,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.1.6"
|
||||
version = "0.1.7"
|
||||
dependencies = [
|
||||
"primp",
|
||||
"quick-xml",
|
||||
|
|
@ -2937,7 +2937,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.1.6"
|
||||
version = "0.1.7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -2950,7 +2950,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.1.6"
|
||||
version = "0.1.7"
|
||||
dependencies = [
|
||||
"dotenvy",
|
||||
"reqwest",
|
||||
|
|
@ -2970,7 +2970,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.1.6"
|
||||
version = "0.1.7"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.1.6"
|
||||
version = "0.1.7"
|
||||
edition = "2024"
|
||||
license = "MIT"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -1248,7 +1248,10 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
|
|||
);
|
||||
|
||||
let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect();
|
||||
let results = client.fetch_and_extract_batch(&urls, cli.concurrency).await;
|
||||
let options = build_extraction_options(cli);
|
||||
let results = client
|
||||
.fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options)
|
||||
.await;
|
||||
|
||||
let ok = results.iter().filter(|r| r.result.is_ok()).count();
|
||||
let errors = results.len() - ok;
|
||||
|
|
|
|||
|
|
@ -465,6 +465,24 @@ impl FetchClient {
|
|||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
) -> Vec<BatchExtractResult> {
|
||||
self.fetch_and_extract_batch_with_options(
|
||||
urls,
|
||||
concurrency,
|
||||
&webclaw_core::ExtractionOptions::default(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetch and extract multiple URLs concurrently with custom extraction options.
|
||||
///
|
||||
/// Same as [`fetch_and_extract_batch`] but applies the given options
|
||||
/// (include/exclude selectors, only-main-content, etc.) to each extraction.
|
||||
pub async fn fetch_and_extract_batch_with_options(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
) -> Vec<BatchExtractResult> {
|
||||
let semaphore = Arc::new(Semaphore::new(concurrency));
|
||||
let mut handles = Vec::with_capacity(urls.len());
|
||||
|
|
@ -473,10 +491,11 @@ impl FetchClient {
|
|||
let permit = Arc::clone(&semaphore);
|
||||
let client = Arc::clone(self);
|
||||
let url = url.to_string();
|
||||
let opts = options.clone();
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||
let result = client.fetch_and_extract(&url).await;
|
||||
let result = client.fetch_and_extract_with_options(&url, &opts).await;
|
||||
(idx, BatchExtractResult { url, result })
|
||||
}));
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue