feat: v0.1.6 — watch mode, webhooks (Discord/Slack auto-format)

Watch mode:
- --watch polls a URL at --watch-interval (default 5min)
- Reports diffs to stdout when content changes
- --on-change runs a command with diff JSON on stdin
- Ctrl+C stops cleanly

Webhooks:
- --webhook POSTs JSON on crawl/batch complete and watch changes
- Auto-detects Discord and Slack URLs, formats as embeds/blocks
- Also available via WEBCLAW_WEBHOOK_URL env var
- Non-blocking, errors logged to stderr

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-03-26 12:30:08 +01:00
parent e5649e1824
commit 1b8dfb77a6
4 changed files with 244 additions and 8 deletions

View file

@ -12,7 +12,8 @@ use std::sync::atomic::{AtomicBool, Ordering};
use clap::{Parser, ValueEnum};
use tracing_subscriber::EnvFilter;
use webclaw_core::{
ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options, to_llm_text,
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
to_llm_text,
};
use webclaw_fetch::{
BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
@ -158,6 +159,23 @@ struct Cli {
#[arg(long)]
diff_with: Option<String>,
/// Watch a URL for changes. Checks at the specified interval and reports diffs.
#[arg(long)]
watch: bool,
/// Watch interval in seconds [default: 300]
#[arg(long, default_value = "300")]
watch_interval: u64,
/// Command to run when changes are detected (receives diff JSON on stdin)
#[arg(long)]
on_change: Option<String>,
/// Webhook URL: POST a JSON payload when an operation completes.
/// Works with crawl, batch, watch (on change), and single URL modes.
#[arg(long, env = "WEBCLAW_WEBHOOK_URL")]
webhook: Option<String>,
/// Extract brand identity (colors, fonts, logo)
#[arg(long)]
brand: bool,
@ -1171,6 +1189,24 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
result.total, result.ok, result.errors, result.elapsed_secs,
);
// Fire webhook on crawl complete
if let Some(ref webhook_url) = cli.webhook {
let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
fire_webhook(
webhook_url,
&serde_json::json!({
"event": "crawl_complete",
"total": result.total,
"ok": result.ok,
"errors": result.errors,
"elapsed_secs": result.elapsed_secs,
"urls": urls,
}),
);
// Brief pause so the async webhook has time to fire
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
if result.errors > 0 {
Err(format!(
"{} of {} pages failed",
@ -1260,6 +1296,22 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
errors
);
// Fire webhook on batch complete
if let Some(ref webhook_url) = cli.webhook {
let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
fire_webhook(
webhook_url,
&serde_json::json!({
"event": "batch_complete",
"total": results.len(),
"ok": ok,
"errors": errors,
"urls": urls,
}),
);
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
if errors > 0 {
Err(format!("{errors} of {} URLs failed", results.len()))
} else {
@ -1267,6 +1319,171 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
}
}
fn timestamp() -> String {
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let hours = (now % 86400) / 3600;
let minutes = (now % 3600) / 60;
let seconds = now % 60;
format!("{hours:02}:{minutes:02}:{seconds:02}")
}
/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
fn fire_webhook(url: &str, payload: &serde_json::Value) {
let url = url.to_string();
let is_discord = url.contains("discord.com/api/webhooks");
let is_slack = url.contains("hooks.slack.com");
let body = if is_discord {
let event = payload
.get("event")
.and_then(|v| v.as_str())
.unwrap_or("notification");
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
serde_json::json!({
"embeds": [{
"title": format!("webclaw: {event}"),
"description": format!("```json\n{details}\n```"),
"color": 5814783
}]
})
.to_string()
} else if is_slack {
let event = payload
.get("event")
.and_then(|v| v.as_str())
.unwrap_or("notification");
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
serde_json::json!({
"text": format!("*webclaw: {event}*\n```{details}```")
})
.to_string()
} else {
serde_json::to_string(payload).unwrap_or_default()
};
tokio::spawn(async move {
match reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
{
Ok(c) => match c
.post(&url)
.header("Content-Type", "application/json")
.body(body)
.send()
.await
{
Ok(resp) => {
eprintln!(
"[webhook] POST {} -> {}",
&url[..url.len().min(60)],
resp.status()
);
}
Err(e) => eprintln!("[webhook] POST failed: {e}"),
},
Err(e) => eprintln!("[webhook] client error: {e}"),
}
});
}
async fn run_watch(cli: &Cli) -> Result<(), String> {
let raw_url = cli.urls.first().ok_or("--watch requires a URL argument")?;
let url = normalize_url(raw_url);
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
// Initial snapshot
let mut previous = client
.fetch_and_extract_with_options(&url, &options)
.await
.map_err(|e| format!("initial fetch failed: {e}"))?;
eprintln!(
"[watch] Initial snapshot: {url} ({} words)",
previous.metadata.word_count
);
// Ctrl+C handler
let cancelled = Arc::new(AtomicBool::new(false));
let flag = Arc::clone(&cancelled);
tokio::spawn(async move {
tokio::signal::ctrl_c().await.ok();
flag.store(true, Ordering::Relaxed);
});
loop {
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
if cancelled.load(Ordering::Relaxed) {
eprintln!("[watch] Stopped");
break;
}
let current = match client.fetch_and_extract_with_options(&url, &options).await {
Ok(result) => result,
Err(e) => {
eprintln!("[watch] Fetch error ({}): {e}", timestamp());
continue;
}
};
let diff = webclaw_core::diff::diff(&previous, &current);
if diff.status == ChangeStatus::Same {
eprintln!("[watch] No changes ({})", timestamp());
} else {
print_diff_output(&diff, &cli.format);
eprintln!("[watch] Changes detected! ({})", timestamp());
if let Some(ref cmd) = cli.on_change {
let diff_json = serde_json::to_string(&diff).unwrap_or_default();
eprintln!("[watch] Running: {cmd}");
match tokio::process::Command::new("sh")
.arg("-c")
.arg(cmd)
.stdin(std::process::Stdio::piped())
.spawn()
{
Ok(mut child) => {
// Pipe diff JSON to stdin, then detach
if let Some(mut stdin) = child.stdin.take() {
use tokio::io::AsyncWriteExt;
let _ = stdin.write_all(diff_json.as_bytes()).await;
}
}
Err(e) => eprintln!("[watch] Failed to run command: {e}"),
}
}
// Fire webhook on change
if let Some(ref webhook_url) = cli.webhook {
fire_webhook(
webhook_url,
&serde_json::json!({
"event": "watch_change",
"url": url,
"status": format!("{:?}", diff.status),
"word_count_delta": diff.word_count_delta,
"metadata_changes": diff.metadata_changes.len(),
"links_added": diff.links_added.len(),
"links_removed": diff.links_removed.len(),
}),
);
}
previous = current;
}
}
Ok(())
}
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
// Load previous snapshot
let snapshot_json = std::fs::read_to_string(snapshot_path)
@ -1436,6 +1653,15 @@ async fn main() {
return;
}
// --watch: poll a URL for changes
if cli.watch {
if let Err(e) = run_watch(&cli).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// --diff-with: change tracking mode
if let Some(ref snapshot_path) = cli.diff_with {
if let Err(e) = run_diff(&cli, snapshot_path).await {