From bac13fc1b56a1b332885c04165e5db75c245f42f Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sat, 11 Apr 2026 12:24:44 -0400 Subject: [PATCH] feat: wire ResolvedConfig into main.rs via clap ValueSource --- crates/noxa-cli/src/config.rs | 109 +++++++++++++++ crates/noxa-cli/src/main.rs | 242 ++++++++++++++++++---------------- 2 files changed, 241 insertions(+), 110 deletions(-) diff --git a/crates/noxa-cli/src/config.rs b/crates/noxa-cli/src/config.rs index d1f80d6..57467e8 100644 --- a/crates/noxa-cli/src/config.rs +++ b/crates/noxa-cli/src/config.rs @@ -134,6 +134,115 @@ pub struct ResolvedConfig { pub llm_model: Option, } +use clap::parser::ValueSource; + +/// Merge CLI flags (detected via ValueSource), config file, and hard defaults +/// into a single ResolvedConfig. CLI explicit values always win. +pub fn resolve( + cli: &crate::Cli, + matches: &clap::ArgMatches, + cfg: &NoxaConfig, +) -> ResolvedConfig { + let explicit = |name: &str| { + matches.value_source(name) == Some(ValueSource::CommandLine) + }; + + ResolvedConfig { + format: if explicit("format") { + cli.format.clone() + } else { + cfg.format.clone().unwrap_or(crate::OutputFormat::Markdown) + }, + browser: if explicit("browser") { + cli.browser.clone() + } else { + cfg.browser.clone().unwrap_or(crate::Browser::Chrome) + }, + pdf_mode: if explicit("pdf_mode") { + cli.pdf_mode.clone() + } else { + cfg.pdf_mode.clone().unwrap_or(crate::PdfModeArg::Auto) + }, + timeout: if explicit("timeout") { + cli.timeout + } else { + cfg.timeout.unwrap_or(30) + }, + depth: if explicit("depth") { + cli.depth + } else { + cfg.depth.unwrap_or(1) + }, + max_pages: if explicit("max_pages") { + cli.max_pages + } else { + cfg.max_pages.unwrap_or(20) + }, + concurrency: if explicit("concurrency") { + cli.concurrency + } else { + cfg.concurrency.unwrap_or(5) + }, + delay: if explicit("delay") { + cli.delay + } else { + cfg.delay.unwrap_or(100) + }, + path_prefix: if explicit("path_prefix") { + cli.path_prefix.clone() + } else { + cfg.path_prefix.clone().or(cli.path_prefix.clone()) + }, + include_paths: if explicit("include_paths") { + cli.include_paths + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default() + } else { + cfg.include_paths.clone().unwrap_or_default() + }, + exclude_paths: if explicit("exclude_paths") { + cli.exclude_paths + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default() + } else { + cfg.exclude_paths.clone().unwrap_or_default() + }, + include_selectors: if explicit("include") { + cli.include + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default() + } else { + cfg.include_selectors.clone().unwrap_or_default() + }, + exclude_selectors: if explicit("exclude") { + cli.exclude + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default() + } else { + cfg.exclude_selectors.clone().unwrap_or_default() + }, + only_main_content: cli.only_main_content || cfg.only_main_content.unwrap_or(false), + metadata: cli.metadata || cfg.metadata.unwrap_or(false), + verbose: cli.verbose || cfg.verbose.unwrap_or(false), + use_sitemap: cli.sitemap || cfg.use_sitemap.unwrap_or(false), + raw_html: cli.raw_html, + llm_provider: if cli.llm_provider.is_some() { + cli.llm_provider.clone() + } else { + cfg.llm_provider.clone() + }, + llm_model: if cli.llm_model.is_some() { + cli.llm_model.clone() + } else { + cfg.llm_model.clone() + }, + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/noxa-cli/src/main.rs b/crates/noxa-cli/src/main.rs index 60608c0..75c9486 100644 --- a/crates/noxa-cli/src/main.rs +++ b/crates/noxa-cli/src/main.rs @@ -10,7 +10,7 @@ use std::process; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; -use clap::{Parser, ValueEnum}; +use clap::{CommandFactory, FromArgMatches, Parser, ValueEnum}; use serde::Deserialize; use tracing_subscriber::EnvFilter; use noxa_core::{ @@ -89,6 +89,10 @@ fn warn_empty(url: &str, reason: &EmptyReason) { #[derive(Parser)] #[command(name = "noxa", about = "Extract web content for LLMs", version)] struct Cli { + /// Path to config.json (default: ./config.json, override with NOXA_CONFIG env var) + #[arg(long, global = true)] + config: Option, + /// URLs to fetch (multiple allowed) #[arg()] urls: Vec, @@ -348,7 +352,7 @@ fn init_logging(verbose: bool) { /// `--proxy` sets a single static proxy (no rotation). /// `--proxy-file` loads a pool of proxies and rotates per-request. /// `--proxy` takes priority: if both are set, only the single proxy is used. -fn build_fetch_config(cli: &Cli) -> FetchConfig { +fn build_fetch_config(cli: &Cli, resolved: &config::ResolvedConfig) -> FetchConfig { let (proxy, proxy_pool) = if cli.proxy.is_some() { (cli.proxy.clone(), Vec::new()) } else if let Some(ref path) = cli.proxy_file { @@ -408,11 +412,11 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig { } FetchConfig { - browser: cli.browser.clone().into(), + browser: resolved.browser.clone().into(), proxy, proxy_pool, - timeout: std::time::Duration::from_secs(cli.timeout), - pdf_mode: cli.pdf_mode.clone().into(), + timeout: std::time::Duration::from_secs(resolved.timeout), + pdf_mode: resolved.pdf_mode.clone().into(), headers, ..Default::default() } @@ -441,20 +445,12 @@ fn parse_cookie_file(path: &str) -> Result { Ok(pairs.join("; ")) } -fn build_extraction_options(cli: &Cli) -> ExtractionOptions { +fn build_extraction_options(resolved: &config::ResolvedConfig) -> ExtractionOptions { ExtractionOptions { - include_selectors: cli - .include - .as_deref() - .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) - .unwrap_or_default(), - exclude_selectors: cli - .exclude - .as_deref() - .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) - .unwrap_or_default(), - only_main_content: cli.only_main_content, - include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html), + include_selectors: resolved.include_selectors.clone(), + exclude_selectors: resolved.exclude_selectors.clone(), + only_main_content: resolved.only_main_content, + include_raw_html: resolved.raw_html || matches!(resolved.format, OutputFormat::Html), } } @@ -623,14 +619,17 @@ impl FetchOutput { /// Fetch a URL and extract content, handling PDF detection automatically. /// Falls back to cloud API when bot protection or JS rendering is detected. -async fn fetch_and_extract(cli: &Cli) -> Result { +async fn fetch_and_extract( + cli: &Cli, + resolved: &config::ResolvedConfig, +) -> Result { // Local sources: read and extract as HTML if cli.stdin { let mut buf = String::new(); io::stdin() .read_to_string(&mut buf) .map_err(|e| format!("failed to read stdin: {e}"))?; - let options = build_extraction_options(cli); + let options = build_extraction_options(resolved); return extract_with_options(&buf, None, &options) .map(|r| FetchOutput::Local(Box::new(r))) .map_err(|e| format!("extraction error: {e}")); @@ -639,7 +638,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result { if let Some(ref path) = cli.file { let html = std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; - let options = build_extraction_options(cli); + let options = build_extraction_options(resolved); return extract_with_options(&html, None, &options) .map(|r| FetchOutput::Local(Box::new(r))) .map_err(|e| format!("extraction error: {e}")); @@ -658,8 +657,8 @@ async fn fetch_and_extract(cli: &Cli) -> Result { if cli.cloud { let c = cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?; - let options = build_extraction_options(cli); - let format_str = match cli.format { + let options = build_extraction_options(resolved); + let format_str = match resolved.format { OutputFormat::Markdown => "markdown", OutputFormat::Json => "json", OutputFormat::Text => "text", @@ -679,9 +678,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result { } // Normal path: try local first - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - let options = build_extraction_options(cli); + let client = FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?; + let options = build_extraction_options(resolved); let result = client .fetch_and_extract_with_options(url, &options) .await @@ -692,7 +691,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result { if !matches!(reason, EmptyReason::None) { if let Some(ref c) = cloud_client { eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API..."); - let format_str = match cli.format { + let format_str = match resolved.format { OutputFormat::Markdown => "markdown", OutputFormat::Json => "json", OutputFormat::Text => "text", @@ -723,7 +722,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result { } /// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction. -async fn fetch_html(cli: &Cli) -> Result { +async fn fetch_html(cli: &Cli, resolved: &config::ResolvedConfig) -> Result { if cli.stdin { let mut buf = String::new(); io::stdin() @@ -756,8 +755,8 @@ async fn fetch_html(cli: &Cli) -> Result { .ok_or("no input provided -- pass a URL, --file, or --stdin")?; let url = normalize_url(raw_url); - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + let client = FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?; client .fetch(&url) .await @@ -1171,7 +1170,7 @@ fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String ) } -async fn run_crawl(cli: &Cli) -> Result<(), String> { +async fn run_crawl(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> { let url = cli .urls .first() @@ -1183,16 +1182,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { return Err("--crawl cannot be used with --file or --stdin".into()); } - let include_patterns: Vec = cli - .include_paths - .as_deref() - .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) - .unwrap_or_default(); - let exclude_patterns: Vec = cli - .exclude_paths - .as_deref() - .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) - .unwrap_or_default(); + let include_patterns = resolved.include_paths.clone(); + let exclude_patterns = resolved.exclude_paths.clone(); // Set up streaming progress channel let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::(100); @@ -1212,13 +1203,13 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { } let config = CrawlConfig { - fetch: build_fetch_config(cli), - max_depth: cli.depth, - max_pages: cli.max_pages, - concurrency: cli.concurrency, - delay: std::time::Duration::from_millis(cli.delay), - path_prefix: cli.path_prefix.clone(), - use_sitemap: cli.sitemap, + fetch: build_fetch_config(cli, resolved), + max_depth: resolved.depth, + max_pages: resolved.max_pages, + concurrency: resolved.concurrency, + delay: std::time::Duration::from_millis(resolved.delay), + path_prefix: resolved.path_prefix.clone(), + use_sitemap: resolved.use_sitemap, include_patterns, exclude_patterns, progress_tx: Some(progress_tx), @@ -1237,7 +1228,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { ); }); - let max_pages = cli.max_pages; + let max_pages = resolved.max_pages; let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages); // Spawn background task to print streaming progress to stderr @@ -1266,8 +1257,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { &result.visited, &result.remaining_frontier, completed_offset + result.pages.len(), - cli.max_pages, - cli.depth, + resolved.max_pages, + resolved.depth, )?; eprintln!( "Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}", @@ -1299,15 +1290,15 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { let mut saved = 0usize; for page in &result.pages { if let Some(ref extraction) = page.extraction { - let filename = url_to_filename(&page.url, &cli.format); - let content = format_output(extraction, &cli.format, cli.metadata); + let filename = url_to_filename(&page.url, &resolved.format); + let content = format_output(extraction, &resolved.format, resolved.metadata); write_to_file(dir, &filename, &content)?; saved += 1; } } eprintln!("Saved {saved} files to {}", dir.display()); } else { - print_crawl_output(&result, &cli.format, cli.metadata); + print_crawl_output(&result, &resolved.format, resolved.metadata); } eprintln!( @@ -1343,7 +1334,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { } } -async fn run_map(cli: &Cli) -> Result<(), String> { +async fn run_map(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> { let url = cli .urls .first() @@ -1351,8 +1342,8 @@ async fn run_map(cli: &Cli) -> Result<(), String> { .map(|u| normalize_url(u))?; let url = url.as_str(); - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + let client = FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?; let entries = noxa_fetch::sitemap::discover(&client, url) .await @@ -1364,19 +1355,24 @@ async fn run_map(cli: &Cli) -> Result<(), String> { eprintln!("discovered {} URLs", entries.len()); } - print_map_output(&entries, &cli.format); + print_map_output(&entries, &resolved.format); Ok(()) } -async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { +async fn run_batch( + cli: &Cli, + resolved: &config::ResolvedConfig, + entries: &[(String, Option)], +) -> Result<(), String> { let client = Arc::new( - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, + FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?, ); let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect(); - let options = build_extraction_options(cli); + let options = build_extraction_options(resolved); let results = client - .fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options) + .fetch_and_extract_batch_with_options(&urls, resolved.concurrency, &options) .await; let ok = results.iter().filter(|r| r.result.is_ok()).count(); @@ -1407,15 +1403,15 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<() let filename = custom_names .get(r.url.as_str()) .map(|s| s.to_string()) - .unwrap_or_else(|| url_to_filename(&r.url, &cli.format)); - let content = format_output(extraction, &cli.format, cli.metadata); + .unwrap_or_else(|| url_to_filename(&r.url, &resolved.format)); + let content = format_output(extraction, &resolved.format, resolved.metadata); write_to_file(dir, &filename, &content)?; saved += 1; } } eprintln!("Saved {saved} files to {}", dir.display()); } else { - print_batch_output(&results, &cli.format, cli.metadata); + print_batch_output(&results, &resolved.format, resolved.metadata); } eprintln!( @@ -1519,15 +1515,20 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) { }); } -async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { +async fn run_watch( + cli: &Cli, + resolved: &config::ResolvedConfig, + urls: &[String], +) -> Result<(), String> { if urls.is_empty() { return Err("--watch requires at least one URL".into()); } let client = Arc::new( - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, + FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?, ); - let options = build_extraction_options(cli); + let options = build_extraction_options(resolved); // Ctrl+C handler let cancelled = Arc::new(AtomicBool::new(false)); @@ -1539,16 +1540,17 @@ async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { // Single-URL mode: preserve original behavior exactly if urls.len() == 1 { - return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await; + return run_watch_single(cli, resolved, &client, &options, &urls[0], &cancelled).await; } // Multi-URL mode: batch fetch, diff each, report aggregate - run_watch_multi(cli, &client, &options, urls, &cancelled).await + run_watch_multi(cli, resolved, &client, &options, urls, &cancelled).await } /// Original single-URL watch loop -- backward compatible. async fn run_watch_single( cli: &Cli, + resolved: &config::ResolvedConfig, client: &Arc, options: &ExtractionOptions, url: &str, @@ -1585,7 +1587,7 @@ async fn run_watch_single( if diff.status == ChangeStatus::Same { eprintln!("[watch] No changes ({})", timestamp()); } else { - print_diff_output(&diff, &cli.format); + print_diff_output(&diff, &resolved.format); eprintln!("[watch] Changes detected! ({})", timestamp()); if let Some(ref cmd) = cli.on_change { @@ -1632,6 +1634,7 @@ async fn run_watch_single( /// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate. async fn run_watch_multi( cli: &Cli, + resolved: &config::ResolvedConfig, client: &Arc, options: &ExtractionOptions, urls: &[String], @@ -1641,7 +1644,7 @@ async fn run_watch_multi( // Initial pass: fetch all URLs in parallel let initial_results = client - .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) + .fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options) .await; let mut snapshots = std::collections::HashMap::new(); @@ -1681,7 +1684,7 @@ async fn run_watch_multi( check_number += 1; let current_results = client - .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) + .fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options) .await; let mut changed: Vec = Vec::new(); @@ -1785,7 +1788,11 @@ async fn run_watch_multi( Ok(()) } -async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { +async fn run_diff( + cli: &Cli, + resolved: &config::ResolvedConfig, + snapshot_path: &str, +) -> Result<(), String> { // Load previous snapshot let snapshot_json = std::fs::read_to_string(snapshot_path) .map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?; @@ -1793,16 +1800,16 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { .map_err(|e| format!("failed to parse snapshot JSON: {e}"))?; // Extract current version (handles PDF detection for URLs) - let new_result = fetch_and_extract(cli).await?.into_extraction()?; + let new_result = fetch_and_extract(cli, resolved).await?.into_extraction()?; let diff = noxa_core::diff::diff(&old, &new_result); - print_diff_output(&diff, &cli.format); + print_diff_output(&diff, &resolved.format); Ok(()) } -async fn run_brand(cli: &Cli) -> Result<(), String> { - let result = fetch_html(cli).await?; +async fn run_brand(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> { + let result = fetch_html(cli, resolved).await?; let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await; let brand = noxa_core::brand::extract_brand( &enriched, @@ -1816,12 +1823,15 @@ async fn run_brand(cli: &Cli) -> Result<(), String> { } /// Build an LLM provider based on CLI flags, or fall back to the default chain. -async fn build_llm_provider(cli: &Cli) -> Result, String> { - if let Some(ref name) = cli.llm_provider { +async fn build_llm_provider( + cli: &Cli, + resolved: &config::ResolvedConfig, +) -> Result, String> { + if let Some(ref name) = resolved.llm_provider { match name.as_str() { "gemini" => { let provider = noxa_llm::providers::gemini_cli::GeminiCliProvider::new( - cli.llm_model.clone(), + resolved.llm_model.clone(), ); if !provider.is_available().await { return Err( @@ -1833,7 +1843,7 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { "ollama" => { let provider = noxa_llm::providers::ollama::OllamaProvider::new( cli.llm_base_url.clone(), - cli.llm_model.clone(), + resolved.llm_model.clone(), ); if !provider.is_available().await { return Err("ollama is not running or unreachable".into()); @@ -1844,7 +1854,7 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { let provider = noxa_llm::providers::openai::OpenAiProvider::new( None, cli.llm_base_url.clone(), - cli.llm_model.clone(), + resolved.llm_model.clone(), ) .ok_or("OPENAI_API_KEY not set")?; Ok(Box::new(provider)) @@ -1852,7 +1862,7 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { "anthropic" => { let provider = noxa_llm::providers::anthropic::AnthropicProvider::new( None, - cli.llm_model.clone(), + resolved.llm_model.clone(), ) .ok_or("ANTHROPIC_API_KEY not set")?; Ok(Box::new(provider)) @@ -1873,12 +1883,12 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { } } -async fn run_llm(cli: &Cli) -> Result<(), String> { +async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> { // Extract content from source first (handles PDF detection for URLs) - let result = fetch_and_extract(cli).await?.into_extraction()?; + let result = fetch_and_extract(cli, resolved).await?.into_extraction()?; - let provider = build_llm_provider(cli).await?; - let model = cli.llm_model.as_deref(); + let provider = build_llm_provider(cli, resolved).await?; + let model = resolved.llm_model.as_deref(); if let Some(ref schema_input) = cli.extract_json { // Support @file syntax for loading schema from file @@ -1937,12 +1947,16 @@ async fn run_llm(cli: &Cli) -> Result<(), String> { /// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results. /// URLs are processed sequentially to respect LLM provider rate limits. -async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - let options = build_extraction_options(cli); - let provider = build_llm_provider(cli).await?; - let model = cli.llm_model.as_deref(); +async fn run_batch_llm( + cli: &Cli, + resolved: &config::ResolvedConfig, + entries: &[(String, Option)], +) -> Result<(), String> { + let client = FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?; + let options = build_extraction_options(resolved); + let provider = build_llm_provider(cli, resolved).await?; + let model = resolved.llm_model.as_deref(); // Pre-parse schema once if --extract-json is used let schema = if let Some(ref schema_input) = cli.extract_json { @@ -2231,12 +2245,19 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> { async fn main() { dotenvy::dotenv().ok(); - let cli = Cli::parse(); - init_logging(cli.verbose); + // Use low-level API to get both typed Cli and ArgMatches for ValueSource detection. + let matches = Cli::command().get_matches(); + let cli = Cli::from_arg_matches(&matches).unwrap_or_else(|e| e.exit()); + + // Load config BEFORE init_logging so verbose from config takes effect. + let cfg = config::NoxaConfig::load(cli.config.as_deref()); + let resolved = config::resolve(&cli, &matches, &cfg); + + init_logging(resolved.verbose); // --map: sitemap discovery mode if cli.map { - if let Err(e) = run_map(&cli).await { + if let Err(e) = run_map(&cli, &resolved).await { eprintln!("error: {e}"); process::exit(1); } @@ -2245,7 +2266,7 @@ async fn main() { // --crawl: recursive crawl mode if cli.crawl { - if let Err(e) = run_crawl(&cli).await { + if let Err(e) = run_crawl(&cli, &resolved).await { eprintln!("error: {e}"); process::exit(1); } @@ -2261,7 +2282,7 @@ async fn main() { process::exit(1); } }; - if let Err(e) = run_watch(&cli, &watch_urls).await { + if let Err(e) = run_watch(&cli, &resolved, &watch_urls).await { eprintln!("error: {e}"); process::exit(1); } @@ -2270,7 +2291,7 @@ async fn main() { // --diff-with: change tracking mode if let Some(ref snapshot_path) = cli.diff_with { - if let Err(e) = run_diff(&cli, snapshot_path).await { + if let Err(e) = run_diff(&cli, &resolved, snapshot_path).await { eprintln!("error: {e}"); process::exit(1); } @@ -2279,7 +2300,7 @@ async fn main() { // --brand: brand identity extraction mode if cli.brand { - if let Err(e) = run_brand(&cli).await { + if let Err(e) = run_brand(&cli, &resolved).await { eprintln!("error: {e}"); process::exit(1); } @@ -2308,11 +2329,11 @@ async fn main() { // When multiple URLs are provided, run batch LLM extraction over all of them. if has_llm_flags(&cli) { if entries.len() > 1 { - if let Err(e) = run_batch_llm(&cli, &entries).await { + if let Err(e) = run_batch_llm(&cli, &resolved, &entries).await { eprintln!("error: {e}"); process::exit(1); } - } else if let Err(e) = run_llm(&cli).await { + } else if let Err(e) = run_llm(&cli, &resolved).await { eprintln!("error: {e}"); process::exit(1); } @@ -2321,7 +2342,7 @@ async fn main() { // Multi-URL batch mode if entries.len() > 1 { - if let Err(e) = run_batch(&cli, &entries).await { + if let Err(e) = run_batch(&cli, &resolved, &entries).await { eprintln!("error: {e}"); process::exit(1); } @@ -2330,7 +2351,7 @@ async fn main() { // --raw-html: skip extraction, dump the fetched HTML if cli.raw_html && cli.include.is_none() && cli.exclude.is_none() { - match fetch_html(&cli).await { + match fetch_html(&cli, &resolved).await { Ok(r) => println!("{}", r.html), Err(e) => { eprintln!("error: {e}"); @@ -2341,7 +2362,7 @@ async fn main() { } // Single-page extraction (handles both HTML and PDF via content-type detection) - match fetch_and_extract(&cli).await { + match fetch_and_extract(&cli, &resolved).await { Ok(FetchOutput::Local(result)) => { if let Some(ref dir) = cli.output_dir { let url = cli @@ -2350,18 +2371,19 @@ async fn main() { .map(|u| normalize_url(u)) .unwrap_or_default(); let custom_name = entries.first().and_then(|(_, name)| name.clone()); - let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format)); - let content = format_output(&result, &cli.format, cli.metadata); + let filename = + custom_name.unwrap_or_else(|| url_to_filename(&url, &resolved.format)); + let content = format_output(&result, &resolved.format, resolved.metadata); if let Err(e) = write_to_file(dir, &filename, &content) { eprintln!("error: {e}"); process::exit(1); } } else { - print_output(&result, &cli.format, cli.metadata); + print_output(&result, &resolved.format, resolved.metadata); } } Ok(FetchOutput::Cloud(resp)) => { - print_cloud_output(&resp, &cli.format); + print_cloud_output(&resp, &resolved.format); } Err(e) => { eprintln!("{e}");