feat: wire ResolvedConfig into main.rs via clap ValueSource

This commit is contained in:
Jacob Magar 2026-04-11 12:24:44 -04:00
parent e7583a5c51
commit bac13fc1b5
2 changed files with 241 additions and 110 deletions

View file

@ -134,6 +134,115 @@ pub struct ResolvedConfig {
pub llm_model: Option<String>, pub llm_model: Option<String>,
} }
use clap::parser::ValueSource;
/// Merge CLI flags (detected via ValueSource), config file, and hard defaults
/// into a single ResolvedConfig. CLI explicit values always win.
pub fn resolve(
cli: &crate::Cli,
matches: &clap::ArgMatches,
cfg: &NoxaConfig,
) -> ResolvedConfig {
let explicit = |name: &str| {
matches.value_source(name) == Some(ValueSource::CommandLine)
};
ResolvedConfig {
format: if explicit("format") {
cli.format.clone()
} else {
cfg.format.clone().unwrap_or(crate::OutputFormat::Markdown)
},
browser: if explicit("browser") {
cli.browser.clone()
} else {
cfg.browser.clone().unwrap_or(crate::Browser::Chrome)
},
pdf_mode: if explicit("pdf_mode") {
cli.pdf_mode.clone()
} else {
cfg.pdf_mode.clone().unwrap_or(crate::PdfModeArg::Auto)
},
timeout: if explicit("timeout") {
cli.timeout
} else {
cfg.timeout.unwrap_or(30)
},
depth: if explicit("depth") {
cli.depth
} else {
cfg.depth.unwrap_or(1)
},
max_pages: if explicit("max_pages") {
cli.max_pages
} else {
cfg.max_pages.unwrap_or(20)
},
concurrency: if explicit("concurrency") {
cli.concurrency
} else {
cfg.concurrency.unwrap_or(5)
},
delay: if explicit("delay") {
cli.delay
} else {
cfg.delay.unwrap_or(100)
},
path_prefix: if explicit("path_prefix") {
cli.path_prefix.clone()
} else {
cfg.path_prefix.clone().or(cli.path_prefix.clone())
},
include_paths: if explicit("include_paths") {
cli.include_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default()
} else {
cfg.include_paths.clone().unwrap_or_default()
},
exclude_paths: if explicit("exclude_paths") {
cli.exclude_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default()
} else {
cfg.exclude_paths.clone().unwrap_or_default()
},
include_selectors: if explicit("include") {
cli.include
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default()
} else {
cfg.include_selectors.clone().unwrap_or_default()
},
exclude_selectors: if explicit("exclude") {
cli.exclude
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default()
} else {
cfg.exclude_selectors.clone().unwrap_or_default()
},
only_main_content: cli.only_main_content || cfg.only_main_content.unwrap_or(false),
metadata: cli.metadata || cfg.metadata.unwrap_or(false),
verbose: cli.verbose || cfg.verbose.unwrap_or(false),
use_sitemap: cli.sitemap || cfg.use_sitemap.unwrap_or(false),
raw_html: cli.raw_html,
llm_provider: if cli.llm_provider.is_some() {
cli.llm_provider.clone()
} else {
cfg.llm_provider.clone()
},
llm_model: if cli.llm_model.is_some() {
cli.llm_model.clone()
} else {
cfg.llm_model.clone()
},
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;

View file

@ -10,7 +10,7 @@ use std::process;
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
use clap::{Parser, ValueEnum}; use clap::{CommandFactory, FromArgMatches, Parser, ValueEnum};
use serde::Deserialize; use serde::Deserialize;
use tracing_subscriber::EnvFilter; use tracing_subscriber::EnvFilter;
use noxa_core::{ use noxa_core::{
@ -89,6 +89,10 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
#[derive(Parser)] #[derive(Parser)]
#[command(name = "noxa", about = "Extract web content for LLMs", version)] #[command(name = "noxa", about = "Extract web content for LLMs", version)]
struct Cli { struct Cli {
/// Path to config.json (default: ./config.json, override with NOXA_CONFIG env var)
#[arg(long, global = true)]
config: Option<String>,
/// URLs to fetch (multiple allowed) /// URLs to fetch (multiple allowed)
#[arg()] #[arg()]
urls: Vec<String>, urls: Vec<String>,
@ -348,7 +352,7 @@ fn init_logging(verbose: bool) {
/// `--proxy` sets a single static proxy (no rotation). /// `--proxy` sets a single static proxy (no rotation).
/// `--proxy-file` loads a pool of proxies and rotates per-request. /// `--proxy-file` loads a pool of proxies and rotates per-request.
/// `--proxy` takes priority: if both are set, only the single proxy is used. /// `--proxy` takes priority: if both are set, only the single proxy is used.
fn build_fetch_config(cli: &Cli) -> FetchConfig { fn build_fetch_config(cli: &Cli, resolved: &config::ResolvedConfig) -> FetchConfig {
let (proxy, proxy_pool) = if cli.proxy.is_some() { let (proxy, proxy_pool) = if cli.proxy.is_some() {
(cli.proxy.clone(), Vec::new()) (cli.proxy.clone(), Vec::new())
} else if let Some(ref path) = cli.proxy_file { } else if let Some(ref path) = cli.proxy_file {
@ -408,11 +412,11 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
} }
FetchConfig { FetchConfig {
browser: cli.browser.clone().into(), browser: resolved.browser.clone().into(),
proxy, proxy,
proxy_pool, proxy_pool,
timeout: std::time::Duration::from_secs(cli.timeout), timeout: std::time::Duration::from_secs(resolved.timeout),
pdf_mode: cli.pdf_mode.clone().into(), pdf_mode: resolved.pdf_mode.clone().into(),
headers, headers,
..Default::default() ..Default::default()
} }
@ -441,20 +445,12 @@ fn parse_cookie_file(path: &str) -> Result<String, String> {
Ok(pairs.join("; ")) Ok(pairs.join("; "))
} }
fn build_extraction_options(cli: &Cli) -> ExtractionOptions { fn build_extraction_options(resolved: &config::ResolvedConfig) -> ExtractionOptions {
ExtractionOptions { ExtractionOptions {
include_selectors: cli include_selectors: resolved.include_selectors.clone(),
.include exclude_selectors: resolved.exclude_selectors.clone(),
.as_deref() only_main_content: resolved.only_main_content,
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) include_raw_html: resolved.raw_html || matches!(resolved.format, OutputFormat::Html),
.unwrap_or_default(),
exclude_selectors: cli
.exclude
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
only_main_content: cli.only_main_content,
include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
} }
} }
@ -623,14 +619,17 @@ impl FetchOutput {
/// Fetch a URL and extract content, handling PDF detection automatically. /// Fetch a URL and extract content, handling PDF detection automatically.
/// Falls back to cloud API when bot protection or JS rendering is detected. /// Falls back to cloud API when bot protection or JS rendering is detected.
async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> { async fn fetch_and_extract(
cli: &Cli,
resolved: &config::ResolvedConfig,
) -> Result<FetchOutput, String> {
// Local sources: read and extract as HTML // Local sources: read and extract as HTML
if cli.stdin { if cli.stdin {
let mut buf = String::new(); let mut buf = String::new();
io::stdin() io::stdin()
.read_to_string(&mut buf) .read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?; .map_err(|e| format!("failed to read stdin: {e}"))?;
let options = build_extraction_options(cli); let options = build_extraction_options(resolved);
return extract_with_options(&buf, None, &options) return extract_with_options(&buf, None, &options)
.map(|r| FetchOutput::Local(Box::new(r))) .map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}")); .map_err(|e| format!("extraction error: {e}"));
@ -639,7 +638,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
if let Some(ref path) = cli.file { if let Some(ref path) = cli.file {
let html = let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
let options = build_extraction_options(cli); let options = build_extraction_options(resolved);
return extract_with_options(&html, None, &options) return extract_with_options(&html, None, &options)
.map(|r| FetchOutput::Local(Box::new(r))) .map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}")); .map_err(|e| format!("extraction error: {e}"));
@ -658,8 +657,8 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
if cli.cloud { if cli.cloud {
let c = let c =
cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?; cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
let options = build_extraction_options(cli); let options = build_extraction_options(resolved);
let format_str = match cli.format { let format_str = match resolved.format {
OutputFormat::Markdown => "markdown", OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json", OutputFormat::Json => "json",
OutputFormat::Text => "text", OutputFormat::Text => "text",
@ -679,9 +678,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
} }
// Normal path: try local first // Normal path: try local first
let client = let client = FetchClient::new(build_fetch_config(cli, resolved))
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; .map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli); let options = build_extraction_options(resolved);
let result = client let result = client
.fetch_and_extract_with_options(url, &options) .fetch_and_extract_with_options(url, &options)
.await .await
@ -692,7 +691,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
if !matches!(reason, EmptyReason::None) { if !matches!(reason, EmptyReason::None) {
if let Some(ref c) = cloud_client { if let Some(ref c) = cloud_client {
eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API..."); eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
let format_str = match cli.format { let format_str = match resolved.format {
OutputFormat::Markdown => "markdown", OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json", OutputFormat::Json => "json",
OutputFormat::Text => "text", OutputFormat::Text => "text",
@ -723,7 +722,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
} }
/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction. /// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> { async fn fetch_html(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<FetchResult, String> {
if cli.stdin { if cli.stdin {
let mut buf = String::new(); let mut buf = String::new();
io::stdin() io::stdin()
@ -756,8 +755,8 @@ async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
.ok_or("no input provided -- pass a URL, --file, or --stdin")?; .ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url); let url = normalize_url(raw_url);
let client = let client = FetchClient::new(build_fetch_config(cli, resolved))
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; .map_err(|e| format!("client error: {e}"))?;
client client
.fetch(&url) .fetch(&url)
.await .await
@ -1171,7 +1170,7 @@ fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String
) )
} }
async fn run_crawl(cli: &Cli) -> Result<(), String> { async fn run_crawl(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
let url = cli let url = cli
.urls .urls
.first() .first()
@ -1183,16 +1182,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
return Err("--crawl cannot be used with --file or --stdin".into()); return Err("--crawl cannot be used with --file or --stdin".into());
} }
let include_patterns: Vec<String> = cli let include_patterns = resolved.include_paths.clone();
.include_paths let exclude_patterns = resolved.exclude_paths.clone();
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default();
let exclude_patterns: Vec<String> = cli
.exclude_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default();
// Set up streaming progress channel // Set up streaming progress channel
let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::<PageResult>(100); let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::<PageResult>(100);
@ -1212,13 +1203,13 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
} }
let config = CrawlConfig { let config = CrawlConfig {
fetch: build_fetch_config(cli), fetch: build_fetch_config(cli, resolved),
max_depth: cli.depth, max_depth: resolved.depth,
max_pages: cli.max_pages, max_pages: resolved.max_pages,
concurrency: cli.concurrency, concurrency: resolved.concurrency,
delay: std::time::Duration::from_millis(cli.delay), delay: std::time::Duration::from_millis(resolved.delay),
path_prefix: cli.path_prefix.clone(), path_prefix: resolved.path_prefix.clone(),
use_sitemap: cli.sitemap, use_sitemap: resolved.use_sitemap,
include_patterns, include_patterns,
exclude_patterns, exclude_patterns,
progress_tx: Some(progress_tx), progress_tx: Some(progress_tx),
@ -1237,7 +1228,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
); );
}); });
let max_pages = cli.max_pages; let max_pages = resolved.max_pages;
let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages); let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages);
// Spawn background task to print streaming progress to stderr // Spawn background task to print streaming progress to stderr
@ -1266,8 +1257,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
&result.visited, &result.visited,
&result.remaining_frontier, &result.remaining_frontier,
completed_offset + result.pages.len(), completed_offset + result.pages.len(),
cli.max_pages, resolved.max_pages,
cli.depth, resolved.depth,
)?; )?;
eprintln!( eprintln!(
"Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}", "Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}",
@ -1299,15 +1290,15 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
let mut saved = 0usize; let mut saved = 0usize;
for page in &result.pages { for page in &result.pages {
if let Some(ref extraction) = page.extraction { if let Some(ref extraction) = page.extraction {
let filename = url_to_filename(&page.url, &cli.format); let filename = url_to_filename(&page.url, &resolved.format);
let content = format_output(extraction, &cli.format, cli.metadata); let content = format_output(extraction, &resolved.format, resolved.metadata);
write_to_file(dir, &filename, &content)?; write_to_file(dir, &filename, &content)?;
saved += 1; saved += 1;
} }
} }
eprintln!("Saved {saved} files to {}", dir.display()); eprintln!("Saved {saved} files to {}", dir.display());
} else { } else {
print_crawl_output(&result, &cli.format, cli.metadata); print_crawl_output(&result, &resolved.format, resolved.metadata);
} }
eprintln!( eprintln!(
@ -1343,7 +1334,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
} }
} }
async fn run_map(cli: &Cli) -> Result<(), String> { async fn run_map(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
let url = cli let url = cli
.urls .urls
.first() .first()
@ -1351,8 +1342,8 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
.map(|u| normalize_url(u))?; .map(|u| normalize_url(u))?;
let url = url.as_str(); let url = url.as_str();
let client = let client = FetchClient::new(build_fetch_config(cli, resolved))
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; .map_err(|e| format!("client error: {e}"))?;
let entries = noxa_fetch::sitemap::discover(&client, url) let entries = noxa_fetch::sitemap::discover(&client, url)
.await .await
@ -1364,19 +1355,24 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
eprintln!("discovered {} URLs", entries.len()); eprintln!("discovered {} URLs", entries.len());
} }
print_map_output(&entries, &cli.format); print_map_output(&entries, &resolved.format);
Ok(()) Ok(())
} }
async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> { async fn run_batch(
cli: &Cli,
resolved: &config::ResolvedConfig,
entries: &[(String, Option<String>)],
) -> Result<(), String> {
let client = Arc::new( let client = Arc::new(
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?,
); );
let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect(); let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect();
let options = build_extraction_options(cli); let options = build_extraction_options(resolved);
let results = client let results = client
.fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options) .fetch_and_extract_batch_with_options(&urls, resolved.concurrency, &options)
.await; .await;
let ok = results.iter().filter(|r| r.result.is_ok()).count(); let ok = results.iter().filter(|r| r.result.is_ok()).count();
@ -1407,15 +1403,15 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
let filename = custom_names let filename = custom_names
.get(r.url.as_str()) .get(r.url.as_str())
.map(|s| s.to_string()) .map(|s| s.to_string())
.unwrap_or_else(|| url_to_filename(&r.url, &cli.format)); .unwrap_or_else(|| url_to_filename(&r.url, &resolved.format));
let content = format_output(extraction, &cli.format, cli.metadata); let content = format_output(extraction, &resolved.format, resolved.metadata);
write_to_file(dir, &filename, &content)?; write_to_file(dir, &filename, &content)?;
saved += 1; saved += 1;
} }
} }
eprintln!("Saved {saved} files to {}", dir.display()); eprintln!("Saved {saved} files to {}", dir.display());
} else { } else {
print_batch_output(&results, &cli.format, cli.metadata); print_batch_output(&results, &resolved.format, resolved.metadata);
} }
eprintln!( eprintln!(
@ -1519,15 +1515,20 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
}); });
} }
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { async fn run_watch(
cli: &Cli,
resolved: &config::ResolvedConfig,
urls: &[String],
) -> Result<(), String> {
if urls.is_empty() { if urls.is_empty() {
return Err("--watch requires at least one URL".into()); return Err("--watch requires at least one URL".into());
} }
let client = Arc::new( let client = Arc::new(
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?,
); );
let options = build_extraction_options(cli); let options = build_extraction_options(resolved);
// Ctrl+C handler // Ctrl+C handler
let cancelled = Arc::new(AtomicBool::new(false)); let cancelled = Arc::new(AtomicBool::new(false));
@ -1539,16 +1540,17 @@ async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
// Single-URL mode: preserve original behavior exactly // Single-URL mode: preserve original behavior exactly
if urls.len() == 1 { if urls.len() == 1 {
return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await; return run_watch_single(cli, resolved, &client, &options, &urls[0], &cancelled).await;
} }
// Multi-URL mode: batch fetch, diff each, report aggregate // Multi-URL mode: batch fetch, diff each, report aggregate
run_watch_multi(cli, &client, &options, urls, &cancelled).await run_watch_multi(cli, resolved, &client, &options, urls, &cancelled).await
} }
/// Original single-URL watch loop -- backward compatible. /// Original single-URL watch loop -- backward compatible.
async fn run_watch_single( async fn run_watch_single(
cli: &Cli, cli: &Cli,
resolved: &config::ResolvedConfig,
client: &Arc<FetchClient>, client: &Arc<FetchClient>,
options: &ExtractionOptions, options: &ExtractionOptions,
url: &str, url: &str,
@ -1585,7 +1587,7 @@ async fn run_watch_single(
if diff.status == ChangeStatus::Same { if diff.status == ChangeStatus::Same {
eprintln!("[watch] No changes ({})", timestamp()); eprintln!("[watch] No changes ({})", timestamp());
} else { } else {
print_diff_output(&diff, &cli.format); print_diff_output(&diff, &resolved.format);
eprintln!("[watch] Changes detected! ({})", timestamp()); eprintln!("[watch] Changes detected! ({})", timestamp());
if let Some(ref cmd) = cli.on_change { if let Some(ref cmd) = cli.on_change {
@ -1632,6 +1634,7 @@ async fn run_watch_single(
/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate. /// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
async fn run_watch_multi( async fn run_watch_multi(
cli: &Cli, cli: &Cli,
resolved: &config::ResolvedConfig,
client: &Arc<FetchClient>, client: &Arc<FetchClient>,
options: &ExtractionOptions, options: &ExtractionOptions,
urls: &[String], urls: &[String],
@ -1641,7 +1644,7 @@ async fn run_watch_multi(
// Initial pass: fetch all URLs in parallel // Initial pass: fetch all URLs in parallel
let initial_results = client let initial_results = client
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) .fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options)
.await; .await;
let mut snapshots = std::collections::HashMap::new(); let mut snapshots = std::collections::HashMap::new();
@ -1681,7 +1684,7 @@ async fn run_watch_multi(
check_number += 1; check_number += 1;
let current_results = client let current_results = client
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) .fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options)
.await; .await;
let mut changed: Vec<serde_json::Value> = Vec::new(); let mut changed: Vec<serde_json::Value> = Vec::new();
@ -1785,7 +1788,11 @@ async fn run_watch_multi(
Ok(()) Ok(())
} }
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { async fn run_diff(
cli: &Cli,
resolved: &config::ResolvedConfig,
snapshot_path: &str,
) -> Result<(), String> {
// Load previous snapshot // Load previous snapshot
let snapshot_json = std::fs::read_to_string(snapshot_path) let snapshot_json = std::fs::read_to_string(snapshot_path)
.map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?; .map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?;
@ -1793,16 +1800,16 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
.map_err(|e| format!("failed to parse snapshot JSON: {e}"))?; .map_err(|e| format!("failed to parse snapshot JSON: {e}"))?;
// Extract current version (handles PDF detection for URLs) // Extract current version (handles PDF detection for URLs)
let new_result = fetch_and_extract(cli).await?.into_extraction()?; let new_result = fetch_and_extract(cli, resolved).await?.into_extraction()?;
let diff = noxa_core::diff::diff(&old, &new_result); let diff = noxa_core::diff::diff(&old, &new_result);
print_diff_output(&diff, &cli.format); print_diff_output(&diff, &resolved.format);
Ok(()) Ok(())
} }
async fn run_brand(cli: &Cli) -> Result<(), String> { async fn run_brand(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
let result = fetch_html(cli).await?; let result = fetch_html(cli, resolved).await?;
let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await; let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
let brand = noxa_core::brand::extract_brand( let brand = noxa_core::brand::extract_brand(
&enriched, &enriched,
@ -1816,12 +1823,15 @@ async fn run_brand(cli: &Cli) -> Result<(), String> {
} }
/// Build an LLM provider based on CLI flags, or fall back to the default chain. /// Build an LLM provider based on CLI flags, or fall back to the default chain.
async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> { async fn build_llm_provider(
if let Some(ref name) = cli.llm_provider { cli: &Cli,
resolved: &config::ResolvedConfig,
) -> Result<Box<dyn LlmProvider>, String> {
if let Some(ref name) = resolved.llm_provider {
match name.as_str() { match name.as_str() {
"gemini" => { "gemini" => {
let provider = noxa_llm::providers::gemini_cli::GeminiCliProvider::new( let provider = noxa_llm::providers::gemini_cli::GeminiCliProvider::new(
cli.llm_model.clone(), resolved.llm_model.clone(),
); );
if !provider.is_available().await { if !provider.is_available().await {
return Err( return Err(
@ -1833,7 +1843,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
"ollama" => { "ollama" => {
let provider = noxa_llm::providers::ollama::OllamaProvider::new( let provider = noxa_llm::providers::ollama::OllamaProvider::new(
cli.llm_base_url.clone(), cli.llm_base_url.clone(),
cli.llm_model.clone(), resolved.llm_model.clone(),
); );
if !provider.is_available().await { if !provider.is_available().await {
return Err("ollama is not running or unreachable".into()); return Err("ollama is not running or unreachable".into());
@ -1844,7 +1854,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
let provider = noxa_llm::providers::openai::OpenAiProvider::new( let provider = noxa_llm::providers::openai::OpenAiProvider::new(
None, None,
cli.llm_base_url.clone(), cli.llm_base_url.clone(),
cli.llm_model.clone(), resolved.llm_model.clone(),
) )
.ok_or("OPENAI_API_KEY not set")?; .ok_or("OPENAI_API_KEY not set")?;
Ok(Box::new(provider)) Ok(Box::new(provider))
@ -1852,7 +1862,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
"anthropic" => { "anthropic" => {
let provider = noxa_llm::providers::anthropic::AnthropicProvider::new( let provider = noxa_llm::providers::anthropic::AnthropicProvider::new(
None, None,
cli.llm_model.clone(), resolved.llm_model.clone(),
) )
.ok_or("ANTHROPIC_API_KEY not set")?; .ok_or("ANTHROPIC_API_KEY not set")?;
Ok(Box::new(provider)) Ok(Box::new(provider))
@ -1873,12 +1883,12 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
} }
} }
async fn run_llm(cli: &Cli) -> Result<(), String> { async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
// Extract content from source first (handles PDF detection for URLs) // Extract content from source first (handles PDF detection for URLs)
let result = fetch_and_extract(cli).await?.into_extraction()?; let result = fetch_and_extract(cli, resolved).await?.into_extraction()?;
let provider = build_llm_provider(cli).await?; let provider = build_llm_provider(cli, resolved).await?;
let model = cli.llm_model.as_deref(); let model = resolved.llm_model.as_deref();
if let Some(ref schema_input) = cli.extract_json { if let Some(ref schema_input) = cli.extract_json {
// Support @file syntax for loading schema from file // Support @file syntax for loading schema from file
@ -1937,12 +1947,16 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results. /// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
/// URLs are processed sequentially to respect LLM provider rate limits. /// URLs are processed sequentially to respect LLM provider rate limits.
async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> { async fn run_batch_llm(
let client = cli: &Cli,
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; resolved: &config::ResolvedConfig,
let options = build_extraction_options(cli); entries: &[(String, Option<String>)],
let provider = build_llm_provider(cli).await?; ) -> Result<(), String> {
let model = cli.llm_model.as_deref(); let client = FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(resolved);
let provider = build_llm_provider(cli, resolved).await?;
let model = resolved.llm_model.as_deref();
// Pre-parse schema once if --extract-json is used // Pre-parse schema once if --extract-json is used
let schema = if let Some(ref schema_input) = cli.extract_json { let schema = if let Some(ref schema_input) = cli.extract_json {
@ -2231,12 +2245,19 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
async fn main() { async fn main() {
dotenvy::dotenv().ok(); dotenvy::dotenv().ok();
let cli = Cli::parse(); // Use low-level API to get both typed Cli and ArgMatches for ValueSource detection.
init_logging(cli.verbose); let matches = Cli::command().get_matches();
let cli = Cli::from_arg_matches(&matches).unwrap_or_else(|e| e.exit());
// Load config BEFORE init_logging so verbose from config takes effect.
let cfg = config::NoxaConfig::load(cli.config.as_deref());
let resolved = config::resolve(&cli, &matches, &cfg);
init_logging(resolved.verbose);
// --map: sitemap discovery mode // --map: sitemap discovery mode
if cli.map { if cli.map {
if let Err(e) = run_map(&cli).await { if let Err(e) = run_map(&cli, &resolved).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
@ -2245,7 +2266,7 @@ async fn main() {
// --crawl: recursive crawl mode // --crawl: recursive crawl mode
if cli.crawl { if cli.crawl {
if let Err(e) = run_crawl(&cli).await { if let Err(e) = run_crawl(&cli, &resolved).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
@ -2261,7 +2282,7 @@ async fn main() {
process::exit(1); process::exit(1);
} }
}; };
if let Err(e) = run_watch(&cli, &watch_urls).await { if let Err(e) = run_watch(&cli, &resolved, &watch_urls).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
@ -2270,7 +2291,7 @@ async fn main() {
// --diff-with: change tracking mode // --diff-with: change tracking mode
if let Some(ref snapshot_path) = cli.diff_with { if let Some(ref snapshot_path) = cli.diff_with {
if let Err(e) = run_diff(&cli, snapshot_path).await { if let Err(e) = run_diff(&cli, &resolved, snapshot_path).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
@ -2279,7 +2300,7 @@ async fn main() {
// --brand: brand identity extraction mode // --brand: brand identity extraction mode
if cli.brand { if cli.brand {
if let Err(e) = run_brand(&cli).await { if let Err(e) = run_brand(&cli, &resolved).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
@ -2308,11 +2329,11 @@ async fn main() {
// When multiple URLs are provided, run batch LLM extraction over all of them. // When multiple URLs are provided, run batch LLM extraction over all of them.
if has_llm_flags(&cli) { if has_llm_flags(&cli) {
if entries.len() > 1 { if entries.len() > 1 {
if let Err(e) = run_batch_llm(&cli, &entries).await { if let Err(e) = run_batch_llm(&cli, &resolved, &entries).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
} else if let Err(e) = run_llm(&cli).await { } else if let Err(e) = run_llm(&cli, &resolved).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
@ -2321,7 +2342,7 @@ async fn main() {
// Multi-URL batch mode // Multi-URL batch mode
if entries.len() > 1 { if entries.len() > 1 {
if let Err(e) = run_batch(&cli, &entries).await { if let Err(e) = run_batch(&cli, &resolved, &entries).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
@ -2330,7 +2351,7 @@ async fn main() {
// --raw-html: skip extraction, dump the fetched HTML // --raw-html: skip extraction, dump the fetched HTML
if cli.raw_html && cli.include.is_none() && cli.exclude.is_none() { if cli.raw_html && cli.include.is_none() && cli.exclude.is_none() {
match fetch_html(&cli).await { match fetch_html(&cli, &resolved).await {
Ok(r) => println!("{}", r.html), Ok(r) => println!("{}", r.html),
Err(e) => { Err(e) => {
eprintln!("error: {e}"); eprintln!("error: {e}");
@ -2341,7 +2362,7 @@ async fn main() {
} }
// Single-page extraction (handles both HTML and PDF via content-type detection) // Single-page extraction (handles both HTML and PDF via content-type detection)
match fetch_and_extract(&cli).await { match fetch_and_extract(&cli, &resolved).await {
Ok(FetchOutput::Local(result)) => { Ok(FetchOutput::Local(result)) => {
if let Some(ref dir) = cli.output_dir { if let Some(ref dir) = cli.output_dir {
let url = cli let url = cli
@ -2350,18 +2371,19 @@ async fn main() {
.map(|u| normalize_url(u)) .map(|u| normalize_url(u))
.unwrap_or_default(); .unwrap_or_default();
let custom_name = entries.first().and_then(|(_, name)| name.clone()); let custom_name = entries.first().and_then(|(_, name)| name.clone());
let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format)); let filename =
let content = format_output(&result, &cli.format, cli.metadata); custom_name.unwrap_or_else(|| url_to_filename(&url, &resolved.format));
let content = format_output(&result, &resolved.format, resolved.metadata);
if let Err(e) = write_to_file(dir, &filename, &content) { if let Err(e) = write_to_file(dir, &filename, &content) {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
} else { } else {
print_output(&result, &cli.format, cli.metadata); print_output(&result, &resolved.format, resolved.metadata);
} }
} }
Ok(FetchOutput::Cloud(resp)) => { Ok(FetchOutput::Cloud(resp)) => {
print_cloud_output(&resp, &cli.format); print_cloud_output(&resp, &resolved.format);
} }
Err(e) => { Err(e) => {
eprintln!("{e}"); eprintln!("{e}");