diff --git a/crates/noxa-cli/src/config.rs b/crates/noxa-cli/src/config.rs new file mode 100644 index 0000000..30ab6c8 --- /dev/null +++ b/crates/noxa-cli/src/config.rs @@ -0,0 +1,187 @@ +use serde::Deserialize; +use std::path::Path; + +use crate::{Browser, OutputFormat, PdfModeArg}; + +/// Non-secret, non-URL configuration defaults loaded from config.json. +/// All fields optional — absent means "use the hard default". +/// Unknown fields are silently ignored (serde default) so config files +/// written for a newer version of noxa work on older binaries. +/// +/// DELIBERATELY EXCLUDED: +/// - on_change: passes content to sh -c; must remain CLI-only to prevent +/// shell injection via config file writes. +/// - Secrets/URLs (api_key, proxy, webhook, llm_base_url): stay in .env. +/// +/// BOOL FLAG LIMITATION: +/// only_main_content, metadata, verbose, use_sitemap set to true here +/// cannot be overridden to false from the CLI for a single run (no --no-flag +/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null to bypass. +#[derive(Debug, Default, Deserialize)] +pub struct NoxaConfig { + // Output + pub format: Option, + pub metadata: Option, + pub verbose: Option, + + // Fetch + pub browser: Option, + pub timeout: Option, + pub pdf_mode: Option, + pub only_main_content: Option, + + // CSS selectors + pub include_selectors: Option>, + pub exclude_selectors: Option>, + + // Crawl + pub depth: Option, + pub max_pages: Option, + pub concurrency: Option, + pub delay: Option, + pub path_prefix: Option, + pub include_paths: Option>, + pub exclude_paths: Option>, + pub use_sitemap: Option, + + // LLM (non-secret: provider name and model only; base URL stays in .env) + pub llm_provider: Option, + pub llm_model: Option, +} + +impl NoxaConfig { + /// Load config from an explicit path, NOXA_CONFIG env var, or ./config.json. + /// Returns an empty (all-None) config if the file doesn't exist. + /// Prints an error and exits if the file exists but is invalid JSON. + pub fn load(explicit_path: Option<&str>) -> Self { + let path_str = explicit_path + .map(String::from) + .or_else(|| std::env::var("NOXA_CONFIG").ok()) + .unwrap_or_else(|| "config.json".to_string()); + + let path = Path::new(&path_str); + if !path.exists() { + return Self::default(); + } + + let display_name = path.file_name() + .and_then(|n| n.to_str()) + .unwrap_or(&path_str); + eprintln!( + "noxa: config loaded from {display_name} \ + (API keys and secrets belong in .env, not config.json)" + ); + tracing::debug!("config path: {}", path.display()); + + let content = match std::fs::read_to_string(path) { + Ok(s) => s, + Err(e) => { + eprintln!("error: cannot read config file {display_name}: {e}"); + std::process::exit(1); + } + }; + + match serde_json::from_str(&content) { + Ok(cfg) => cfg, + Err(e) => { + eprintln!("error: invalid JSON in config file {display_name}: {e}"); + std::process::exit(1); + } + } + } +} + +/// Fully resolved configuration after merging CLI flags > config file > hard defaults. +/// All fields are concrete — no Option. This is what the rest of main.rs reads. +/// +/// The merge uses clap's ValueSource to detect which fields were explicitly set on +/// the command line. CLI-explicit values always win. Config fills in the rest. +/// Hard defaults are the fallback of last resort. +pub struct ResolvedConfig { + // Output + pub format: OutputFormat, + pub metadata: bool, + pub verbose: bool, + + // Fetch + pub browser: Browser, + pub timeout: u64, + pub pdf_mode: PdfModeArg, + pub only_main_content: bool, + pub raw_html: bool, + + // CSS selectors + pub include_selectors: Vec, + pub exclude_selectors: Vec, + + // Crawl + pub depth: usize, + pub max_pages: usize, + pub concurrency: usize, + pub delay: u64, + pub path_prefix: Option, + /// Vec — never joined to a comma-string. Passed directly to CrawlConfig. + pub include_paths: Vec, + /// Vec — never joined to a comma-string. Passed directly to CrawlConfig. + pub exclude_paths: Vec, + pub use_sitemap: bool, + + // LLM + pub llm_provider: Option, + pub llm_model: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_noxa_config_deserialize_full() { + let json = r#"{ + "format": "llm", + "depth": 3, + "max_pages": 100, + "concurrency": 10, + "delay": 200, + "browser": "firefox", + "timeout": 60, + "only_main_content": true, + "use_sitemap": true, + "path_prefix": "/docs/", + "include_paths": ["/docs/*", "/api/*"], + "exclude_paths": ["/changelog/*", "/blog/*"], + "include_selectors": ["article", ".content"], + "exclude_selectors": ["nav", "footer"], + "llm_provider": "gemini", + "llm_model": "gemini-2.5-pro", + "pdf_mode": "fast", + "metadata": true, + "verbose": false + }"#; + let cfg: NoxaConfig = serde_json::from_str(json).unwrap(); + assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm))); + assert_eq!(cfg.depth, Some(3)); + assert_eq!(cfg.exclude_paths, Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()])); + assert!(matches!(cfg.pdf_mode, Some(crate::PdfModeArg::Fast))); + } + + #[test] + fn test_noxa_config_empty() { + let cfg: NoxaConfig = serde_json::from_str("{}").unwrap(); + assert!(cfg.format.is_none()); + assert!(cfg.depth.is_none()); + } + + #[test] + fn test_noxa_config_unknown_fields_ignored() { + // Unknown fields must NOT cause a parse failure + let cfg: NoxaConfig = serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap(); + assert_eq!(cfg.depth, Some(2)); + } + + #[test] + fn test_load_missing_file_returns_default() { + let cfg = NoxaConfig::load(Some("/nonexistent/path/config.json")); + assert!(cfg.format.is_none()); + } +} diff --git a/crates/noxa-cli/src/main.rs b/crates/noxa-cli/src/main.rs index 80d20b4..60608c0 100644 --- a/crates/noxa-cli/src/main.rs +++ b/crates/noxa-cli/src/main.rs @@ -2,6 +2,7 @@ /// CLI entry point -- wires noxa-core and noxa-fetch into a single command. /// All extraction and fetching logic lives in sibling crates; this is pure plumbing. mod cloud; +mod config; use std::io::{self, Read as _}; use std::path::{Path, PathBuf}; @@ -285,7 +286,7 @@ struct Cli { output_dir: Option, } -#[derive(Clone, ValueEnum, Deserialize)] +#[derive(Clone, Debug, ValueEnum, Deserialize)] #[serde(rename_all = "lowercase")] enum OutputFormat { Markdown, @@ -295,7 +296,7 @@ enum OutputFormat { Html, } -#[derive(Clone, ValueEnum, Deserialize)] +#[derive(Clone, Debug, ValueEnum, Deserialize)] #[serde(rename_all = "lowercase")] enum Browser { Chrome, @@ -303,7 +304,7 @@ enum Browser { Random, } -#[derive(Clone, ValueEnum, Default, Deserialize)] +#[derive(Clone, Debug, ValueEnum, Default, Deserialize)] #[serde(rename_all = "lowercase")] enum PdfModeArg { /// Error if PDF has no extractable text (catches scanned PDFs)