feat: add NoxaConfig and ResolvedConfig with load()

Introduces config.rs with NoxaConfig (serde Deserialize, all-optional fields,
unknown-field-tolerant), ResolvedConfig (concrete post-merge struct), and
NoxaConfig::load() (explicit path > NOXA_CONFIG env > ./config.json, missing
file returns default). Also adds Debug derives to OutputFormat, Browser, and
PdfModeArg required by NoxaConfig. 4 tests pass.
This commit is contained in:
Jacob Magar 2026-04-11 12:16:56 -04:00
parent cc1617a3a9
commit 3bc6a9920b
2 changed files with 191 additions and 3 deletions

View file

@ -0,0 +1,187 @@
use serde::Deserialize;
use std::path::Path;
use crate::{Browser, OutputFormat, PdfModeArg};
/// Non-secret, non-URL configuration defaults loaded from config.json.
/// All fields optional — absent means "use the hard default".
/// Unknown fields are silently ignored (serde default) so config files
/// written for a newer version of noxa work on older binaries.
///
/// DELIBERATELY EXCLUDED:
/// - on_change: passes content to sh -c; must remain CLI-only to prevent
/// shell injection via config file writes.
/// - Secrets/URLs (api_key, proxy, webhook, llm_base_url): stay in .env.
///
/// BOOL FLAG LIMITATION:
/// only_main_content, metadata, verbose, use_sitemap set to true here
/// cannot be overridden to false from the CLI for a single run (no --no-flag
/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null to bypass.
#[derive(Debug, Default, Deserialize)]
pub struct NoxaConfig {
// Output
pub format: Option<OutputFormat>,
pub metadata: Option<bool>,
pub verbose: Option<bool>,
// Fetch
pub browser: Option<Browser>,
pub timeout: Option<u64>,
pub pdf_mode: Option<PdfModeArg>,
pub only_main_content: Option<bool>,
// CSS selectors
pub include_selectors: Option<Vec<String>>,
pub exclude_selectors: Option<Vec<String>>,
// Crawl
pub depth: Option<usize>,
pub max_pages: Option<usize>,
pub concurrency: Option<usize>,
pub delay: Option<u64>,
pub path_prefix: Option<String>,
pub include_paths: Option<Vec<String>>,
pub exclude_paths: Option<Vec<String>>,
pub use_sitemap: Option<bool>,
// LLM (non-secret: provider name and model only; base URL stays in .env)
pub llm_provider: Option<String>,
pub llm_model: Option<String>,
}
impl NoxaConfig {
/// Load config from an explicit path, NOXA_CONFIG env var, or ./config.json.
/// Returns an empty (all-None) config if the file doesn't exist.
/// Prints an error and exits if the file exists but is invalid JSON.
pub fn load(explicit_path: Option<&str>) -> Self {
let path_str = explicit_path
.map(String::from)
.or_else(|| std::env::var("NOXA_CONFIG").ok())
.unwrap_or_else(|| "config.json".to_string());
let path = Path::new(&path_str);
if !path.exists() {
return Self::default();
}
let display_name = path.file_name()
.and_then(|n| n.to_str())
.unwrap_or(&path_str);
eprintln!(
"noxa: config loaded from {display_name} \
(API keys and secrets belong in .env, not config.json)"
);
tracing::debug!("config path: {}", path.display());
let content = match std::fs::read_to_string(path) {
Ok(s) => s,
Err(e) => {
eprintln!("error: cannot read config file {display_name}: {e}");
std::process::exit(1);
}
};
match serde_json::from_str(&content) {
Ok(cfg) => cfg,
Err(e) => {
eprintln!("error: invalid JSON in config file {display_name}: {e}");
std::process::exit(1);
}
}
}
}
/// Fully resolved configuration after merging CLI flags > config file > hard defaults.
/// All fields are concrete — no Option<T>. This is what the rest of main.rs reads.
///
/// The merge uses clap's ValueSource to detect which fields were explicitly set on
/// the command line. CLI-explicit values always win. Config fills in the rest.
/// Hard defaults are the fallback of last resort.
pub struct ResolvedConfig {
// Output
pub format: OutputFormat,
pub metadata: bool,
pub verbose: bool,
// Fetch
pub browser: Browser,
pub timeout: u64,
pub pdf_mode: PdfModeArg,
pub only_main_content: bool,
pub raw_html: bool,
// CSS selectors
pub include_selectors: Vec<String>,
pub exclude_selectors: Vec<String>,
// Crawl
pub depth: usize,
pub max_pages: usize,
pub concurrency: usize,
pub delay: u64,
pub path_prefix: Option<String>,
/// Vec<String> — never joined to a comma-string. Passed directly to CrawlConfig.
pub include_paths: Vec<String>,
/// Vec<String> — never joined to a comma-string. Passed directly to CrawlConfig.
pub exclude_paths: Vec<String>,
pub use_sitemap: bool,
// LLM
pub llm_provider: Option<String>,
pub llm_model: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_noxa_config_deserialize_full() {
let json = r#"{
"format": "llm",
"depth": 3,
"max_pages": 100,
"concurrency": 10,
"delay": 200,
"browser": "firefox",
"timeout": 60,
"only_main_content": true,
"use_sitemap": true,
"path_prefix": "/docs/",
"include_paths": ["/docs/*", "/api/*"],
"exclude_paths": ["/changelog/*", "/blog/*"],
"include_selectors": ["article", ".content"],
"exclude_selectors": ["nav", "footer"],
"llm_provider": "gemini",
"llm_model": "gemini-2.5-pro",
"pdf_mode": "fast",
"metadata": true,
"verbose": false
}"#;
let cfg: NoxaConfig = serde_json::from_str(json).unwrap();
assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm)));
assert_eq!(cfg.depth, Some(3));
assert_eq!(cfg.exclude_paths, Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()]));
assert!(matches!(cfg.pdf_mode, Some(crate::PdfModeArg::Fast)));
}
#[test]
fn test_noxa_config_empty() {
let cfg: NoxaConfig = serde_json::from_str("{}").unwrap();
assert!(cfg.format.is_none());
assert!(cfg.depth.is_none());
}
#[test]
fn test_noxa_config_unknown_fields_ignored() {
// Unknown fields must NOT cause a parse failure
let cfg: NoxaConfig = serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap();
assert_eq!(cfg.depth, Some(2));
}
#[test]
fn test_load_missing_file_returns_default() {
let cfg = NoxaConfig::load(Some("/nonexistent/path/config.json"));
assert!(cfg.format.is_none());
}
}

View file

@ -2,6 +2,7 @@
/// CLI entry point -- wires noxa-core and noxa-fetch into a single command.
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
mod cloud;
mod config;
use std::io::{self, Read as _};
use std::path::{Path, PathBuf};
@ -285,7 +286,7 @@ struct Cli {
output_dir: Option<PathBuf>,
}
#[derive(Clone, ValueEnum, Deserialize)]
#[derive(Clone, Debug, ValueEnum, Deserialize)]
#[serde(rename_all = "lowercase")]
enum OutputFormat {
Markdown,
@ -295,7 +296,7 @@ enum OutputFormat {
Html,
}
#[derive(Clone, ValueEnum, Deserialize)]
#[derive(Clone, Debug, ValueEnum, Deserialize)]
#[serde(rename_all = "lowercase")]
enum Browser {
Chrome,
@ -303,7 +304,7 @@ enum Browser {
Random,
}
#[derive(Clone, ValueEnum, Default, Deserialize)]
#[derive(Clone, Debug, ValueEnum, Default, Deserialize)]
#[serde(rename_all = "lowercase")]
enum PdfModeArg {
/// Error if PDF has no extractable text (catches scanned PDFs)