mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat: add NoxaConfig and ResolvedConfig with load()
Introduces config.rs with NoxaConfig (serde Deserialize, all-optional fields, unknown-field-tolerant), ResolvedConfig (concrete post-merge struct), and NoxaConfig::load() (explicit path > NOXA_CONFIG env > ./config.json, missing file returns default). Also adds Debug derives to OutputFormat, Browser, and PdfModeArg required by NoxaConfig. 4 tests pass.
This commit is contained in:
parent
cc1617a3a9
commit
3bc6a9920b
2 changed files with 191 additions and 3 deletions
187
crates/noxa-cli/src/config.rs
Normal file
187
crates/noxa-cli/src/config.rs
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
use serde::Deserialize;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::{Browser, OutputFormat, PdfModeArg};
|
||||
|
||||
/// Non-secret, non-URL configuration defaults loaded from config.json.
|
||||
/// All fields optional — absent means "use the hard default".
|
||||
/// Unknown fields are silently ignored (serde default) so config files
|
||||
/// written for a newer version of noxa work on older binaries.
|
||||
///
|
||||
/// DELIBERATELY EXCLUDED:
|
||||
/// - on_change: passes content to sh -c; must remain CLI-only to prevent
|
||||
/// shell injection via config file writes.
|
||||
/// - Secrets/URLs (api_key, proxy, webhook, llm_base_url): stay in .env.
|
||||
///
|
||||
/// BOOL FLAG LIMITATION:
|
||||
/// only_main_content, metadata, verbose, use_sitemap set to true here
|
||||
/// cannot be overridden to false from the CLI for a single run (no --no-flag
|
||||
/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null to bypass.
|
||||
#[derive(Debug, Default, Deserialize)]
|
||||
pub struct NoxaConfig {
|
||||
// Output
|
||||
pub format: Option<OutputFormat>,
|
||||
pub metadata: Option<bool>,
|
||||
pub verbose: Option<bool>,
|
||||
|
||||
// Fetch
|
||||
pub browser: Option<Browser>,
|
||||
pub timeout: Option<u64>,
|
||||
pub pdf_mode: Option<PdfModeArg>,
|
||||
pub only_main_content: Option<bool>,
|
||||
|
||||
// CSS selectors
|
||||
pub include_selectors: Option<Vec<String>>,
|
||||
pub exclude_selectors: Option<Vec<String>>,
|
||||
|
||||
// Crawl
|
||||
pub depth: Option<usize>,
|
||||
pub max_pages: Option<usize>,
|
||||
pub concurrency: Option<usize>,
|
||||
pub delay: Option<u64>,
|
||||
pub path_prefix: Option<String>,
|
||||
pub include_paths: Option<Vec<String>>,
|
||||
pub exclude_paths: Option<Vec<String>>,
|
||||
pub use_sitemap: Option<bool>,
|
||||
|
||||
// LLM (non-secret: provider name and model only; base URL stays in .env)
|
||||
pub llm_provider: Option<String>,
|
||||
pub llm_model: Option<String>,
|
||||
}
|
||||
|
||||
impl NoxaConfig {
|
||||
/// Load config from an explicit path, NOXA_CONFIG env var, or ./config.json.
|
||||
/// Returns an empty (all-None) config if the file doesn't exist.
|
||||
/// Prints an error and exits if the file exists but is invalid JSON.
|
||||
pub fn load(explicit_path: Option<&str>) -> Self {
|
||||
let path_str = explicit_path
|
||||
.map(String::from)
|
||||
.or_else(|| std::env::var("NOXA_CONFIG").ok())
|
||||
.unwrap_or_else(|| "config.json".to_string());
|
||||
|
||||
let path = Path::new(&path_str);
|
||||
if !path.exists() {
|
||||
return Self::default();
|
||||
}
|
||||
|
||||
let display_name = path.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or(&path_str);
|
||||
eprintln!(
|
||||
"noxa: config loaded from {display_name} \
|
||||
(API keys and secrets belong in .env, not config.json)"
|
||||
);
|
||||
tracing::debug!("config path: {}", path.display());
|
||||
|
||||
let content = match std::fs::read_to_string(path) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
eprintln!("error: cannot read config file {display_name}: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
match serde_json::from_str(&content) {
|
||||
Ok(cfg) => cfg,
|
||||
Err(e) => {
|
||||
eprintln!("error: invalid JSON in config file {display_name}: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Fully resolved configuration after merging CLI flags > config file > hard defaults.
|
||||
/// All fields are concrete — no Option<T>. This is what the rest of main.rs reads.
|
||||
///
|
||||
/// The merge uses clap's ValueSource to detect which fields were explicitly set on
|
||||
/// the command line. CLI-explicit values always win. Config fills in the rest.
|
||||
/// Hard defaults are the fallback of last resort.
|
||||
pub struct ResolvedConfig {
|
||||
// Output
|
||||
pub format: OutputFormat,
|
||||
pub metadata: bool,
|
||||
pub verbose: bool,
|
||||
|
||||
// Fetch
|
||||
pub browser: Browser,
|
||||
pub timeout: u64,
|
||||
pub pdf_mode: PdfModeArg,
|
||||
pub only_main_content: bool,
|
||||
pub raw_html: bool,
|
||||
|
||||
// CSS selectors
|
||||
pub include_selectors: Vec<String>,
|
||||
pub exclude_selectors: Vec<String>,
|
||||
|
||||
// Crawl
|
||||
pub depth: usize,
|
||||
pub max_pages: usize,
|
||||
pub concurrency: usize,
|
||||
pub delay: u64,
|
||||
pub path_prefix: Option<String>,
|
||||
/// Vec<String> — never joined to a comma-string. Passed directly to CrawlConfig.
|
||||
pub include_paths: Vec<String>,
|
||||
/// Vec<String> — never joined to a comma-string. Passed directly to CrawlConfig.
|
||||
pub exclude_paths: Vec<String>,
|
||||
pub use_sitemap: bool,
|
||||
|
||||
// LLM
|
||||
pub llm_provider: Option<String>,
|
||||
pub llm_model: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_noxa_config_deserialize_full() {
|
||||
let json = r#"{
|
||||
"format": "llm",
|
||||
"depth": 3,
|
||||
"max_pages": 100,
|
||||
"concurrency": 10,
|
||||
"delay": 200,
|
||||
"browser": "firefox",
|
||||
"timeout": 60,
|
||||
"only_main_content": true,
|
||||
"use_sitemap": true,
|
||||
"path_prefix": "/docs/",
|
||||
"include_paths": ["/docs/*", "/api/*"],
|
||||
"exclude_paths": ["/changelog/*", "/blog/*"],
|
||||
"include_selectors": ["article", ".content"],
|
||||
"exclude_selectors": ["nav", "footer"],
|
||||
"llm_provider": "gemini",
|
||||
"llm_model": "gemini-2.5-pro",
|
||||
"pdf_mode": "fast",
|
||||
"metadata": true,
|
||||
"verbose": false
|
||||
}"#;
|
||||
let cfg: NoxaConfig = serde_json::from_str(json).unwrap();
|
||||
assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm)));
|
||||
assert_eq!(cfg.depth, Some(3));
|
||||
assert_eq!(cfg.exclude_paths, Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()]));
|
||||
assert!(matches!(cfg.pdf_mode, Some(crate::PdfModeArg::Fast)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_noxa_config_empty() {
|
||||
let cfg: NoxaConfig = serde_json::from_str("{}").unwrap();
|
||||
assert!(cfg.format.is_none());
|
||||
assert!(cfg.depth.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_noxa_config_unknown_fields_ignored() {
|
||||
// Unknown fields must NOT cause a parse failure
|
||||
let cfg: NoxaConfig = serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap();
|
||||
assert_eq!(cfg.depth, Some(2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_load_missing_file_returns_default() {
|
||||
let cfg = NoxaConfig::load(Some("/nonexistent/path/config.json"));
|
||||
assert!(cfg.format.is_none());
|
||||
}
|
||||
}
|
||||
|
|
@ -2,6 +2,7 @@
|
|||
/// CLI entry point -- wires noxa-core and noxa-fetch into a single command.
|
||||
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
|
||||
mod cloud;
|
||||
mod config;
|
||||
|
||||
use std::io::{self, Read as _};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
|
@ -285,7 +286,7 @@ struct Cli {
|
|||
output_dir: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum, Deserialize)]
|
||||
#[derive(Clone, Debug, ValueEnum, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum OutputFormat {
|
||||
Markdown,
|
||||
|
|
@ -295,7 +296,7 @@ enum OutputFormat {
|
|||
Html,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum, Deserialize)]
|
||||
#[derive(Clone, Debug, ValueEnum, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum Browser {
|
||||
Chrome,
|
||||
|
|
@ -303,7 +304,7 @@ enum Browser {
|
|||
Random,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum, Default, Deserialize)]
|
||||
#[derive(Clone, Debug, ValueEnum, Default, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum PdfModeArg {
|
||||
/// Error if PDF has no extractable text (catches scanned PDFs)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue