fix: error on explicit missing config path; update env.example; add README config docs

- config.rs: NoxaConfig::load() now exits with an error when an explicit path
  (via --config arg or NOXA_CONFIG env var) does not exist; implicit ./config.json
  missing still silently returns default
- Updated test: test_load_missing_file_returns_default replaced with
  test_load_implicit_missing_file_returns_default (tests None path, not explicit)
- env.example: replaced fat legacy content with slim secrets-only template
  matching .env.example; deleted redundant .env.example
- README.md: replaced bare env-var table with full Configuration section
  covering config.json workflow, precedence, NOXA_CONFIG override, and bool
  flag limitation
This commit is contained in:
Jacob Magar 2026-04-11 12:35:21 -04:00
parent 10364416c1
commit 1c112459bc
4 changed files with 322 additions and 62 deletions

View file

@ -1,17 +0,0 @@
# Secrets and URLs only — everything else goes in config.json
# See config.example.json for the full list of configurable defaults.
# Cloud API key (required for --cloud / --research)
NOXA_API_KEY=
# Single proxy URL (or use NOXA_PROXY_FILE for pool rotation)
NOXA_PROXY=
# Webhook URL for completion notifications
NOXA_WEBHOOK_URL=
# LLM base URL (Ollama or OpenAI-compatible endpoint)
NOXA_LLM_BASE_URL=
# Optional: path to a non-default config file (default: ./config.json)
# NOXA_CONFIG=/path/to/my-config.json

292
README.md
View file

@ -77,7 +77,7 @@ Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for mac
### Cargo (from source) ### Cargo (from source)
```bash ```bash
cargo install --git https://github.com/jmagar/noxa.git noxa cargo install --git https://github.com/jmagar/noxa.git noxa-cli --bin noxa
cargo install --git https://github.com/jmagar/noxa.git noxa-mcp cargo install --git https://github.com/jmagar/noxa.git noxa-mcp
``` ```
@ -159,6 +159,271 @@ Crawling... 50/50 pages extracted
--- ---
## Examples
### Basic Extraction
```bash
# Extract as markdown (default)
noxa https://example.com
# Multiple output formats
noxa https://example.com -f markdown # Clean markdown
noxa https://example.com -f json # Full structured JSON
noxa https://example.com -f text # Plain text (no formatting)
noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens)
# Bare domains work (auto-prepends https://)
noxa example.com
```
### Content Filtering
```bash
# Only extract main content (skip nav, sidebar, footer)
noxa https://docs.rs/tokio --only-main-content
# Include specific CSS selectors
noxa https://news.ycombinator.com --include ".titleline,.score"
# Exclude specific elements
noxa https://example.com --exclude "nav,footer,.ads,.sidebar"
# Combine both
noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar"
```
### Brand Identity Extraction
```bash
# Extract colors, fonts, logos from any website
noxa --brand https://stripe.com
# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] }
noxa --brand https://github.com
# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... }
noxa --brand wikipedia.org
# Output: 10 colors, 5 fonts, favicon, logo URL
```
### Sitemap Discovery
```bash
# Discover all URLs from a site's sitemaps
noxa --map https://sitemaps.org
# Output: one URL per line (84 URLs found)
# JSON output with metadata
noxa --map https://sitemaps.org -f json
# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }]
```
### Recursive Crawling
```bash
# Crawl a site (default: depth 1, max 20 pages)
noxa --crawl https://example.com
# Control depth and page limit
noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio
# Crawl with sitemap seeding (finds more pages)
noxa --crawl --sitemap --depth 2 https://docs.rs/tokio
# Filter crawl paths
noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com
noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com
# Control concurrency and delay
noxa --crawl --concurrency 10 --delay 200 https://example.com
```
### Change Detection (Diff)
```bash
# Step 1: Save a snapshot
noxa https://example.com -f json > snapshot.json
# Step 2: Later, compare against the snapshot
noxa --diff-with snapshot.json https://example.com
# Output:
# Status: Same
# Word count delta: +0
# If the page changed:
# Status: Changed
# Word count delta: +42
# --- old
# +++ new
# @@ -1,3 +1,3 @@
# -Old content here
# +New content here
```
### PDF Extraction
```bash
# PDF URLs are auto-detected via Content-Type
noxa https://example.com/report.pdf
# Control PDF mode
noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs)
noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found
```
### Batch Processing
```bash
# Multiple URLs in one command
noxa https://example.com https://httpbin.org/html https://rust-lang.org
# URLs from a file (one per line, # comments supported)
noxa --urls-file urls.txt
# Batch with JSON output
noxa --urls-file urls.txt -f json
# Proxy rotation for large batches
noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10
```
### Local Files & Stdin
```bash
# Extract from a local HTML file
noxa --file page.html
# Pipe HTML from another command
curl -s https://example.com | noxa --stdin
# Chain with other tools
noxa https://example.com -f text | wc -w # Word count
noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq
```
### Browser Impersonation
```bash
# Chrome (default) — latest Chrome TLS fingerprint
noxa https://example.com
# Firefox fingerprint
noxa --browser firefox https://example.com
# Random browser per request (good for batch)
noxa --browser random --urls-file urls.txt
```
### Custom Headers & Cookies
```bash
# Custom headers
noxa -H "Authorization: Bearer token123" https://api.example.com
noxa -H "Accept-Language: de-DE" https://example.com
# Cookies
noxa --cookie "session=abc123; theme=dark" https://example.com
# Multiple headers
noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com
```
### LLM-Powered Features
These require an LLM provider (Ollama local, or OpenAI/Anthropic API key).
```bash
# Summarize a page (default: 3 sentences)
noxa --summarize https://example.com
# Control summary length
noxa --summarize 5 https://example.com
# Extract structured JSON with a schema
noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product
# Extract with a schema from file
noxa --extract-json @schema.json https://example.com/product
# Extract with natural language prompt
noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing
# Use a specific LLM provider
noxa --llm-provider ollama --summarize https://example.com
noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com
noxa --llm-provider anthropic --summarize https://example.com
```
### Raw HTML Output
```bash
# Get the raw fetched HTML (no extraction)
noxa --raw-html https://example.com
# Useful for debugging extraction issues
noxa --raw-html https://example.com > raw.html
noxa --file raw.html # Then extract locally
```
### Metadata & Verbose Mode
```bash
# Include YAML frontmatter with metadata
noxa --metadata https://example.com
# Output:
# ---
# title: "Example Domain"
# source: "https://example.com"
# word_count: 20
# ---
# # Example Domain
# ...
# Verbose logging (debug extraction pipeline)
noxa -v https://example.com
```
### Proxy Usage
```bash
# Single proxy
noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com
# SOCKS5 proxy
noxa --proxy socks5://proxy.example.com:1080 https://example.com
# Proxy rotation from file (one per line: host:port:user:pass)
noxa --proxy-file proxies.txt https://example.com
# Auto-load proxies.txt from current directory
echo "proxy1.com:8080:user:pass" > proxies.txt
noxa https://example.com # Automatically detects and uses proxies.txt
```
### Real-World Recipes
```bash
# Monitor competitor pricing — save today's pricing
noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \
https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json
# Build a documentation search index
noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt
# Extract all images from a page
noxa https://example.com -f json | jq -r '.content.images[].src'
# Get all external links
noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href'
# Compare two pages
noxa https://site-a.com -f json > a.json
noxa https://site-b.com --diff-with a.json
```
---
## MCP Server — 10 tools for AI agents ## MCP Server — 10 tools for AI agents
<a href="https://glama.ai/mcp/servers/jmagar/noxa"><img src="https://glama.ai/mcp/servers/jmagar/noxa/badge" alt="noxa MCP server" /></a> <a href="https://glama.ai/mcp/servers/jmagar/noxa"><img src="https://glama.ai/mcp/servers/jmagar/noxa/badge" alt="noxa MCP server" /></a>
@ -327,6 +592,31 @@ noxa/
## Configuration ## Configuration
Non-secret defaults live in `config.json` in your working directory. Copy the example:
```bash
cp config.example.json config.json
```
**Precedence:** CLI flags > `config.json` > built-in defaults
**Secrets and URLs** (API keys, proxy, webhook, LLM base URL) always go in `.env`, not `config.json`:
```bash
cp env.example .env
```
**Override config path** for a single run:
```bash
NOXA_CONFIG=/path/to/other-config.json noxa https://example.com
NOXA_CONFIG=/dev/null noxa https://example.com # bypass config entirely
```
**Bool flag limitation:** flags like `--metadata`, `--only-main-content`, `--verbose` set to `true` in `config.json` cannot be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass.
### Environment variables
| Variable | Description | | Variable | Description |
|----------|-------------| |----------|-------------|
| `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) | | `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) |

View file

@ -54,13 +54,23 @@ impl NoxaConfig {
/// Returns an empty (all-None) config if the file doesn't exist. /// Returns an empty (all-None) config if the file doesn't exist.
/// Prints an error and exits if the file exists but is invalid JSON. /// Prints an error and exits if the file exists but is invalid JSON.
pub fn load(explicit_path: Option<&str>) -> Self { pub fn load(explicit_path: Option<&str>) -> Self {
let noxa_config_env = std::env::var("NOXA_CONFIG").ok();
let was_explicit = explicit_path.is_some() || noxa_config_env.is_some();
let path_str = explicit_path let path_str = explicit_path
.map(String::from) .map(String::from)
.or_else(|| std::env::var("NOXA_CONFIG").ok()) .or(noxa_config_env)
.unwrap_or_else(|| "config.json".to_string()); .unwrap_or_else(|| "config.json".to_string());
let path = Path::new(&path_str); let path = Path::new(&path_str);
if !path.exists() { if !path.exists() {
if was_explicit {
let display_name = path.file_name()
.and_then(|n| n.to_str())
.unwrap_or(&path_str);
eprintln!("error: config file not found: {display_name}");
std::process::exit(1);
}
return Self::default(); return Self::default();
} }
@ -292,8 +302,14 @@ mod tests {
} }
#[test] #[test]
fn test_load_missing_file_returns_default() { fn test_load_implicit_missing_file_returns_default() {
let cfg = NoxaConfig::load(Some("/nonexistent/path/config.json")); // When no explicit path and ./config.json doesn't exist, silently return default.
// The simplest test: call with None and rely on ./config.json not existing in test env.
// If CWD has config.json this test is skipped to avoid flakiness.
if std::path::Path::new("config.json").exists() {
return; // skip: CWD has config.json
}
let cfg = NoxaConfig::load(None);
assert!(cfg.format.is_none()); assert!(cfg.format.is_none());
} }
} }

View file

@ -1,46 +1,17 @@
# ============================================ # Secrets and URLs only — everything else goes in config.json
# Noxa Configuration # See config.example.json for the full list of configurable defaults.
# Copy to .env and fill in your values
# ============================================
# --- LLM Providers --- # Cloud API key (required for --cloud / --research)
NOXA_API_KEY=
# Gemini CLI (primary provider — requires `gemini` binary on PATH) # Single proxy URL (or use NOXA_PROXY_FILE for pool rotation)
# GEMINI_MODEL=gemini-2.5-pro # defaults to gemini-2.5-pro NOXA_PROXY=
# Ollama (fallback; local inference) # Webhook URL for completion notifications
OLLAMA_HOST=http://localhost:11434 NOXA_WEBHOOK_URL=
OLLAMA_MODEL=qwen3:8b
# OpenAI (optional cloud fallback) # LLM base URL (Ollama or OpenAI-compatible endpoint)
# OPENAI_API_KEY — set your OpenAI key NOXA_LLM_BASE_URL=
# OPENAI_BASE_URL — defaults to https://api.openai.com/v1
# OPENAI_MODEL — defaults to gpt-4o-mini
# Anthropic (optional cloud fallback) # Optional: path to a non-default config file (default: ./config.json)
# ANTHROPIC_API_KEY — set your Anthropic key # NOXA_CONFIG=/path/to/my-config.json
# ANTHROPIC_MODEL — defaults to claude-sonnet-4-20250514
# --- Proxy ---
# Single proxy
# NOXA_PROXY=http://user:pass@host:port
# Proxy file (one per line: host:port:user:pass)
# NOXA_PROXY_FILE=/path/to/proxies.txt
# --- Server (noxa-server only) ---
# NOXA_PORT=3000
# NOXA_HOST=0.0.0.0
# NOXA_AUTH_KEY=your-auth-key
# NOXA_MAX_CONCURRENCY=50
# NOXA_JOB_TTL_SECS=3600
# NOXA_MAX_JOBS=100
# --- CLI LLM overrides ---
# NOXA_LLM_PROVIDER=ollama
# NOXA_LLM_MODEL=qwen3:8b
# NOXA_LLM_BASE_URL=http://localhost:11434
# --- Logging ---
# NOXA_LOG=info