diff --git a/.env.example b/.env.example deleted file mode 100644 index aa69cd2..0000000 --- a/.env.example +++ /dev/null @@ -1,17 +0,0 @@ -# Secrets and URLs only — everything else goes in config.json -# See config.example.json for the full list of configurable defaults. - -# Cloud API key (required for --cloud / --research) -NOXA_API_KEY= - -# Single proxy URL (or use NOXA_PROXY_FILE for pool rotation) -NOXA_PROXY= - -# Webhook URL for completion notifications -NOXA_WEBHOOK_URL= - -# LLM base URL (Ollama or OpenAI-compatible endpoint) -NOXA_LLM_BASE_URL= - -# Optional: path to a non-default config file (default: ./config.json) -# NOXA_CONFIG=/path/to/my-config.json diff --git a/README.md b/README.md index cd3cba4..fea03dc 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for mac ### Cargo (from source) ```bash -cargo install --git https://github.com/jmagar/noxa.git noxa +cargo install --git https://github.com/jmagar/noxa.git noxa-cli --bin noxa cargo install --git https://github.com/jmagar/noxa.git noxa-mcp ``` @@ -159,6 +159,271 @@ Crawling... 50/50 pages extracted --- +## Examples + +### Basic Extraction + +```bash +# Extract as markdown (default) +noxa https://example.com + +# Multiple output formats +noxa https://example.com -f markdown # Clean markdown +noxa https://example.com -f json # Full structured JSON +noxa https://example.com -f text # Plain text (no formatting) +noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens) + +# Bare domains work (auto-prepends https://) +noxa example.com +``` + +### Content Filtering + +```bash +# Only extract main content (skip nav, sidebar, footer) +noxa https://docs.rs/tokio --only-main-content + +# Include specific CSS selectors +noxa https://news.ycombinator.com --include ".titleline,.score" + +# Exclude specific elements +noxa https://example.com --exclude "nav,footer,.ads,.sidebar" + +# Combine both +noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar" +``` + +### Brand Identity Extraction + +```bash +# Extract colors, fonts, logos from any website +noxa --brand https://stripe.com +# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] } + +noxa --brand https://github.com +# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... } + +noxa --brand wikipedia.org +# Output: 10 colors, 5 fonts, favicon, logo URL +``` + +### Sitemap Discovery + +```bash +# Discover all URLs from a site's sitemaps +noxa --map https://sitemaps.org +# Output: one URL per line (84 URLs found) + +# JSON output with metadata +noxa --map https://sitemaps.org -f json +# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }] +``` + +### Recursive Crawling + +```bash +# Crawl a site (default: depth 1, max 20 pages) +noxa --crawl https://example.com + +# Control depth and page limit +noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio + +# Crawl with sitemap seeding (finds more pages) +noxa --crawl --sitemap --depth 2 https://docs.rs/tokio + +# Filter crawl paths +noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com +noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com + +# Control concurrency and delay +noxa --crawl --concurrency 10 --delay 200 https://example.com +``` + +### Change Detection (Diff) + +```bash +# Step 1: Save a snapshot +noxa https://example.com -f json > snapshot.json + +# Step 2: Later, compare against the snapshot +noxa --diff-with snapshot.json https://example.com +# Output: +# Status: Same +# Word count delta: +0 + +# If the page changed: +# Status: Changed +# Word count delta: +42 +# --- old +# +++ new +# @@ -1,3 +1,3 @@ +# -Old content here +# +New content here +``` + +### PDF Extraction + +```bash +# PDF URLs are auto-detected via Content-Type +noxa https://example.com/report.pdf + +# Control PDF mode +noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs) +noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found +``` + +### Batch Processing + +```bash +# Multiple URLs in one command +noxa https://example.com https://httpbin.org/html https://rust-lang.org + +# URLs from a file (one per line, # comments supported) +noxa --urls-file urls.txt + +# Batch with JSON output +noxa --urls-file urls.txt -f json + +# Proxy rotation for large batches +noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10 +``` + +### Local Files & Stdin + +```bash +# Extract from a local HTML file +noxa --file page.html + +# Pipe HTML from another command +curl -s https://example.com | noxa --stdin + +# Chain with other tools +noxa https://example.com -f text | wc -w # Word count +noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq +``` + +### Browser Impersonation + +```bash +# Chrome (default) — latest Chrome TLS fingerprint +noxa https://example.com + +# Firefox fingerprint +noxa --browser firefox https://example.com + +# Random browser per request (good for batch) +noxa --browser random --urls-file urls.txt +``` + +### Custom Headers & Cookies + +```bash +# Custom headers +noxa -H "Authorization: Bearer token123" https://api.example.com +noxa -H "Accept-Language: de-DE" https://example.com + +# Cookies +noxa --cookie "session=abc123; theme=dark" https://example.com + +# Multiple headers +noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com +``` + +### LLM-Powered Features + +These require an LLM provider (Ollama local, or OpenAI/Anthropic API key). + +```bash +# Summarize a page (default: 3 sentences) +noxa --summarize https://example.com + +# Control summary length +noxa --summarize 5 https://example.com + +# Extract structured JSON with a schema +noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product + +# Extract with a schema from file +noxa --extract-json @schema.json https://example.com/product + +# Extract with natural language prompt +noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing + +# Use a specific LLM provider +noxa --llm-provider ollama --summarize https://example.com +noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com +noxa --llm-provider anthropic --summarize https://example.com +``` + +### Raw HTML Output + +```bash +# Get the raw fetched HTML (no extraction) +noxa --raw-html https://example.com + +# Useful for debugging extraction issues +noxa --raw-html https://example.com > raw.html +noxa --file raw.html # Then extract locally +``` + +### Metadata & Verbose Mode + +```bash +# Include YAML frontmatter with metadata +noxa --metadata https://example.com +# Output: +# --- +# title: "Example Domain" +# source: "https://example.com" +# word_count: 20 +# --- +# # Example Domain +# ... + +# Verbose logging (debug extraction pipeline) +noxa -v https://example.com +``` + +### Proxy Usage + +```bash +# Single proxy +noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com + +# SOCKS5 proxy +noxa --proxy socks5://proxy.example.com:1080 https://example.com + +# Proxy rotation from file (one per line: host:port:user:pass) +noxa --proxy-file proxies.txt https://example.com + +# Auto-load proxies.txt from current directory +echo "proxy1.com:8080:user:pass" > proxies.txt +noxa https://example.com # Automatically detects and uses proxies.txt +``` + +### Real-World Recipes + +```bash +# Monitor competitor pricing — save today's pricing +noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \ + https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json + +# Build a documentation search index +noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt + +# Extract all images from a page +noxa https://example.com -f json | jq -r '.content.images[].src' + +# Get all external links +noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href' + +# Compare two pages +noxa https://site-a.com -f json > a.json +noxa https://site-b.com --diff-with a.json +``` + +--- + ## MCP Server — 10 tools for AI agents noxa MCP server @@ -327,6 +592,31 @@ noxa/ ## Configuration +Non-secret defaults live in `config.json` in your working directory. Copy the example: + +```bash +cp config.example.json config.json +``` + +**Precedence:** CLI flags > `config.json` > built-in defaults + +**Secrets and URLs** (API keys, proxy, webhook, LLM base URL) always go in `.env`, not `config.json`: + +```bash +cp env.example .env +``` + +**Override config path** for a single run: + +```bash +NOXA_CONFIG=/path/to/other-config.json noxa https://example.com +NOXA_CONFIG=/dev/null noxa https://example.com # bypass config entirely +``` + +**Bool flag limitation:** flags like `--metadata`, `--only-main-content`, `--verbose` set to `true` in `config.json` cannot be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass. + +### Environment variables + | Variable | Description | |----------|-------------| | `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) | diff --git a/crates/noxa-cli/src/config.rs b/crates/noxa-cli/src/config.rs index 46e9e52..894716f 100644 --- a/crates/noxa-cli/src/config.rs +++ b/crates/noxa-cli/src/config.rs @@ -54,13 +54,23 @@ impl NoxaConfig { /// Returns an empty (all-None) config if the file doesn't exist. /// Prints an error and exits if the file exists but is invalid JSON. pub fn load(explicit_path: Option<&str>) -> Self { + let noxa_config_env = std::env::var("NOXA_CONFIG").ok(); + let was_explicit = explicit_path.is_some() || noxa_config_env.is_some(); + let path_str = explicit_path .map(String::from) - .or_else(|| std::env::var("NOXA_CONFIG").ok()) + .or(noxa_config_env) .unwrap_or_else(|| "config.json".to_string()); let path = Path::new(&path_str); if !path.exists() { + if was_explicit { + let display_name = path.file_name() + .and_then(|n| n.to_str()) + .unwrap_or(&path_str); + eprintln!("error: config file not found: {display_name}"); + std::process::exit(1); + } return Self::default(); } @@ -292,8 +302,14 @@ mod tests { } #[test] - fn test_load_missing_file_returns_default() { - let cfg = NoxaConfig::load(Some("/nonexistent/path/config.json")); + fn test_load_implicit_missing_file_returns_default() { + // When no explicit path and ./config.json doesn't exist, silently return default. + // The simplest test: call with None and rely on ./config.json not existing in test env. + // If CWD has config.json this test is skipped to avoid flakiness. + if std::path::Path::new("config.json").exists() { + return; // skip: CWD has config.json + } + let cfg = NoxaConfig::load(None); assert!(cfg.format.is_none()); } } diff --git a/env.example b/env.example index e81f4e4..aa69cd2 100644 --- a/env.example +++ b/env.example @@ -1,46 +1,17 @@ -# ============================================ -# Noxa Configuration -# Copy to .env and fill in your values -# ============================================ +# Secrets and URLs only — everything else goes in config.json +# See config.example.json for the full list of configurable defaults. -# --- LLM Providers --- +# Cloud API key (required for --cloud / --research) +NOXA_API_KEY= -# Gemini CLI (primary provider — requires `gemini` binary on PATH) -# GEMINI_MODEL=gemini-2.5-pro # defaults to gemini-2.5-pro +# Single proxy URL (or use NOXA_PROXY_FILE for pool rotation) +NOXA_PROXY= -# Ollama (fallback; local inference) -OLLAMA_HOST=http://localhost:11434 -OLLAMA_MODEL=qwen3:8b +# Webhook URL for completion notifications +NOXA_WEBHOOK_URL= -# OpenAI (optional cloud fallback) -# OPENAI_API_KEY — set your OpenAI key -# OPENAI_BASE_URL — defaults to https://api.openai.com/v1 -# OPENAI_MODEL — defaults to gpt-4o-mini +# LLM base URL (Ollama or OpenAI-compatible endpoint) +NOXA_LLM_BASE_URL= -# Anthropic (optional cloud fallback) -# ANTHROPIC_API_KEY — set your Anthropic key -# ANTHROPIC_MODEL — defaults to claude-sonnet-4-20250514 - -# --- Proxy --- - -# Single proxy -# NOXA_PROXY=http://user:pass@host:port - -# Proxy file (one per line: host:port:user:pass) -# NOXA_PROXY_FILE=/path/to/proxies.txt - -# --- Server (noxa-server only) --- -# NOXA_PORT=3000 -# NOXA_HOST=0.0.0.0 -# NOXA_AUTH_KEY=your-auth-key -# NOXA_MAX_CONCURRENCY=50 -# NOXA_JOB_TTL_SECS=3600 -# NOXA_MAX_JOBS=100 - -# --- CLI LLM overrides --- -# NOXA_LLM_PROVIDER=ollama -# NOXA_LLM_MODEL=qwen3:8b -# NOXA_LLM_BASE_URL=http://localhost:11434 - -# --- Logging --- -# NOXA_LOG=info +# Optional: path to a non-default config file (default: ./config.json) +# NOXA_CONFIG=/path/to/my-config.json