Merge pull request #2 from jmagar/feature/noxa-mcp-subcommand

refactor: add noxa mcp subcommand
This commit is contained in:
jmagar 2026-04-11 21:38:16 -04:00 committed by GitHub
commit 464eb1baec
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 1874 additions and 199 deletions

16
.gitignore vendored
View file

@ -1,5 +1,21 @@
target/
.DS_Store
.env
config.json
proxies.txt
.claude/skills/
.omc
.lavra
.beads
.cache
docs/plans
docs/superpowers
docs/reports
docs/sessions
benchmarks
docs
# Beads / Dolt files (added by bd init)
.dolt/
*.db
.beads-credential-key

View file

@ -15,8 +15,8 @@ noxa/
# + proxy pool rotation (per-request)
# + PDF content-type detection
# + document parsing (DOCX, XLSX, CSV)
noxa-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic)
# + JSON schema extraction, prompt extraction, summarization
noxa-llm/ # LLM provider chain (Gemini CLI -> OpenAI -> Ollama -> Anthropic)
# + JSON schema extraction (validated + retry), prompt extraction, summarization
noxa-pdf/ # PDF text extraction via pdf-extract
noxa-mcp/ # MCP server (Model Context Protocol) for AI agents
noxa/ # CLI binary
@ -48,8 +48,10 @@ Two binaries: `noxa` (CLI), `noxa-mcp` (MCP server).
- `search.rs` — Web search via Serper.dev with parallel result scraping
### LLM Modules (`noxa-llm`)
- Provider chain: Ollama (local-first) -> OpenAI -> Anthropic
- JSON schema extraction, prompt-based extraction, summarization
- Provider chain: Gemini CLI (primary) -> OpenAI -> Ollama -> Anthropic
- Gemini CLI requires the `gemini` binary on PATH; `GEMINI_MODEL` env var controls model (default: `gemini-2.5-pro`)
- JSON schema extraction with jsonschema validation; parse failures retry once; schema mismatches fail immediately
- Prompt-based extraction, summarization
### PDF Modules (`noxa-pdf`)
- PDF text extraction via pdf-extract crate
@ -105,11 +107,15 @@ noxa https://example.com --diff-with snap.json
# Brand extraction
noxa https://example.com --brand
# LLM features (Ollama local-first)
# LLM features (Gemini CLI primary; requires `gemini` on PATH)
noxa https://example.com --summarize
noxa https://example.com --extract-prompt "Get all pricing tiers"
noxa https://example.com --extract-json '{"type":"object","properties":{"title":{"type":"string"}}}'
# Force a specific LLM provider
noxa https://example.com --llm-provider gemini --summarize
noxa https://example.com --llm-provider openai --summarize
# PDF (auto-detected via Content-Type)
noxa https://example.com/report.pdf

246
Cargo.lock generated
View file

@ -35,7 +35,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
"getrandom 0.3.4",
"once_cell",
"serde",
"version_check",
"zerocopy",
]
@ -64,6 +66,12 @@ dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "allocator-api2"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
[[package]]
name = "android_system_properties"
version = "0.1.5"
@ -206,6 +214,21 @@ dependencies = [
"syn",
]
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec",
]
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]]
name = "bitflags"
version = "2.11.0"
@ -246,6 +269,12 @@ dependencies = [
"openssl-macros",
]
[[package]]
name = "borrow-or-share"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc0b364ead1874514c8c2855ab558056ebfeb775653e7ae45ff72f28f8f3166c"
[[package]]
name = "brotli"
version = "8.0.2"
@ -273,6 +302,12 @@ version = "3.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
[[package]]
name = "bytecount"
version = "0.6.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
[[package]]
name = "byteorder"
version = "1.5.0"
@ -601,6 +636,12 @@ dependencies = [
"syn",
]
[[package]]
name = "data-encoding"
version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea"
[[package]]
name = "debug_unsafe"
version = "0.1.4"
@ -726,6 +767,15 @@ version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "email_address"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e079f19b08ca6239f47f8ba8509c11cf3ea30095831f7fed61441475edd8c449"
dependencies = [
"serde",
]
[[package]]
name = "encoding_rs"
version = "0.8.35"
@ -760,6 +810,17 @@ dependencies = [
"num-traits",
]
[[package]]
name = "fancy-regex"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
dependencies = [
"bit-set",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fast-float2"
version = "0.2.3"
@ -789,6 +850,17 @@ dependencies = [
"zlib-rs",
]
[[package]]
name = "fluent-uri"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc74ac4d8359ae70623506d512209619e5cf8f347124910440dbc221714b328e"
dependencies = [
"borrow-or-share",
"ref-cast",
"serde",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -801,6 +873,12 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foldhash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "foreign-types"
version = "0.5.0"
@ -837,6 +915,16 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "fraction"
version = "0.15.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f158e3ff0a1b334408dc9fb811cd99b446986f4d8b741bb08f9df1604085ae7"
dependencies = [
"lazy_static",
"num",
]
[[package]]
name = "fs_extra"
version = "1.3.0"
@ -1037,7 +1125,7 @@ version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"foldhash",
"foldhash 0.1.5",
]
[[package]]
@ -1045,6 +1133,11 @@ name = "hashbrown"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash 0.2.0",
]
[[package]]
name = "heck"
@ -1410,6 +1503,33 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "jsonschema"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84695c6689b01384700a3d93acecbd07231ee6fff1bf22ae980b4c307e6ddfd5"
dependencies = [
"ahash",
"bytecount",
"data-encoding",
"email_address",
"fancy-regex",
"fraction",
"getrandom 0.3.4",
"idna",
"itoa",
"num-cmp",
"num-traits",
"percent-encoding",
"referencing",
"regex",
"regex-syntax",
"serde",
"serde_json",
"unicode-general-category",
"uuid-simd",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
@ -1575,6 +1695,12 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "micromap"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a86d3146ed3995b5913c414f6664344b9617457320782e64f0bb44afd49d74"
[[package]]
name = "minimal-lexical"
version = "0.2.1"
@ -1627,10 +1753,12 @@ dependencies = [
"noxa-core",
"noxa-fetch",
"noxa-llm",
"noxa-mcp",
"noxa-pdf",
"rand 0.8.5",
"regex",
"reqwest",
"serde",
"serde_json",
"tokio",
"tracing",
@ -1683,6 +1811,7 @@ name = "noxa-llm"
version = "0.3.11"
dependencies = [
"async-trait",
"jsonschema",
"reqwest",
"serde",
"serde_json",
@ -1730,12 +1859,82 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "num"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
dependencies = [
"num-bigint",
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-traits",
]
[[package]]
name = "num-bigint"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
dependencies = [
"num-integer",
"num-traits",
]
[[package]]
name = "num-cmp"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63335b2e2c34fae2fb0aa2cecfd9f0832a1e24b3b32ecec612c3426d46dc8aaa"
[[package]]
name = "num-complex"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
dependencies = [
"num-traits",
]
[[package]]
name = "num-conv"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
[[package]]
name = "num-integer"
version = "0.1.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
dependencies = [
"num-traits",
]
[[package]]
name = "num-iter"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
"num-bigint",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.19"
@ -1774,6 +1973,12 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
[[package]]
name = "outref"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e"
[[package]]
name = "parking_lot"
version = "0.12.5"
@ -2160,6 +2365,23 @@ dependencies = [
"syn",
]
[[package]]
name = "referencing"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2d5554bf79f4acf770dc3193b44b2d63b348f5f7b7448a0ea1191b37b620728"
dependencies = [
"ahash",
"fluent-uri",
"getrandom 0.3.4",
"hashbrown 0.16.1",
"itoa",
"micromap",
"parking_lot",
"percent-encoding",
"serde_json",
]
[[package]]
name = "regex"
version = "1.12.3"
@ -2985,6 +3207,12 @@ version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
[[package]]
name = "unicode-general-category"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b993bddc193ae5bd0d623b49ec06ac3e9312875fdae725a975c51db1cc1677f"
[[package]]
name = "unicode-ident"
version = "1.0.24"
@ -3049,6 +3277,16 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "uuid-simd"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23b082222b4f6619906941c17eb2297fff4c2fb96cb60164170522942a200bd8"
dependencies = [
"outref",
"vsimd",
]
[[package]]
name = "valuable"
version = "0.1.1"
@ -3061,6 +3299,12 @@ version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "vsimd"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64"
[[package]]
name = "want"
version = "0.3.1"

View file

@ -13,6 +13,7 @@ noxa-core = { path = "crates/noxa-core" }
noxa-fetch = { path = "crates/noxa-fetch" }
noxa-llm = { path = "crates/noxa-llm" }
noxa-pdf = { path = "crates/noxa-pdf" }
noxa-mcp = { path = "crates/noxa-mcp" }
tokio = { version = "1", features = ["full"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
@ -21,3 +22,6 @@ tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
clap = { version = "4", features = ["derive", "env"] }
dotenvy = "0.15"
rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
schemars = "1.0"
dirs = "6.0.0"

292
README.md
View file

@ -77,7 +77,7 @@ Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for mac
### Cargo (from source)
```bash
cargo install --git https://github.com/jmagar/noxa.git noxa
cargo install --git https://github.com/jmagar/noxa.git noxa-cli --bin noxa
cargo install --git https://github.com/jmagar/noxa.git noxa-mcp
```
@ -159,6 +159,271 @@ Crawling... 50/50 pages extracted
---
## Examples
### Basic Extraction
```bash
# Extract as markdown (default)
noxa https://example.com
# Multiple output formats
noxa https://example.com -f markdown # Clean markdown
noxa https://example.com -f json # Full structured JSON
noxa https://example.com -f text # Plain text (no formatting)
noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens)
# Bare domains work (auto-prepends https://)
noxa example.com
```
### Content Filtering
```bash
# Only extract main content (skip nav, sidebar, footer)
noxa https://docs.rs/tokio --only-main-content
# Include specific CSS selectors
noxa https://news.ycombinator.com --include ".titleline,.score"
# Exclude specific elements
noxa https://example.com --exclude "nav,footer,.ads,.sidebar"
# Combine both
noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar"
```
### Brand Identity Extraction
```bash
# Extract colors, fonts, logos from any website
noxa --brand https://stripe.com
# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] }
noxa --brand https://github.com
# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... }
noxa --brand wikipedia.org
# Output: 10 colors, 5 fonts, favicon, logo URL
```
### Sitemap Discovery
```bash
# Discover all URLs from a site's sitemaps
noxa --map https://sitemaps.org
# Output: one URL per line (84 URLs found)
# JSON output with metadata
noxa --map https://sitemaps.org -f json
# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }]
```
### Recursive Crawling
```bash
# Crawl a site (default: depth 1, max 20 pages)
noxa --crawl https://example.com
# Control depth and page limit
noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio
# Crawl with sitemap seeding (finds more pages)
noxa --crawl --sitemap --depth 2 https://docs.rs/tokio
# Filter crawl paths
noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com
noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com
# Control concurrency and delay
noxa --crawl --concurrency 10 --delay 200 https://example.com
```
### Change Detection (Diff)
```bash
# Step 1: Save a snapshot
noxa https://example.com -f json > snapshot.json
# Step 2: Later, compare against the snapshot
noxa --diff-with snapshot.json https://example.com
# Output:
# Status: Same
# Word count delta: +0
# If the page changed:
# Status: Changed
# Word count delta: +42
# --- old
# +++ new
# @@ -1,3 +1,3 @@
# -Old content here
# +New content here
```
### PDF Extraction
```bash
# PDF URLs are auto-detected via Content-Type
noxa https://example.com/report.pdf
# Control PDF mode
noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs)
noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found
```
### Batch Processing
```bash
# Multiple URLs in one command
noxa https://example.com https://httpbin.org/html https://rust-lang.org
# URLs from a file (one per line, # comments supported)
noxa --urls-file urls.txt
# Batch with JSON output
noxa --urls-file urls.txt -f json
# Proxy rotation for large batches
noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10
```
### Local Files & Stdin
```bash
# Extract from a local HTML file
noxa --file page.html
# Pipe HTML from another command
curl -s https://example.com | noxa --stdin
# Chain with other tools
noxa https://example.com -f text | wc -w # Word count
noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq
```
### Browser Impersonation
```bash
# Chrome (default) — latest Chrome TLS fingerprint
noxa https://example.com
# Firefox fingerprint
noxa --browser firefox https://example.com
# Random browser per request (good for batch)
noxa --browser random --urls-file urls.txt
```
### Custom Headers & Cookies
```bash
# Custom headers
noxa -H "Authorization: Bearer token123" https://api.example.com
noxa -H "Accept-Language: de-DE" https://example.com
# Cookies
noxa --cookie "session=abc123; theme=dark" https://example.com
# Multiple headers
noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com
```
### LLM-Powered Features
These require an LLM provider (Ollama local, or OpenAI/Anthropic API key).
```bash
# Summarize a page (default: 3 sentences)
noxa --summarize https://example.com
# Control summary length
noxa --summarize 5 https://example.com
# Extract structured JSON with a schema
noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product
# Extract with a schema from file
noxa --extract-json @schema.json https://example.com/product
# Extract with natural language prompt
noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing
# Use a specific LLM provider
noxa --llm-provider ollama --summarize https://example.com
noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com
noxa --llm-provider anthropic --summarize https://example.com
```
### Raw HTML Output
```bash
# Get the raw fetched HTML (no extraction)
noxa --raw-html https://example.com
# Useful for debugging extraction issues
noxa --raw-html https://example.com > raw.html
noxa --file raw.html # Then extract locally
```
### Metadata & Verbose Mode
```bash
# Include YAML frontmatter with metadata
noxa --metadata https://example.com
# Output:
# ---
# title: "Example Domain"
# source: "https://example.com"
# word_count: 20
# ---
# # Example Domain
# ...
# Verbose logging (debug extraction pipeline)
noxa -v https://example.com
```
### Proxy Usage
```bash
# Single proxy
noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com
# SOCKS5 proxy
noxa --proxy socks5://proxy.example.com:1080 https://example.com
# Proxy rotation from file (one per line: host:port:user:pass)
noxa --proxy-file proxies.txt https://example.com
# Auto-load proxies.txt from current directory
echo "proxy1.com:8080:user:pass" > proxies.txt
noxa https://example.com # Automatically detects and uses proxies.txt
```
### Real-World Recipes
```bash
# Monitor competitor pricing — save today's pricing
noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \
https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json
# Build a documentation search index
noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt
# Extract all images from a page
noxa https://example.com -f json | jq -r '.content.images[].src'
# Get all external links
noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href'
# Compare two pages
noxa https://site-a.com -f json > a.json
noxa https://site-b.com --diff-with a.json
```
---
## MCP Server — 10 tools for AI agents
<a href="https://glama.ai/mcp/servers/jmagar/noxa"><img src="https://glama.ai/mcp/servers/jmagar/noxa/badge" alt="noxa MCP server" /></a>
@ -327,6 +592,31 @@ noxa/
## Configuration
Non-secret defaults live in `config.json` in your working directory. Copy the example:
```bash
cp config.example.json config.json
```
**Precedence:** CLI flags > `config.json` > built-in defaults
**Secrets and URLs** (API keys, proxy, webhook, LLM base URL) always go in `.env`, not `config.json`:
```bash
cp env.example .env
```
**Override config path** for a single run:
```bash
NOXA_CONFIG=/path/to/other-config.json noxa https://example.com
NOXA_CONFIG=/dev/null noxa https://example.com # bypass config entirely
```
**Bool flag limitation:** flags like `--metadata`, `--only-main-content`, `--verbose` set to `true` in `config.json` cannot be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass.
### Environment variables
| Variable | Description |
|----------|-------------|
| `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) |

34
config.example.json Normal file
View file

@ -0,0 +1,34 @@
{
"_doc": [
"Copy to config.json and remove fields you don't need.",
"Secrets (api_key, proxy, webhook, llm_base_url) go in .env — NOT here.",
"BOOL FLAG LIMITATION: once set to true here, cannot be overridden to false",
"from the CLI for a single run (no --no-flag support). Use NOXA_CONFIG=/dev/null",
"on the command line to bypass this config entirely.",
"on_change is intentionally absent — it must remain a CLI-only flag.",
"Unknown fields are silently ignored, so this file works across noxa versions."
],
"format": "markdown",
"browser": "chrome",
"timeout": 30,
"pdf_mode": "auto",
"metadata": false,
"verbose": false,
"only_main_content": false,
"include_selectors": [],
"exclude_selectors": ["nav", "footer", ".sidebar", ".cookie-banner"],
"depth": 1,
"max_pages": 20,
"concurrency": 5,
"delay": 100,
"path_prefix": null,
"include_paths": [],
"exclude_paths": ["/changelog/*", "/blog/*", "/releases/*"],
"use_sitemap": false,
"llm_provider": "gemini",
"llm_model": "gemini-2.5-pro"
}

View file

@ -14,9 +14,11 @@ noxa-core = { workspace = true }
noxa-fetch = { workspace = true }
noxa-llm = { workspace = true }
noxa-pdf = { workspace = true }
noxa-mcp = { workspace = true }
dotenvy = { workspace = true }
rand = "0.8"
serde_json = { workspace = true }
serde = { workspace = true }
tokio = { workspace = true }
clap = { workspace = true }
tracing = { workspace = true }

View file

@ -0,0 +1,315 @@
use serde::Deserialize;
use std::path::Path;
use crate::{Browser, OutputFormat, PdfModeArg};
/// Non-secret, non-URL configuration defaults loaded from config.json.
/// All fields optional — absent means "use the hard default".
/// Unknown fields are silently ignored (serde default) so config files
/// written for a newer version of noxa work on older binaries.
///
/// DELIBERATELY EXCLUDED:
/// - on_change: passes content to sh -c; must remain CLI-only to prevent
/// shell injection via config file writes.
/// - Secrets/URLs (api_key, proxy, webhook, llm_base_url): stay in .env.
///
/// BOOL FLAG LIMITATION:
/// only_main_content, metadata, verbose, use_sitemap set to true here
/// cannot be overridden to false from the CLI for a single run (no --no-flag
/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null to bypass.
#[derive(Debug, Default, Deserialize)]
pub struct NoxaConfig {
// Output
pub format: Option<OutputFormat>,
pub metadata: Option<bool>,
pub verbose: Option<bool>,
// Fetch
pub browser: Option<Browser>,
pub timeout: Option<u64>,
pub pdf_mode: Option<PdfModeArg>,
pub only_main_content: Option<bool>,
// CSS selectors
pub include_selectors: Option<Vec<String>>,
pub exclude_selectors: Option<Vec<String>>,
// Crawl
pub depth: Option<usize>,
pub max_pages: Option<usize>,
pub concurrency: Option<usize>,
pub delay: Option<u64>,
pub path_prefix: Option<String>,
pub include_paths: Option<Vec<String>>,
pub exclude_paths: Option<Vec<String>>,
pub use_sitemap: Option<bool>,
// LLM (non-secret: provider name and model only; base URL stays in .env)
pub llm_provider: Option<String>,
pub llm_model: Option<String>,
}
impl NoxaConfig {
/// Load config from an explicit path, NOXA_CONFIG env var, or ./config.json.
/// Returns an empty (all-None) config if the file doesn't exist.
/// Prints an error and exits if the file exists but is invalid JSON.
pub fn load(explicit_path: Option<&str>) -> Self {
let noxa_config_env = std::env::var("NOXA_CONFIG").ok();
let was_explicit = explicit_path.is_some() || noxa_config_env.is_some();
let path_str = explicit_path
.map(String::from)
.or(noxa_config_env)
.unwrap_or_else(|| "config.json".to_string());
let path = Path::new(&path_str);
if !path.exists() {
if was_explicit {
let display_name = path.file_name()
.and_then(|n| n.to_str())
.unwrap_or(&path_str);
eprintln!("error: config file not found: {display_name}");
std::process::exit(1);
}
return Self::default();
}
let display_name = path.file_name()
.and_then(|n| n.to_str())
.unwrap_or(&path_str);
eprintln!(
"noxa: config loaded from {display_name} \
(API keys and secrets belong in .env, not config.json)"
);
tracing::debug!("config path: {}", path.display());
let content = match std::fs::read_to_string(path) {
Ok(s) => s,
Err(e) => {
eprintln!("error: cannot read config file {display_name}: {e}");
std::process::exit(1);
}
};
match serde_json::from_str(&content) {
Ok(cfg) => cfg,
Err(e) => {
eprintln!("error: invalid JSON in config file {display_name}: {e}");
std::process::exit(1);
}
}
}
}
/// Fully resolved configuration after merging CLI flags > config file > hard defaults.
/// All fields are concrete — no Option<T>. This is what the rest of main.rs reads.
///
/// The merge uses clap's ValueSource to detect which fields were explicitly set on
/// the command line. CLI-explicit values always win. Config fills in the rest.
/// Hard defaults are the fallback of last resort.
pub struct ResolvedConfig {
// Output
pub format: OutputFormat,
pub metadata: bool,
pub verbose: bool,
// Fetch
pub browser: Browser,
pub timeout: u64,
pub pdf_mode: PdfModeArg,
pub only_main_content: bool,
/// CLI-only output flag — not configurable via config.json (it is a per-run mode, not a persistent default).
pub raw_html: bool,
// CSS selectors
/// Vec<String> — CSS selectors passed directly to extraction filter.
pub include_selectors: Vec<String>,
/// Vec<String> — CSS selectors passed directly to extraction filter.
pub exclude_selectors: Vec<String>,
// Crawl
pub depth: usize,
pub max_pages: usize,
pub concurrency: usize,
pub delay: u64,
pub path_prefix: Option<String>,
/// Vec<String> — never joined to a comma-string. Passed directly to CrawlConfig.
pub include_paths: Vec<String>,
/// Vec<String> — never joined to a comma-string. Passed directly to CrawlConfig.
pub exclude_paths: Vec<String>,
pub use_sitemap: bool,
// LLM
pub llm_provider: Option<String>,
pub llm_model: Option<String>,
}
use clap::parser::ValueSource;
/// Merge CLI flags (detected via ValueSource), config file, and hard defaults
/// into a single ResolvedConfig. CLI explicit values always win.
pub fn resolve(
cli: &crate::Cli,
matches: &clap::ArgMatches,
cfg: &NoxaConfig,
) -> ResolvedConfig {
let explicit = |name: &str| {
matches.value_source(name) == Some(ValueSource::CommandLine)
};
ResolvedConfig {
format: if explicit("format") {
cli.format.clone()
} else {
cfg.format.clone().unwrap_or(crate::OutputFormat::Markdown)
},
browser: if explicit("browser") {
cli.browser.clone()
} else {
cfg.browser.clone().unwrap_or(crate::Browser::Chrome)
},
pdf_mode: if explicit("pdf_mode") {
cli.pdf_mode.clone()
} else {
cfg.pdf_mode.clone().unwrap_or(crate::PdfModeArg::Auto)
},
timeout: if explicit("timeout") {
cli.timeout
} else {
cfg.timeout.unwrap_or(30)
},
depth: if explicit("depth") {
cli.depth
} else {
cfg.depth.unwrap_or(1)
},
max_pages: if explicit("max_pages") {
cli.max_pages
} else {
cfg.max_pages.unwrap_or(20)
},
concurrency: if explicit("concurrency") {
cli.concurrency
} else {
cfg.concurrency.unwrap_or(5)
},
delay: if explicit("delay") {
cli.delay
} else {
cfg.delay.unwrap_or(100)
},
path_prefix: if explicit("path_prefix") {
cli.path_prefix.clone()
} else {
cfg.path_prefix.clone()
},
include_paths: if explicit("include_paths") {
cli.include_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default()
} else {
cfg.include_paths.clone().unwrap_or_default()
},
exclude_paths: if explicit("exclude_paths") {
cli.exclude_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default()
} else {
cfg.exclude_paths.clone().unwrap_or_default()
},
include_selectors: if explicit("include") {
cli.include
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default()
} else {
cfg.include_selectors.clone().unwrap_or_default()
},
exclude_selectors: if explicit("exclude") {
cli.exclude
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default()
} else {
cfg.exclude_selectors.clone().unwrap_or_default()
},
only_main_content: cli.only_main_content || cfg.only_main_content.unwrap_or(false),
metadata: cli.metadata || cfg.metadata.unwrap_or(false),
verbose: cli.verbose || cfg.verbose.unwrap_or(false),
use_sitemap: cli.sitemap || cfg.use_sitemap.unwrap_or(false),
raw_html: cli.raw_html,
llm_provider: if cli.llm_provider.is_some() {
cli.llm_provider.clone()
} else {
cfg.llm_provider.clone()
},
llm_model: if cli.llm_model.is_some() {
cli.llm_model.clone()
} else {
cfg.llm_model.clone()
},
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_noxa_config_deserialize_full() {
let json = r#"{
"format": "llm",
"depth": 3,
"max_pages": 100,
"concurrency": 10,
"delay": 200,
"browser": "firefox",
"timeout": 60,
"only_main_content": true,
"use_sitemap": true,
"path_prefix": "/docs/",
"include_paths": ["/docs/*", "/api/*"],
"exclude_paths": ["/changelog/*", "/blog/*"],
"include_selectors": ["article", ".content"],
"exclude_selectors": ["nav", "footer"],
"llm_provider": "gemini",
"llm_model": "gemini-2.5-pro",
"pdf_mode": "fast",
"metadata": true,
"verbose": false
}"#;
let cfg: NoxaConfig = serde_json::from_str(json).unwrap();
assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm)));
assert_eq!(cfg.depth, Some(3));
assert_eq!(cfg.exclude_paths, Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()]));
assert!(matches!(cfg.pdf_mode, Some(crate::PdfModeArg::Fast)));
}
#[test]
fn test_noxa_config_empty() {
let cfg: NoxaConfig = serde_json::from_str("{}").unwrap();
assert!(cfg.format.is_none());
assert!(cfg.depth.is_none());
}
#[test]
fn test_noxa_config_unknown_fields_ignored() {
// Unknown fields must NOT cause a parse failure
let cfg: NoxaConfig = serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap();
assert_eq!(cfg.depth, Some(2));
}
#[test]
fn test_load_implicit_missing_file_returns_default() {
// When no explicit path and ./config.json doesn't exist, silently return default.
// The simplest test: call with None and rely on ./config.json not existing in test env.
// If CWD has config.json this test is skipped to avoid flakiness.
if std::path::Path::new("config.json").exists() {
return; // skip: CWD has config.json
}
let cfg = NoxaConfig::load(None);
assert!(cfg.format.is_none());
}
}

View file

@ -2,6 +2,7 @@
/// CLI entry point -- wires noxa-core and noxa-fetch into a single command.
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
mod cloud;
mod config;
use std::io::{self, Read as _};
use std::path::{Path, PathBuf};
@ -9,8 +10,7 @@ use std::process;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use clap::{Parser, ValueEnum};
use tracing_subscriber::EnvFilter;
use clap::{CommandFactory, FromArgMatches, Parser, ValueEnum};
use noxa_core::{
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
to_llm_text,
@ -20,7 +20,10 @@ use noxa_fetch::{
FetchConfig, FetchResult, PageResult, SitemapEntry,
};
use noxa_llm::LlmProvider;
use noxa_mcp;
use noxa_pdf::PdfMode;
use serde::Deserialize;
use tracing_subscriber::EnvFilter;
/// Known anti-bot challenge page titles (case-insensitive prefix match).
const ANTIBOT_TITLES: &[&str] = &[
@ -87,6 +90,10 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
#[derive(Parser)]
#[command(name = "noxa", about = "Extract web content for LLMs", version)]
struct Cli {
/// Path to config.json (default: ./config.json, override with NOXA_CONFIG env var)
#[arg(long, global = true)]
config: Option<String>,
/// URLs to fetch (multiple allowed)
#[arg()]
urls: Vec<String>,
@ -247,7 +254,7 @@ struct Cli {
#[arg(long, num_args = 0..=1, default_missing_value = "3")]
summarize: Option<usize>,
/// Force a specific LLM provider (ollama, openai, anthropic)
/// Force a specific LLM provider (gemini, ollama, openai, anthropic)
#[arg(long, env = "NOXA_LLM_PROVIDER")]
llm_provider: Option<String>,
@ -284,7 +291,8 @@ struct Cli {
output_dir: Option<PathBuf>,
}
#[derive(Clone, ValueEnum)]
#[derive(Clone, Debug, ValueEnum, Deserialize)]
#[serde(rename_all = "lowercase")]
enum OutputFormat {
Markdown,
Json,
@ -293,14 +301,16 @@ enum OutputFormat {
Html,
}
#[derive(Clone, ValueEnum)]
#[derive(Clone, Debug, ValueEnum, Deserialize)]
#[serde(rename_all = "lowercase")]
enum Browser {
Chrome,
Firefox,
Random,
}
#[derive(Clone, ValueEnum, Default)]
#[derive(Clone, Debug, ValueEnum, Default, Deserialize)]
#[serde(rename_all = "lowercase")]
enum PdfModeArg {
/// Error if PDF has no extractable text (catches scanned PDFs)
#[default]
@ -338,12 +348,21 @@ fn init_logging(verbose: bool) {
tracing_subscriber::fmt().with_env_filter(filter).init();
}
fn init_mcp_logging() {
tracing_subscriber::fmt()
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.with_writer(std::io::stderr)
.with_ansi(false)
.try_init()
.ok();
}
/// Build FetchConfig from CLI flags.
///
/// `--proxy` sets a single static proxy (no rotation).
/// `--proxy-file` loads a pool of proxies and rotates per-request.
/// `--proxy` takes priority: if both are set, only the single proxy is used.
fn build_fetch_config(cli: &Cli) -> FetchConfig {
fn build_fetch_config(cli: &Cli, resolved: &config::ResolvedConfig) -> FetchConfig {
let (proxy, proxy_pool) = if cli.proxy.is_some() {
(cli.proxy.clone(), Vec::new())
} else if let Some(ref path) = cli.proxy_file {
@ -403,11 +422,11 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
}
FetchConfig {
browser: cli.browser.clone().into(),
browser: resolved.browser.clone().into(),
proxy,
proxy_pool,
timeout: std::time::Duration::from_secs(cli.timeout),
pdf_mode: cli.pdf_mode.clone().into(),
timeout: std::time::Duration::from_secs(resolved.timeout),
pdf_mode: resolved.pdf_mode.clone().into(),
headers,
..Default::default()
}
@ -436,20 +455,12 @@ fn parse_cookie_file(path: &str) -> Result<String, String> {
Ok(pairs.join("; "))
}
fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
fn build_extraction_options(resolved: &config::ResolvedConfig) -> ExtractionOptions {
ExtractionOptions {
include_selectors: cli
.include
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
exclude_selectors: cli
.exclude
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
only_main_content: cli.only_main_content,
include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
include_selectors: resolved.include_selectors.clone(),
exclude_selectors: resolved.exclude_selectors.clone(),
only_main_content: resolved.only_main_content,
include_raw_html: resolved.raw_html || matches!(resolved.format, OutputFormat::Html),
}
}
@ -618,14 +629,17 @@ impl FetchOutput {
/// Fetch a URL and extract content, handling PDF detection automatically.
/// Falls back to cloud API when bot protection or JS rendering is detected.
async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
async fn fetch_and_extract(
cli: &Cli,
resolved: &config::ResolvedConfig,
) -> Result<FetchOutput, String> {
// Local sources: read and extract as HTML
if cli.stdin {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?;
let options = build_extraction_options(cli);
let options = build_extraction_options(resolved);
return extract_with_options(&buf, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
@ -634,7 +648,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
if let Some(ref path) = cli.file {
let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
let options = build_extraction_options(cli);
let options = build_extraction_options(resolved);
return extract_with_options(&html, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
@ -651,10 +665,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
// --cloud: skip local, go straight to cloud API
if cli.cloud {
let c =
cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
let options = build_extraction_options(cli);
let format_str = match cli.format {
let c = cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
let options = build_extraction_options(resolved);
let format_str = match resolved.format {
OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json",
OutputFormat::Text => "text",
@ -674,9 +687,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
}
// Normal path: try local first
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
let client = FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(resolved);
let result = client
.fetch_and_extract_with_options(url, &options)
.await
@ -687,7 +700,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
if !matches!(reason, EmptyReason::None) {
if let Some(ref c) = cloud_client {
eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
let format_str = match cli.format {
let format_str = match resolved.format {
OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json",
OutputFormat::Text => "text",
@ -718,7 +731,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
}
/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
async fn fetch_html(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<FetchResult, String> {
if cli.stdin {
let mut buf = String::new();
io::stdin()
@ -751,8 +764,8 @@ async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url);
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let client = FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?;
client
.fetch(&url)
.await
@ -1166,7 +1179,7 @@ fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String
)
}
async fn run_crawl(cli: &Cli) -> Result<(), String> {
async fn run_crawl(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
let url = cli
.urls
.first()
@ -1178,16 +1191,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
return Err("--crawl cannot be used with --file or --stdin".into());
}
let include_patterns: Vec<String> = cli
.include_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default();
let exclude_patterns: Vec<String> = cli
.exclude_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default();
let include_patterns = resolved.include_paths.clone();
let exclude_patterns = resolved.exclude_paths.clone();
// Set up streaming progress channel
let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::<PageResult>(100);
@ -1207,13 +1212,13 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
}
let config = CrawlConfig {
fetch: build_fetch_config(cli),
max_depth: cli.depth,
max_pages: cli.max_pages,
concurrency: cli.concurrency,
delay: std::time::Duration::from_millis(cli.delay),
path_prefix: cli.path_prefix.clone(),
use_sitemap: cli.sitemap,
fetch: build_fetch_config(cli, resolved),
max_depth: resolved.depth,
max_pages: resolved.max_pages,
concurrency: resolved.concurrency,
delay: std::time::Duration::from_millis(resolved.delay),
path_prefix: resolved.path_prefix.clone(),
use_sitemap: resolved.use_sitemap,
include_patterns,
exclude_patterns,
progress_tx: Some(progress_tx),
@ -1232,7 +1237,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
);
});
let max_pages = cli.max_pages;
let max_pages = resolved.max_pages;
let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages);
// Spawn background task to print streaming progress to stderr
@ -1261,8 +1266,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
&result.visited,
&result.remaining_frontier,
completed_offset + result.pages.len(),
cli.max_pages,
cli.depth,
resolved.max_pages,
resolved.depth,
)?;
eprintln!(
"Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}",
@ -1294,15 +1299,15 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
let mut saved = 0usize;
for page in &result.pages {
if let Some(ref extraction) = page.extraction {
let filename = url_to_filename(&page.url, &cli.format);
let content = format_output(extraction, &cli.format, cli.metadata);
let filename = url_to_filename(&page.url, &resolved.format);
let content = format_output(extraction, &resolved.format, resolved.metadata);
write_to_file(dir, &filename, &content)?;
saved += 1;
}
}
eprintln!("Saved {saved} files to {}", dir.display());
} else {
print_crawl_output(&result, &cli.format, cli.metadata);
print_crawl_output(&result, &resolved.format, resolved.metadata);
}
eprintln!(
@ -1338,7 +1343,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
}
}
async fn run_map(cli: &Cli) -> Result<(), String> {
async fn run_map(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
let url = cli
.urls
.first()
@ -1346,8 +1351,8 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
.map(|u| normalize_url(u))?;
let url = url.as_str();
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let client = FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?;
let entries = noxa_fetch::sitemap::discover(&client, url)
.await
@ -1359,19 +1364,24 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
eprintln!("discovered {} URLs", entries.len());
}
print_map_output(&entries, &cli.format);
print_map_output(&entries, &resolved.format);
Ok(())
}
async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
async fn run_batch(
cli: &Cli,
resolved: &config::ResolvedConfig,
entries: &[(String, Option<String>)],
) -> Result<(), String> {
let client = Arc::new(
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?,
);
let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect();
let options = build_extraction_options(cli);
let options = build_extraction_options(resolved);
let results = client
.fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options)
.fetch_and_extract_batch_with_options(&urls, resolved.concurrency, &options)
.await;
let ok = results.iter().filter(|r| r.result.is_ok()).count();
@ -1402,15 +1412,15 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
let filename = custom_names
.get(r.url.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| url_to_filename(&r.url, &cli.format));
let content = format_output(extraction, &cli.format, cli.metadata);
.unwrap_or_else(|| url_to_filename(&r.url, &resolved.format));
let content = format_output(extraction, &resolved.format, resolved.metadata);
write_to_file(dir, &filename, &content)?;
saved += 1;
}
}
eprintln!("Saved {saved} files to {}", dir.display());
} else {
print_batch_output(&results, &cli.format, cli.metadata);
print_batch_output(&results, &resolved.format, resolved.metadata);
}
eprintln!(
@ -1514,15 +1524,20 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
});
}
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
async fn run_watch(
cli: &Cli,
resolved: &config::ResolvedConfig,
urls: &[String],
) -> Result<(), String> {
if urls.is_empty() {
return Err("--watch requires at least one URL".into());
}
let client = Arc::new(
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?,
);
let options = build_extraction_options(cli);
let options = build_extraction_options(resolved);
// Ctrl+C handler
let cancelled = Arc::new(AtomicBool::new(false));
@ -1534,16 +1549,17 @@ async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
// Single-URL mode: preserve original behavior exactly
if urls.len() == 1 {
return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
return run_watch_single(cli, resolved, &client, &options, &urls[0], &cancelled).await;
}
// Multi-URL mode: batch fetch, diff each, report aggregate
run_watch_multi(cli, &client, &options, urls, &cancelled).await
run_watch_multi(cli, resolved, &client, &options, urls, &cancelled).await
}
/// Original single-URL watch loop -- backward compatible.
async fn run_watch_single(
cli: &Cli,
resolved: &config::ResolvedConfig,
client: &Arc<FetchClient>,
options: &ExtractionOptions,
url: &str,
@ -1580,7 +1596,7 @@ async fn run_watch_single(
if diff.status == ChangeStatus::Same {
eprintln!("[watch] No changes ({})", timestamp());
} else {
print_diff_output(&diff, &cli.format);
print_diff_output(&diff, &resolved.format);
eprintln!("[watch] Changes detected! ({})", timestamp());
if let Some(ref cmd) = cli.on_change {
@ -1627,6 +1643,7 @@ async fn run_watch_single(
/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
async fn run_watch_multi(
cli: &Cli,
resolved: &config::ResolvedConfig,
client: &Arc<FetchClient>,
options: &ExtractionOptions,
urls: &[String],
@ -1636,7 +1653,7 @@ async fn run_watch_multi(
// Initial pass: fetch all URLs in parallel
let initial_results = client
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
.fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options)
.await;
let mut snapshots = std::collections::HashMap::new();
@ -1676,7 +1693,7 @@ async fn run_watch_multi(
check_number += 1;
let current_results = client
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
.fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options)
.await;
let mut changed: Vec<serde_json::Value> = Vec::new();
@ -1780,7 +1797,11 @@ async fn run_watch_multi(
Ok(())
}
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
async fn run_diff(
cli: &Cli,
resolved: &config::ResolvedConfig,
snapshot_path: &str,
) -> Result<(), String> {
// Load previous snapshot
let snapshot_json = std::fs::read_to_string(snapshot_path)
.map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?;
@ -1788,16 +1809,16 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
.map_err(|e| format!("failed to parse snapshot JSON: {e}"))?;
// Extract current version (handles PDF detection for URLs)
let new_result = fetch_and_extract(cli).await?.into_extraction()?;
let new_result = fetch_and_extract(cli, resolved).await?.into_extraction()?;
let diff = noxa_core::diff::diff(&old, &new_result);
print_diff_output(&diff, &cli.format);
print_diff_output(&diff, &resolved.format);
Ok(())
}
async fn run_brand(cli: &Cli) -> Result<(), String> {
let result = fetch_html(cli).await?;
async fn run_brand(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
let result = fetch_html(cli, resolved).await?;
let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
let brand = noxa_core::brand::extract_brand(
&enriched,
@ -1811,13 +1832,27 @@ async fn run_brand(cli: &Cli) -> Result<(), String> {
}
/// Build an LLM provider based on CLI flags, or fall back to the default chain.
async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
if let Some(ref name) = cli.llm_provider {
async fn build_llm_provider(
cli: &Cli,
resolved: &config::ResolvedConfig,
) -> Result<Box<dyn LlmProvider>, String> {
if let Some(ref name) = resolved.llm_provider {
match name.as_str() {
"gemini" => {
let provider = noxa_llm::providers::gemini_cli::GeminiCliProvider::new(
resolved.llm_model.clone(),
);
if !provider.is_available().await {
return Err(
"gemini CLI not found on PATH -- install it or omit --llm-provider".into(),
);
}
Ok(Box::new(provider))
}
"ollama" => {
let provider = noxa_llm::providers::ollama::OllamaProvider::new(
cli.llm_base_url.clone(),
cli.llm_model.clone(),
resolved.llm_model.clone(),
);
if !provider.is_available().await {
return Err("ollama is not running or unreachable".into());
@ -1828,7 +1863,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
let provider = noxa_llm::providers::openai::OpenAiProvider::new(
None,
cli.llm_base_url.clone(),
cli.llm_model.clone(),
resolved.llm_model.clone(),
)
.ok_or("OPENAI_API_KEY not set")?;
Ok(Box::new(provider))
@ -1836,20 +1871,20 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
"anthropic" => {
let provider = noxa_llm::providers::anthropic::AnthropicProvider::new(
None,
cli.llm_model.clone(),
resolved.llm_model.clone(),
)
.ok_or("ANTHROPIC_API_KEY not set")?;
Ok(Box::new(provider))
}
other => Err(format!(
"unknown LLM provider: {other} (use ollama, openai, or anthropic)"
"unknown LLM provider: {other} (use gemini, ollama, openai, or anthropic)"
)),
}
} else {
let chain = noxa_llm::ProviderChain::default().await;
if chain.is_empty() {
return Err(
"no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
"no LLM providers available -- install the gemini CLI, start Ollama, or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
.into(),
);
}
@ -1857,12 +1892,12 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
}
}
async fn run_llm(cli: &Cli) -> Result<(), String> {
async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
// Extract content from source first (handles PDF detection for URLs)
let result = fetch_and_extract(cli).await?.into_extraction()?;
let result = fetch_and_extract(cli, resolved).await?.into_extraction()?;
let provider = build_llm_provider(cli).await?;
let model = cli.llm_model.as_deref();
let provider = build_llm_provider(cli, resolved).await?;
let model = resolved.llm_model.as_deref();
if let Some(ref schema_input) = cli.extract_json {
// Support @file syntax for loading schema from file
@ -1876,6 +1911,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
let schema: serde_json::Value =
serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?;
let t = std::time::Instant::now();
let extracted = noxa_llm::extract::extract_json(
&result.content.plain_text,
&schema,
@ -1884,12 +1920,14 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
)
.await
.map_err(|e| format!("LLM extraction failed: {e}"))?;
eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
println!(
"{}",
serde_json::to_string_pretty(&extracted).expect("serialization failed")
);
} else if let Some(ref prompt) = cli.extract_prompt {
let t = std::time::Instant::now();
let extracted = noxa_llm::extract::extract_with_prompt(
&result.content.plain_text,
prompt,
@ -1898,12 +1936,14 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
)
.await
.map_err(|e| format!("LLM extraction failed: {e}"))?;
eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
println!(
"{}",
serde_json::to_string_pretty(&extracted).expect("serialization failed")
);
} else if let Some(sentences) = cli.summarize {
let t = std::time::Instant::now();
let summary = noxa_llm::summarize::summarize(
&result.content.plain_text,
Some(sentences),
@ -1912,6 +1952,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
)
.await
.map_err(|e| format!("LLM summarization failed: {e}"))?;
eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
println!("{summary}");
}
@ -1921,12 +1962,16 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
/// URLs are processed sequentially to respect LLM provider rate limits.
async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
let provider = build_llm_provider(cli).await?;
let model = cli.llm_model.as_deref();
async fn run_batch_llm(
cli: &Cli,
resolved: &config::ResolvedConfig,
entries: &[(String, Option<String>)],
) -> Result<(), String> {
let client = FetchClient::new(build_fetch_config(cli, resolved))
.map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(resolved);
let provider = build_llm_provider(cli, resolved).await?;
let model = resolved.llm_model.as_deref();
// Pre-parse schema once if --extract-json is used
let schema = if let Some(ref schema_input) = cli.extract_json {
@ -1974,6 +2019,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
let text = &extraction.content.plain_text;
// Run the appropriate LLM operation
let llm_start = std::time::Instant::now();
let llm_result = if let Some(ref schema) = schema {
noxa_llm::extract::extract_json(text, schema, provider.as_ref(), model)
.await
@ -1989,6 +2035,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
} else {
unreachable!("run_batch_llm called without LLM flags")
};
let llm_elapsed = llm_start.elapsed();
match llm_result {
Ok(output) => {
@ -2018,7 +2065,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
format!("{words} words")
}
};
eprintln!("-> extracted {detail}");
eprintln!("-> extracted {detail} ({:.1}s)", llm_elapsed.as_secs_f64());
if let Some(ref dir) = cli.output_dir {
let filename = custom_names
@ -2215,12 +2262,29 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
async fn main() {
dotenvy::dotenv().ok();
let cli = Cli::parse();
init_logging(cli.verbose);
if matches!(std::env::args().nth(1).as_deref(), Some("mcp")) {
init_mcp_logging();
if let Err(e) = noxa_mcp::run().await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// Use low-level API to get both typed Cli and ArgMatches for ValueSource detection.
let matches = Cli::command().get_matches();
let cli = Cli::from_arg_matches(&matches).unwrap_or_else(|e| e.exit());
// Load config BEFORE init_logging so verbose from config takes effect.
let cfg = config::NoxaConfig::load(cli.config.as_deref());
let resolved = config::resolve(&cli, &matches, &cfg);
init_logging(resolved.verbose);
// --map: sitemap discovery mode
if cli.map {
if let Err(e) = run_map(&cli).await {
if let Err(e) = run_map(&cli, &resolved).await {
eprintln!("error: {e}");
process::exit(1);
}
@ -2229,7 +2293,7 @@ async fn main() {
// --crawl: recursive crawl mode
if cli.crawl {
if let Err(e) = run_crawl(&cli).await {
if let Err(e) = run_crawl(&cli, &resolved).await {
eprintln!("error: {e}");
process::exit(1);
}
@ -2245,7 +2309,7 @@ async fn main() {
process::exit(1);
}
};
if let Err(e) = run_watch(&cli, &watch_urls).await {
if let Err(e) = run_watch(&cli, &resolved, &watch_urls).await {
eprintln!("error: {e}");
process::exit(1);
}
@ -2254,7 +2318,7 @@ async fn main() {
// --diff-with: change tracking mode
if let Some(ref snapshot_path) = cli.diff_with {
if let Err(e) = run_diff(&cli, snapshot_path).await {
if let Err(e) = run_diff(&cli, &resolved, snapshot_path).await {
eprintln!("error: {e}");
process::exit(1);
}
@ -2263,7 +2327,7 @@ async fn main() {
// --brand: brand identity extraction mode
if cli.brand {
if let Err(e) = run_brand(&cli).await {
if let Err(e) = run_brand(&cli, &resolved).await {
eprintln!("error: {e}");
process::exit(1);
}
@ -2292,11 +2356,11 @@ async fn main() {
// When multiple URLs are provided, run batch LLM extraction over all of them.
if has_llm_flags(&cli) {
if entries.len() > 1 {
if let Err(e) = run_batch_llm(&cli, &entries).await {
if let Err(e) = run_batch_llm(&cli, &resolved, &entries).await {
eprintln!("error: {e}");
process::exit(1);
}
} else if let Err(e) = run_llm(&cli).await {
} else if let Err(e) = run_llm(&cli, &resolved).await {
eprintln!("error: {e}");
process::exit(1);
}
@ -2305,7 +2369,7 @@ async fn main() {
// Multi-URL batch mode
if entries.len() > 1 {
if let Err(e) = run_batch(&cli, &entries).await {
if let Err(e) = run_batch(&cli, &resolved, &entries).await {
eprintln!("error: {e}");
process::exit(1);
}
@ -2313,8 +2377,11 @@ async fn main() {
}
// --raw-html: skip extraction, dump the fetched HTML
if cli.raw_html && cli.include.is_none() && cli.exclude.is_none() {
match fetch_html(&cli).await {
if resolved.raw_html
&& resolved.include_selectors.is_empty()
&& resolved.exclude_selectors.is_empty()
{
match fetch_html(&cli, &resolved).await {
Ok(r) => println!("{}", r.html),
Err(e) => {
eprintln!("error: {e}");
@ -2325,7 +2392,7 @@ async fn main() {
}
// Single-page extraction (handles both HTML and PDF via content-type detection)
match fetch_and_extract(&cli).await {
match fetch_and_extract(&cli, &resolved).await {
Ok(FetchOutput::Local(result)) => {
if let Some(ref dir) = cli.output_dir {
let url = cli
@ -2334,18 +2401,19 @@ async fn main() {
.map(|u| normalize_url(u))
.unwrap_or_default();
let custom_name = entries.first().and_then(|(_, name)| name.clone());
let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
let content = format_output(&result, &cli.format, cli.metadata);
let filename =
custom_name.unwrap_or_else(|| url_to_filename(&url, &resolved.format));
let content = format_output(&result, &resolved.format, resolved.metadata);
if let Err(e) = write_to_file(dir, &filename, &content) {
eprintln!("error: {e}");
process::exit(1);
}
} else {
print_output(&result, &cli.format, cli.metadata);
print_output(&result, &resolved.format, resolved.metadata);
}
}
Ok(FetchOutput::Cloud(resp)) => {
print_cloud_output(&resp, &cli.format);
print_cloud_output(&resp, &resolved.format);
}
Err(e) => {
eprintln!("{e}");
@ -2456,3 +2524,28 @@ mod tests {
let _ = std::fs::remove_dir_all(&dir);
}
}
#[cfg(test)]
mod enum_deserialize_tests {
use super::*;
#[test]
fn test_output_format_deserialize() {
let f: OutputFormat = serde_json::from_str("\"llm\"").unwrap();
assert!(matches!(f, OutputFormat::Llm));
let f: OutputFormat = serde_json::from_str("\"markdown\"").unwrap();
assert!(matches!(f, OutputFormat::Markdown));
}
#[test]
fn test_browser_deserialize() {
let b: Browser = serde_json::from_str("\"firefox\"").unwrap();
assert!(matches!(b, Browser::Firefox));
}
#[test]
fn test_pdf_mode_deserialize() {
let p: PdfModeArg = serde_json::from_str("\"fast\"").unwrap();
assert!(matches!(p, PdfModeArg::Fast));
}
}

View file

@ -8,6 +8,7 @@ license.workspace = true
[dependencies]
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
async-trait = "0.1"
jsonschema = { version = "0.46", default-features = false }
serde = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }

View file

@ -2,12 +2,15 @@
/// Default order: Ollama (local, free) -> OpenAI -> Anthropic.
/// Only includes providers that are actually configured/available.
use async_trait::async_trait;
use tracing::{debug, warn};
use tracing::{debug, info, warn};
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider};
use crate::providers::{
anthropic::AnthropicProvider, ollama::OllamaProvider, openai::OpenAiProvider,
anthropic::AnthropicProvider,
gemini_cli::GeminiCliProvider,
ollama::OllamaProvider,
openai::OpenAiProvider,
};
pub struct ProviderChain {
@ -15,12 +18,26 @@ pub struct ProviderChain {
}
impl ProviderChain {
/// Build the default chain: Ollama -> OpenAI -> Anthropic.
/// Ollama is always added (availability checked at call time).
/// Build the default chain: Gemini CLI -> OpenAI -> Ollama -> Anthropic.
/// Gemini CLI is the primary backend (subprocess-based, requires `gemini` on PATH).
/// Cloud providers are only added if their API keys are configured.
/// Ollama is added if reachable at call time.
pub async fn default() -> Self {
let mut providers: Vec<Box<dyn LlmProvider>> = Vec::new();
let gemini = GeminiCliProvider::new(None);
if gemini.is_available().await {
debug!("gemini cli available, adding as primary provider");
providers.push(Box::new(gemini));
} else {
debug!("gemini cli not found on PATH, skipping");
}
if let Some(openai) = OpenAiProvider::new(None, None, None) {
debug!("openai configured, adding to chain");
providers.push(Box::new(openai));
}
let ollama = OllamaProvider::new(None, None);
if ollama.is_available().await {
debug!("ollama is available, adding to chain");
@ -29,11 +46,6 @@ impl ProviderChain {
debug!("ollama not available, skipping");
}
if let Some(openai) = OpenAiProvider::new(None, None, None) {
debug!("openai configured, adding to chain");
providers.push(Box::new(openai));
}
if let Some(anthropic) = AnthropicProvider::new(None, None) {
debug!("anthropic configured, adding to chain");
providers.push(Box::new(anthropic));
@ -79,9 +91,10 @@ impl LlmProvider for ProviderChain {
for provider in &self.providers {
debug!(provider = provider.name(), "attempting completion");
let t = std::time::Instant::now();
match provider.complete(request).await {
Ok(response) => {
debug!(provider = provider.name(), "completion succeeded");
info!(provider = provider.name(), elapsed_ms = t.elapsed().as_millis(), "completion succeeded");
return Ok(response);
}
Err(e) => {
@ -202,4 +215,46 @@ mod tests {
assert_eq!(chain.len(), 2);
assert!(!chain.is_empty());
}
// ── Gemini-first chain ordering ───────────────────────────────────────────
#[tokio::test]
async fn gemini_first_in_single_provider_chain() {
// When we build a chain with a mock "gemini" provider first, it should
// be used before any fallback.
let chain = ProviderChain::from_providers(vec![
Box::new(MockProvider {
name: "gemini",
response: Ok("from gemini".into()),
available: true,
}),
Box::new(MockProvider {
name: "openai",
response: Ok("from openai".into()),
available: true,
}),
]);
let result = chain.complete(&test_request()).await.unwrap();
assert_eq!(result, "from gemini");
// Confirm order: first provider name is "gemini"
assert_eq!(chain.providers[0].name(), "gemini");
}
#[tokio::test]
async fn gemini_failure_falls_back_to_openai() {
let chain = ProviderChain::from_providers(vec![
Box::new(MockProvider {
name: "gemini",
response: Err("subprocess timed out".into()),
available: true,
}),
Box::new(MockProvider {
name: "openai",
response: Ok("from openai".into()),
available: true,
}),
]);
let result = chain.complete(&test_request()).await.unwrap();
assert_eq!(result, "from openai");
}
}

View file

@ -4,6 +4,12 @@ pub enum LlmError {
#[error("HTTP error: {0}")]
Http(#[from] reqwest::Error),
#[error("subprocess error: {0}")]
Subprocess(#[from] std::io::Error),
#[error("subprocess timed out")]
Timeout,
#[error("no providers available")]
NoProviders,

View file

@ -1,11 +1,45 @@
/// Schema-based and prompt-based LLM extraction.
/// Both functions build a system prompt, send content to the LLM, and parse JSON back.
use jsonschema;
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider, Message};
/// Validate a JSON value against a schema. Returns Ok(()) on success or
/// Err(LlmError::InvalidJson) with a concise error message on failure.
fn validate_schema(
value: &serde_json::Value,
schema: &serde_json::Value,
) -> Result<(), LlmError> {
let compiled = jsonschema::validator_for(schema).map_err(|e| {
LlmError::InvalidJson(format!("invalid schema: {e}"))
})?;
let errors: Vec<String> = compiled
.iter_errors(value)
.map(|e| format!("{} at {}", e, e.instance_path()))
.collect();
if errors.is_empty() {
Ok(())
} else {
Err(LlmError::InvalidJson(format!(
"schema validation failed: {}",
errors.join("; ")
)))
}
}
/// Extract structured JSON from content using a JSON schema.
/// The schema tells the LLM exactly what fields to extract and their types.
///
/// Retry policy:
/// - If the response cannot be parsed as JSON at all: retry once with the
/// identical request (handles transient formatting issues).
/// - If the response is valid JSON but fails schema validation: return
/// `LlmError::InvalidJson` immediately — the schema is likely unsatisfiable
/// for this content, so retrying would produce the same result.
pub async fn extract_json(
content: &str,
schema: &serde_json::Value,
@ -37,7 +71,22 @@ pub async fn extract_json(
};
let response = provider.complete(&request).await?;
parse_json_response(&response)
match parse_json_response(&response) {
Ok(value) => {
// Valid JSON — now validate against the schema.
// Schema mismatches do not retry (unsatisfiable → same result).
validate_schema(&value, schema)?;
Ok(value)
}
Err(_parse_err) => {
// Unparseable JSON — retry once with the identical request.
let retry_response = provider.complete(&request).await?;
let value = parse_json_response(&retry_response)?;
validate_schema(&value, schema)?;
Ok(value)
}
}
}
/// Extract information using a natural language prompt.
@ -184,4 +233,130 @@ mod tests {
assert_eq!(result["emails"][0], "test@example.com");
}
// ── Schema validation ─────────────────────────────────────────────────────
#[tokio::test]
async fn schema_validation_passes_for_matching_json() {
let schema = serde_json::json!({
"type": "object",
"required": ["price"],
"properties": {
"price": { "type": "number" }
}
});
let mock = MockProvider::ok(r#"{"price": 9.99}"#);
let result = extract_json("content", &schema, &mock, None).await.unwrap();
assert_eq!(result["price"], 9.99);
}
#[tokio::test]
async fn schema_validation_fails_for_wrong_type() {
let schema = serde_json::json!({
"type": "object",
"required": ["price"],
"properties": {
"price": { "type": "number" }
}
});
// Model returns valid JSON but wrong type ("string" instead of number).
// Should NOT retry (schema mismatch ≠ parse failure) — returns InvalidJson immediately.
let mock = MockProvider::ok(r#"{"price": "not-a-number"}"#);
let result = extract_json("content", &schema, &mock, None).await;
assert!(
matches!(result, Err(LlmError::InvalidJson(_))),
"expected InvalidJson for schema mismatch, got {result:?}"
);
}
#[tokio::test]
async fn schema_validation_fails_for_missing_required_field() {
let schema = serde_json::json!({
"type": "object",
"required": ["title"],
"properties": {
"title": { "type": "string" }
}
});
let mock = MockProvider::ok(r#"{"other": "value"}"#);
let result = extract_json("content", &schema, &mock, None).await;
assert!(matches!(result, Err(LlmError::InvalidJson(_))));
}
#[tokio::test]
async fn parse_failure_triggers_one_retry() {
use crate::testing::mock::SequenceMockProvider;
let schema = serde_json::json!({
"type": "object",
"properties": { "title": { "type": "string" } }
});
// First call: unparseable JSON. Second call: valid JSON matching schema.
let mock = SequenceMockProvider::new(
"mock-seq",
vec![
Ok("this is not json at all".to_string()),
Ok(r#"{"title": "Retry succeeded"}"#.to_string()),
],
);
let result = extract_json("content", &schema, &mock, None)
.await
.unwrap();
assert_eq!(result["title"], "Retry succeeded");
}
#[tokio::test]
async fn both_attempts_fail_returns_invalid_json() {
use crate::testing::mock::SequenceMockProvider;
let schema = serde_json::json!({
"type": "object",
"properties": { "title": { "type": "string" } }
});
let mock = SequenceMockProvider::new(
"mock-seq",
vec![
Ok("not json".to_string()),
Ok("also not json".to_string()),
],
);
let result = extract_json("content", &schema, &mock, None).await;
assert!(
matches!(result, Err(LlmError::InvalidJson(_))),
"expected InvalidJson after both attempts fail"
);
}
#[tokio::test]
async fn schema_mismatch_does_not_retry() {
use crate::testing::mock::SequenceMockProvider;
let schema = serde_json::json!({
"type": "object",
"required": ["price"],
"properties": {
"price": { "type": "number" }
}
});
// Both calls return valid JSON with wrong schema — but only one call should happen.
let mock = SequenceMockProvider::new(
"mock-seq",
vec![
Ok(r#"{"price": "wrong-type"}"#.to_string()),
Ok(r#"{"price": 9.99}"#.to_string()), // would succeed — but shouldn't be called
],
);
// Should return InvalidJson without calling second response.
let result = extract_json("content", &schema, &mock, None).await;
assert!(
matches!(result, Err(LlmError::InvalidJson(_))),
"schema mismatch should not trigger retry"
);
}
}

View file

@ -1,8 +1,9 @@
/// noxa-llm: LLM integration with local-first hybrid architecture.
/// noxa-llm: LLM integration with Gemini-CLI-first hybrid architecture.
///
/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
/// Provides schema-based extraction, prompt extraction, and summarization
/// on top of noxa-core's content pipeline.
/// Provider chain: Gemini CLI (primary) → OpenAI → Ollama → Anthropic.
/// Gemini CLI requires the `gemini` binary on PATH; GEMINI_MODEL env var sets the model.
/// Provides schema-validated extraction (with one retry on parse failure),
/// prompt extraction, and summarization on top of noxa-core's content pipeline.
pub mod chain;
pub mod clean;
pub mod error;

View file

@ -0,0 +1,392 @@
/// Gemini CLI provider — shells out to `gemini -p` for completions.
/// Primary provider in the default chain; requires the `gemini` binary on PATH.
///
/// Prompts are passed via the `-p` flag (not via stdin or as a positional) to prevent
/// command injection from web-scraped content. Output is parsed from `--output-format json`.
///
/// # Startup optimizations
///
/// The gemini CLI is an agentic Node.js application that connects to every configured MCP
/// server at startup (the user has 6). Without mitigation this can add 10-60+ seconds per
/// call as those servers spin up and time out.
///
/// Two flags reduce this:
/// - `--extensions ""` — skips extension loading (~3 s saved)
/// - `current_dir` set to a temp workdir containing `.gemini/settings.json` with
/// `{"mcpServers":{}}` — workspace settings override user settings, so all 6 MCP
/// servers are skipped at subprocess startup (major speedup).
///
/// The workdir is created once at construction and reused for every call.
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use tokio::process::Command;
use tokio::sync::Semaphore;
use tokio::time::timeout;
use tracing::debug;
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider};
/// Maximum concurrent Gemini subprocess calls.
const MAX_CONCURRENT: usize = 6;
/// Subprocess deadline — prevents hung `gemini` processes blocking the chain.
const SUBPROCESS_TIMEOUT: Duration = Duration::from_secs(60);
/// Fixed workdir used for every subprocess call.
/// A workspace-level `.gemini/settings.json` here overrides the user's MCP server config.
const NOXA_GEMINI_WORKDIR: &str = "/tmp/noxa-gemini";
pub struct GeminiCliProvider {
default_model: String,
semaphore: Arc<Semaphore>,
/// Workdir with a minimal `.gemini/settings.json` that disables MCP servers.
workdir: PathBuf,
}
impl GeminiCliProvider {
/// Construct the provider.
/// Model resolves as: `model` arg → `GEMINI_MODEL` env → `"gemini-2.5-pro"`.
pub fn new(model: Option<String>) -> Self {
let default_model = model
.or_else(|| std::env::var("GEMINI_MODEL").ok())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| "gemini-2.5-pro".into());
let workdir = PathBuf::from(NOXA_GEMINI_WORKDIR);
ensure_gemini_workdir(&workdir);
Self {
default_model,
semaphore: Arc::new(Semaphore::new(MAX_CONCURRENT)),
workdir,
}
}
#[cfg(test)]
fn default_model(&self) -> &str {
&self.default_model
}
}
#[async_trait]
impl LlmProvider for GeminiCliProvider {
async fn complete(&self, request: &CompletionRequest) -> Result<String, LlmError> {
let model = if request.model.is_empty() {
&self.default_model
} else {
&request.model
};
// Build the prompt text from all messages.
let prompt = build_prompt(&request.messages);
// Acquire concurrency slot before spawning.
let _permit = self
.semaphore
.acquire()
.await
.map_err(|_| LlmError::ProviderError("gemini semaphore closed".into()))?;
let mut cmd = Command::new("gemini");
// -p STRING: headless mode with prompt as the flag value (never positional arg).
// Passing via -p prevents command injection; the value is never interpreted as a shell command.
cmd.arg("-p").arg(&prompt);
cmd.arg("--model").arg(model);
// Always request structured JSON output so we can extract the `response` field
// and skip any preceding noise lines (e.g. MCP status warnings).
cmd.arg("--output-format").arg("json");
// --yolo suppresses any interactive confirmation prompts in headless mode.
cmd.arg("--yolo");
// --extensions "" skips loading user extensions (~3 s startup savings).
cmd.arg("--extensions").arg("");
// Workspace settings in self.workdir override the user's ~/.gemini/settings.json,
// replacing the user's MCP server list with {} so none are spawned at startup.
// Without this, each of the user's MCP servers adds latency to every call.
cmd.current_dir(&self.workdir);
cmd.stdin(std::process::Stdio::null());
cmd.stdout(std::process::Stdio::piped());
cmd.stderr(std::process::Stdio::piped());
debug!(model, workdir = %self.workdir.display(), "spawning gemini subprocess");
let child = cmd.spawn().map_err(LlmError::Subprocess)?;
// Bounded wait — prevents indefinite hangs on auth expiry or network stall.
let output = match timeout(SUBPROCESS_TIMEOUT, child.wait_with_output()).await {
Ok(Ok(out)) => out,
Ok(Err(e)) => return Err(LlmError::Subprocess(e)),
Err(_elapsed) => return Err(LlmError::Timeout),
};
if !output.status.success() {
let stderr_preview = String::from_utf8_lossy(&output.stderr);
let preview = &stderr_preview[..stderr_preview.len().min(500)];
return Err(LlmError::ProviderError(format!(
"gemini exited with {}: {preview}",
output.status
)));
}
let stdout = String::from_utf8_lossy(&output.stdout);
let response = extract_response_from_output(&stdout)?;
let cleaned = strip_code_fences(strip_thinking_tags(&response).trim());
Ok(cleaned)
}
async fn is_available(&self) -> bool {
// Pure PATH check — no inference call, fast.
matches!(
Command::new("gemini")
.arg("--version")
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status()
.await,
Ok(s) if s.success()
)
}
fn name(&self) -> &str {
"gemini"
}
}
/// Parse the `response` field from gemini's `--output-format json` output.
///
/// The CLI emits lines before the JSON object (e.g. MCP status warnings).
/// We find the first `{` to locate the JSON, parse it, and extract `.response`.
fn extract_response_from_output(stdout: &str) -> Result<String, LlmError> {
let json_start = stdout.find('{').ok_or_else(|| {
let preview = &stdout[..stdout.len().min(300)];
LlmError::ProviderError(format!("gemini produced no JSON output: {preview}"))
})?;
let json_str = &stdout[json_start..];
let outer: serde_json::Value = serde_json::from_str(json_str).map_err(|e| {
let preview = &json_str[..json_str.len().min(300)];
LlmError::ProviderError(format!("failed to parse gemini JSON output: {e}{preview}"))
})?;
// `response` holds the model's actual text output.
outer["response"]
.as_str()
.ok_or_else(|| {
LlmError::ProviderError(format!(
"gemini JSON output missing 'response' field: {}",
&json_str[..json_str.len().min(300)]
))
})
.map(|s| s.to_string())
}
/// Create the noxa gemini workdir with a minimal workspace settings file.
///
/// The `.gemini/settings.json` written here overrides the user's `~/.gemini/settings.json`
/// for any `gemini` subprocess run from this directory. Setting `mcpServers` to `{}` prevents
/// the CLI from spawning the user's configured MCP servers on every headless call.
///
/// Errors are intentionally ignored — if the write fails, the subprocess still works,
/// just without the startup optimization (and with a warning in the logs).
fn ensure_gemini_workdir(workdir: &std::path::Path) {
let settings_dir = workdir.join(".gemini");
let settings_path = settings_dir.join("settings.json");
if settings_path.exists() {
return;
}
if let Err(e) = std::fs::create_dir_all(&settings_dir) {
tracing::warn!(path = %settings_dir.display(), error = %e, "failed to create gemini workdir");
return;
}
// Minimal workspace settings: disable all MCP servers.
// Workspace settings override ~/.gemini/settings.json per gemini CLI docs.
let content = r#"{"mcpServers":{}}"#;
if let Err(e) = std::fs::write(&settings_path, content) {
tracing::warn!(path = %settings_path.display(), error = %e, "failed to write gemini workspace settings");
}
}
/// Concatenate all messages into a single prompt string for the CLI.
fn build_prompt(messages: &[crate::provider::Message]) -> String {
messages
.iter()
.map(|m| match m.role.as_str() {
"system" => format!("[System]: {}", m.content),
"assistant" => format!("[Assistant]: {}", m.content),
_ => m.content.clone(),
})
.collect::<Vec<_>>()
.join("\n\n")
}
/// Strip markdown code fences from a response string.
fn strip_code_fences(s: &str) -> String {
let trimmed = s.trim();
if trimmed.starts_with("```") {
let without_opener = trimmed
.strip_prefix("```json")
.or_else(|| trimmed.strip_prefix("```"))
.unwrap_or(trimmed);
without_opener
.strip_suffix("```")
.unwrap_or(without_opener)
.trim()
.to_string()
} else {
trimmed.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
// ── Construction ──────────────────────────────────────────────────────────
#[test]
fn explicit_model_used() {
let p = GeminiCliProvider::new(Some("gemini-1.5-flash".into()));
assert_eq!(p.default_model(), "gemini-1.5-flash");
assert_eq!(p.name(), "gemini");
}
#[test]
fn default_model_fallback() {
// Explicit None + no GEMINI_MODEL env → hardcoded default.
// We unset the env to avoid flakiness (it may or may not be set).
unsafe { std::env::remove_var("GEMINI_MODEL") };
let p = GeminiCliProvider::new(None);
assert_eq!(p.default_model(), "gemini-2.5-pro");
}
// Env var tests mutate process-global state and race with parallel tests.
// Run in isolation if needed:
// cargo test -p noxa-llm env_model_override -- --ignored --test-threads=1
#[test]
#[ignore = "mutates process env; run with --test-threads=1"]
fn env_model_override() {
unsafe { std::env::set_var("GEMINI_MODEL", "gemini-1.5-pro") };
let p = GeminiCliProvider::new(None);
assert_eq!(p.default_model(), "gemini-1.5-pro");
unsafe { std::env::remove_var("GEMINI_MODEL") };
}
// ── build_prompt ──────────────────────────────────────────────────────────
#[test]
fn build_prompt_user_only() {
use crate::provider::Message;
let messages = vec![Message {
role: "user".into(),
content: "hello world".into(),
}];
assert_eq!(build_prompt(&messages), "hello world");
}
#[test]
fn build_prompt_system_and_user() {
use crate::provider::Message;
let messages = vec![
Message {
role: "system".into(),
content: "You are helpful.".into(),
},
Message {
role: "user".into(),
content: "Tell me something.".into(),
},
];
let result = build_prompt(&messages);
assert!(result.contains("[System]: You are helpful."));
assert!(result.contains("Tell me something."));
}
// ── extract_response_from_output ──────────────────────────────────────────
#[test]
fn extracts_response_from_clean_json() {
let stdout = r#"{"session_id":"abc","response":"Hello world","stats":{}}"#;
assert_eq!(extract_response_from_output(stdout).unwrap(), "Hello world");
}
#[test]
fn extracts_response_skipping_mcp_noise() {
// MCP warning line appears before the JSON object in real gemini output.
let stdout = "MCP issues detected. Run /mcp list for status.\n{\"session_id\":\"abc\",\"response\":\"the answer\",\"stats\":{}}";
assert_eq!(
extract_response_from_output(stdout).unwrap(),
"the answer"
);
}
#[test]
fn error_when_no_json_in_output() {
let result = extract_response_from_output("MCP issues detected. No JSON follows.");
assert!(matches!(result, Err(LlmError::ProviderError(_))));
}
#[test]
fn error_when_response_field_missing() {
let stdout = r#"{"session_id":"abc","stats":{}}"#;
let result = extract_response_from_output(stdout);
assert!(matches!(result, Err(LlmError::ProviderError(_))));
}
// ── strip_code_fences ─────────────────────────────────────────────────────
#[test]
fn strips_json_fence() {
let input = "```json\n{\"key\": \"value\"}\n```";
assert_eq!(strip_code_fences(input), "{\"key\": \"value\"}");
}
#[test]
fn strips_plain_fence() {
let input = "```\nhello\n```";
assert_eq!(strip_code_fences(input), "hello");
}
#[test]
fn passthrough_no_fence() {
let input = "{\"key\": \"value\"}";
assert_eq!(strip_code_fences(input), "{\"key\": \"value\"}");
}
// ── is_available returns false when binary absent ──────────────────────────
#[tokio::test]
async fn unavailable_when_binary_missing() {
let result = tokio::process::Command::new("__noxa_nonexistent_binary_xyz__")
.arg("--version")
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status()
.await;
assert!(result.is_err(), "missing binary should fail to spawn");
}
// ── thinking tag stripping ────────────────────────────────────────────────
#[test]
fn strips_thinking_tags_from_output() {
let raw = "<think>internal reasoning</think>{\"result\": true}";
let after_thinking = strip_thinking_tags(raw);
let after_fences = strip_code_fences(after_thinking.trim());
assert_eq!(after_fences, "{\"result\": true}");
}
#[test]
fn strips_code_fence_after_thinking() {
let raw = "<think>let me check</think>\n```json\n{\"ok\": 1}\n```";
let after_thinking = strip_thinking_tags(raw);
let after_fences = strip_code_fences(after_thinking.trim());
assert_eq!(after_fences, "{\"ok\": 1}");
}
}

View file

@ -1,4 +1,5 @@
pub mod anthropic;
pub mod gemini_cli;
pub mod ollama;
pub mod openai;

View file

@ -2,6 +2,7 @@
/// First choice in the provider chain: free, private, fast on Apple Silicon.
use async_trait::async_trait;
use serde_json::json;
use std::time::Duration;
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
@ -96,7 +97,10 @@ impl LlmProvider for OllamaProvider {
async fn is_available(&self) -> bool {
let url = format!("{}/api/tags", self.base_url);
matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success())
matches!(
tokio::time::timeout(Duration::from_millis(500), self.client.get(&url).send()).await,
Ok(Ok(r)) if r.status().is_success()
)
}
fn name(&self) -> &str {

View file

@ -4,6 +4,9 @@
/// extract, chain, and other modules that need a fake LLM backend.
#[cfg(test)]
pub(crate) mod mock {
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use async_trait::async_trait;
use crate::error::LlmError;
@ -45,4 +48,48 @@ pub(crate) mod mock {
self.name
}
}
/// A mock provider that returns responses from a sequence.
/// Call N → returns responses[N], wrapping at the end.
/// Useful for testing first-failure / second-success retry paths.
pub struct SequenceMockProvider {
pub name: &'static str,
pub responses: Vec<Result<String, String>>,
pub available: bool,
call_count: Arc<AtomicUsize>,
}
impl SequenceMockProvider {
pub fn new(
name: &'static str,
responses: Vec<Result<String, String>>,
) -> Self {
Self {
name,
responses,
available: true,
call_count: Arc::new(AtomicUsize::new(0)),
}
}
}
#[async_trait]
impl LlmProvider for SequenceMockProvider {
async fn complete(&self, _request: &CompletionRequest) -> Result<String, LlmError> {
let idx = self.call_count.fetch_add(1, Ordering::SeqCst);
let response = &self.responses[idx.min(self.responses.len() - 1)];
match response {
Ok(text) => Ok(text.clone()),
Err(msg) => Err(LlmError::ProviderError(msg.clone())),
}
}
async fn is_available(&self) -> bool {
self.available
}
fn name(&self) -> &str {
self.name
}
}
}

View file

@ -5,6 +5,10 @@ version.workspace = true
edition.workspace = true
license.workspace = true
[lib]
name = "noxa_mcp"
path = "src/lib.rs"
[[bin]]
name = "noxa-mcp"
path = "src/main.rs"
@ -14,8 +18,8 @@ noxa-core = { workspace = true }
noxa-fetch = { workspace = true }
noxa-llm = { workspace = true }
noxa-pdf = { workspace = true }
rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
schemars = "1.0"
rmcp = { workspace = true }
schemars = { workspace = true }
dotenvy = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
@ -24,4 +28,4 @@ tracing = { workspace = true }
tracing-subscriber = { workspace = true }
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
url = "2"
dirs = "6.0.0"
dirs = { workspace = true }

View file

@ -0,0 +1,20 @@
/// noxa-mcp library wrapper.
///
/// This exposes the MCP server so it can be embedded by the `noxa` CLI via
/// `noxa mcp` without duplicating the transport/bootstrap code.
///
/// Callers must initialize tracing before calling `run()`. Stdout must remain
/// untouched after `run()` begins because it carries the MCP wire protocol.
pub(crate) mod cloud;
pub(crate) mod server;
pub(crate) mod tools;
use rmcp::ServiceExt;
use rmcp::transport::stdio;
/// Start the MCP server over stdio and block until the client disconnects.
pub async fn run() -> Result<(), Box<dyn std::error::Error>> {
let service = server::NoxaMcp::new().await.serve(stdio()).await?;
service.waiting().await?;
Ok(())
}

View file

@ -1,15 +1,6 @@
/// noxa-mcp: MCP (Model Context Protocol) server for noxa.
/// Exposes web extraction tools over stdio transport for AI agents
/// like Claude Desktop, Claude Code, and other MCP clients.
mod cloud;
mod server;
mod tools;
use rmcp::ServiceExt;
use rmcp::transport::stdio;
use server::NoxaMcp;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
dotenvy::dotenv().ok();
@ -21,8 +12,5 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.with_ansi(false)
.init();
let service = NoxaMcp::new().await.serve(stdio()).await?;
service.waiting().await?;
Ok(())
noxa_mcp::run().await
}

View file

@ -89,7 +89,7 @@ impl NoxaMcp {
let chain = noxa_llm::ProviderChain::default().await;
let llm_chain = if chain.is_empty() {
warn!("no LLM providers available -- extract/summarize tools will fail");
warn!("no LLM providers available (gemini CLI, OPENAI_API_KEY, ANTHROPIC_API_KEY) -- extract/summarize tools will fail");
None
} else {
info!(providers = chain.len(), "LLM provider chain ready");
@ -334,7 +334,7 @@ impl NoxaMcp {
// No local LLM — fall back to cloud API directly
if self.llm_chain.is_none() {
let cloud = self.cloud.as_ref().ok_or(
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
"No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
)?;
let mut body = json!({"url": params.url});
if let Some(ref schema) = params.schema {
@ -387,7 +387,7 @@ impl NoxaMcp {
// No local LLM — fall back to cloud API directly
if self.llm_chain.is_none() {
let cloud = self.cloud.as_ref().ok_or(
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
"No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
)?;
let mut body = json!({"url": params.url});
if let Some(sentences) = params.max_sentences {

View file

@ -1,43 +1,20 @@
# ============================================
# Noxa Configuration
# Copy to .env and fill in your values
# ============================================
# Secrets, URLs, and path overrides only — everything else goes in config.json
# See config.example.json for the full list of configurable defaults.
# --- LLM Providers ---
# Cloud API key (required for --cloud / --research)
NOXA_API_KEY=
# Ollama (local, default provider)
OLLAMA_HOST=http://localhost:11434
OLLAMA_MODEL=qwen3:8b
# Single proxy URL (or use NOXA_PROXY_FILE for pool rotation)
NOXA_PROXY=
# OpenAI (optional cloud fallback)
# OPENAI_API_KEY — set your OpenAI key
# OPENAI_BASE_URL — defaults to https://api.openai.com/v1
# OPENAI_MODEL — defaults to gpt-4o-mini
# Proxy pool file path for rotating proxies
NOXA_PROXY_FILE=
# Anthropic (optional cloud fallback)
# ANTHROPIC_API_KEY — set your Anthropic key
# ANTHROPIC_MODEL — defaults to claude-sonnet-4-20250514
# Webhook URL for completion notifications
NOXA_WEBHOOK_URL=
# --- Proxy ---
# LLM base URL (Ollama or OpenAI-compatible endpoint)
NOXA_LLM_BASE_URL=
# Single proxy
# NOXA_PROXY=http://user:pass@host:port
# Proxy file (one per line: host:port:user:pass)
# NOXA_PROXY_FILE=/path/to/proxies.txt
# --- Server (noxa-server only) ---
# NOXA_PORT=3000
# NOXA_HOST=0.0.0.0
# NOXA_AUTH_KEY=your-auth-key
# NOXA_MAX_CONCURRENCY=50
# NOXA_JOB_TTL_SECS=3600
# NOXA_MAX_JOBS=100
# --- CLI LLM overrides ---
# NOXA_LLM_PROVIDER=ollama
# NOXA_LLM_MODEL=qwen3:8b
# NOXA_LLM_BASE_URL=http://localhost:11434
# --- Logging ---
# NOXA_LOG=info
# Optional: path to a non-default config file (default: ./config.json)
# NOXA_CONFIG=/path/to/my-config.json