diff --git a/.gitignore b/.gitignore
index 63934d6..6293f80 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,21 @@
target/
.DS_Store
.env
+config.json
proxies.txt
.claude/skills/
+.omc
+.lavra
+.beads
+.cache
+docs/plans
+docs/superpowers
+docs/reports
+docs/sessions
+benchmarks
+docs
+
+# Beads / Dolt files (added by bd init)
+.dolt/
+*.db
+.beads-credential-key
diff --git a/CLAUDE.md b/CLAUDE.md
index 0f3b388..6e6ab01 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -15,8 +15,8 @@ noxa/
# + proxy pool rotation (per-request)
# + PDF content-type detection
# + document parsing (DOCX, XLSX, CSV)
- noxa-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic)
- # + JSON schema extraction, prompt extraction, summarization
+ noxa-llm/ # LLM provider chain (Gemini CLI -> OpenAI -> Ollama -> Anthropic)
+ # + JSON schema extraction (validated + retry), prompt extraction, summarization
noxa-pdf/ # PDF text extraction via pdf-extract
noxa-mcp/ # MCP server (Model Context Protocol) for AI agents
noxa/ # CLI binary
@@ -48,8 +48,10 @@ Two binaries: `noxa` (CLI), `noxa-mcp` (MCP server).
- `search.rs` — Web search via Serper.dev with parallel result scraping
### LLM Modules (`noxa-llm`)
-- Provider chain: Ollama (local-first) -> OpenAI -> Anthropic
-- JSON schema extraction, prompt-based extraction, summarization
+- Provider chain: Gemini CLI (primary) -> OpenAI -> Ollama -> Anthropic
+- Gemini CLI requires the `gemini` binary on PATH; `GEMINI_MODEL` env var controls model (default: `gemini-2.5-pro`)
+- JSON schema extraction with jsonschema validation; parse failures retry once; schema mismatches fail immediately
+- Prompt-based extraction, summarization
### PDF Modules (`noxa-pdf`)
- PDF text extraction via pdf-extract crate
@@ -105,11 +107,15 @@ noxa https://example.com --diff-with snap.json
# Brand extraction
noxa https://example.com --brand
-# LLM features (Ollama local-first)
+# LLM features (Gemini CLI primary; requires `gemini` on PATH)
noxa https://example.com --summarize
noxa https://example.com --extract-prompt "Get all pricing tiers"
noxa https://example.com --extract-json '{"type":"object","properties":{"title":{"type":"string"}}}'
+# Force a specific LLM provider
+noxa https://example.com --llm-provider gemini --summarize
+noxa https://example.com --llm-provider openai --summarize
+
# PDF (auto-detected via Content-Type)
noxa https://example.com/report.pdf
diff --git a/Cargo.lock b/Cargo.lock
index 0b9cb9d..f9ca781 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -35,7 +35,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
+ "getrandom 0.3.4",
"once_cell",
+ "serde",
"version_check",
"zerocopy",
]
@@ -64,6 +66,12 @@ dependencies = [
"alloc-no-stdlib",
]
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
[[package]]
name = "android_system_properties"
version = "0.1.5"
@@ -206,6 +214,21 @@ dependencies = [
"syn",
]
+[[package]]
+name = "bit-set"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
+
[[package]]
name = "bitflags"
version = "2.11.0"
@@ -246,6 +269,12 @@ dependencies = [
"openssl-macros",
]
+[[package]]
+name = "borrow-or-share"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc0b364ead1874514c8c2855ab558056ebfeb775653e7ae45ff72f28f8f3166c"
+
[[package]]
name = "brotli"
version = "8.0.2"
@@ -273,6 +302,12 @@ version = "3.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+[[package]]
+name = "bytecount"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
+
[[package]]
name = "byteorder"
version = "1.5.0"
@@ -601,6 +636,12 @@ dependencies = [
"syn",
]
+[[package]]
+name = "data-encoding"
+version = "2.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea"
+
[[package]]
name = "debug_unsafe"
version = "0.1.4"
@@ -726,6 +767,15 @@ version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+[[package]]
+name = "email_address"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e079f19b08ca6239f47f8ba8509c11cf3ea30095831f7fed61441475edd8c449"
+dependencies = [
+ "serde",
+]
+
[[package]]
name = "encoding_rs"
version = "0.8.35"
@@ -760,6 +810,17 @@ dependencies = [
"num-traits",
]
+[[package]]
+name = "fancy-regex"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
+dependencies = [
+ "bit-set",
+ "regex-automata",
+ "regex-syntax",
+]
+
[[package]]
name = "fast-float2"
version = "0.2.3"
@@ -789,6 +850,17 @@ dependencies = [
"zlib-rs",
]
+[[package]]
+name = "fluent-uri"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc74ac4d8359ae70623506d512209619e5cf8f347124910440dbc221714b328e"
+dependencies = [
+ "borrow-or-share",
+ "ref-cast",
+ "serde",
+]
+
[[package]]
name = "fnv"
version = "1.0.7"
@@ -801,6 +873,12 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
[[package]]
name = "foreign-types"
version = "0.5.0"
@@ -837,6 +915,16 @@ dependencies = [
"percent-encoding",
]
+[[package]]
+name = "fraction"
+version = "0.15.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f158e3ff0a1b334408dc9fb811cd99b446986f4d8b741bb08f9df1604085ae7"
+dependencies = [
+ "lazy_static",
+ "num",
+]
+
[[package]]
name = "fs_extra"
version = "1.3.0"
@@ -1037,7 +1125,7 @@ version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
- "foldhash",
+ "foldhash 0.1.5",
]
[[package]]
@@ -1045,6 +1133,11 @@ name = "hashbrown"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
+]
[[package]]
name = "heck"
@@ -1410,6 +1503,33 @@ dependencies = [
"wasm-bindgen",
]
+[[package]]
+name = "jsonschema"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84695c6689b01384700a3d93acecbd07231ee6fff1bf22ae980b4c307e6ddfd5"
+dependencies = [
+ "ahash",
+ "bytecount",
+ "data-encoding",
+ "email_address",
+ "fancy-regex",
+ "fraction",
+ "getrandom 0.3.4",
+ "idna",
+ "itoa",
+ "num-cmp",
+ "num-traits",
+ "percent-encoding",
+ "referencing",
+ "regex",
+ "regex-syntax",
+ "serde",
+ "serde_json",
+ "unicode-general-category",
+ "uuid-simd",
+]
+
[[package]]
name = "lazy_static"
version = "1.5.0"
@@ -1575,6 +1695,12 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+[[package]]
+name = "micromap"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a86d3146ed3995b5913c414f6664344b9617457320782e64f0bb44afd49d74"
+
[[package]]
name = "minimal-lexical"
version = "0.2.1"
@@ -1627,10 +1753,12 @@ dependencies = [
"noxa-core",
"noxa-fetch",
"noxa-llm",
+ "noxa-mcp",
"noxa-pdf",
"rand 0.8.5",
"regex",
"reqwest",
+ "serde",
"serde_json",
"tokio",
"tracing",
@@ -1683,6 +1811,7 @@ name = "noxa-llm"
version = "0.3.11"
dependencies = [
"async-trait",
+ "jsonschema",
"reqwest",
"serde",
"serde_json",
@@ -1730,12 +1859,82 @@ dependencies = [
"windows-sys 0.61.2",
]
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-cmp"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63335b2e2c34fae2fb0aa2cecfd9f0832a1e24b3b32ecec612c3426d46dc8aaa"
+
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
[[package]]
name = "num-conv"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
[[package]]
name = "num-traits"
version = "0.2.19"
@@ -1774,6 +1973,12 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+[[package]]
+name = "outref"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e"
+
[[package]]
name = "parking_lot"
version = "0.12.5"
@@ -2160,6 +2365,23 @@ dependencies = [
"syn",
]
+[[package]]
+name = "referencing"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2d5554bf79f4acf770dc3193b44b2d63b348f5f7b7448a0ea1191b37b620728"
+dependencies = [
+ "ahash",
+ "fluent-uri",
+ "getrandom 0.3.4",
+ "hashbrown 0.16.1",
+ "itoa",
+ "micromap",
+ "parking_lot",
+ "percent-encoding",
+ "serde_json",
+]
+
[[package]]
name = "regex"
version = "1.12.3"
@@ -2985,6 +3207,12 @@ version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
+[[package]]
+name = "unicode-general-category"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b993bddc193ae5bd0d623b49ec06ac3e9312875fdae725a975c51db1cc1677f"
+
[[package]]
name = "unicode-ident"
version = "1.0.24"
@@ -3049,6 +3277,16 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+[[package]]
+name = "uuid-simd"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23b082222b4f6619906941c17eb2297fff4c2fb96cb60164170522942a200bd8"
+dependencies = [
+ "outref",
+ "vsimd",
+]
+
[[package]]
name = "valuable"
version = "0.1.1"
@@ -3061,6 +3299,12 @@ version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+[[package]]
+name = "vsimd"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64"
+
[[package]]
name = "want"
version = "0.3.1"
diff --git a/Cargo.toml b/Cargo.toml
index 1b90acd..81bfd4b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ noxa-core = { path = "crates/noxa-core" }
noxa-fetch = { path = "crates/noxa-fetch" }
noxa-llm = { path = "crates/noxa-llm" }
noxa-pdf = { path = "crates/noxa-pdf" }
+noxa-mcp = { path = "crates/noxa-mcp" }
tokio = { version = "1", features = ["full"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
@@ -21,3 +22,6 @@ tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
clap = { version = "4", features = ["derive", "env"] }
dotenvy = "0.15"
+rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
+schemars = "1.0"
+dirs = "6.0.0"
diff --git a/README.md b/README.md
index cd3cba4..fea03dc 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for mac
### Cargo (from source)
```bash
-cargo install --git https://github.com/jmagar/noxa.git noxa
+cargo install --git https://github.com/jmagar/noxa.git noxa-cli --bin noxa
cargo install --git https://github.com/jmagar/noxa.git noxa-mcp
```
@@ -159,6 +159,271 @@ Crawling... 50/50 pages extracted
---
+## Examples
+
+### Basic Extraction
+
+```bash
+# Extract as markdown (default)
+noxa https://example.com
+
+# Multiple output formats
+noxa https://example.com -f markdown # Clean markdown
+noxa https://example.com -f json # Full structured JSON
+noxa https://example.com -f text # Plain text (no formatting)
+noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens)
+
+# Bare domains work (auto-prepends https://)
+noxa example.com
+```
+
+### Content Filtering
+
+```bash
+# Only extract main content (skip nav, sidebar, footer)
+noxa https://docs.rs/tokio --only-main-content
+
+# Include specific CSS selectors
+noxa https://news.ycombinator.com --include ".titleline,.score"
+
+# Exclude specific elements
+noxa https://example.com --exclude "nav,footer,.ads,.sidebar"
+
+# Combine both
+noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar"
+```
+
+### Brand Identity Extraction
+
+```bash
+# Extract colors, fonts, logos from any website
+noxa --brand https://stripe.com
+# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] }
+
+noxa --brand https://github.com
+# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... }
+
+noxa --brand wikipedia.org
+# Output: 10 colors, 5 fonts, favicon, logo URL
+```
+
+### Sitemap Discovery
+
+```bash
+# Discover all URLs from a site's sitemaps
+noxa --map https://sitemaps.org
+# Output: one URL per line (84 URLs found)
+
+# JSON output with metadata
+noxa --map https://sitemaps.org -f json
+# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }]
+```
+
+### Recursive Crawling
+
+```bash
+# Crawl a site (default: depth 1, max 20 pages)
+noxa --crawl https://example.com
+
+# Control depth and page limit
+noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio
+
+# Crawl with sitemap seeding (finds more pages)
+noxa --crawl --sitemap --depth 2 https://docs.rs/tokio
+
+# Filter crawl paths
+noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com
+noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com
+
+# Control concurrency and delay
+noxa --crawl --concurrency 10 --delay 200 https://example.com
+```
+
+### Change Detection (Diff)
+
+```bash
+# Step 1: Save a snapshot
+noxa https://example.com -f json > snapshot.json
+
+# Step 2: Later, compare against the snapshot
+noxa --diff-with snapshot.json https://example.com
+# Output:
+# Status: Same
+# Word count delta: +0
+
+# If the page changed:
+# Status: Changed
+# Word count delta: +42
+# --- old
+# +++ new
+# @@ -1,3 +1,3 @@
+# -Old content here
+# +New content here
+```
+
+### PDF Extraction
+
+```bash
+# PDF URLs are auto-detected via Content-Type
+noxa https://example.com/report.pdf
+
+# Control PDF mode
+noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs)
+noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found
+```
+
+### Batch Processing
+
+```bash
+# Multiple URLs in one command
+noxa https://example.com https://httpbin.org/html https://rust-lang.org
+
+# URLs from a file (one per line, # comments supported)
+noxa --urls-file urls.txt
+
+# Batch with JSON output
+noxa --urls-file urls.txt -f json
+
+# Proxy rotation for large batches
+noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10
+```
+
+### Local Files & Stdin
+
+```bash
+# Extract from a local HTML file
+noxa --file page.html
+
+# Pipe HTML from another command
+curl -s https://example.com | noxa --stdin
+
+# Chain with other tools
+noxa https://example.com -f text | wc -w # Word count
+noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq
+```
+
+### Browser Impersonation
+
+```bash
+# Chrome (default) — latest Chrome TLS fingerprint
+noxa https://example.com
+
+# Firefox fingerprint
+noxa --browser firefox https://example.com
+
+# Random browser per request (good for batch)
+noxa --browser random --urls-file urls.txt
+```
+
+### Custom Headers & Cookies
+
+```bash
+# Custom headers
+noxa -H "Authorization: Bearer token123" https://api.example.com
+noxa -H "Accept-Language: de-DE" https://example.com
+
+# Cookies
+noxa --cookie "session=abc123; theme=dark" https://example.com
+
+# Multiple headers
+noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com
+```
+
+### LLM-Powered Features
+
+These require an LLM provider (Ollama local, or OpenAI/Anthropic API key).
+
+```bash
+# Summarize a page (default: 3 sentences)
+noxa --summarize https://example.com
+
+# Control summary length
+noxa --summarize 5 https://example.com
+
+# Extract structured JSON with a schema
+noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product
+
+# Extract with a schema from file
+noxa --extract-json @schema.json https://example.com/product
+
+# Extract with natural language prompt
+noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing
+
+# Use a specific LLM provider
+noxa --llm-provider ollama --summarize https://example.com
+noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com
+noxa --llm-provider anthropic --summarize https://example.com
+```
+
+### Raw HTML Output
+
+```bash
+# Get the raw fetched HTML (no extraction)
+noxa --raw-html https://example.com
+
+# Useful for debugging extraction issues
+noxa --raw-html https://example.com > raw.html
+noxa --file raw.html # Then extract locally
+```
+
+### Metadata & Verbose Mode
+
+```bash
+# Include YAML frontmatter with metadata
+noxa --metadata https://example.com
+# Output:
+# ---
+# title: "Example Domain"
+# source: "https://example.com"
+# word_count: 20
+# ---
+# # Example Domain
+# ...
+
+# Verbose logging (debug extraction pipeline)
+noxa -v https://example.com
+```
+
+### Proxy Usage
+
+```bash
+# Single proxy
+noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com
+
+# SOCKS5 proxy
+noxa --proxy socks5://proxy.example.com:1080 https://example.com
+
+# Proxy rotation from file (one per line: host:port:user:pass)
+noxa --proxy-file proxies.txt https://example.com
+
+# Auto-load proxies.txt from current directory
+echo "proxy1.com:8080:user:pass" > proxies.txt
+noxa https://example.com # Automatically detects and uses proxies.txt
+```
+
+### Real-World Recipes
+
+```bash
+# Monitor competitor pricing — save today's pricing
+noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \
+ https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json
+
+# Build a documentation search index
+noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt
+
+# Extract all images from a page
+noxa https://example.com -f json | jq -r '.content.images[].src'
+
+# Get all external links
+noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href'
+
+# Compare two pages
+noxa https://site-a.com -f json > a.json
+noxa https://site-b.com --diff-with a.json
+```
+
+---
+
## MCP Server — 10 tools for AI agents
@@ -327,6 +592,31 @@ noxa/
## Configuration
+Non-secret defaults live in `config.json` in your working directory. Copy the example:
+
+```bash
+cp config.example.json config.json
+```
+
+**Precedence:** CLI flags > `config.json` > built-in defaults
+
+**Secrets and URLs** (API keys, proxy, webhook, LLM base URL) always go in `.env`, not `config.json`:
+
+```bash
+cp env.example .env
+```
+
+**Override config path** for a single run:
+
+```bash
+NOXA_CONFIG=/path/to/other-config.json noxa https://example.com
+NOXA_CONFIG=/dev/null noxa https://example.com # bypass config entirely
+```
+
+**Bool flag limitation:** flags like `--metadata`, `--only-main-content`, `--verbose` set to `true` in `config.json` cannot be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass.
+
+### Environment variables
+
| Variable | Description |
|----------|-------------|
| `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) |
diff --git a/config.example.json b/config.example.json
new file mode 100644
index 0000000..db863eb
--- /dev/null
+++ b/config.example.json
@@ -0,0 +1,34 @@
+{
+ "_doc": [
+ "Copy to config.json and remove fields you don't need.",
+ "Secrets (api_key, proxy, webhook, llm_base_url) go in .env — NOT here.",
+ "BOOL FLAG LIMITATION: once set to true here, cannot be overridden to false",
+ "from the CLI for a single run (no --no-flag support). Use NOXA_CONFIG=/dev/null",
+ "on the command line to bypass this config entirely.",
+ "on_change is intentionally absent — it must remain a CLI-only flag.",
+ "Unknown fields are silently ignored, so this file works across noxa versions."
+ ],
+
+ "format": "markdown",
+ "browser": "chrome",
+ "timeout": 30,
+ "pdf_mode": "auto",
+ "metadata": false,
+ "verbose": false,
+ "only_main_content": false,
+
+ "include_selectors": [],
+ "exclude_selectors": ["nav", "footer", ".sidebar", ".cookie-banner"],
+
+ "depth": 1,
+ "max_pages": 20,
+ "concurrency": 5,
+ "delay": 100,
+ "path_prefix": null,
+ "include_paths": [],
+ "exclude_paths": ["/changelog/*", "/blog/*", "/releases/*"],
+ "use_sitemap": false,
+
+ "llm_provider": "gemini",
+ "llm_model": "gemini-2.5-pro"
+}
diff --git a/crates/noxa-cli/Cargo.toml b/crates/noxa-cli/Cargo.toml
index 911cdf9..a874a7f 100644
--- a/crates/noxa-cli/Cargo.toml
+++ b/crates/noxa-cli/Cargo.toml
@@ -14,9 +14,11 @@ noxa-core = { workspace = true }
noxa-fetch = { workspace = true }
noxa-llm = { workspace = true }
noxa-pdf = { workspace = true }
+noxa-mcp = { workspace = true }
dotenvy = { workspace = true }
rand = "0.8"
serde_json = { workspace = true }
+serde = { workspace = true }
tokio = { workspace = true }
clap = { workspace = true }
tracing = { workspace = true }
diff --git a/crates/noxa-cli/src/config.rs b/crates/noxa-cli/src/config.rs
new file mode 100644
index 0000000..894716f
--- /dev/null
+++ b/crates/noxa-cli/src/config.rs
@@ -0,0 +1,315 @@
+use serde::Deserialize;
+use std::path::Path;
+
+use crate::{Browser, OutputFormat, PdfModeArg};
+
+/// Non-secret, non-URL configuration defaults loaded from config.json.
+/// All fields optional — absent means "use the hard default".
+/// Unknown fields are silently ignored (serde default) so config files
+/// written for a newer version of noxa work on older binaries.
+///
+/// DELIBERATELY EXCLUDED:
+/// - on_change: passes content to sh -c; must remain CLI-only to prevent
+/// shell injection via config file writes.
+/// - Secrets/URLs (api_key, proxy, webhook, llm_base_url): stay in .env.
+///
+/// BOOL FLAG LIMITATION:
+/// only_main_content, metadata, verbose, use_sitemap set to true here
+/// cannot be overridden to false from the CLI for a single run (no --no-flag
+/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null to bypass.
+#[derive(Debug, Default, Deserialize)]
+pub struct NoxaConfig {
+ // Output
+ pub format: Option,
+ pub metadata: Option,
+ pub verbose: Option,
+
+ // Fetch
+ pub browser: Option,
+ pub timeout: Option,
+ pub pdf_mode: Option,
+ pub only_main_content: Option,
+
+ // CSS selectors
+ pub include_selectors: Option>,
+ pub exclude_selectors: Option>,
+
+ // Crawl
+ pub depth: Option,
+ pub max_pages: Option,
+ pub concurrency: Option,
+ pub delay: Option,
+ pub path_prefix: Option,
+ pub include_paths: Option>,
+ pub exclude_paths: Option>,
+ pub use_sitemap: Option,
+
+ // LLM (non-secret: provider name and model only; base URL stays in .env)
+ pub llm_provider: Option,
+ pub llm_model: Option,
+}
+
+impl NoxaConfig {
+ /// Load config from an explicit path, NOXA_CONFIG env var, or ./config.json.
+ /// Returns an empty (all-None) config if the file doesn't exist.
+ /// Prints an error and exits if the file exists but is invalid JSON.
+ pub fn load(explicit_path: Option<&str>) -> Self {
+ let noxa_config_env = std::env::var("NOXA_CONFIG").ok();
+ let was_explicit = explicit_path.is_some() || noxa_config_env.is_some();
+
+ let path_str = explicit_path
+ .map(String::from)
+ .or(noxa_config_env)
+ .unwrap_or_else(|| "config.json".to_string());
+
+ let path = Path::new(&path_str);
+ if !path.exists() {
+ if was_explicit {
+ let display_name = path.file_name()
+ .and_then(|n| n.to_str())
+ .unwrap_or(&path_str);
+ eprintln!("error: config file not found: {display_name}");
+ std::process::exit(1);
+ }
+ return Self::default();
+ }
+
+ let display_name = path.file_name()
+ .and_then(|n| n.to_str())
+ .unwrap_or(&path_str);
+ eprintln!(
+ "noxa: config loaded from {display_name} \
+ (API keys and secrets belong in .env, not config.json)"
+ );
+ tracing::debug!("config path: {}", path.display());
+
+ let content = match std::fs::read_to_string(path) {
+ Ok(s) => s,
+ Err(e) => {
+ eprintln!("error: cannot read config file {display_name}: {e}");
+ std::process::exit(1);
+ }
+ };
+
+ match serde_json::from_str(&content) {
+ Ok(cfg) => cfg,
+ Err(e) => {
+ eprintln!("error: invalid JSON in config file {display_name}: {e}");
+ std::process::exit(1);
+ }
+ }
+ }
+}
+
+/// Fully resolved configuration after merging CLI flags > config file > hard defaults.
+/// All fields are concrete — no Option. This is what the rest of main.rs reads.
+///
+/// The merge uses clap's ValueSource to detect which fields were explicitly set on
+/// the command line. CLI-explicit values always win. Config fills in the rest.
+/// Hard defaults are the fallback of last resort.
+pub struct ResolvedConfig {
+ // Output
+ pub format: OutputFormat,
+ pub metadata: bool,
+ pub verbose: bool,
+
+ // Fetch
+ pub browser: Browser,
+ pub timeout: u64,
+ pub pdf_mode: PdfModeArg,
+ pub only_main_content: bool,
+ /// CLI-only output flag — not configurable via config.json (it is a per-run mode, not a persistent default).
+ pub raw_html: bool,
+
+ // CSS selectors
+ /// Vec — CSS selectors passed directly to extraction filter.
+ pub include_selectors: Vec,
+ /// Vec — CSS selectors passed directly to extraction filter.
+ pub exclude_selectors: Vec,
+
+ // Crawl
+ pub depth: usize,
+ pub max_pages: usize,
+ pub concurrency: usize,
+ pub delay: u64,
+ pub path_prefix: Option,
+ /// Vec — never joined to a comma-string. Passed directly to CrawlConfig.
+ pub include_paths: Vec,
+ /// Vec — never joined to a comma-string. Passed directly to CrawlConfig.
+ pub exclude_paths: Vec,
+ pub use_sitemap: bool,
+
+ // LLM
+ pub llm_provider: Option,
+ pub llm_model: Option,
+}
+
+use clap::parser::ValueSource;
+
+/// Merge CLI flags (detected via ValueSource), config file, and hard defaults
+/// into a single ResolvedConfig. CLI explicit values always win.
+pub fn resolve(
+ cli: &crate::Cli,
+ matches: &clap::ArgMatches,
+ cfg: &NoxaConfig,
+) -> ResolvedConfig {
+ let explicit = |name: &str| {
+ matches.value_source(name) == Some(ValueSource::CommandLine)
+ };
+
+ ResolvedConfig {
+ format: if explicit("format") {
+ cli.format.clone()
+ } else {
+ cfg.format.clone().unwrap_or(crate::OutputFormat::Markdown)
+ },
+ browser: if explicit("browser") {
+ cli.browser.clone()
+ } else {
+ cfg.browser.clone().unwrap_or(crate::Browser::Chrome)
+ },
+ pdf_mode: if explicit("pdf_mode") {
+ cli.pdf_mode.clone()
+ } else {
+ cfg.pdf_mode.clone().unwrap_or(crate::PdfModeArg::Auto)
+ },
+ timeout: if explicit("timeout") {
+ cli.timeout
+ } else {
+ cfg.timeout.unwrap_or(30)
+ },
+ depth: if explicit("depth") {
+ cli.depth
+ } else {
+ cfg.depth.unwrap_or(1)
+ },
+ max_pages: if explicit("max_pages") {
+ cli.max_pages
+ } else {
+ cfg.max_pages.unwrap_or(20)
+ },
+ concurrency: if explicit("concurrency") {
+ cli.concurrency
+ } else {
+ cfg.concurrency.unwrap_or(5)
+ },
+ delay: if explicit("delay") {
+ cli.delay
+ } else {
+ cfg.delay.unwrap_or(100)
+ },
+ path_prefix: if explicit("path_prefix") {
+ cli.path_prefix.clone()
+ } else {
+ cfg.path_prefix.clone()
+ },
+ include_paths: if explicit("include_paths") {
+ cli.include_paths
+ .as_deref()
+ .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
+ .unwrap_or_default()
+ } else {
+ cfg.include_paths.clone().unwrap_or_default()
+ },
+ exclude_paths: if explicit("exclude_paths") {
+ cli.exclude_paths
+ .as_deref()
+ .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
+ .unwrap_or_default()
+ } else {
+ cfg.exclude_paths.clone().unwrap_or_default()
+ },
+ include_selectors: if explicit("include") {
+ cli.include
+ .as_deref()
+ .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
+ .unwrap_or_default()
+ } else {
+ cfg.include_selectors.clone().unwrap_or_default()
+ },
+ exclude_selectors: if explicit("exclude") {
+ cli.exclude
+ .as_deref()
+ .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
+ .unwrap_or_default()
+ } else {
+ cfg.exclude_selectors.clone().unwrap_or_default()
+ },
+ only_main_content: cli.only_main_content || cfg.only_main_content.unwrap_or(false),
+ metadata: cli.metadata || cfg.metadata.unwrap_or(false),
+ verbose: cli.verbose || cfg.verbose.unwrap_or(false),
+ use_sitemap: cli.sitemap || cfg.use_sitemap.unwrap_or(false),
+ raw_html: cli.raw_html,
+ llm_provider: if cli.llm_provider.is_some() {
+ cli.llm_provider.clone()
+ } else {
+ cfg.llm_provider.clone()
+ },
+ llm_model: if cli.llm_model.is_some() {
+ cli.llm_model.clone()
+ } else {
+ cfg.llm_model.clone()
+ },
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_noxa_config_deserialize_full() {
+ let json = r#"{
+ "format": "llm",
+ "depth": 3,
+ "max_pages": 100,
+ "concurrency": 10,
+ "delay": 200,
+ "browser": "firefox",
+ "timeout": 60,
+ "only_main_content": true,
+ "use_sitemap": true,
+ "path_prefix": "/docs/",
+ "include_paths": ["/docs/*", "/api/*"],
+ "exclude_paths": ["/changelog/*", "/blog/*"],
+ "include_selectors": ["article", ".content"],
+ "exclude_selectors": ["nav", "footer"],
+ "llm_provider": "gemini",
+ "llm_model": "gemini-2.5-pro",
+ "pdf_mode": "fast",
+ "metadata": true,
+ "verbose": false
+ }"#;
+ let cfg: NoxaConfig = serde_json::from_str(json).unwrap();
+ assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm)));
+ assert_eq!(cfg.depth, Some(3));
+ assert_eq!(cfg.exclude_paths, Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()]));
+ assert!(matches!(cfg.pdf_mode, Some(crate::PdfModeArg::Fast)));
+ }
+
+ #[test]
+ fn test_noxa_config_empty() {
+ let cfg: NoxaConfig = serde_json::from_str("{}").unwrap();
+ assert!(cfg.format.is_none());
+ assert!(cfg.depth.is_none());
+ }
+
+ #[test]
+ fn test_noxa_config_unknown_fields_ignored() {
+ // Unknown fields must NOT cause a parse failure
+ let cfg: NoxaConfig = serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap();
+ assert_eq!(cfg.depth, Some(2));
+ }
+
+ #[test]
+ fn test_load_implicit_missing_file_returns_default() {
+ // When no explicit path and ./config.json doesn't exist, silently return default.
+ // The simplest test: call with None and rely on ./config.json not existing in test env.
+ // If CWD has config.json this test is skipped to avoid flakiness.
+ if std::path::Path::new("config.json").exists() {
+ return; // skip: CWD has config.json
+ }
+ let cfg = NoxaConfig::load(None);
+ assert!(cfg.format.is_none());
+ }
+}
diff --git a/crates/noxa-cli/src/main.rs b/crates/noxa-cli/src/main.rs
index 7bb0a88..7144c24 100644
--- a/crates/noxa-cli/src/main.rs
+++ b/crates/noxa-cli/src/main.rs
@@ -2,6 +2,7 @@
/// CLI entry point -- wires noxa-core and noxa-fetch into a single command.
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
mod cloud;
+mod config;
use std::io::{self, Read as _};
use std::path::{Path, PathBuf};
@@ -9,8 +10,7 @@ use std::process;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
-use clap::{Parser, ValueEnum};
-use tracing_subscriber::EnvFilter;
+use clap::{CommandFactory, FromArgMatches, Parser, ValueEnum};
use noxa_core::{
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
to_llm_text,
@@ -20,7 +20,10 @@ use noxa_fetch::{
FetchConfig, FetchResult, PageResult, SitemapEntry,
};
use noxa_llm::LlmProvider;
+use noxa_mcp;
use noxa_pdf::PdfMode;
+use serde::Deserialize;
+use tracing_subscriber::EnvFilter;
/// Known anti-bot challenge page titles (case-insensitive prefix match).
const ANTIBOT_TITLES: &[&str] = &[
@@ -87,6 +90,10 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
#[derive(Parser)]
#[command(name = "noxa", about = "Extract web content for LLMs", version)]
struct Cli {
+ /// Path to config.json (default: ./config.json, override with NOXA_CONFIG env var)
+ #[arg(long, global = true)]
+ config: Option,
+
/// URLs to fetch (multiple allowed)
#[arg()]
urls: Vec,
@@ -247,7 +254,7 @@ struct Cli {
#[arg(long, num_args = 0..=1, default_missing_value = "3")]
summarize: Option,
- /// Force a specific LLM provider (ollama, openai, anthropic)
+ /// Force a specific LLM provider (gemini, ollama, openai, anthropic)
#[arg(long, env = "NOXA_LLM_PROVIDER")]
llm_provider: Option,
@@ -284,7 +291,8 @@ struct Cli {
output_dir: Option,
}
-#[derive(Clone, ValueEnum)]
+#[derive(Clone, Debug, ValueEnum, Deserialize)]
+#[serde(rename_all = "lowercase")]
enum OutputFormat {
Markdown,
Json,
@@ -293,14 +301,16 @@ enum OutputFormat {
Html,
}
-#[derive(Clone, ValueEnum)]
+#[derive(Clone, Debug, ValueEnum, Deserialize)]
+#[serde(rename_all = "lowercase")]
enum Browser {
Chrome,
Firefox,
Random,
}
-#[derive(Clone, ValueEnum, Default)]
+#[derive(Clone, Debug, ValueEnum, Default, Deserialize)]
+#[serde(rename_all = "lowercase")]
enum PdfModeArg {
/// Error if PDF has no extractable text (catches scanned PDFs)
#[default]
@@ -338,12 +348,21 @@ fn init_logging(verbose: bool) {
tracing_subscriber::fmt().with_env_filter(filter).init();
}
+fn init_mcp_logging() {
+ tracing_subscriber::fmt()
+ .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+ .with_writer(std::io::stderr)
+ .with_ansi(false)
+ .try_init()
+ .ok();
+}
+
/// Build FetchConfig from CLI flags.
///
/// `--proxy` sets a single static proxy (no rotation).
/// `--proxy-file` loads a pool of proxies and rotates per-request.
/// `--proxy` takes priority: if both are set, only the single proxy is used.
-fn build_fetch_config(cli: &Cli) -> FetchConfig {
+fn build_fetch_config(cli: &Cli, resolved: &config::ResolvedConfig) -> FetchConfig {
let (proxy, proxy_pool) = if cli.proxy.is_some() {
(cli.proxy.clone(), Vec::new())
} else if let Some(ref path) = cli.proxy_file {
@@ -403,11 +422,11 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
}
FetchConfig {
- browser: cli.browser.clone().into(),
+ browser: resolved.browser.clone().into(),
proxy,
proxy_pool,
- timeout: std::time::Duration::from_secs(cli.timeout),
- pdf_mode: cli.pdf_mode.clone().into(),
+ timeout: std::time::Duration::from_secs(resolved.timeout),
+ pdf_mode: resolved.pdf_mode.clone().into(),
headers,
..Default::default()
}
@@ -436,20 +455,12 @@ fn parse_cookie_file(path: &str) -> Result {
Ok(pairs.join("; "))
}
-fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
+fn build_extraction_options(resolved: &config::ResolvedConfig) -> ExtractionOptions {
ExtractionOptions {
- include_selectors: cli
- .include
- .as_deref()
- .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
- .unwrap_or_default(),
- exclude_selectors: cli
- .exclude
- .as_deref()
- .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
- .unwrap_or_default(),
- only_main_content: cli.only_main_content,
- include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
+ include_selectors: resolved.include_selectors.clone(),
+ exclude_selectors: resolved.exclude_selectors.clone(),
+ only_main_content: resolved.only_main_content,
+ include_raw_html: resolved.raw_html || matches!(resolved.format, OutputFormat::Html),
}
}
@@ -618,14 +629,17 @@ impl FetchOutput {
/// Fetch a URL and extract content, handling PDF detection automatically.
/// Falls back to cloud API when bot protection or JS rendering is detected.
-async fn fetch_and_extract(cli: &Cli) -> Result {
+async fn fetch_and_extract(
+ cli: &Cli,
+ resolved: &config::ResolvedConfig,
+) -> Result {
// Local sources: read and extract as HTML
if cli.stdin {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?;
- let options = build_extraction_options(cli);
+ let options = build_extraction_options(resolved);
return extract_with_options(&buf, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
@@ -634,7 +648,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result {
if let Some(ref path) = cli.file {
let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
- let options = build_extraction_options(cli);
+ let options = build_extraction_options(resolved);
return extract_with_options(&html, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
@@ -651,10 +665,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result {
// --cloud: skip local, go straight to cloud API
if cli.cloud {
- let c =
- cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
- let options = build_extraction_options(cli);
- let format_str = match cli.format {
+ let c = cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
+ let options = build_extraction_options(resolved);
+ let format_str = match resolved.format {
OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json",
OutputFormat::Text => "text",
@@ -674,9 +687,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result {
}
// Normal path: try local first
- let client =
- FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
- let options = build_extraction_options(cli);
+ let client = FetchClient::new(build_fetch_config(cli, resolved))
+ .map_err(|e| format!("client error: {e}"))?;
+ let options = build_extraction_options(resolved);
let result = client
.fetch_and_extract_with_options(url, &options)
.await
@@ -687,7 +700,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result {
if !matches!(reason, EmptyReason::None) {
if let Some(ref c) = cloud_client {
eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
- let format_str = match cli.format {
+ let format_str = match resolved.format {
OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json",
OutputFormat::Text => "text",
@@ -718,7 +731,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result {
}
/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
-async fn fetch_html(cli: &Cli) -> Result {
+async fn fetch_html(cli: &Cli, resolved: &config::ResolvedConfig) -> Result {
if cli.stdin {
let mut buf = String::new();
io::stdin()
@@ -751,8 +764,8 @@ async fn fetch_html(cli: &Cli) -> Result {
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url);
- let client =
- FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+ let client = FetchClient::new(build_fetch_config(cli, resolved))
+ .map_err(|e| format!("client error: {e}"))?;
client
.fetch(&url)
.await
@@ -1166,7 +1179,7 @@ fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String
)
}
-async fn run_crawl(cli: &Cli) -> Result<(), String> {
+async fn run_crawl(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
let url = cli
.urls
.first()
@@ -1178,16 +1191,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
return Err("--crawl cannot be used with --file or --stdin".into());
}
- let include_patterns: Vec = cli
- .include_paths
- .as_deref()
- .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
- .unwrap_or_default();
- let exclude_patterns: Vec = cli
- .exclude_paths
- .as_deref()
- .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
- .unwrap_or_default();
+ let include_patterns = resolved.include_paths.clone();
+ let exclude_patterns = resolved.exclude_paths.clone();
// Set up streaming progress channel
let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::(100);
@@ -1207,13 +1212,13 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
}
let config = CrawlConfig {
- fetch: build_fetch_config(cli),
- max_depth: cli.depth,
- max_pages: cli.max_pages,
- concurrency: cli.concurrency,
- delay: std::time::Duration::from_millis(cli.delay),
- path_prefix: cli.path_prefix.clone(),
- use_sitemap: cli.sitemap,
+ fetch: build_fetch_config(cli, resolved),
+ max_depth: resolved.depth,
+ max_pages: resolved.max_pages,
+ concurrency: resolved.concurrency,
+ delay: std::time::Duration::from_millis(resolved.delay),
+ path_prefix: resolved.path_prefix.clone(),
+ use_sitemap: resolved.use_sitemap,
include_patterns,
exclude_patterns,
progress_tx: Some(progress_tx),
@@ -1232,7 +1237,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
);
});
- let max_pages = cli.max_pages;
+ let max_pages = resolved.max_pages;
let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages);
// Spawn background task to print streaming progress to stderr
@@ -1261,8 +1266,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
&result.visited,
&result.remaining_frontier,
completed_offset + result.pages.len(),
- cli.max_pages,
- cli.depth,
+ resolved.max_pages,
+ resolved.depth,
)?;
eprintln!(
"Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}",
@@ -1294,15 +1299,15 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
let mut saved = 0usize;
for page in &result.pages {
if let Some(ref extraction) = page.extraction {
- let filename = url_to_filename(&page.url, &cli.format);
- let content = format_output(extraction, &cli.format, cli.metadata);
+ let filename = url_to_filename(&page.url, &resolved.format);
+ let content = format_output(extraction, &resolved.format, resolved.metadata);
write_to_file(dir, &filename, &content)?;
saved += 1;
}
}
eprintln!("Saved {saved} files to {}", dir.display());
} else {
- print_crawl_output(&result, &cli.format, cli.metadata);
+ print_crawl_output(&result, &resolved.format, resolved.metadata);
}
eprintln!(
@@ -1338,7 +1343,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
}
}
-async fn run_map(cli: &Cli) -> Result<(), String> {
+async fn run_map(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
let url = cli
.urls
.first()
@@ -1346,8 +1351,8 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
.map(|u| normalize_url(u))?;
let url = url.as_str();
- let client =
- FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+ let client = FetchClient::new(build_fetch_config(cli, resolved))
+ .map_err(|e| format!("client error: {e}"))?;
let entries = noxa_fetch::sitemap::discover(&client, url)
.await
@@ -1359,19 +1364,24 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
eprintln!("discovered {} URLs", entries.len());
}
- print_map_output(&entries, &cli.format);
+ print_map_output(&entries, &resolved.format);
Ok(())
}
-async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> {
+async fn run_batch(
+ cli: &Cli,
+ resolved: &config::ResolvedConfig,
+ entries: &[(String, Option)],
+) -> Result<(), String> {
let client = Arc::new(
- FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
+ FetchClient::new(build_fetch_config(cli, resolved))
+ .map_err(|e| format!("client error: {e}"))?,
);
let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect();
- let options = build_extraction_options(cli);
+ let options = build_extraction_options(resolved);
let results = client
- .fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options)
+ .fetch_and_extract_batch_with_options(&urls, resolved.concurrency, &options)
.await;
let ok = results.iter().filter(|r| r.result.is_ok()).count();
@@ -1402,15 +1412,15 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<()
let filename = custom_names
.get(r.url.as_str())
.map(|s| s.to_string())
- .unwrap_or_else(|| url_to_filename(&r.url, &cli.format));
- let content = format_output(extraction, &cli.format, cli.metadata);
+ .unwrap_or_else(|| url_to_filename(&r.url, &resolved.format));
+ let content = format_output(extraction, &resolved.format, resolved.metadata);
write_to_file(dir, &filename, &content)?;
saved += 1;
}
}
eprintln!("Saved {saved} files to {}", dir.display());
} else {
- print_batch_output(&results, &cli.format, cli.metadata);
+ print_batch_output(&results, &resolved.format, resolved.metadata);
}
eprintln!(
@@ -1514,15 +1524,20 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
});
}
-async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
+async fn run_watch(
+ cli: &Cli,
+ resolved: &config::ResolvedConfig,
+ urls: &[String],
+) -> Result<(), String> {
if urls.is_empty() {
return Err("--watch requires at least one URL".into());
}
let client = Arc::new(
- FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
+ FetchClient::new(build_fetch_config(cli, resolved))
+ .map_err(|e| format!("client error: {e}"))?,
);
- let options = build_extraction_options(cli);
+ let options = build_extraction_options(resolved);
// Ctrl+C handler
let cancelled = Arc::new(AtomicBool::new(false));
@@ -1534,16 +1549,17 @@ async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
// Single-URL mode: preserve original behavior exactly
if urls.len() == 1 {
- return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
+ return run_watch_single(cli, resolved, &client, &options, &urls[0], &cancelled).await;
}
// Multi-URL mode: batch fetch, diff each, report aggregate
- run_watch_multi(cli, &client, &options, urls, &cancelled).await
+ run_watch_multi(cli, resolved, &client, &options, urls, &cancelled).await
}
/// Original single-URL watch loop -- backward compatible.
async fn run_watch_single(
cli: &Cli,
+ resolved: &config::ResolvedConfig,
client: &Arc,
options: &ExtractionOptions,
url: &str,
@@ -1580,7 +1596,7 @@ async fn run_watch_single(
if diff.status == ChangeStatus::Same {
eprintln!("[watch] No changes ({})", timestamp());
} else {
- print_diff_output(&diff, &cli.format);
+ print_diff_output(&diff, &resolved.format);
eprintln!("[watch] Changes detected! ({})", timestamp());
if let Some(ref cmd) = cli.on_change {
@@ -1627,6 +1643,7 @@ async fn run_watch_single(
/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
async fn run_watch_multi(
cli: &Cli,
+ resolved: &config::ResolvedConfig,
client: &Arc,
options: &ExtractionOptions,
urls: &[String],
@@ -1636,7 +1653,7 @@ async fn run_watch_multi(
// Initial pass: fetch all URLs in parallel
let initial_results = client
- .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
+ .fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options)
.await;
let mut snapshots = std::collections::HashMap::new();
@@ -1676,7 +1693,7 @@ async fn run_watch_multi(
check_number += 1;
let current_results = client
- .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
+ .fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options)
.await;
let mut changed: Vec = Vec::new();
@@ -1780,7 +1797,11 @@ async fn run_watch_multi(
Ok(())
}
-async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
+async fn run_diff(
+ cli: &Cli,
+ resolved: &config::ResolvedConfig,
+ snapshot_path: &str,
+) -> Result<(), String> {
// Load previous snapshot
let snapshot_json = std::fs::read_to_string(snapshot_path)
.map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?;
@@ -1788,16 +1809,16 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
.map_err(|e| format!("failed to parse snapshot JSON: {e}"))?;
// Extract current version (handles PDF detection for URLs)
- let new_result = fetch_and_extract(cli).await?.into_extraction()?;
+ let new_result = fetch_and_extract(cli, resolved).await?.into_extraction()?;
let diff = noxa_core::diff::diff(&old, &new_result);
- print_diff_output(&diff, &cli.format);
+ print_diff_output(&diff, &resolved.format);
Ok(())
}
-async fn run_brand(cli: &Cli) -> Result<(), String> {
- let result = fetch_html(cli).await?;
+async fn run_brand(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
+ let result = fetch_html(cli, resolved).await?;
let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
let brand = noxa_core::brand::extract_brand(
&enriched,
@@ -1811,13 +1832,27 @@ async fn run_brand(cli: &Cli) -> Result<(), String> {
}
/// Build an LLM provider based on CLI flags, or fall back to the default chain.
-async fn build_llm_provider(cli: &Cli) -> Result, String> {
- if let Some(ref name) = cli.llm_provider {
+async fn build_llm_provider(
+ cli: &Cli,
+ resolved: &config::ResolvedConfig,
+) -> Result, String> {
+ if let Some(ref name) = resolved.llm_provider {
match name.as_str() {
+ "gemini" => {
+ let provider = noxa_llm::providers::gemini_cli::GeminiCliProvider::new(
+ resolved.llm_model.clone(),
+ );
+ if !provider.is_available().await {
+ return Err(
+ "gemini CLI not found on PATH -- install it or omit --llm-provider".into(),
+ );
+ }
+ Ok(Box::new(provider))
+ }
"ollama" => {
let provider = noxa_llm::providers::ollama::OllamaProvider::new(
cli.llm_base_url.clone(),
- cli.llm_model.clone(),
+ resolved.llm_model.clone(),
);
if !provider.is_available().await {
return Err("ollama is not running or unreachable".into());
@@ -1828,7 +1863,7 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> {
let provider = noxa_llm::providers::openai::OpenAiProvider::new(
None,
cli.llm_base_url.clone(),
- cli.llm_model.clone(),
+ resolved.llm_model.clone(),
)
.ok_or("OPENAI_API_KEY not set")?;
Ok(Box::new(provider))
@@ -1836,20 +1871,20 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> {
"anthropic" => {
let provider = noxa_llm::providers::anthropic::AnthropicProvider::new(
None,
- cli.llm_model.clone(),
+ resolved.llm_model.clone(),
)
.ok_or("ANTHROPIC_API_KEY not set")?;
Ok(Box::new(provider))
}
other => Err(format!(
- "unknown LLM provider: {other} (use ollama, openai, or anthropic)"
+ "unknown LLM provider: {other} (use gemini, ollama, openai, or anthropic)"
)),
}
} else {
let chain = noxa_llm::ProviderChain::default().await;
if chain.is_empty() {
return Err(
- "no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
+ "no LLM providers available -- install the gemini CLI, start Ollama, or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
.into(),
);
}
@@ -1857,12 +1892,12 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> {
}
}
-async fn run_llm(cli: &Cli) -> Result<(), String> {
+async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
// Extract content from source first (handles PDF detection for URLs)
- let result = fetch_and_extract(cli).await?.into_extraction()?;
+ let result = fetch_and_extract(cli, resolved).await?.into_extraction()?;
- let provider = build_llm_provider(cli).await?;
- let model = cli.llm_model.as_deref();
+ let provider = build_llm_provider(cli, resolved).await?;
+ let model = resolved.llm_model.as_deref();
if let Some(ref schema_input) = cli.extract_json {
// Support @file syntax for loading schema from file
@@ -1876,6 +1911,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
let schema: serde_json::Value =
serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?;
+ let t = std::time::Instant::now();
let extracted = noxa_llm::extract::extract_json(
&result.content.plain_text,
&schema,
@@ -1884,12 +1920,14 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
)
.await
.map_err(|e| format!("LLM extraction failed: {e}"))?;
+ eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
println!(
"{}",
serde_json::to_string_pretty(&extracted).expect("serialization failed")
);
} else if let Some(ref prompt) = cli.extract_prompt {
+ let t = std::time::Instant::now();
let extracted = noxa_llm::extract::extract_with_prompt(
&result.content.plain_text,
prompt,
@@ -1898,12 +1936,14 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
)
.await
.map_err(|e| format!("LLM extraction failed: {e}"))?;
+ eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
println!(
"{}",
serde_json::to_string_pretty(&extracted).expect("serialization failed")
);
} else if let Some(sentences) = cli.summarize {
+ let t = std::time::Instant::now();
let summary = noxa_llm::summarize::summarize(
&result.content.plain_text,
Some(sentences),
@@ -1912,6 +1952,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
)
.await
.map_err(|e| format!("LLM summarization failed: {e}"))?;
+ eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
println!("{summary}");
}
@@ -1921,12 +1962,16 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
/// URLs are processed sequentially to respect LLM provider rate limits.
-async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> {
- let client =
- FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
- let options = build_extraction_options(cli);
- let provider = build_llm_provider(cli).await?;
- let model = cli.llm_model.as_deref();
+async fn run_batch_llm(
+ cli: &Cli,
+ resolved: &config::ResolvedConfig,
+ entries: &[(String, Option)],
+) -> Result<(), String> {
+ let client = FetchClient::new(build_fetch_config(cli, resolved))
+ .map_err(|e| format!("client error: {e}"))?;
+ let options = build_extraction_options(resolved);
+ let provider = build_llm_provider(cli, resolved).await?;
+ let model = resolved.llm_model.as_deref();
// Pre-parse schema once if --extract-json is used
let schema = if let Some(ref schema_input) = cli.extract_json {
@@ -1974,6 +2019,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Resul
let text = &extraction.content.plain_text;
// Run the appropriate LLM operation
+ let llm_start = std::time::Instant::now();
let llm_result = if let Some(ref schema) = schema {
noxa_llm::extract::extract_json(text, schema, provider.as_ref(), model)
.await
@@ -1989,6 +2035,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Resul
} else {
unreachable!("run_batch_llm called without LLM flags")
};
+ let llm_elapsed = llm_start.elapsed();
match llm_result {
Ok(output) => {
@@ -2018,7 +2065,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Resul
format!("{words} words")
}
};
- eprintln!("-> extracted {detail}");
+ eprintln!("-> extracted {detail} ({:.1}s)", llm_elapsed.as_secs_f64());
if let Some(ref dir) = cli.output_dir {
let filename = custom_names
@@ -2215,12 +2262,29 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
async fn main() {
dotenvy::dotenv().ok();
- let cli = Cli::parse();
- init_logging(cli.verbose);
+ if matches!(std::env::args().nth(1).as_deref(), Some("mcp")) {
+ init_mcp_logging();
+
+ if let Err(e) = noxa_mcp::run().await {
+ eprintln!("error: {e}");
+ process::exit(1);
+ }
+ return;
+ }
+
+ // Use low-level API to get both typed Cli and ArgMatches for ValueSource detection.
+ let matches = Cli::command().get_matches();
+ let cli = Cli::from_arg_matches(&matches).unwrap_or_else(|e| e.exit());
+
+ // Load config BEFORE init_logging so verbose from config takes effect.
+ let cfg = config::NoxaConfig::load(cli.config.as_deref());
+ let resolved = config::resolve(&cli, &matches, &cfg);
+
+ init_logging(resolved.verbose);
// --map: sitemap discovery mode
if cli.map {
- if let Err(e) = run_map(&cli).await {
+ if let Err(e) = run_map(&cli, &resolved).await {
eprintln!("error: {e}");
process::exit(1);
}
@@ -2229,7 +2293,7 @@ async fn main() {
// --crawl: recursive crawl mode
if cli.crawl {
- if let Err(e) = run_crawl(&cli).await {
+ if let Err(e) = run_crawl(&cli, &resolved).await {
eprintln!("error: {e}");
process::exit(1);
}
@@ -2245,7 +2309,7 @@ async fn main() {
process::exit(1);
}
};
- if let Err(e) = run_watch(&cli, &watch_urls).await {
+ if let Err(e) = run_watch(&cli, &resolved, &watch_urls).await {
eprintln!("error: {e}");
process::exit(1);
}
@@ -2254,7 +2318,7 @@ async fn main() {
// --diff-with: change tracking mode
if let Some(ref snapshot_path) = cli.diff_with {
- if let Err(e) = run_diff(&cli, snapshot_path).await {
+ if let Err(e) = run_diff(&cli, &resolved, snapshot_path).await {
eprintln!("error: {e}");
process::exit(1);
}
@@ -2263,7 +2327,7 @@ async fn main() {
// --brand: brand identity extraction mode
if cli.brand {
- if let Err(e) = run_brand(&cli).await {
+ if let Err(e) = run_brand(&cli, &resolved).await {
eprintln!("error: {e}");
process::exit(1);
}
@@ -2292,11 +2356,11 @@ async fn main() {
// When multiple URLs are provided, run batch LLM extraction over all of them.
if has_llm_flags(&cli) {
if entries.len() > 1 {
- if let Err(e) = run_batch_llm(&cli, &entries).await {
+ if let Err(e) = run_batch_llm(&cli, &resolved, &entries).await {
eprintln!("error: {e}");
process::exit(1);
}
- } else if let Err(e) = run_llm(&cli).await {
+ } else if let Err(e) = run_llm(&cli, &resolved).await {
eprintln!("error: {e}");
process::exit(1);
}
@@ -2305,7 +2369,7 @@ async fn main() {
// Multi-URL batch mode
if entries.len() > 1 {
- if let Err(e) = run_batch(&cli, &entries).await {
+ if let Err(e) = run_batch(&cli, &resolved, &entries).await {
eprintln!("error: {e}");
process::exit(1);
}
@@ -2313,8 +2377,11 @@ async fn main() {
}
// --raw-html: skip extraction, dump the fetched HTML
- if cli.raw_html && cli.include.is_none() && cli.exclude.is_none() {
- match fetch_html(&cli).await {
+ if resolved.raw_html
+ && resolved.include_selectors.is_empty()
+ && resolved.exclude_selectors.is_empty()
+ {
+ match fetch_html(&cli, &resolved).await {
Ok(r) => println!("{}", r.html),
Err(e) => {
eprintln!("error: {e}");
@@ -2325,7 +2392,7 @@ async fn main() {
}
// Single-page extraction (handles both HTML and PDF via content-type detection)
- match fetch_and_extract(&cli).await {
+ match fetch_and_extract(&cli, &resolved).await {
Ok(FetchOutput::Local(result)) => {
if let Some(ref dir) = cli.output_dir {
let url = cli
@@ -2334,18 +2401,19 @@ async fn main() {
.map(|u| normalize_url(u))
.unwrap_or_default();
let custom_name = entries.first().and_then(|(_, name)| name.clone());
- let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
- let content = format_output(&result, &cli.format, cli.metadata);
+ let filename =
+ custom_name.unwrap_or_else(|| url_to_filename(&url, &resolved.format));
+ let content = format_output(&result, &resolved.format, resolved.metadata);
if let Err(e) = write_to_file(dir, &filename, &content) {
eprintln!("error: {e}");
process::exit(1);
}
} else {
- print_output(&result, &cli.format, cli.metadata);
+ print_output(&result, &resolved.format, resolved.metadata);
}
}
Ok(FetchOutput::Cloud(resp)) => {
- print_cloud_output(&resp, &cli.format);
+ print_cloud_output(&resp, &resolved.format);
}
Err(e) => {
eprintln!("{e}");
@@ -2456,3 +2524,28 @@ mod tests {
let _ = std::fs::remove_dir_all(&dir);
}
}
+
+#[cfg(test)]
+mod enum_deserialize_tests {
+ use super::*;
+
+ #[test]
+ fn test_output_format_deserialize() {
+ let f: OutputFormat = serde_json::from_str("\"llm\"").unwrap();
+ assert!(matches!(f, OutputFormat::Llm));
+ let f: OutputFormat = serde_json::from_str("\"markdown\"").unwrap();
+ assert!(matches!(f, OutputFormat::Markdown));
+ }
+
+ #[test]
+ fn test_browser_deserialize() {
+ let b: Browser = serde_json::from_str("\"firefox\"").unwrap();
+ assert!(matches!(b, Browser::Firefox));
+ }
+
+ #[test]
+ fn test_pdf_mode_deserialize() {
+ let p: PdfModeArg = serde_json::from_str("\"fast\"").unwrap();
+ assert!(matches!(p, PdfModeArg::Fast));
+ }
+}
diff --git a/crates/noxa-llm/Cargo.toml b/crates/noxa-llm/Cargo.toml
index caf656f..4575cdb 100644
--- a/crates/noxa-llm/Cargo.toml
+++ b/crates/noxa-llm/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
[dependencies]
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
async-trait = "0.1"
+jsonschema = { version = "0.46", default-features = false }
serde = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
diff --git a/crates/noxa-llm/src/chain.rs b/crates/noxa-llm/src/chain.rs
index 314bf2a..43f3de9 100644
--- a/crates/noxa-llm/src/chain.rs
+++ b/crates/noxa-llm/src/chain.rs
@@ -2,12 +2,15 @@
/// Default order: Ollama (local, free) -> OpenAI -> Anthropic.
/// Only includes providers that are actually configured/available.
use async_trait::async_trait;
-use tracing::{debug, warn};
+use tracing::{debug, info, warn};
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider};
use crate::providers::{
- anthropic::AnthropicProvider, ollama::OllamaProvider, openai::OpenAiProvider,
+ anthropic::AnthropicProvider,
+ gemini_cli::GeminiCliProvider,
+ ollama::OllamaProvider,
+ openai::OpenAiProvider,
};
pub struct ProviderChain {
@@ -15,12 +18,26 @@ pub struct ProviderChain {
}
impl ProviderChain {
- /// Build the default chain: Ollama -> OpenAI -> Anthropic.
- /// Ollama is always added (availability checked at call time).
+ /// Build the default chain: Gemini CLI -> OpenAI -> Ollama -> Anthropic.
+ /// Gemini CLI is the primary backend (subprocess-based, requires `gemini` on PATH).
/// Cloud providers are only added if their API keys are configured.
+ /// Ollama is added if reachable at call time.
pub async fn default() -> Self {
let mut providers: Vec> = Vec::new();
+ let gemini = GeminiCliProvider::new(None);
+ if gemini.is_available().await {
+ debug!("gemini cli available, adding as primary provider");
+ providers.push(Box::new(gemini));
+ } else {
+ debug!("gemini cli not found on PATH, skipping");
+ }
+
+ if let Some(openai) = OpenAiProvider::new(None, None, None) {
+ debug!("openai configured, adding to chain");
+ providers.push(Box::new(openai));
+ }
+
let ollama = OllamaProvider::new(None, None);
if ollama.is_available().await {
debug!("ollama is available, adding to chain");
@@ -29,11 +46,6 @@ impl ProviderChain {
debug!("ollama not available, skipping");
}
- if let Some(openai) = OpenAiProvider::new(None, None, None) {
- debug!("openai configured, adding to chain");
- providers.push(Box::new(openai));
- }
-
if let Some(anthropic) = AnthropicProvider::new(None, None) {
debug!("anthropic configured, adding to chain");
providers.push(Box::new(anthropic));
@@ -79,9 +91,10 @@ impl LlmProvider for ProviderChain {
for provider in &self.providers {
debug!(provider = provider.name(), "attempting completion");
+ let t = std::time::Instant::now();
match provider.complete(request).await {
Ok(response) => {
- debug!(provider = provider.name(), "completion succeeded");
+ info!(provider = provider.name(), elapsed_ms = t.elapsed().as_millis(), "completion succeeded");
return Ok(response);
}
Err(e) => {
@@ -202,4 +215,46 @@ mod tests {
assert_eq!(chain.len(), 2);
assert!(!chain.is_empty());
}
+
+ // ── Gemini-first chain ordering ───────────────────────────────────────────
+
+ #[tokio::test]
+ async fn gemini_first_in_single_provider_chain() {
+ // When we build a chain with a mock "gemini" provider first, it should
+ // be used before any fallback.
+ let chain = ProviderChain::from_providers(vec![
+ Box::new(MockProvider {
+ name: "gemini",
+ response: Ok("from gemini".into()),
+ available: true,
+ }),
+ Box::new(MockProvider {
+ name: "openai",
+ response: Ok("from openai".into()),
+ available: true,
+ }),
+ ]);
+ let result = chain.complete(&test_request()).await.unwrap();
+ assert_eq!(result, "from gemini");
+ // Confirm order: first provider name is "gemini"
+ assert_eq!(chain.providers[0].name(), "gemini");
+ }
+
+ #[tokio::test]
+ async fn gemini_failure_falls_back_to_openai() {
+ let chain = ProviderChain::from_providers(vec![
+ Box::new(MockProvider {
+ name: "gemini",
+ response: Err("subprocess timed out".into()),
+ available: true,
+ }),
+ Box::new(MockProvider {
+ name: "openai",
+ response: Ok("from openai".into()),
+ available: true,
+ }),
+ ]);
+ let result = chain.complete(&test_request()).await.unwrap();
+ assert_eq!(result, "from openai");
+ }
}
diff --git a/crates/noxa-llm/src/error.rs b/crates/noxa-llm/src/error.rs
index 19f75f3..ecc12d8 100644
--- a/crates/noxa-llm/src/error.rs
+++ b/crates/noxa-llm/src/error.rs
@@ -4,6 +4,12 @@ pub enum LlmError {
#[error("HTTP error: {0}")]
Http(#[from] reqwest::Error),
+ #[error("subprocess error: {0}")]
+ Subprocess(#[from] std::io::Error),
+
+ #[error("subprocess timed out")]
+ Timeout,
+
#[error("no providers available")]
NoProviders,
diff --git a/crates/noxa-llm/src/extract.rs b/crates/noxa-llm/src/extract.rs
index 35c6f77..9216b0d 100644
--- a/crates/noxa-llm/src/extract.rs
+++ b/crates/noxa-llm/src/extract.rs
@@ -1,11 +1,45 @@
/// Schema-based and prompt-based LLM extraction.
/// Both functions build a system prompt, send content to the LLM, and parse JSON back.
+use jsonschema;
+
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider, Message};
+/// Validate a JSON value against a schema. Returns Ok(()) on success or
+/// Err(LlmError::InvalidJson) with a concise error message on failure.
+fn validate_schema(
+ value: &serde_json::Value,
+ schema: &serde_json::Value,
+) -> Result<(), LlmError> {
+ let compiled = jsonschema::validator_for(schema).map_err(|e| {
+ LlmError::InvalidJson(format!("invalid schema: {e}"))
+ })?;
+
+ let errors: Vec = compiled
+ .iter_errors(value)
+ .map(|e| format!("{} at {}", e, e.instance_path()))
+ .collect();
+
+ if errors.is_empty() {
+ Ok(())
+ } else {
+ Err(LlmError::InvalidJson(format!(
+ "schema validation failed: {}",
+ errors.join("; ")
+ )))
+ }
+}
+
/// Extract structured JSON from content using a JSON schema.
/// The schema tells the LLM exactly what fields to extract and their types.
+///
+/// Retry policy:
+/// - If the response cannot be parsed as JSON at all: retry once with the
+/// identical request (handles transient formatting issues).
+/// - If the response is valid JSON but fails schema validation: return
+/// `LlmError::InvalidJson` immediately — the schema is likely unsatisfiable
+/// for this content, so retrying would produce the same result.
pub async fn extract_json(
content: &str,
schema: &serde_json::Value,
@@ -37,7 +71,22 @@ pub async fn extract_json(
};
let response = provider.complete(&request).await?;
- parse_json_response(&response)
+
+ match parse_json_response(&response) {
+ Ok(value) => {
+ // Valid JSON — now validate against the schema.
+ // Schema mismatches do not retry (unsatisfiable → same result).
+ validate_schema(&value, schema)?;
+ Ok(value)
+ }
+ Err(_parse_err) => {
+ // Unparseable JSON — retry once with the identical request.
+ let retry_response = provider.complete(&request).await?;
+ let value = parse_json_response(&retry_response)?;
+ validate_schema(&value, schema)?;
+ Ok(value)
+ }
+ }
}
/// Extract information using a natural language prompt.
@@ -184,4 +233,130 @@ mod tests {
assert_eq!(result["emails"][0], "test@example.com");
}
+
+ // ── Schema validation ─────────────────────────────────────────────────────
+
+ #[tokio::test]
+ async fn schema_validation_passes_for_matching_json() {
+ let schema = serde_json::json!({
+ "type": "object",
+ "required": ["price"],
+ "properties": {
+ "price": { "type": "number" }
+ }
+ });
+ let mock = MockProvider::ok(r#"{"price": 9.99}"#);
+ let result = extract_json("content", &schema, &mock, None).await.unwrap();
+ assert_eq!(result["price"], 9.99);
+ }
+
+ #[tokio::test]
+ async fn schema_validation_fails_for_wrong_type() {
+ let schema = serde_json::json!({
+ "type": "object",
+ "required": ["price"],
+ "properties": {
+ "price": { "type": "number" }
+ }
+ });
+ // Model returns valid JSON but wrong type ("string" instead of number).
+ // Should NOT retry (schema mismatch ≠ parse failure) — returns InvalidJson immediately.
+ let mock = MockProvider::ok(r#"{"price": "not-a-number"}"#);
+ let result = extract_json("content", &schema, &mock, None).await;
+ assert!(
+ matches!(result, Err(LlmError::InvalidJson(_))),
+ "expected InvalidJson for schema mismatch, got {result:?}"
+ );
+ }
+
+ #[tokio::test]
+ async fn schema_validation_fails_for_missing_required_field() {
+ let schema = serde_json::json!({
+ "type": "object",
+ "required": ["title"],
+ "properties": {
+ "title": { "type": "string" }
+ }
+ });
+ let mock = MockProvider::ok(r#"{"other": "value"}"#);
+ let result = extract_json("content", &schema, &mock, None).await;
+ assert!(matches!(result, Err(LlmError::InvalidJson(_))));
+ }
+
+ #[tokio::test]
+ async fn parse_failure_triggers_one_retry() {
+ use crate::testing::mock::SequenceMockProvider;
+
+ let schema = serde_json::json!({
+ "type": "object",
+ "properties": { "title": { "type": "string" } }
+ });
+
+ // First call: unparseable JSON. Second call: valid JSON matching schema.
+ let mock = SequenceMockProvider::new(
+ "mock-seq",
+ vec![
+ Ok("this is not json at all".to_string()),
+ Ok(r#"{"title": "Retry succeeded"}"#.to_string()),
+ ],
+ );
+
+ let result = extract_json("content", &schema, &mock, None)
+ .await
+ .unwrap();
+ assert_eq!(result["title"], "Retry succeeded");
+ }
+
+ #[tokio::test]
+ async fn both_attempts_fail_returns_invalid_json() {
+ use crate::testing::mock::SequenceMockProvider;
+
+ let schema = serde_json::json!({
+ "type": "object",
+ "properties": { "title": { "type": "string" } }
+ });
+
+ let mock = SequenceMockProvider::new(
+ "mock-seq",
+ vec![
+ Ok("not json".to_string()),
+ Ok("also not json".to_string()),
+ ],
+ );
+
+ let result = extract_json("content", &schema, &mock, None).await;
+ assert!(
+ matches!(result, Err(LlmError::InvalidJson(_))),
+ "expected InvalidJson after both attempts fail"
+ );
+ }
+
+ #[tokio::test]
+ async fn schema_mismatch_does_not_retry() {
+ use crate::testing::mock::SequenceMockProvider;
+
+ let schema = serde_json::json!({
+ "type": "object",
+ "required": ["price"],
+ "properties": {
+ "price": { "type": "number" }
+ }
+ });
+
+ // Both calls return valid JSON with wrong schema — but only one call should happen.
+ let mock = SequenceMockProvider::new(
+ "mock-seq",
+ vec![
+ Ok(r#"{"price": "wrong-type"}"#.to_string()),
+ Ok(r#"{"price": 9.99}"#.to_string()), // would succeed — but shouldn't be called
+ ],
+ );
+
+ // Should return InvalidJson without calling second response.
+ let result = extract_json("content", &schema, &mock, None).await;
+ assert!(
+ matches!(result, Err(LlmError::InvalidJson(_))),
+ "schema mismatch should not trigger retry"
+ );
+ }
}
diff --git a/crates/noxa-llm/src/lib.rs b/crates/noxa-llm/src/lib.rs
index 15664b9..250ae88 100644
--- a/crates/noxa-llm/src/lib.rs
+++ b/crates/noxa-llm/src/lib.rs
@@ -1,8 +1,9 @@
-/// noxa-llm: LLM integration with local-first hybrid architecture.
+/// noxa-llm: LLM integration with Gemini-CLI-first hybrid architecture.
///
-/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
-/// Provides schema-based extraction, prompt extraction, and summarization
-/// on top of noxa-core's content pipeline.
+/// Provider chain: Gemini CLI (primary) → OpenAI → Ollama → Anthropic.
+/// Gemini CLI requires the `gemini` binary on PATH; GEMINI_MODEL env var sets the model.
+/// Provides schema-validated extraction (with one retry on parse failure),
+/// prompt extraction, and summarization on top of noxa-core's content pipeline.
pub mod chain;
pub mod clean;
pub mod error;
diff --git a/crates/noxa-llm/src/providers/gemini_cli.rs b/crates/noxa-llm/src/providers/gemini_cli.rs
new file mode 100644
index 0000000..9d2d2d7
--- /dev/null
+++ b/crates/noxa-llm/src/providers/gemini_cli.rs
@@ -0,0 +1,392 @@
+/// Gemini CLI provider — shells out to `gemini -p` for completions.
+/// Primary provider in the default chain; requires the `gemini` binary on PATH.
+///
+/// Prompts are passed via the `-p` flag (not via stdin or as a positional) to prevent
+/// command injection from web-scraped content. Output is parsed from `--output-format json`.
+///
+/// # Startup optimizations
+///
+/// The gemini CLI is an agentic Node.js application that connects to every configured MCP
+/// server at startup (the user has 6). Without mitigation this can add 10-60+ seconds per
+/// call as those servers spin up and time out.
+///
+/// Two flags reduce this:
+/// - `--extensions ""` — skips extension loading (~3 s saved)
+/// - `current_dir` set to a temp workdir containing `.gemini/settings.json` with
+/// `{"mcpServers":{}}` — workspace settings override user settings, so all 6 MCP
+/// servers are skipped at subprocess startup (major speedup).
+///
+/// The workdir is created once at construction and reused for every call.
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use tokio::process::Command;
+use tokio::sync::Semaphore;
+use tokio::time::timeout;
+use tracing::debug;
+
+use crate::clean::strip_thinking_tags;
+use crate::error::LlmError;
+use crate::provider::{CompletionRequest, LlmProvider};
+
+/// Maximum concurrent Gemini subprocess calls.
+const MAX_CONCURRENT: usize = 6;
+/// Subprocess deadline — prevents hung `gemini` processes blocking the chain.
+const SUBPROCESS_TIMEOUT: Duration = Duration::from_secs(60);
+
+/// Fixed workdir used for every subprocess call.
+/// A workspace-level `.gemini/settings.json` here overrides the user's MCP server config.
+const NOXA_GEMINI_WORKDIR: &str = "/tmp/noxa-gemini";
+
+pub struct GeminiCliProvider {
+ default_model: String,
+ semaphore: Arc,
+ /// Workdir with a minimal `.gemini/settings.json` that disables MCP servers.
+ workdir: PathBuf,
+}
+
+impl GeminiCliProvider {
+ /// Construct the provider.
+ /// Model resolves as: `model` arg → `GEMINI_MODEL` env → `"gemini-2.5-pro"`.
+ pub fn new(model: Option) -> Self {
+ let default_model = model
+ .or_else(|| std::env::var("GEMINI_MODEL").ok())
+ .filter(|s| !s.is_empty())
+ .unwrap_or_else(|| "gemini-2.5-pro".into());
+
+ let workdir = PathBuf::from(NOXA_GEMINI_WORKDIR);
+ ensure_gemini_workdir(&workdir);
+
+ Self {
+ default_model,
+ semaphore: Arc::new(Semaphore::new(MAX_CONCURRENT)),
+ workdir,
+ }
+ }
+
+ #[cfg(test)]
+ fn default_model(&self) -> &str {
+ &self.default_model
+ }
+}
+
+#[async_trait]
+impl LlmProvider for GeminiCliProvider {
+ async fn complete(&self, request: &CompletionRequest) -> Result {
+ let model = if request.model.is_empty() {
+ &self.default_model
+ } else {
+ &request.model
+ };
+
+ // Build the prompt text from all messages.
+ let prompt = build_prompt(&request.messages);
+
+ // Acquire concurrency slot before spawning.
+ let _permit = self
+ .semaphore
+ .acquire()
+ .await
+ .map_err(|_| LlmError::ProviderError("gemini semaphore closed".into()))?;
+
+ let mut cmd = Command::new("gemini");
+ // -p STRING: headless mode with prompt as the flag value (never positional arg).
+ // Passing via -p prevents command injection; the value is never interpreted as a shell command.
+ cmd.arg("-p").arg(&prompt);
+ cmd.arg("--model").arg(model);
+ // Always request structured JSON output so we can extract the `response` field
+ // and skip any preceding noise lines (e.g. MCP status warnings).
+ cmd.arg("--output-format").arg("json");
+ // --yolo suppresses any interactive confirmation prompts in headless mode.
+ cmd.arg("--yolo");
+ // --extensions "" skips loading user extensions (~3 s startup savings).
+ cmd.arg("--extensions").arg("");
+ // Workspace settings in self.workdir override the user's ~/.gemini/settings.json,
+ // replacing the user's MCP server list with {} so none are spawned at startup.
+ // Without this, each of the user's MCP servers adds latency to every call.
+ cmd.current_dir(&self.workdir);
+
+ cmd.stdin(std::process::Stdio::null());
+ cmd.stdout(std::process::Stdio::piped());
+ cmd.stderr(std::process::Stdio::piped());
+
+ debug!(model, workdir = %self.workdir.display(), "spawning gemini subprocess");
+
+ let child = cmd.spawn().map_err(LlmError::Subprocess)?;
+
+ // Bounded wait — prevents indefinite hangs on auth expiry or network stall.
+ let output = match timeout(SUBPROCESS_TIMEOUT, child.wait_with_output()).await {
+ Ok(Ok(out)) => out,
+ Ok(Err(e)) => return Err(LlmError::Subprocess(e)),
+ Err(_elapsed) => return Err(LlmError::Timeout),
+ };
+
+ if !output.status.success() {
+ let stderr_preview = String::from_utf8_lossy(&output.stderr);
+ let preview = &stderr_preview[..stderr_preview.len().min(500)];
+ return Err(LlmError::ProviderError(format!(
+ "gemini exited with {}: {preview}",
+ output.status
+ )));
+ }
+
+ let stdout = String::from_utf8_lossy(&output.stdout);
+ let response = extract_response_from_output(&stdout)?;
+ let cleaned = strip_code_fences(strip_thinking_tags(&response).trim());
+ Ok(cleaned)
+ }
+
+ async fn is_available(&self) -> bool {
+ // Pure PATH check — no inference call, fast.
+ matches!(
+ Command::new("gemini")
+ .arg("--version")
+ .stdout(std::process::Stdio::null())
+ .stderr(std::process::Stdio::null())
+ .status()
+ .await,
+ Ok(s) if s.success()
+ )
+ }
+
+ fn name(&self) -> &str {
+ "gemini"
+ }
+}
+
+/// Parse the `response` field from gemini's `--output-format json` output.
+///
+/// The CLI emits lines before the JSON object (e.g. MCP status warnings).
+/// We find the first `{` to locate the JSON, parse it, and extract `.response`.
+fn extract_response_from_output(stdout: &str) -> Result {
+ let json_start = stdout.find('{').ok_or_else(|| {
+ let preview = &stdout[..stdout.len().min(300)];
+ LlmError::ProviderError(format!("gemini produced no JSON output: {preview}"))
+ })?;
+
+ let json_str = &stdout[json_start..];
+ let outer: serde_json::Value = serde_json::from_str(json_str).map_err(|e| {
+ let preview = &json_str[..json_str.len().min(300)];
+ LlmError::ProviderError(format!("failed to parse gemini JSON output: {e} — {preview}"))
+ })?;
+
+ // `response` holds the model's actual text output.
+ outer["response"]
+ .as_str()
+ .ok_or_else(|| {
+ LlmError::ProviderError(format!(
+ "gemini JSON output missing 'response' field: {}",
+ &json_str[..json_str.len().min(300)]
+ ))
+ })
+ .map(|s| s.to_string())
+}
+
+/// Create the noxa gemini workdir with a minimal workspace settings file.
+///
+/// The `.gemini/settings.json` written here overrides the user's `~/.gemini/settings.json`
+/// for any `gemini` subprocess run from this directory. Setting `mcpServers` to `{}` prevents
+/// the CLI from spawning the user's configured MCP servers on every headless call.
+///
+/// Errors are intentionally ignored — if the write fails, the subprocess still works,
+/// just without the startup optimization (and with a warning in the logs).
+fn ensure_gemini_workdir(workdir: &std::path::Path) {
+ let settings_dir = workdir.join(".gemini");
+ let settings_path = settings_dir.join("settings.json");
+
+ if settings_path.exists() {
+ return;
+ }
+
+ if let Err(e) = std::fs::create_dir_all(&settings_dir) {
+ tracing::warn!(path = %settings_dir.display(), error = %e, "failed to create gemini workdir");
+ return;
+ }
+
+ // Minimal workspace settings: disable all MCP servers.
+ // Workspace settings override ~/.gemini/settings.json per gemini CLI docs.
+ let content = r#"{"mcpServers":{}}"#;
+ if let Err(e) = std::fs::write(&settings_path, content) {
+ tracing::warn!(path = %settings_path.display(), error = %e, "failed to write gemini workspace settings");
+ }
+}
+
+/// Concatenate all messages into a single prompt string for the CLI.
+fn build_prompt(messages: &[crate::provider::Message]) -> String {
+ messages
+ .iter()
+ .map(|m| match m.role.as_str() {
+ "system" => format!("[System]: {}", m.content),
+ "assistant" => format!("[Assistant]: {}", m.content),
+ _ => m.content.clone(),
+ })
+ .collect::>()
+ .join("\n\n")
+}
+
+/// Strip markdown code fences from a response string.
+fn strip_code_fences(s: &str) -> String {
+ let trimmed = s.trim();
+ if trimmed.starts_with("```") {
+ let without_opener = trimmed
+ .strip_prefix("```json")
+ .or_else(|| trimmed.strip_prefix("```"))
+ .unwrap_or(trimmed);
+ without_opener
+ .strip_suffix("```")
+ .unwrap_or(without_opener)
+ .trim()
+ .to_string()
+ } else {
+ trimmed.to_string()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // ── Construction ──────────────────────────────────────────────────────────
+
+ #[test]
+ fn explicit_model_used() {
+ let p = GeminiCliProvider::new(Some("gemini-1.5-flash".into()));
+ assert_eq!(p.default_model(), "gemini-1.5-flash");
+ assert_eq!(p.name(), "gemini");
+ }
+
+ #[test]
+ fn default_model_fallback() {
+ // Explicit None + no GEMINI_MODEL env → hardcoded default.
+ // We unset the env to avoid flakiness (it may or may not be set).
+ unsafe { std::env::remove_var("GEMINI_MODEL") };
+ let p = GeminiCliProvider::new(None);
+ assert_eq!(p.default_model(), "gemini-2.5-pro");
+ }
+
+ // Env var tests mutate process-global state and race with parallel tests.
+ // Run in isolation if needed:
+ // cargo test -p noxa-llm env_model_override -- --ignored --test-threads=1
+ #[test]
+ #[ignore = "mutates process env; run with --test-threads=1"]
+ fn env_model_override() {
+ unsafe { std::env::set_var("GEMINI_MODEL", "gemini-1.5-pro") };
+ let p = GeminiCliProvider::new(None);
+ assert_eq!(p.default_model(), "gemini-1.5-pro");
+ unsafe { std::env::remove_var("GEMINI_MODEL") };
+ }
+
+ // ── build_prompt ──────────────────────────────────────────────────────────
+
+ #[test]
+ fn build_prompt_user_only() {
+ use crate::provider::Message;
+ let messages = vec![Message {
+ role: "user".into(),
+ content: "hello world".into(),
+ }];
+ assert_eq!(build_prompt(&messages), "hello world");
+ }
+
+ #[test]
+ fn build_prompt_system_and_user() {
+ use crate::provider::Message;
+ let messages = vec![
+ Message {
+ role: "system".into(),
+ content: "You are helpful.".into(),
+ },
+ Message {
+ role: "user".into(),
+ content: "Tell me something.".into(),
+ },
+ ];
+ let result = build_prompt(&messages);
+ assert!(result.contains("[System]: You are helpful."));
+ assert!(result.contains("Tell me something."));
+ }
+
+ // ── extract_response_from_output ──────────────────────────────────────────
+
+ #[test]
+ fn extracts_response_from_clean_json() {
+ let stdout = r#"{"session_id":"abc","response":"Hello world","stats":{}}"#;
+ assert_eq!(extract_response_from_output(stdout).unwrap(), "Hello world");
+ }
+
+ #[test]
+ fn extracts_response_skipping_mcp_noise() {
+ // MCP warning line appears before the JSON object in real gemini output.
+ let stdout = "MCP issues detected. Run /mcp list for status.\n{\"session_id\":\"abc\",\"response\":\"the answer\",\"stats\":{}}";
+ assert_eq!(
+ extract_response_from_output(stdout).unwrap(),
+ "the answer"
+ );
+ }
+
+ #[test]
+ fn error_when_no_json_in_output() {
+ let result = extract_response_from_output("MCP issues detected. No JSON follows.");
+ assert!(matches!(result, Err(LlmError::ProviderError(_))));
+ }
+
+ #[test]
+ fn error_when_response_field_missing() {
+ let stdout = r#"{"session_id":"abc","stats":{}}"#;
+ let result = extract_response_from_output(stdout);
+ assert!(matches!(result, Err(LlmError::ProviderError(_))));
+ }
+
+ // ── strip_code_fences ─────────────────────────────────────────────────────
+
+ #[test]
+ fn strips_json_fence() {
+ let input = "```json\n{\"key\": \"value\"}\n```";
+ assert_eq!(strip_code_fences(input), "{\"key\": \"value\"}");
+ }
+
+ #[test]
+ fn strips_plain_fence() {
+ let input = "```\nhello\n```";
+ assert_eq!(strip_code_fences(input), "hello");
+ }
+
+ #[test]
+ fn passthrough_no_fence() {
+ let input = "{\"key\": \"value\"}";
+ assert_eq!(strip_code_fences(input), "{\"key\": \"value\"}");
+ }
+
+ // ── is_available returns false when binary absent ──────────────────────────
+
+ #[tokio::test]
+ async fn unavailable_when_binary_missing() {
+ let result = tokio::process::Command::new("__noxa_nonexistent_binary_xyz__")
+ .arg("--version")
+ .stdout(std::process::Stdio::null())
+ .stderr(std::process::Stdio::null())
+ .status()
+ .await;
+ assert!(result.is_err(), "missing binary should fail to spawn");
+ }
+
+ // ── thinking tag stripping ────────────────────────────────────────────────
+
+ #[test]
+ fn strips_thinking_tags_from_output() {
+ let raw = "internal reasoning{\"result\": true}";
+ let after_thinking = strip_thinking_tags(raw);
+ let after_fences = strip_code_fences(after_thinking.trim());
+ assert_eq!(after_fences, "{\"result\": true}");
+ }
+
+ #[test]
+ fn strips_code_fence_after_thinking() {
+ let raw = "let me check\n```json\n{\"ok\": 1}\n```";
+ let after_thinking = strip_thinking_tags(raw);
+ let after_fences = strip_code_fences(after_thinking.trim());
+ assert_eq!(after_fences, "{\"ok\": 1}");
+ }
+}
diff --git a/crates/noxa-llm/src/providers/mod.rs b/crates/noxa-llm/src/providers/mod.rs
index c6b8f60..b1a8736 100644
--- a/crates/noxa-llm/src/providers/mod.rs
+++ b/crates/noxa-llm/src/providers/mod.rs
@@ -1,4 +1,5 @@
pub mod anthropic;
+pub mod gemini_cli;
pub mod ollama;
pub mod openai;
diff --git a/crates/noxa-llm/src/providers/ollama.rs b/crates/noxa-llm/src/providers/ollama.rs
index b42a584..d728e67 100644
--- a/crates/noxa-llm/src/providers/ollama.rs
+++ b/crates/noxa-llm/src/providers/ollama.rs
@@ -2,6 +2,7 @@
/// First choice in the provider chain: free, private, fast on Apple Silicon.
use async_trait::async_trait;
use serde_json::json;
+use std::time::Duration;
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
@@ -96,7 +97,10 @@ impl LlmProvider for OllamaProvider {
async fn is_available(&self) -> bool {
let url = format!("{}/api/tags", self.base_url);
- matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success())
+ matches!(
+ tokio::time::timeout(Duration::from_millis(500), self.client.get(&url).send()).await,
+ Ok(Ok(r)) if r.status().is_success()
+ )
}
fn name(&self) -> &str {
diff --git a/crates/noxa-llm/src/testing.rs b/crates/noxa-llm/src/testing.rs
index 66157a2..da5cc0b 100644
--- a/crates/noxa-llm/src/testing.rs
+++ b/crates/noxa-llm/src/testing.rs
@@ -4,6 +4,9 @@
/// extract, chain, and other modules that need a fake LLM backend.
#[cfg(test)]
pub(crate) mod mock {
+ use std::sync::atomic::{AtomicUsize, Ordering};
+ use std::sync::Arc;
+
use async_trait::async_trait;
use crate::error::LlmError;
@@ -45,4 +48,48 @@ pub(crate) mod mock {
self.name
}
}
+
+ /// A mock provider that returns responses from a sequence.
+ /// Call N → returns responses[N], wrapping at the end.
+ /// Useful for testing first-failure / second-success retry paths.
+ pub struct SequenceMockProvider {
+ pub name: &'static str,
+ pub responses: Vec>,
+ pub available: bool,
+ call_count: Arc,
+ }
+
+ impl SequenceMockProvider {
+ pub fn new(
+ name: &'static str,
+ responses: Vec>,
+ ) -> Self {
+ Self {
+ name,
+ responses,
+ available: true,
+ call_count: Arc::new(AtomicUsize::new(0)),
+ }
+ }
+ }
+
+ #[async_trait]
+ impl LlmProvider for SequenceMockProvider {
+ async fn complete(&self, _request: &CompletionRequest) -> Result {
+ let idx = self.call_count.fetch_add(1, Ordering::SeqCst);
+ let response = &self.responses[idx.min(self.responses.len() - 1)];
+ match response {
+ Ok(text) => Ok(text.clone()),
+ Err(msg) => Err(LlmError::ProviderError(msg.clone())),
+ }
+ }
+
+ async fn is_available(&self) -> bool {
+ self.available
+ }
+
+ fn name(&self) -> &str {
+ self.name
+ }
+ }
}
diff --git a/crates/noxa-mcp/Cargo.toml b/crates/noxa-mcp/Cargo.toml
index 16f4f2e..a82757b 100644
--- a/crates/noxa-mcp/Cargo.toml
+++ b/crates/noxa-mcp/Cargo.toml
@@ -5,6 +5,10 @@ version.workspace = true
edition.workspace = true
license.workspace = true
+[lib]
+name = "noxa_mcp"
+path = "src/lib.rs"
+
[[bin]]
name = "noxa-mcp"
path = "src/main.rs"
@@ -14,8 +18,8 @@ noxa-core = { workspace = true }
noxa-fetch = { workspace = true }
noxa-llm = { workspace = true }
noxa-pdf = { workspace = true }
-rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
-schemars = "1.0"
+rmcp = { workspace = true }
+schemars = { workspace = true }
dotenvy = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
@@ -24,4 +28,4 @@ tracing = { workspace = true }
tracing-subscriber = { workspace = true }
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
url = "2"
-dirs = "6.0.0"
+dirs = { workspace = true }
diff --git a/crates/noxa-mcp/src/lib.rs b/crates/noxa-mcp/src/lib.rs
new file mode 100644
index 0000000..fe75c97
--- /dev/null
+++ b/crates/noxa-mcp/src/lib.rs
@@ -0,0 +1,20 @@
+/// noxa-mcp library wrapper.
+///
+/// This exposes the MCP server so it can be embedded by the `noxa` CLI via
+/// `noxa mcp` without duplicating the transport/bootstrap code.
+///
+/// Callers must initialize tracing before calling `run()`. Stdout must remain
+/// untouched after `run()` begins because it carries the MCP wire protocol.
+pub(crate) mod cloud;
+pub(crate) mod server;
+pub(crate) mod tools;
+
+use rmcp::ServiceExt;
+use rmcp::transport::stdio;
+
+/// Start the MCP server over stdio and block until the client disconnects.
+pub async fn run() -> Result<(), Box> {
+ let service = server::NoxaMcp::new().await.serve(stdio()).await?;
+ service.waiting().await?;
+ Ok(())
+}
diff --git a/crates/noxa-mcp/src/main.rs b/crates/noxa-mcp/src/main.rs
index 5abde92..fdc71c0 100644
--- a/crates/noxa-mcp/src/main.rs
+++ b/crates/noxa-mcp/src/main.rs
@@ -1,15 +1,6 @@
/// noxa-mcp: MCP (Model Context Protocol) server for noxa.
/// Exposes web extraction tools over stdio transport for AI agents
/// like Claude Desktop, Claude Code, and other MCP clients.
-mod cloud;
-mod server;
-mod tools;
-
-use rmcp::ServiceExt;
-use rmcp::transport::stdio;
-
-use server::NoxaMcp;
-
#[tokio::main]
async fn main() -> Result<(), Box> {
dotenvy::dotenv().ok();
@@ -21,8 +12,5 @@ async fn main() -> Result<(), Box> {
.with_ansi(false)
.init();
- let service = NoxaMcp::new().await.serve(stdio()).await?;
-
- service.waiting().await?;
- Ok(())
+ noxa_mcp::run().await
}
diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs
index 804861d..4b7bb44 100644
--- a/crates/noxa-mcp/src/server.rs
+++ b/crates/noxa-mcp/src/server.rs
@@ -89,7 +89,7 @@ impl NoxaMcp {
let chain = noxa_llm::ProviderChain::default().await;
let llm_chain = if chain.is_empty() {
- warn!("no LLM providers available -- extract/summarize tools will fail");
+ warn!("no LLM providers available (gemini CLI, OPENAI_API_KEY, ANTHROPIC_API_KEY) -- extract/summarize tools will fail");
None
} else {
info!(providers = chain.len(), "LLM provider chain ready");
@@ -334,7 +334,7 @@ impl NoxaMcp {
// No local LLM — fall back to cloud API directly
if self.llm_chain.is_none() {
let cloud = self.cloud.as_ref().ok_or(
- "No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
+ "No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
)?;
let mut body = json!({"url": params.url});
if let Some(ref schema) = params.schema {
@@ -387,7 +387,7 @@ impl NoxaMcp {
// No local LLM — fall back to cloud API directly
if self.llm_chain.is_none() {
let cloud = self.cloud.as_ref().ok_or(
- "No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
+ "No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
)?;
let mut body = json!({"url": params.url});
if let Some(sentences) = params.max_sentences {
diff --git a/env.example b/env.example
index d15b729..aad81c5 100644
--- a/env.example
+++ b/env.example
@@ -1,43 +1,20 @@
-# ============================================
-# Noxa Configuration
-# Copy to .env and fill in your values
-# ============================================
+# Secrets, URLs, and path overrides only — everything else goes in config.json
+# See config.example.json for the full list of configurable defaults.
-# --- LLM Providers ---
+# Cloud API key (required for --cloud / --research)
+NOXA_API_KEY=
-# Ollama (local, default provider)
-OLLAMA_HOST=http://localhost:11434
-OLLAMA_MODEL=qwen3:8b
+# Single proxy URL (or use NOXA_PROXY_FILE for pool rotation)
+NOXA_PROXY=
-# OpenAI (optional cloud fallback)
-# OPENAI_API_KEY — set your OpenAI key
-# OPENAI_BASE_URL — defaults to https://api.openai.com/v1
-# OPENAI_MODEL — defaults to gpt-4o-mini
+# Proxy pool file path for rotating proxies
+NOXA_PROXY_FILE=
-# Anthropic (optional cloud fallback)
-# ANTHROPIC_API_KEY — set your Anthropic key
-# ANTHROPIC_MODEL — defaults to claude-sonnet-4-20250514
+# Webhook URL for completion notifications
+NOXA_WEBHOOK_URL=
-# --- Proxy ---
+# LLM base URL (Ollama or OpenAI-compatible endpoint)
+NOXA_LLM_BASE_URL=
-# Single proxy
-# NOXA_PROXY=http://user:pass@host:port
-
-# Proxy file (one per line: host:port:user:pass)
-# NOXA_PROXY_FILE=/path/to/proxies.txt
-
-# --- Server (noxa-server only) ---
-# NOXA_PORT=3000
-# NOXA_HOST=0.0.0.0
-# NOXA_AUTH_KEY=your-auth-key
-# NOXA_MAX_CONCURRENCY=50
-# NOXA_JOB_TTL_SECS=3600
-# NOXA_MAX_JOBS=100
-
-# --- CLI LLM overrides ---
-# NOXA_LLM_PROVIDER=ollama
-# NOXA_LLM_MODEL=qwen3:8b
-# NOXA_LLM_BASE_URL=http://localhost:11434
-
-# --- Logging ---
-# NOXA_LOG=info
+# Optional: path to a non-default config file (default: ./config.json)
+# NOXA_CONFIG=/path/to/my-config.json