diff --git a/.gitignore b/.gitignore index 63934d6..6293f80 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,21 @@ target/ .DS_Store .env +config.json proxies.txt .claude/skills/ +.omc +.lavra +.beads +.cache +docs/plans +docs/superpowers +docs/reports +docs/sessions +benchmarks +docs + +# Beads / Dolt files (added by bd init) +.dolt/ +*.db +.beads-credential-key diff --git a/CLAUDE.md b/CLAUDE.md index 0f3b388..6e6ab01 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -15,8 +15,8 @@ noxa/ # + proxy pool rotation (per-request) # + PDF content-type detection # + document parsing (DOCX, XLSX, CSV) - noxa-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic) - # + JSON schema extraction, prompt extraction, summarization + noxa-llm/ # LLM provider chain (Gemini CLI -> OpenAI -> Ollama -> Anthropic) + # + JSON schema extraction (validated + retry), prompt extraction, summarization noxa-pdf/ # PDF text extraction via pdf-extract noxa-mcp/ # MCP server (Model Context Protocol) for AI agents noxa/ # CLI binary @@ -48,8 +48,10 @@ Two binaries: `noxa` (CLI), `noxa-mcp` (MCP server). - `search.rs` — Web search via Serper.dev with parallel result scraping ### LLM Modules (`noxa-llm`) -- Provider chain: Ollama (local-first) -> OpenAI -> Anthropic -- JSON schema extraction, prompt-based extraction, summarization +- Provider chain: Gemini CLI (primary) -> OpenAI -> Ollama -> Anthropic +- Gemini CLI requires the `gemini` binary on PATH; `GEMINI_MODEL` env var controls model (default: `gemini-2.5-pro`) +- JSON schema extraction with jsonschema validation; parse failures retry once; schema mismatches fail immediately +- Prompt-based extraction, summarization ### PDF Modules (`noxa-pdf`) - PDF text extraction via pdf-extract crate @@ -105,11 +107,15 @@ noxa https://example.com --diff-with snap.json # Brand extraction noxa https://example.com --brand -# LLM features (Ollama local-first) +# LLM features (Gemini CLI primary; requires `gemini` on PATH) noxa https://example.com --summarize noxa https://example.com --extract-prompt "Get all pricing tiers" noxa https://example.com --extract-json '{"type":"object","properties":{"title":{"type":"string"}}}' +# Force a specific LLM provider +noxa https://example.com --llm-provider gemini --summarize +noxa https://example.com --llm-provider openai --summarize + # PDF (auto-detected via Content-Type) noxa https://example.com/report.pdf diff --git a/Cargo.lock b/Cargo.lock index 0b9cb9d..f9ca781 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,7 +35,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "getrandom 0.3.4", "once_cell", + "serde", "version_check", "zerocopy", ] @@ -64,6 +66,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -206,6 +214,21 @@ dependencies = [ "syn", ] +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "2.11.0" @@ -246,6 +269,12 @@ dependencies = [ "openssl-macros", ] +[[package]] +name = "borrow-or-share" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0b364ead1874514c8c2855ab558056ebfeb775653e7ae45ff72f28f8f3166c" + [[package]] name = "brotli" version = "8.0.2" @@ -273,6 +302,12 @@ version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +[[package]] +name = "bytecount" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" + [[package]] name = "byteorder" version = "1.5.0" @@ -601,6 +636,12 @@ dependencies = [ "syn", ] +[[package]] +name = "data-encoding" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" + [[package]] name = "debug_unsafe" version = "0.1.4" @@ -726,6 +767,15 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "email_address" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e079f19b08ca6239f47f8ba8509c11cf3ea30095831f7fed61441475edd8c449" +dependencies = [ + "serde", +] + [[package]] name = "encoding_rs" version = "0.8.35" @@ -760,6 +810,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "fancy-regex" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fast-float2" version = "0.2.3" @@ -789,6 +850,17 @@ dependencies = [ "zlib-rs", ] +[[package]] +name = "fluent-uri" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc74ac4d8359ae70623506d512209619e5cf8f347124910440dbc221714b328e" +dependencies = [ + "borrow-or-share", + "ref-cast", + "serde", +] + [[package]] name = "fnv" version = "1.0.7" @@ -801,6 +873,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.5.0" @@ -837,6 +915,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fraction" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f158e3ff0a1b334408dc9fb811cd99b446986f4d8b741bb08f9df1604085ae7" +dependencies = [ + "lazy_static", + "num", +] + [[package]] name = "fs_extra" version = "1.3.0" @@ -1037,7 +1125,7 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -1045,6 +1133,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -1410,6 +1503,33 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "jsonschema" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84695c6689b01384700a3d93acecbd07231ee6fff1bf22ae980b4c307e6ddfd5" +dependencies = [ + "ahash", + "bytecount", + "data-encoding", + "email_address", + "fancy-regex", + "fraction", + "getrandom 0.3.4", + "idna", + "itoa", + "num-cmp", + "num-traits", + "percent-encoding", + "referencing", + "regex", + "regex-syntax", + "serde", + "serde_json", + "unicode-general-category", + "uuid-simd", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1575,6 +1695,12 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "micromap" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a86d3146ed3995b5913c414f6664344b9617457320782e64f0bb44afd49d74" + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -1627,10 +1753,12 @@ dependencies = [ "noxa-core", "noxa-fetch", "noxa-llm", + "noxa-mcp", "noxa-pdf", "rand 0.8.5", "regex", "reqwest", + "serde", "serde_json", "tokio", "tracing", @@ -1683,6 +1811,7 @@ name = "noxa-llm" version = "0.3.11" dependencies = [ "async-trait", + "jsonschema", "reqwest", "serde", "serde_json", @@ -1730,12 +1859,82 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-cmp" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63335b2e2c34fae2fb0aa2cecfd9f0832a1e24b3b32ecec612c3426d46dc8aaa" + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1774,6 +1973,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + [[package]] name = "parking_lot" version = "0.12.5" @@ -2160,6 +2365,23 @@ dependencies = [ "syn", ] +[[package]] +name = "referencing" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2d5554bf79f4acf770dc3193b44b2d63b348f5f7b7448a0ea1191b37b620728" +dependencies = [ + "ahash", + "fluent-uri", + "getrandom 0.3.4", + "hashbrown 0.16.1", + "itoa", + "micromap", + "parking_lot", + "percent-encoding", + "serde_json", +] + [[package]] name = "regex" version = "1.12.3" @@ -2985,6 +3207,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +[[package]] +name = "unicode-general-category" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b993bddc193ae5bd0d623b49ec06ac3e9312875fdae725a975c51db1cc1677f" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -3049,6 +3277,16 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "uuid-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b082222b4f6619906941c17eb2297fff4c2fb96cb60164170522942a200bd8" +dependencies = [ + "outref", + "vsimd", +] + [[package]] name = "valuable" version = "0.1.1" @@ -3061,6 +3299,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "want" version = "0.3.1" diff --git a/Cargo.toml b/Cargo.toml index 1b90acd..81bfd4b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ noxa-core = { path = "crates/noxa-core" } noxa-fetch = { path = "crates/noxa-fetch" } noxa-llm = { path = "crates/noxa-llm" } noxa-pdf = { path = "crates/noxa-pdf" } +noxa-mcp = { path = "crates/noxa-mcp" } tokio = { version = "1", features = ["full"] } serde = { version = "1", features = ["derive"] } serde_json = "1" @@ -21,3 +22,6 @@ tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } clap = { version = "4", features = ["derive", "env"] } dotenvy = "0.15" +rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] } +schemars = "1.0" +dirs = "6.0.0" diff --git a/README.md b/README.md index cd3cba4..fea03dc 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for mac ### Cargo (from source) ```bash -cargo install --git https://github.com/jmagar/noxa.git noxa +cargo install --git https://github.com/jmagar/noxa.git noxa-cli --bin noxa cargo install --git https://github.com/jmagar/noxa.git noxa-mcp ``` @@ -159,6 +159,271 @@ Crawling... 50/50 pages extracted --- +## Examples + +### Basic Extraction + +```bash +# Extract as markdown (default) +noxa https://example.com + +# Multiple output formats +noxa https://example.com -f markdown # Clean markdown +noxa https://example.com -f json # Full structured JSON +noxa https://example.com -f text # Plain text (no formatting) +noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens) + +# Bare domains work (auto-prepends https://) +noxa example.com +``` + +### Content Filtering + +```bash +# Only extract main content (skip nav, sidebar, footer) +noxa https://docs.rs/tokio --only-main-content + +# Include specific CSS selectors +noxa https://news.ycombinator.com --include ".titleline,.score" + +# Exclude specific elements +noxa https://example.com --exclude "nav,footer,.ads,.sidebar" + +# Combine both +noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar" +``` + +### Brand Identity Extraction + +```bash +# Extract colors, fonts, logos from any website +noxa --brand https://stripe.com +# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] } + +noxa --brand https://github.com +# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... } + +noxa --brand wikipedia.org +# Output: 10 colors, 5 fonts, favicon, logo URL +``` + +### Sitemap Discovery + +```bash +# Discover all URLs from a site's sitemaps +noxa --map https://sitemaps.org +# Output: one URL per line (84 URLs found) + +# JSON output with metadata +noxa --map https://sitemaps.org -f json +# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }] +``` + +### Recursive Crawling + +```bash +# Crawl a site (default: depth 1, max 20 pages) +noxa --crawl https://example.com + +# Control depth and page limit +noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio + +# Crawl with sitemap seeding (finds more pages) +noxa --crawl --sitemap --depth 2 https://docs.rs/tokio + +# Filter crawl paths +noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com +noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com + +# Control concurrency and delay +noxa --crawl --concurrency 10 --delay 200 https://example.com +``` + +### Change Detection (Diff) + +```bash +# Step 1: Save a snapshot +noxa https://example.com -f json > snapshot.json + +# Step 2: Later, compare against the snapshot +noxa --diff-with snapshot.json https://example.com +# Output: +# Status: Same +# Word count delta: +0 + +# If the page changed: +# Status: Changed +# Word count delta: +42 +# --- old +# +++ new +# @@ -1,3 +1,3 @@ +# -Old content here +# +New content here +``` + +### PDF Extraction + +```bash +# PDF URLs are auto-detected via Content-Type +noxa https://example.com/report.pdf + +# Control PDF mode +noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs) +noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found +``` + +### Batch Processing + +```bash +# Multiple URLs in one command +noxa https://example.com https://httpbin.org/html https://rust-lang.org + +# URLs from a file (one per line, # comments supported) +noxa --urls-file urls.txt + +# Batch with JSON output +noxa --urls-file urls.txt -f json + +# Proxy rotation for large batches +noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10 +``` + +### Local Files & Stdin + +```bash +# Extract from a local HTML file +noxa --file page.html + +# Pipe HTML from another command +curl -s https://example.com | noxa --stdin + +# Chain with other tools +noxa https://example.com -f text | wc -w # Word count +noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq +``` + +### Browser Impersonation + +```bash +# Chrome (default) — latest Chrome TLS fingerprint +noxa https://example.com + +# Firefox fingerprint +noxa --browser firefox https://example.com + +# Random browser per request (good for batch) +noxa --browser random --urls-file urls.txt +``` + +### Custom Headers & Cookies + +```bash +# Custom headers +noxa -H "Authorization: Bearer token123" https://api.example.com +noxa -H "Accept-Language: de-DE" https://example.com + +# Cookies +noxa --cookie "session=abc123; theme=dark" https://example.com + +# Multiple headers +noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com +``` + +### LLM-Powered Features + +These require an LLM provider (Ollama local, or OpenAI/Anthropic API key). + +```bash +# Summarize a page (default: 3 sentences) +noxa --summarize https://example.com + +# Control summary length +noxa --summarize 5 https://example.com + +# Extract structured JSON with a schema +noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product + +# Extract with a schema from file +noxa --extract-json @schema.json https://example.com/product + +# Extract with natural language prompt +noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing + +# Use a specific LLM provider +noxa --llm-provider ollama --summarize https://example.com +noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com +noxa --llm-provider anthropic --summarize https://example.com +``` + +### Raw HTML Output + +```bash +# Get the raw fetched HTML (no extraction) +noxa --raw-html https://example.com + +# Useful for debugging extraction issues +noxa --raw-html https://example.com > raw.html +noxa --file raw.html # Then extract locally +``` + +### Metadata & Verbose Mode + +```bash +# Include YAML frontmatter with metadata +noxa --metadata https://example.com +# Output: +# --- +# title: "Example Domain" +# source: "https://example.com" +# word_count: 20 +# --- +# # Example Domain +# ... + +# Verbose logging (debug extraction pipeline) +noxa -v https://example.com +``` + +### Proxy Usage + +```bash +# Single proxy +noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com + +# SOCKS5 proxy +noxa --proxy socks5://proxy.example.com:1080 https://example.com + +# Proxy rotation from file (one per line: host:port:user:pass) +noxa --proxy-file proxies.txt https://example.com + +# Auto-load proxies.txt from current directory +echo "proxy1.com:8080:user:pass" > proxies.txt +noxa https://example.com # Automatically detects and uses proxies.txt +``` + +### Real-World Recipes + +```bash +# Monitor competitor pricing — save today's pricing +noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \ + https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json + +# Build a documentation search index +noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt + +# Extract all images from a page +noxa https://example.com -f json | jq -r '.content.images[].src' + +# Get all external links +noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href' + +# Compare two pages +noxa https://site-a.com -f json > a.json +noxa https://site-b.com --diff-with a.json +``` + +--- + ## MCP Server — 10 tools for AI agents noxa MCP server @@ -327,6 +592,31 @@ noxa/ ## Configuration +Non-secret defaults live in `config.json` in your working directory. Copy the example: + +```bash +cp config.example.json config.json +``` + +**Precedence:** CLI flags > `config.json` > built-in defaults + +**Secrets and URLs** (API keys, proxy, webhook, LLM base URL) always go in `.env`, not `config.json`: + +```bash +cp env.example .env +``` + +**Override config path** for a single run: + +```bash +NOXA_CONFIG=/path/to/other-config.json noxa https://example.com +NOXA_CONFIG=/dev/null noxa https://example.com # bypass config entirely +``` + +**Bool flag limitation:** flags like `--metadata`, `--only-main-content`, `--verbose` set to `true` in `config.json` cannot be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass. + +### Environment variables + | Variable | Description | |----------|-------------| | `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) | diff --git a/config.example.json b/config.example.json new file mode 100644 index 0000000..db863eb --- /dev/null +++ b/config.example.json @@ -0,0 +1,34 @@ +{ + "_doc": [ + "Copy to config.json and remove fields you don't need.", + "Secrets (api_key, proxy, webhook, llm_base_url) go in .env — NOT here.", + "BOOL FLAG LIMITATION: once set to true here, cannot be overridden to false", + "from the CLI for a single run (no --no-flag support). Use NOXA_CONFIG=/dev/null", + "on the command line to bypass this config entirely.", + "on_change is intentionally absent — it must remain a CLI-only flag.", + "Unknown fields are silently ignored, so this file works across noxa versions." + ], + + "format": "markdown", + "browser": "chrome", + "timeout": 30, + "pdf_mode": "auto", + "metadata": false, + "verbose": false, + "only_main_content": false, + + "include_selectors": [], + "exclude_selectors": ["nav", "footer", ".sidebar", ".cookie-banner"], + + "depth": 1, + "max_pages": 20, + "concurrency": 5, + "delay": 100, + "path_prefix": null, + "include_paths": [], + "exclude_paths": ["/changelog/*", "/blog/*", "/releases/*"], + "use_sitemap": false, + + "llm_provider": "gemini", + "llm_model": "gemini-2.5-pro" +} diff --git a/crates/noxa-cli/Cargo.toml b/crates/noxa-cli/Cargo.toml index 911cdf9..a874a7f 100644 --- a/crates/noxa-cli/Cargo.toml +++ b/crates/noxa-cli/Cargo.toml @@ -14,9 +14,11 @@ noxa-core = { workspace = true } noxa-fetch = { workspace = true } noxa-llm = { workspace = true } noxa-pdf = { workspace = true } +noxa-mcp = { workspace = true } dotenvy = { workspace = true } rand = "0.8" serde_json = { workspace = true } +serde = { workspace = true } tokio = { workspace = true } clap = { workspace = true } tracing = { workspace = true } diff --git a/crates/noxa-cli/src/config.rs b/crates/noxa-cli/src/config.rs new file mode 100644 index 0000000..894716f --- /dev/null +++ b/crates/noxa-cli/src/config.rs @@ -0,0 +1,315 @@ +use serde::Deserialize; +use std::path::Path; + +use crate::{Browser, OutputFormat, PdfModeArg}; + +/// Non-secret, non-URL configuration defaults loaded from config.json. +/// All fields optional — absent means "use the hard default". +/// Unknown fields are silently ignored (serde default) so config files +/// written for a newer version of noxa work on older binaries. +/// +/// DELIBERATELY EXCLUDED: +/// - on_change: passes content to sh -c; must remain CLI-only to prevent +/// shell injection via config file writes. +/// - Secrets/URLs (api_key, proxy, webhook, llm_base_url): stay in .env. +/// +/// BOOL FLAG LIMITATION: +/// only_main_content, metadata, verbose, use_sitemap set to true here +/// cannot be overridden to false from the CLI for a single run (no --no-flag +/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null to bypass. +#[derive(Debug, Default, Deserialize)] +pub struct NoxaConfig { + // Output + pub format: Option, + pub metadata: Option, + pub verbose: Option, + + // Fetch + pub browser: Option, + pub timeout: Option, + pub pdf_mode: Option, + pub only_main_content: Option, + + // CSS selectors + pub include_selectors: Option>, + pub exclude_selectors: Option>, + + // Crawl + pub depth: Option, + pub max_pages: Option, + pub concurrency: Option, + pub delay: Option, + pub path_prefix: Option, + pub include_paths: Option>, + pub exclude_paths: Option>, + pub use_sitemap: Option, + + // LLM (non-secret: provider name and model only; base URL stays in .env) + pub llm_provider: Option, + pub llm_model: Option, +} + +impl NoxaConfig { + /// Load config from an explicit path, NOXA_CONFIG env var, or ./config.json. + /// Returns an empty (all-None) config if the file doesn't exist. + /// Prints an error and exits if the file exists but is invalid JSON. + pub fn load(explicit_path: Option<&str>) -> Self { + let noxa_config_env = std::env::var("NOXA_CONFIG").ok(); + let was_explicit = explicit_path.is_some() || noxa_config_env.is_some(); + + let path_str = explicit_path + .map(String::from) + .or(noxa_config_env) + .unwrap_or_else(|| "config.json".to_string()); + + let path = Path::new(&path_str); + if !path.exists() { + if was_explicit { + let display_name = path.file_name() + .and_then(|n| n.to_str()) + .unwrap_or(&path_str); + eprintln!("error: config file not found: {display_name}"); + std::process::exit(1); + } + return Self::default(); + } + + let display_name = path.file_name() + .and_then(|n| n.to_str()) + .unwrap_or(&path_str); + eprintln!( + "noxa: config loaded from {display_name} \ + (API keys and secrets belong in .env, not config.json)" + ); + tracing::debug!("config path: {}", path.display()); + + let content = match std::fs::read_to_string(path) { + Ok(s) => s, + Err(e) => { + eprintln!("error: cannot read config file {display_name}: {e}"); + std::process::exit(1); + } + }; + + match serde_json::from_str(&content) { + Ok(cfg) => cfg, + Err(e) => { + eprintln!("error: invalid JSON in config file {display_name}: {e}"); + std::process::exit(1); + } + } + } +} + +/// Fully resolved configuration after merging CLI flags > config file > hard defaults. +/// All fields are concrete — no Option. This is what the rest of main.rs reads. +/// +/// The merge uses clap's ValueSource to detect which fields were explicitly set on +/// the command line. CLI-explicit values always win. Config fills in the rest. +/// Hard defaults are the fallback of last resort. +pub struct ResolvedConfig { + // Output + pub format: OutputFormat, + pub metadata: bool, + pub verbose: bool, + + // Fetch + pub browser: Browser, + pub timeout: u64, + pub pdf_mode: PdfModeArg, + pub only_main_content: bool, + /// CLI-only output flag — not configurable via config.json (it is a per-run mode, not a persistent default). + pub raw_html: bool, + + // CSS selectors + /// Vec — CSS selectors passed directly to extraction filter. + pub include_selectors: Vec, + /// Vec — CSS selectors passed directly to extraction filter. + pub exclude_selectors: Vec, + + // Crawl + pub depth: usize, + pub max_pages: usize, + pub concurrency: usize, + pub delay: u64, + pub path_prefix: Option, + /// Vec — never joined to a comma-string. Passed directly to CrawlConfig. + pub include_paths: Vec, + /// Vec — never joined to a comma-string. Passed directly to CrawlConfig. + pub exclude_paths: Vec, + pub use_sitemap: bool, + + // LLM + pub llm_provider: Option, + pub llm_model: Option, +} + +use clap::parser::ValueSource; + +/// Merge CLI flags (detected via ValueSource), config file, and hard defaults +/// into a single ResolvedConfig. CLI explicit values always win. +pub fn resolve( + cli: &crate::Cli, + matches: &clap::ArgMatches, + cfg: &NoxaConfig, +) -> ResolvedConfig { + let explicit = |name: &str| { + matches.value_source(name) == Some(ValueSource::CommandLine) + }; + + ResolvedConfig { + format: if explicit("format") { + cli.format.clone() + } else { + cfg.format.clone().unwrap_or(crate::OutputFormat::Markdown) + }, + browser: if explicit("browser") { + cli.browser.clone() + } else { + cfg.browser.clone().unwrap_or(crate::Browser::Chrome) + }, + pdf_mode: if explicit("pdf_mode") { + cli.pdf_mode.clone() + } else { + cfg.pdf_mode.clone().unwrap_or(crate::PdfModeArg::Auto) + }, + timeout: if explicit("timeout") { + cli.timeout + } else { + cfg.timeout.unwrap_or(30) + }, + depth: if explicit("depth") { + cli.depth + } else { + cfg.depth.unwrap_or(1) + }, + max_pages: if explicit("max_pages") { + cli.max_pages + } else { + cfg.max_pages.unwrap_or(20) + }, + concurrency: if explicit("concurrency") { + cli.concurrency + } else { + cfg.concurrency.unwrap_or(5) + }, + delay: if explicit("delay") { + cli.delay + } else { + cfg.delay.unwrap_or(100) + }, + path_prefix: if explicit("path_prefix") { + cli.path_prefix.clone() + } else { + cfg.path_prefix.clone() + }, + include_paths: if explicit("include_paths") { + cli.include_paths + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default() + } else { + cfg.include_paths.clone().unwrap_or_default() + }, + exclude_paths: if explicit("exclude_paths") { + cli.exclude_paths + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default() + } else { + cfg.exclude_paths.clone().unwrap_or_default() + }, + include_selectors: if explicit("include") { + cli.include + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default() + } else { + cfg.include_selectors.clone().unwrap_or_default() + }, + exclude_selectors: if explicit("exclude") { + cli.exclude + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default() + } else { + cfg.exclude_selectors.clone().unwrap_or_default() + }, + only_main_content: cli.only_main_content || cfg.only_main_content.unwrap_or(false), + metadata: cli.metadata || cfg.metadata.unwrap_or(false), + verbose: cli.verbose || cfg.verbose.unwrap_or(false), + use_sitemap: cli.sitemap || cfg.use_sitemap.unwrap_or(false), + raw_html: cli.raw_html, + llm_provider: if cli.llm_provider.is_some() { + cli.llm_provider.clone() + } else { + cfg.llm_provider.clone() + }, + llm_model: if cli.llm_model.is_some() { + cli.llm_model.clone() + } else { + cfg.llm_model.clone() + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_noxa_config_deserialize_full() { + let json = r#"{ + "format": "llm", + "depth": 3, + "max_pages": 100, + "concurrency": 10, + "delay": 200, + "browser": "firefox", + "timeout": 60, + "only_main_content": true, + "use_sitemap": true, + "path_prefix": "/docs/", + "include_paths": ["/docs/*", "/api/*"], + "exclude_paths": ["/changelog/*", "/blog/*"], + "include_selectors": ["article", ".content"], + "exclude_selectors": ["nav", "footer"], + "llm_provider": "gemini", + "llm_model": "gemini-2.5-pro", + "pdf_mode": "fast", + "metadata": true, + "verbose": false + }"#; + let cfg: NoxaConfig = serde_json::from_str(json).unwrap(); + assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm))); + assert_eq!(cfg.depth, Some(3)); + assert_eq!(cfg.exclude_paths, Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()])); + assert!(matches!(cfg.pdf_mode, Some(crate::PdfModeArg::Fast))); + } + + #[test] + fn test_noxa_config_empty() { + let cfg: NoxaConfig = serde_json::from_str("{}").unwrap(); + assert!(cfg.format.is_none()); + assert!(cfg.depth.is_none()); + } + + #[test] + fn test_noxa_config_unknown_fields_ignored() { + // Unknown fields must NOT cause a parse failure + let cfg: NoxaConfig = serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap(); + assert_eq!(cfg.depth, Some(2)); + } + + #[test] + fn test_load_implicit_missing_file_returns_default() { + // When no explicit path and ./config.json doesn't exist, silently return default. + // The simplest test: call with None and rely on ./config.json not existing in test env. + // If CWD has config.json this test is skipped to avoid flakiness. + if std::path::Path::new("config.json").exists() { + return; // skip: CWD has config.json + } + let cfg = NoxaConfig::load(None); + assert!(cfg.format.is_none()); + } +} diff --git a/crates/noxa-cli/src/main.rs b/crates/noxa-cli/src/main.rs index 7bb0a88..7144c24 100644 --- a/crates/noxa-cli/src/main.rs +++ b/crates/noxa-cli/src/main.rs @@ -2,6 +2,7 @@ /// CLI entry point -- wires noxa-core and noxa-fetch into a single command. /// All extraction and fetching logic lives in sibling crates; this is pure plumbing. mod cloud; +mod config; use std::io::{self, Read as _}; use std::path::{Path, PathBuf}; @@ -9,8 +10,7 @@ use std::process; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; -use clap::{Parser, ValueEnum}; -use tracing_subscriber::EnvFilter; +use clap::{CommandFactory, FromArgMatches, Parser, ValueEnum}; use noxa_core::{ ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options, to_llm_text, @@ -20,7 +20,10 @@ use noxa_fetch::{ FetchConfig, FetchResult, PageResult, SitemapEntry, }; use noxa_llm::LlmProvider; +use noxa_mcp; use noxa_pdf::PdfMode; +use serde::Deserialize; +use tracing_subscriber::EnvFilter; /// Known anti-bot challenge page titles (case-insensitive prefix match). const ANTIBOT_TITLES: &[&str] = &[ @@ -87,6 +90,10 @@ fn warn_empty(url: &str, reason: &EmptyReason) { #[derive(Parser)] #[command(name = "noxa", about = "Extract web content for LLMs", version)] struct Cli { + /// Path to config.json (default: ./config.json, override with NOXA_CONFIG env var) + #[arg(long, global = true)] + config: Option, + /// URLs to fetch (multiple allowed) #[arg()] urls: Vec, @@ -247,7 +254,7 @@ struct Cli { #[arg(long, num_args = 0..=1, default_missing_value = "3")] summarize: Option, - /// Force a specific LLM provider (ollama, openai, anthropic) + /// Force a specific LLM provider (gemini, ollama, openai, anthropic) #[arg(long, env = "NOXA_LLM_PROVIDER")] llm_provider: Option, @@ -284,7 +291,8 @@ struct Cli { output_dir: Option, } -#[derive(Clone, ValueEnum)] +#[derive(Clone, Debug, ValueEnum, Deserialize)] +#[serde(rename_all = "lowercase")] enum OutputFormat { Markdown, Json, @@ -293,14 +301,16 @@ enum OutputFormat { Html, } -#[derive(Clone, ValueEnum)] +#[derive(Clone, Debug, ValueEnum, Deserialize)] +#[serde(rename_all = "lowercase")] enum Browser { Chrome, Firefox, Random, } -#[derive(Clone, ValueEnum, Default)] +#[derive(Clone, Debug, ValueEnum, Default, Deserialize)] +#[serde(rename_all = "lowercase")] enum PdfModeArg { /// Error if PDF has no extractable text (catches scanned PDFs) #[default] @@ -338,12 +348,21 @@ fn init_logging(verbose: bool) { tracing_subscriber::fmt().with_env_filter(filter).init(); } +fn init_mcp_logging() { + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .with_writer(std::io::stderr) + .with_ansi(false) + .try_init() + .ok(); +} + /// Build FetchConfig from CLI flags. /// /// `--proxy` sets a single static proxy (no rotation). /// `--proxy-file` loads a pool of proxies and rotates per-request. /// `--proxy` takes priority: if both are set, only the single proxy is used. -fn build_fetch_config(cli: &Cli) -> FetchConfig { +fn build_fetch_config(cli: &Cli, resolved: &config::ResolvedConfig) -> FetchConfig { let (proxy, proxy_pool) = if cli.proxy.is_some() { (cli.proxy.clone(), Vec::new()) } else if let Some(ref path) = cli.proxy_file { @@ -403,11 +422,11 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig { } FetchConfig { - browser: cli.browser.clone().into(), + browser: resolved.browser.clone().into(), proxy, proxy_pool, - timeout: std::time::Duration::from_secs(cli.timeout), - pdf_mode: cli.pdf_mode.clone().into(), + timeout: std::time::Duration::from_secs(resolved.timeout), + pdf_mode: resolved.pdf_mode.clone().into(), headers, ..Default::default() } @@ -436,20 +455,12 @@ fn parse_cookie_file(path: &str) -> Result { Ok(pairs.join("; ")) } -fn build_extraction_options(cli: &Cli) -> ExtractionOptions { +fn build_extraction_options(resolved: &config::ResolvedConfig) -> ExtractionOptions { ExtractionOptions { - include_selectors: cli - .include - .as_deref() - .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) - .unwrap_or_default(), - exclude_selectors: cli - .exclude - .as_deref() - .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) - .unwrap_or_default(), - only_main_content: cli.only_main_content, - include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html), + include_selectors: resolved.include_selectors.clone(), + exclude_selectors: resolved.exclude_selectors.clone(), + only_main_content: resolved.only_main_content, + include_raw_html: resolved.raw_html || matches!(resolved.format, OutputFormat::Html), } } @@ -618,14 +629,17 @@ impl FetchOutput { /// Fetch a URL and extract content, handling PDF detection automatically. /// Falls back to cloud API when bot protection or JS rendering is detected. -async fn fetch_and_extract(cli: &Cli) -> Result { +async fn fetch_and_extract( + cli: &Cli, + resolved: &config::ResolvedConfig, +) -> Result { // Local sources: read and extract as HTML if cli.stdin { let mut buf = String::new(); io::stdin() .read_to_string(&mut buf) .map_err(|e| format!("failed to read stdin: {e}"))?; - let options = build_extraction_options(cli); + let options = build_extraction_options(resolved); return extract_with_options(&buf, None, &options) .map(|r| FetchOutput::Local(Box::new(r))) .map_err(|e| format!("extraction error: {e}")); @@ -634,7 +648,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result { if let Some(ref path) = cli.file { let html = std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; - let options = build_extraction_options(cli); + let options = build_extraction_options(resolved); return extract_with_options(&html, None, &options) .map(|r| FetchOutput::Local(Box::new(r))) .map_err(|e| format!("extraction error: {e}")); @@ -651,10 +665,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result { // --cloud: skip local, go straight to cloud API if cli.cloud { - let c = - cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?; - let options = build_extraction_options(cli); - let format_str = match cli.format { + let c = cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?; + let options = build_extraction_options(resolved); + let format_str = match resolved.format { OutputFormat::Markdown => "markdown", OutputFormat::Json => "json", OutputFormat::Text => "text", @@ -674,9 +687,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result { } // Normal path: try local first - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - let options = build_extraction_options(cli); + let client = FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?; + let options = build_extraction_options(resolved); let result = client .fetch_and_extract_with_options(url, &options) .await @@ -687,7 +700,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result { if !matches!(reason, EmptyReason::None) { if let Some(ref c) = cloud_client { eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API..."); - let format_str = match cli.format { + let format_str = match resolved.format { OutputFormat::Markdown => "markdown", OutputFormat::Json => "json", OutputFormat::Text => "text", @@ -718,7 +731,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result { } /// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction. -async fn fetch_html(cli: &Cli) -> Result { +async fn fetch_html(cli: &Cli, resolved: &config::ResolvedConfig) -> Result { if cli.stdin { let mut buf = String::new(); io::stdin() @@ -751,8 +764,8 @@ async fn fetch_html(cli: &Cli) -> Result { .ok_or("no input provided -- pass a URL, --file, or --stdin")?; let url = normalize_url(raw_url); - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + let client = FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?; client .fetch(&url) .await @@ -1166,7 +1179,7 @@ fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String ) } -async fn run_crawl(cli: &Cli) -> Result<(), String> { +async fn run_crawl(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> { let url = cli .urls .first() @@ -1178,16 +1191,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { return Err("--crawl cannot be used with --file or --stdin".into()); } - let include_patterns: Vec = cli - .include_paths - .as_deref() - .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) - .unwrap_or_default(); - let exclude_patterns: Vec = cli - .exclude_paths - .as_deref() - .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) - .unwrap_or_default(); + let include_patterns = resolved.include_paths.clone(); + let exclude_patterns = resolved.exclude_paths.clone(); // Set up streaming progress channel let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::(100); @@ -1207,13 +1212,13 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { } let config = CrawlConfig { - fetch: build_fetch_config(cli), - max_depth: cli.depth, - max_pages: cli.max_pages, - concurrency: cli.concurrency, - delay: std::time::Duration::from_millis(cli.delay), - path_prefix: cli.path_prefix.clone(), - use_sitemap: cli.sitemap, + fetch: build_fetch_config(cli, resolved), + max_depth: resolved.depth, + max_pages: resolved.max_pages, + concurrency: resolved.concurrency, + delay: std::time::Duration::from_millis(resolved.delay), + path_prefix: resolved.path_prefix.clone(), + use_sitemap: resolved.use_sitemap, include_patterns, exclude_patterns, progress_tx: Some(progress_tx), @@ -1232,7 +1237,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { ); }); - let max_pages = cli.max_pages; + let max_pages = resolved.max_pages; let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages); // Spawn background task to print streaming progress to stderr @@ -1261,8 +1266,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { &result.visited, &result.remaining_frontier, completed_offset + result.pages.len(), - cli.max_pages, - cli.depth, + resolved.max_pages, + resolved.depth, )?; eprintln!( "Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}", @@ -1294,15 +1299,15 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { let mut saved = 0usize; for page in &result.pages { if let Some(ref extraction) = page.extraction { - let filename = url_to_filename(&page.url, &cli.format); - let content = format_output(extraction, &cli.format, cli.metadata); + let filename = url_to_filename(&page.url, &resolved.format); + let content = format_output(extraction, &resolved.format, resolved.metadata); write_to_file(dir, &filename, &content)?; saved += 1; } } eprintln!("Saved {saved} files to {}", dir.display()); } else { - print_crawl_output(&result, &cli.format, cli.metadata); + print_crawl_output(&result, &resolved.format, resolved.metadata); } eprintln!( @@ -1338,7 +1343,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { } } -async fn run_map(cli: &Cli) -> Result<(), String> { +async fn run_map(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> { let url = cli .urls .first() @@ -1346,8 +1351,8 @@ async fn run_map(cli: &Cli) -> Result<(), String> { .map(|u| normalize_url(u))?; let url = url.as_str(); - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + let client = FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?; let entries = noxa_fetch::sitemap::discover(&client, url) .await @@ -1359,19 +1364,24 @@ async fn run_map(cli: &Cli) -> Result<(), String> { eprintln!("discovered {} URLs", entries.len()); } - print_map_output(&entries, &cli.format); + print_map_output(&entries, &resolved.format); Ok(()) } -async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { +async fn run_batch( + cli: &Cli, + resolved: &config::ResolvedConfig, + entries: &[(String, Option)], +) -> Result<(), String> { let client = Arc::new( - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, + FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?, ); let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect(); - let options = build_extraction_options(cli); + let options = build_extraction_options(resolved); let results = client - .fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options) + .fetch_and_extract_batch_with_options(&urls, resolved.concurrency, &options) .await; let ok = results.iter().filter(|r| r.result.is_ok()).count(); @@ -1402,15 +1412,15 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<() let filename = custom_names .get(r.url.as_str()) .map(|s| s.to_string()) - .unwrap_or_else(|| url_to_filename(&r.url, &cli.format)); - let content = format_output(extraction, &cli.format, cli.metadata); + .unwrap_or_else(|| url_to_filename(&r.url, &resolved.format)); + let content = format_output(extraction, &resolved.format, resolved.metadata); write_to_file(dir, &filename, &content)?; saved += 1; } } eprintln!("Saved {saved} files to {}", dir.display()); } else { - print_batch_output(&results, &cli.format, cli.metadata); + print_batch_output(&results, &resolved.format, resolved.metadata); } eprintln!( @@ -1514,15 +1524,20 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) { }); } -async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { +async fn run_watch( + cli: &Cli, + resolved: &config::ResolvedConfig, + urls: &[String], +) -> Result<(), String> { if urls.is_empty() { return Err("--watch requires at least one URL".into()); } let client = Arc::new( - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, + FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?, ); - let options = build_extraction_options(cli); + let options = build_extraction_options(resolved); // Ctrl+C handler let cancelled = Arc::new(AtomicBool::new(false)); @@ -1534,16 +1549,17 @@ async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { // Single-URL mode: preserve original behavior exactly if urls.len() == 1 { - return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await; + return run_watch_single(cli, resolved, &client, &options, &urls[0], &cancelled).await; } // Multi-URL mode: batch fetch, diff each, report aggregate - run_watch_multi(cli, &client, &options, urls, &cancelled).await + run_watch_multi(cli, resolved, &client, &options, urls, &cancelled).await } /// Original single-URL watch loop -- backward compatible. async fn run_watch_single( cli: &Cli, + resolved: &config::ResolvedConfig, client: &Arc, options: &ExtractionOptions, url: &str, @@ -1580,7 +1596,7 @@ async fn run_watch_single( if diff.status == ChangeStatus::Same { eprintln!("[watch] No changes ({})", timestamp()); } else { - print_diff_output(&diff, &cli.format); + print_diff_output(&diff, &resolved.format); eprintln!("[watch] Changes detected! ({})", timestamp()); if let Some(ref cmd) = cli.on_change { @@ -1627,6 +1643,7 @@ async fn run_watch_single( /// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate. async fn run_watch_multi( cli: &Cli, + resolved: &config::ResolvedConfig, client: &Arc, options: &ExtractionOptions, urls: &[String], @@ -1636,7 +1653,7 @@ async fn run_watch_multi( // Initial pass: fetch all URLs in parallel let initial_results = client - .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) + .fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options) .await; let mut snapshots = std::collections::HashMap::new(); @@ -1676,7 +1693,7 @@ async fn run_watch_multi( check_number += 1; let current_results = client - .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) + .fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options) .await; let mut changed: Vec = Vec::new(); @@ -1780,7 +1797,11 @@ async fn run_watch_multi( Ok(()) } -async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { +async fn run_diff( + cli: &Cli, + resolved: &config::ResolvedConfig, + snapshot_path: &str, +) -> Result<(), String> { // Load previous snapshot let snapshot_json = std::fs::read_to_string(snapshot_path) .map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?; @@ -1788,16 +1809,16 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { .map_err(|e| format!("failed to parse snapshot JSON: {e}"))?; // Extract current version (handles PDF detection for URLs) - let new_result = fetch_and_extract(cli).await?.into_extraction()?; + let new_result = fetch_and_extract(cli, resolved).await?.into_extraction()?; let diff = noxa_core::diff::diff(&old, &new_result); - print_diff_output(&diff, &cli.format); + print_diff_output(&diff, &resolved.format); Ok(()) } -async fn run_brand(cli: &Cli) -> Result<(), String> { - let result = fetch_html(cli).await?; +async fn run_brand(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> { + let result = fetch_html(cli, resolved).await?; let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await; let brand = noxa_core::brand::extract_brand( &enriched, @@ -1811,13 +1832,27 @@ async fn run_brand(cli: &Cli) -> Result<(), String> { } /// Build an LLM provider based on CLI flags, or fall back to the default chain. -async fn build_llm_provider(cli: &Cli) -> Result, String> { - if let Some(ref name) = cli.llm_provider { +async fn build_llm_provider( + cli: &Cli, + resolved: &config::ResolvedConfig, +) -> Result, String> { + if let Some(ref name) = resolved.llm_provider { match name.as_str() { + "gemini" => { + let provider = noxa_llm::providers::gemini_cli::GeminiCliProvider::new( + resolved.llm_model.clone(), + ); + if !provider.is_available().await { + return Err( + "gemini CLI not found on PATH -- install it or omit --llm-provider".into(), + ); + } + Ok(Box::new(provider)) + } "ollama" => { let provider = noxa_llm::providers::ollama::OllamaProvider::new( cli.llm_base_url.clone(), - cli.llm_model.clone(), + resolved.llm_model.clone(), ); if !provider.is_available().await { return Err("ollama is not running or unreachable".into()); @@ -1828,7 +1863,7 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { let provider = noxa_llm::providers::openai::OpenAiProvider::new( None, cli.llm_base_url.clone(), - cli.llm_model.clone(), + resolved.llm_model.clone(), ) .ok_or("OPENAI_API_KEY not set")?; Ok(Box::new(provider)) @@ -1836,20 +1871,20 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { "anthropic" => { let provider = noxa_llm::providers::anthropic::AnthropicProvider::new( None, - cli.llm_model.clone(), + resolved.llm_model.clone(), ) .ok_or("ANTHROPIC_API_KEY not set")?; Ok(Box::new(provider)) } other => Err(format!( - "unknown LLM provider: {other} (use ollama, openai, or anthropic)" + "unknown LLM provider: {other} (use gemini, ollama, openai, or anthropic)" )), } } else { let chain = noxa_llm::ProviderChain::default().await; if chain.is_empty() { return Err( - "no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY" + "no LLM providers available -- install the gemini CLI, start Ollama, or set OPENAI_API_KEY / ANTHROPIC_API_KEY" .into(), ); } @@ -1857,12 +1892,12 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { } } -async fn run_llm(cli: &Cli) -> Result<(), String> { +async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> { // Extract content from source first (handles PDF detection for URLs) - let result = fetch_and_extract(cli).await?.into_extraction()?; + let result = fetch_and_extract(cli, resolved).await?.into_extraction()?; - let provider = build_llm_provider(cli).await?; - let model = cli.llm_model.as_deref(); + let provider = build_llm_provider(cli, resolved).await?; + let model = resolved.llm_model.as_deref(); if let Some(ref schema_input) = cli.extract_json { // Support @file syntax for loading schema from file @@ -1876,6 +1911,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> { let schema: serde_json::Value = serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?; + let t = std::time::Instant::now(); let extracted = noxa_llm::extract::extract_json( &result.content.plain_text, &schema, @@ -1884,12 +1920,14 @@ async fn run_llm(cli: &Cli) -> Result<(), String> { ) .await .map_err(|e| format!("LLM extraction failed: {e}"))?; + eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64()); println!( "{}", serde_json::to_string_pretty(&extracted).expect("serialization failed") ); } else if let Some(ref prompt) = cli.extract_prompt { + let t = std::time::Instant::now(); let extracted = noxa_llm::extract::extract_with_prompt( &result.content.plain_text, prompt, @@ -1898,12 +1936,14 @@ async fn run_llm(cli: &Cli) -> Result<(), String> { ) .await .map_err(|e| format!("LLM extraction failed: {e}"))?; + eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64()); println!( "{}", serde_json::to_string_pretty(&extracted).expect("serialization failed") ); } else if let Some(sentences) = cli.summarize { + let t = std::time::Instant::now(); let summary = noxa_llm::summarize::summarize( &result.content.plain_text, Some(sentences), @@ -1912,6 +1952,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> { ) .await .map_err(|e| format!("LLM summarization failed: {e}"))?; + eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64()); println!("{summary}"); } @@ -1921,12 +1962,16 @@ async fn run_llm(cli: &Cli) -> Result<(), String> { /// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results. /// URLs are processed sequentially to respect LLM provider rate limits. -async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - let options = build_extraction_options(cli); - let provider = build_llm_provider(cli).await?; - let model = cli.llm_model.as_deref(); +async fn run_batch_llm( + cli: &Cli, + resolved: &config::ResolvedConfig, + entries: &[(String, Option)], +) -> Result<(), String> { + let client = FetchClient::new(build_fetch_config(cli, resolved)) + .map_err(|e| format!("client error: {e}"))?; + let options = build_extraction_options(resolved); + let provider = build_llm_provider(cli, resolved).await?; + let model = resolved.llm_model.as_deref(); // Pre-parse schema once if --extract-json is used let schema = if let Some(ref schema_input) = cli.extract_json { @@ -1974,6 +2019,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Resul let text = &extraction.content.plain_text; // Run the appropriate LLM operation + let llm_start = std::time::Instant::now(); let llm_result = if let Some(ref schema) = schema { noxa_llm::extract::extract_json(text, schema, provider.as_ref(), model) .await @@ -1989,6 +2035,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Resul } else { unreachable!("run_batch_llm called without LLM flags") }; + let llm_elapsed = llm_start.elapsed(); match llm_result { Ok(output) => { @@ -2018,7 +2065,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Resul format!("{words} words") } }; - eprintln!("-> extracted {detail}"); + eprintln!("-> extracted {detail} ({:.1}s)", llm_elapsed.as_secs_f64()); if let Some(ref dir) = cli.output_dir { let filename = custom_names @@ -2215,12 +2262,29 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> { async fn main() { dotenvy::dotenv().ok(); - let cli = Cli::parse(); - init_logging(cli.verbose); + if matches!(std::env::args().nth(1).as_deref(), Some("mcp")) { + init_mcp_logging(); + + if let Err(e) = noxa_mcp::run().await { + eprintln!("error: {e}"); + process::exit(1); + } + return; + } + + // Use low-level API to get both typed Cli and ArgMatches for ValueSource detection. + let matches = Cli::command().get_matches(); + let cli = Cli::from_arg_matches(&matches).unwrap_or_else(|e| e.exit()); + + // Load config BEFORE init_logging so verbose from config takes effect. + let cfg = config::NoxaConfig::load(cli.config.as_deref()); + let resolved = config::resolve(&cli, &matches, &cfg); + + init_logging(resolved.verbose); // --map: sitemap discovery mode if cli.map { - if let Err(e) = run_map(&cli).await { + if let Err(e) = run_map(&cli, &resolved).await { eprintln!("error: {e}"); process::exit(1); } @@ -2229,7 +2293,7 @@ async fn main() { // --crawl: recursive crawl mode if cli.crawl { - if let Err(e) = run_crawl(&cli).await { + if let Err(e) = run_crawl(&cli, &resolved).await { eprintln!("error: {e}"); process::exit(1); } @@ -2245,7 +2309,7 @@ async fn main() { process::exit(1); } }; - if let Err(e) = run_watch(&cli, &watch_urls).await { + if let Err(e) = run_watch(&cli, &resolved, &watch_urls).await { eprintln!("error: {e}"); process::exit(1); } @@ -2254,7 +2318,7 @@ async fn main() { // --diff-with: change tracking mode if let Some(ref snapshot_path) = cli.diff_with { - if let Err(e) = run_diff(&cli, snapshot_path).await { + if let Err(e) = run_diff(&cli, &resolved, snapshot_path).await { eprintln!("error: {e}"); process::exit(1); } @@ -2263,7 +2327,7 @@ async fn main() { // --brand: brand identity extraction mode if cli.brand { - if let Err(e) = run_brand(&cli).await { + if let Err(e) = run_brand(&cli, &resolved).await { eprintln!("error: {e}"); process::exit(1); } @@ -2292,11 +2356,11 @@ async fn main() { // When multiple URLs are provided, run batch LLM extraction over all of them. if has_llm_flags(&cli) { if entries.len() > 1 { - if let Err(e) = run_batch_llm(&cli, &entries).await { + if let Err(e) = run_batch_llm(&cli, &resolved, &entries).await { eprintln!("error: {e}"); process::exit(1); } - } else if let Err(e) = run_llm(&cli).await { + } else if let Err(e) = run_llm(&cli, &resolved).await { eprintln!("error: {e}"); process::exit(1); } @@ -2305,7 +2369,7 @@ async fn main() { // Multi-URL batch mode if entries.len() > 1 { - if let Err(e) = run_batch(&cli, &entries).await { + if let Err(e) = run_batch(&cli, &resolved, &entries).await { eprintln!("error: {e}"); process::exit(1); } @@ -2313,8 +2377,11 @@ async fn main() { } // --raw-html: skip extraction, dump the fetched HTML - if cli.raw_html && cli.include.is_none() && cli.exclude.is_none() { - match fetch_html(&cli).await { + if resolved.raw_html + && resolved.include_selectors.is_empty() + && resolved.exclude_selectors.is_empty() + { + match fetch_html(&cli, &resolved).await { Ok(r) => println!("{}", r.html), Err(e) => { eprintln!("error: {e}"); @@ -2325,7 +2392,7 @@ async fn main() { } // Single-page extraction (handles both HTML and PDF via content-type detection) - match fetch_and_extract(&cli).await { + match fetch_and_extract(&cli, &resolved).await { Ok(FetchOutput::Local(result)) => { if let Some(ref dir) = cli.output_dir { let url = cli @@ -2334,18 +2401,19 @@ async fn main() { .map(|u| normalize_url(u)) .unwrap_or_default(); let custom_name = entries.first().and_then(|(_, name)| name.clone()); - let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format)); - let content = format_output(&result, &cli.format, cli.metadata); + let filename = + custom_name.unwrap_or_else(|| url_to_filename(&url, &resolved.format)); + let content = format_output(&result, &resolved.format, resolved.metadata); if let Err(e) = write_to_file(dir, &filename, &content) { eprintln!("error: {e}"); process::exit(1); } } else { - print_output(&result, &cli.format, cli.metadata); + print_output(&result, &resolved.format, resolved.metadata); } } Ok(FetchOutput::Cloud(resp)) => { - print_cloud_output(&resp, &cli.format); + print_cloud_output(&resp, &resolved.format); } Err(e) => { eprintln!("{e}"); @@ -2456,3 +2524,28 @@ mod tests { let _ = std::fs::remove_dir_all(&dir); } } + +#[cfg(test)] +mod enum_deserialize_tests { + use super::*; + + #[test] + fn test_output_format_deserialize() { + let f: OutputFormat = serde_json::from_str("\"llm\"").unwrap(); + assert!(matches!(f, OutputFormat::Llm)); + let f: OutputFormat = serde_json::from_str("\"markdown\"").unwrap(); + assert!(matches!(f, OutputFormat::Markdown)); + } + + #[test] + fn test_browser_deserialize() { + let b: Browser = serde_json::from_str("\"firefox\"").unwrap(); + assert!(matches!(b, Browser::Firefox)); + } + + #[test] + fn test_pdf_mode_deserialize() { + let p: PdfModeArg = serde_json::from_str("\"fast\"").unwrap(); + assert!(matches!(p, PdfModeArg::Fast)); + } +} diff --git a/crates/noxa-llm/Cargo.toml b/crates/noxa-llm/Cargo.toml index caf656f..4575cdb 100644 --- a/crates/noxa-llm/Cargo.toml +++ b/crates/noxa-llm/Cargo.toml @@ -8,6 +8,7 @@ license.workspace = true [dependencies] reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } async-trait = "0.1" +jsonschema = { version = "0.46", default-features = false } serde = { workspace = true } serde_json = { workspace = true } tokio = { workspace = true } diff --git a/crates/noxa-llm/src/chain.rs b/crates/noxa-llm/src/chain.rs index 314bf2a..43f3de9 100644 --- a/crates/noxa-llm/src/chain.rs +++ b/crates/noxa-llm/src/chain.rs @@ -2,12 +2,15 @@ /// Default order: Ollama (local, free) -> OpenAI -> Anthropic. /// Only includes providers that are actually configured/available. use async_trait::async_trait; -use tracing::{debug, warn}; +use tracing::{debug, info, warn}; use crate::error::LlmError; use crate::provider::{CompletionRequest, LlmProvider}; use crate::providers::{ - anthropic::AnthropicProvider, ollama::OllamaProvider, openai::OpenAiProvider, + anthropic::AnthropicProvider, + gemini_cli::GeminiCliProvider, + ollama::OllamaProvider, + openai::OpenAiProvider, }; pub struct ProviderChain { @@ -15,12 +18,26 @@ pub struct ProviderChain { } impl ProviderChain { - /// Build the default chain: Ollama -> OpenAI -> Anthropic. - /// Ollama is always added (availability checked at call time). + /// Build the default chain: Gemini CLI -> OpenAI -> Ollama -> Anthropic. + /// Gemini CLI is the primary backend (subprocess-based, requires `gemini` on PATH). /// Cloud providers are only added if their API keys are configured. + /// Ollama is added if reachable at call time. pub async fn default() -> Self { let mut providers: Vec> = Vec::new(); + let gemini = GeminiCliProvider::new(None); + if gemini.is_available().await { + debug!("gemini cli available, adding as primary provider"); + providers.push(Box::new(gemini)); + } else { + debug!("gemini cli not found on PATH, skipping"); + } + + if let Some(openai) = OpenAiProvider::new(None, None, None) { + debug!("openai configured, adding to chain"); + providers.push(Box::new(openai)); + } + let ollama = OllamaProvider::new(None, None); if ollama.is_available().await { debug!("ollama is available, adding to chain"); @@ -29,11 +46,6 @@ impl ProviderChain { debug!("ollama not available, skipping"); } - if let Some(openai) = OpenAiProvider::new(None, None, None) { - debug!("openai configured, adding to chain"); - providers.push(Box::new(openai)); - } - if let Some(anthropic) = AnthropicProvider::new(None, None) { debug!("anthropic configured, adding to chain"); providers.push(Box::new(anthropic)); @@ -79,9 +91,10 @@ impl LlmProvider for ProviderChain { for provider in &self.providers { debug!(provider = provider.name(), "attempting completion"); + let t = std::time::Instant::now(); match provider.complete(request).await { Ok(response) => { - debug!(provider = provider.name(), "completion succeeded"); + info!(provider = provider.name(), elapsed_ms = t.elapsed().as_millis(), "completion succeeded"); return Ok(response); } Err(e) => { @@ -202,4 +215,46 @@ mod tests { assert_eq!(chain.len(), 2); assert!(!chain.is_empty()); } + + // ── Gemini-first chain ordering ─────────────────────────────────────────── + + #[tokio::test] + async fn gemini_first_in_single_provider_chain() { + // When we build a chain with a mock "gemini" provider first, it should + // be used before any fallback. + let chain = ProviderChain::from_providers(vec![ + Box::new(MockProvider { + name: "gemini", + response: Ok("from gemini".into()), + available: true, + }), + Box::new(MockProvider { + name: "openai", + response: Ok("from openai".into()), + available: true, + }), + ]); + let result = chain.complete(&test_request()).await.unwrap(); + assert_eq!(result, "from gemini"); + // Confirm order: first provider name is "gemini" + assert_eq!(chain.providers[0].name(), "gemini"); + } + + #[tokio::test] + async fn gemini_failure_falls_back_to_openai() { + let chain = ProviderChain::from_providers(vec![ + Box::new(MockProvider { + name: "gemini", + response: Err("subprocess timed out".into()), + available: true, + }), + Box::new(MockProvider { + name: "openai", + response: Ok("from openai".into()), + available: true, + }), + ]); + let result = chain.complete(&test_request()).await.unwrap(); + assert_eq!(result, "from openai"); + } } diff --git a/crates/noxa-llm/src/error.rs b/crates/noxa-llm/src/error.rs index 19f75f3..ecc12d8 100644 --- a/crates/noxa-llm/src/error.rs +++ b/crates/noxa-llm/src/error.rs @@ -4,6 +4,12 @@ pub enum LlmError { #[error("HTTP error: {0}")] Http(#[from] reqwest::Error), + #[error("subprocess error: {0}")] + Subprocess(#[from] std::io::Error), + + #[error("subprocess timed out")] + Timeout, + #[error("no providers available")] NoProviders, diff --git a/crates/noxa-llm/src/extract.rs b/crates/noxa-llm/src/extract.rs index 35c6f77..9216b0d 100644 --- a/crates/noxa-llm/src/extract.rs +++ b/crates/noxa-llm/src/extract.rs @@ -1,11 +1,45 @@ /// Schema-based and prompt-based LLM extraction. /// Both functions build a system prompt, send content to the LLM, and parse JSON back. +use jsonschema; + use crate::clean::strip_thinking_tags; use crate::error::LlmError; use crate::provider::{CompletionRequest, LlmProvider, Message}; +/// Validate a JSON value against a schema. Returns Ok(()) on success or +/// Err(LlmError::InvalidJson) with a concise error message on failure. +fn validate_schema( + value: &serde_json::Value, + schema: &serde_json::Value, +) -> Result<(), LlmError> { + let compiled = jsonschema::validator_for(schema).map_err(|e| { + LlmError::InvalidJson(format!("invalid schema: {e}")) + })?; + + let errors: Vec = compiled + .iter_errors(value) + .map(|e| format!("{} at {}", e, e.instance_path())) + .collect(); + + if errors.is_empty() { + Ok(()) + } else { + Err(LlmError::InvalidJson(format!( + "schema validation failed: {}", + errors.join("; ") + ))) + } +} + /// Extract structured JSON from content using a JSON schema. /// The schema tells the LLM exactly what fields to extract and their types. +/// +/// Retry policy: +/// - If the response cannot be parsed as JSON at all: retry once with the +/// identical request (handles transient formatting issues). +/// - If the response is valid JSON but fails schema validation: return +/// `LlmError::InvalidJson` immediately — the schema is likely unsatisfiable +/// for this content, so retrying would produce the same result. pub async fn extract_json( content: &str, schema: &serde_json::Value, @@ -37,7 +71,22 @@ pub async fn extract_json( }; let response = provider.complete(&request).await?; - parse_json_response(&response) + + match parse_json_response(&response) { + Ok(value) => { + // Valid JSON — now validate against the schema. + // Schema mismatches do not retry (unsatisfiable → same result). + validate_schema(&value, schema)?; + Ok(value) + } + Err(_parse_err) => { + // Unparseable JSON — retry once with the identical request. + let retry_response = provider.complete(&request).await?; + let value = parse_json_response(&retry_response)?; + validate_schema(&value, schema)?; + Ok(value) + } + } } /// Extract information using a natural language prompt. @@ -184,4 +233,130 @@ mod tests { assert_eq!(result["emails"][0], "test@example.com"); } + + // ── Schema validation ───────────────────────────────────────────────────── + + #[tokio::test] + async fn schema_validation_passes_for_matching_json() { + let schema = serde_json::json!({ + "type": "object", + "required": ["price"], + "properties": { + "price": { "type": "number" } + } + }); + let mock = MockProvider::ok(r#"{"price": 9.99}"#); + let result = extract_json("content", &schema, &mock, None).await.unwrap(); + assert_eq!(result["price"], 9.99); + } + + #[tokio::test] + async fn schema_validation_fails_for_wrong_type() { + let schema = serde_json::json!({ + "type": "object", + "required": ["price"], + "properties": { + "price": { "type": "number" } + } + }); + // Model returns valid JSON but wrong type ("string" instead of number). + // Should NOT retry (schema mismatch ≠ parse failure) — returns InvalidJson immediately. + let mock = MockProvider::ok(r#"{"price": "not-a-number"}"#); + let result = extract_json("content", &schema, &mock, None).await; + assert!( + matches!(result, Err(LlmError::InvalidJson(_))), + "expected InvalidJson for schema mismatch, got {result:?}" + ); + } + + #[tokio::test] + async fn schema_validation_fails_for_missing_required_field() { + let schema = serde_json::json!({ + "type": "object", + "required": ["title"], + "properties": { + "title": { "type": "string" } + } + }); + let mock = MockProvider::ok(r#"{"other": "value"}"#); + let result = extract_json("content", &schema, &mock, None).await; + assert!(matches!(result, Err(LlmError::InvalidJson(_)))); + } + + #[tokio::test] + async fn parse_failure_triggers_one_retry() { + use crate::testing::mock::SequenceMockProvider; + + let schema = serde_json::json!({ + "type": "object", + "properties": { "title": { "type": "string" } } + }); + + // First call: unparseable JSON. Second call: valid JSON matching schema. + let mock = SequenceMockProvider::new( + "mock-seq", + vec![ + Ok("this is not json at all".to_string()), + Ok(r#"{"title": "Retry succeeded"}"#.to_string()), + ], + ); + + let result = extract_json("content", &schema, &mock, None) + .await + .unwrap(); + assert_eq!(result["title"], "Retry succeeded"); + } + + #[tokio::test] + async fn both_attempts_fail_returns_invalid_json() { + use crate::testing::mock::SequenceMockProvider; + + let schema = serde_json::json!({ + "type": "object", + "properties": { "title": { "type": "string" } } + }); + + let mock = SequenceMockProvider::new( + "mock-seq", + vec![ + Ok("not json".to_string()), + Ok("also not json".to_string()), + ], + ); + + let result = extract_json("content", &schema, &mock, None).await; + assert!( + matches!(result, Err(LlmError::InvalidJson(_))), + "expected InvalidJson after both attempts fail" + ); + } + + #[tokio::test] + async fn schema_mismatch_does_not_retry() { + use crate::testing::mock::SequenceMockProvider; + + let schema = serde_json::json!({ + "type": "object", + "required": ["price"], + "properties": { + "price": { "type": "number" } + } + }); + + // Both calls return valid JSON with wrong schema — but only one call should happen. + let mock = SequenceMockProvider::new( + "mock-seq", + vec![ + Ok(r#"{"price": "wrong-type"}"#.to_string()), + Ok(r#"{"price": 9.99}"#.to_string()), // would succeed — but shouldn't be called + ], + ); + + // Should return InvalidJson without calling second response. + let result = extract_json("content", &schema, &mock, None).await; + assert!( + matches!(result, Err(LlmError::InvalidJson(_))), + "schema mismatch should not trigger retry" + ); + } } diff --git a/crates/noxa-llm/src/lib.rs b/crates/noxa-llm/src/lib.rs index 15664b9..250ae88 100644 --- a/crates/noxa-llm/src/lib.rs +++ b/crates/noxa-llm/src/lib.rs @@ -1,8 +1,9 @@ -/// noxa-llm: LLM integration with local-first hybrid architecture. +/// noxa-llm: LLM integration with Gemini-CLI-first hybrid architecture. /// -/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic. -/// Provides schema-based extraction, prompt extraction, and summarization -/// on top of noxa-core's content pipeline. +/// Provider chain: Gemini CLI (primary) → OpenAI → Ollama → Anthropic. +/// Gemini CLI requires the `gemini` binary on PATH; GEMINI_MODEL env var sets the model. +/// Provides schema-validated extraction (with one retry on parse failure), +/// prompt extraction, and summarization on top of noxa-core's content pipeline. pub mod chain; pub mod clean; pub mod error; diff --git a/crates/noxa-llm/src/providers/gemini_cli.rs b/crates/noxa-llm/src/providers/gemini_cli.rs new file mode 100644 index 0000000..9d2d2d7 --- /dev/null +++ b/crates/noxa-llm/src/providers/gemini_cli.rs @@ -0,0 +1,392 @@ +/// Gemini CLI provider — shells out to `gemini -p` for completions. +/// Primary provider in the default chain; requires the `gemini` binary on PATH. +/// +/// Prompts are passed via the `-p` flag (not via stdin or as a positional) to prevent +/// command injection from web-scraped content. Output is parsed from `--output-format json`. +/// +/// # Startup optimizations +/// +/// The gemini CLI is an agentic Node.js application that connects to every configured MCP +/// server at startup (the user has 6). Without mitigation this can add 10-60+ seconds per +/// call as those servers spin up and time out. +/// +/// Two flags reduce this: +/// - `--extensions ""` — skips extension loading (~3 s saved) +/// - `current_dir` set to a temp workdir containing `.gemini/settings.json` with +/// `{"mcpServers":{}}` — workspace settings override user settings, so all 6 MCP +/// servers are skipped at subprocess startup (major speedup). +/// +/// The workdir is created once at construction and reused for every call. +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +use async_trait::async_trait; +use tokio::process::Command; +use tokio::sync::Semaphore; +use tokio::time::timeout; +use tracing::debug; + +use crate::clean::strip_thinking_tags; +use crate::error::LlmError; +use crate::provider::{CompletionRequest, LlmProvider}; + +/// Maximum concurrent Gemini subprocess calls. +const MAX_CONCURRENT: usize = 6; +/// Subprocess deadline — prevents hung `gemini` processes blocking the chain. +const SUBPROCESS_TIMEOUT: Duration = Duration::from_secs(60); + +/// Fixed workdir used for every subprocess call. +/// A workspace-level `.gemini/settings.json` here overrides the user's MCP server config. +const NOXA_GEMINI_WORKDIR: &str = "/tmp/noxa-gemini"; + +pub struct GeminiCliProvider { + default_model: String, + semaphore: Arc, + /// Workdir with a minimal `.gemini/settings.json` that disables MCP servers. + workdir: PathBuf, +} + +impl GeminiCliProvider { + /// Construct the provider. + /// Model resolves as: `model` arg → `GEMINI_MODEL` env → `"gemini-2.5-pro"`. + pub fn new(model: Option) -> Self { + let default_model = model + .or_else(|| std::env::var("GEMINI_MODEL").ok()) + .filter(|s| !s.is_empty()) + .unwrap_or_else(|| "gemini-2.5-pro".into()); + + let workdir = PathBuf::from(NOXA_GEMINI_WORKDIR); + ensure_gemini_workdir(&workdir); + + Self { + default_model, + semaphore: Arc::new(Semaphore::new(MAX_CONCURRENT)), + workdir, + } + } + + #[cfg(test)] + fn default_model(&self) -> &str { + &self.default_model + } +} + +#[async_trait] +impl LlmProvider for GeminiCliProvider { + async fn complete(&self, request: &CompletionRequest) -> Result { + let model = if request.model.is_empty() { + &self.default_model + } else { + &request.model + }; + + // Build the prompt text from all messages. + let prompt = build_prompt(&request.messages); + + // Acquire concurrency slot before spawning. + let _permit = self + .semaphore + .acquire() + .await + .map_err(|_| LlmError::ProviderError("gemini semaphore closed".into()))?; + + let mut cmd = Command::new("gemini"); + // -p STRING: headless mode with prompt as the flag value (never positional arg). + // Passing via -p prevents command injection; the value is never interpreted as a shell command. + cmd.arg("-p").arg(&prompt); + cmd.arg("--model").arg(model); + // Always request structured JSON output so we can extract the `response` field + // and skip any preceding noise lines (e.g. MCP status warnings). + cmd.arg("--output-format").arg("json"); + // --yolo suppresses any interactive confirmation prompts in headless mode. + cmd.arg("--yolo"); + // --extensions "" skips loading user extensions (~3 s startup savings). + cmd.arg("--extensions").arg(""); + // Workspace settings in self.workdir override the user's ~/.gemini/settings.json, + // replacing the user's MCP server list with {} so none are spawned at startup. + // Without this, each of the user's MCP servers adds latency to every call. + cmd.current_dir(&self.workdir); + + cmd.stdin(std::process::Stdio::null()); + cmd.stdout(std::process::Stdio::piped()); + cmd.stderr(std::process::Stdio::piped()); + + debug!(model, workdir = %self.workdir.display(), "spawning gemini subprocess"); + + let child = cmd.spawn().map_err(LlmError::Subprocess)?; + + // Bounded wait — prevents indefinite hangs on auth expiry or network stall. + let output = match timeout(SUBPROCESS_TIMEOUT, child.wait_with_output()).await { + Ok(Ok(out)) => out, + Ok(Err(e)) => return Err(LlmError::Subprocess(e)), + Err(_elapsed) => return Err(LlmError::Timeout), + }; + + if !output.status.success() { + let stderr_preview = String::from_utf8_lossy(&output.stderr); + let preview = &stderr_preview[..stderr_preview.len().min(500)]; + return Err(LlmError::ProviderError(format!( + "gemini exited with {}: {preview}", + output.status + ))); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let response = extract_response_from_output(&stdout)?; + let cleaned = strip_code_fences(strip_thinking_tags(&response).trim()); + Ok(cleaned) + } + + async fn is_available(&self) -> bool { + // Pure PATH check — no inference call, fast. + matches!( + Command::new("gemini") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .await, + Ok(s) if s.success() + ) + } + + fn name(&self) -> &str { + "gemini" + } +} + +/// Parse the `response` field from gemini's `--output-format json` output. +/// +/// The CLI emits lines before the JSON object (e.g. MCP status warnings). +/// We find the first `{` to locate the JSON, parse it, and extract `.response`. +fn extract_response_from_output(stdout: &str) -> Result { + let json_start = stdout.find('{').ok_or_else(|| { + let preview = &stdout[..stdout.len().min(300)]; + LlmError::ProviderError(format!("gemini produced no JSON output: {preview}")) + })?; + + let json_str = &stdout[json_start..]; + let outer: serde_json::Value = serde_json::from_str(json_str).map_err(|e| { + let preview = &json_str[..json_str.len().min(300)]; + LlmError::ProviderError(format!("failed to parse gemini JSON output: {e} — {preview}")) + })?; + + // `response` holds the model's actual text output. + outer["response"] + .as_str() + .ok_or_else(|| { + LlmError::ProviderError(format!( + "gemini JSON output missing 'response' field: {}", + &json_str[..json_str.len().min(300)] + )) + }) + .map(|s| s.to_string()) +} + +/// Create the noxa gemini workdir with a minimal workspace settings file. +/// +/// The `.gemini/settings.json` written here overrides the user's `~/.gemini/settings.json` +/// for any `gemini` subprocess run from this directory. Setting `mcpServers` to `{}` prevents +/// the CLI from spawning the user's configured MCP servers on every headless call. +/// +/// Errors are intentionally ignored — if the write fails, the subprocess still works, +/// just without the startup optimization (and with a warning in the logs). +fn ensure_gemini_workdir(workdir: &std::path::Path) { + let settings_dir = workdir.join(".gemini"); + let settings_path = settings_dir.join("settings.json"); + + if settings_path.exists() { + return; + } + + if let Err(e) = std::fs::create_dir_all(&settings_dir) { + tracing::warn!(path = %settings_dir.display(), error = %e, "failed to create gemini workdir"); + return; + } + + // Minimal workspace settings: disable all MCP servers. + // Workspace settings override ~/.gemini/settings.json per gemini CLI docs. + let content = r#"{"mcpServers":{}}"#; + if let Err(e) = std::fs::write(&settings_path, content) { + tracing::warn!(path = %settings_path.display(), error = %e, "failed to write gemini workspace settings"); + } +} + +/// Concatenate all messages into a single prompt string for the CLI. +fn build_prompt(messages: &[crate::provider::Message]) -> String { + messages + .iter() + .map(|m| match m.role.as_str() { + "system" => format!("[System]: {}", m.content), + "assistant" => format!("[Assistant]: {}", m.content), + _ => m.content.clone(), + }) + .collect::>() + .join("\n\n") +} + +/// Strip markdown code fences from a response string. +fn strip_code_fences(s: &str) -> String { + let trimmed = s.trim(); + if trimmed.starts_with("```") { + let without_opener = trimmed + .strip_prefix("```json") + .or_else(|| trimmed.strip_prefix("```")) + .unwrap_or(trimmed); + without_opener + .strip_suffix("```") + .unwrap_or(without_opener) + .trim() + .to_string() + } else { + trimmed.to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── Construction ────────────────────────────────────────────────────────── + + #[test] + fn explicit_model_used() { + let p = GeminiCliProvider::new(Some("gemini-1.5-flash".into())); + assert_eq!(p.default_model(), "gemini-1.5-flash"); + assert_eq!(p.name(), "gemini"); + } + + #[test] + fn default_model_fallback() { + // Explicit None + no GEMINI_MODEL env → hardcoded default. + // We unset the env to avoid flakiness (it may or may not be set). + unsafe { std::env::remove_var("GEMINI_MODEL") }; + let p = GeminiCliProvider::new(None); + assert_eq!(p.default_model(), "gemini-2.5-pro"); + } + + // Env var tests mutate process-global state and race with parallel tests. + // Run in isolation if needed: + // cargo test -p noxa-llm env_model_override -- --ignored --test-threads=1 + #[test] + #[ignore = "mutates process env; run with --test-threads=1"] + fn env_model_override() { + unsafe { std::env::set_var("GEMINI_MODEL", "gemini-1.5-pro") }; + let p = GeminiCliProvider::new(None); + assert_eq!(p.default_model(), "gemini-1.5-pro"); + unsafe { std::env::remove_var("GEMINI_MODEL") }; + } + + // ── build_prompt ────────────────────────────────────────────────────────── + + #[test] + fn build_prompt_user_only() { + use crate::provider::Message; + let messages = vec![Message { + role: "user".into(), + content: "hello world".into(), + }]; + assert_eq!(build_prompt(&messages), "hello world"); + } + + #[test] + fn build_prompt_system_and_user() { + use crate::provider::Message; + let messages = vec![ + Message { + role: "system".into(), + content: "You are helpful.".into(), + }, + Message { + role: "user".into(), + content: "Tell me something.".into(), + }, + ]; + let result = build_prompt(&messages); + assert!(result.contains("[System]: You are helpful.")); + assert!(result.contains("Tell me something.")); + } + + // ── extract_response_from_output ────────────────────────────────────────── + + #[test] + fn extracts_response_from_clean_json() { + let stdout = r#"{"session_id":"abc","response":"Hello world","stats":{}}"#; + assert_eq!(extract_response_from_output(stdout).unwrap(), "Hello world"); + } + + #[test] + fn extracts_response_skipping_mcp_noise() { + // MCP warning line appears before the JSON object in real gemini output. + let stdout = "MCP issues detected. Run /mcp list for status.\n{\"session_id\":\"abc\",\"response\":\"the answer\",\"stats\":{}}"; + assert_eq!( + extract_response_from_output(stdout).unwrap(), + "the answer" + ); + } + + #[test] + fn error_when_no_json_in_output() { + let result = extract_response_from_output("MCP issues detected. No JSON follows."); + assert!(matches!(result, Err(LlmError::ProviderError(_)))); + } + + #[test] + fn error_when_response_field_missing() { + let stdout = r#"{"session_id":"abc","stats":{}}"#; + let result = extract_response_from_output(stdout); + assert!(matches!(result, Err(LlmError::ProviderError(_)))); + } + + // ── strip_code_fences ───────────────────────────────────────────────────── + + #[test] + fn strips_json_fence() { + let input = "```json\n{\"key\": \"value\"}\n```"; + assert_eq!(strip_code_fences(input), "{\"key\": \"value\"}"); + } + + #[test] + fn strips_plain_fence() { + let input = "```\nhello\n```"; + assert_eq!(strip_code_fences(input), "hello"); + } + + #[test] + fn passthrough_no_fence() { + let input = "{\"key\": \"value\"}"; + assert_eq!(strip_code_fences(input), "{\"key\": \"value\"}"); + } + + // ── is_available returns false when binary absent ────────────────────────── + + #[tokio::test] + async fn unavailable_when_binary_missing() { + let result = tokio::process::Command::new("__noxa_nonexistent_binary_xyz__") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .await; + assert!(result.is_err(), "missing binary should fail to spawn"); + } + + // ── thinking tag stripping ──────────────────────────────────────────────── + + #[test] + fn strips_thinking_tags_from_output() { + let raw = "internal reasoning{\"result\": true}"; + let after_thinking = strip_thinking_tags(raw); + let after_fences = strip_code_fences(after_thinking.trim()); + assert_eq!(after_fences, "{\"result\": true}"); + } + + #[test] + fn strips_code_fence_after_thinking() { + let raw = "let me check\n```json\n{\"ok\": 1}\n```"; + let after_thinking = strip_thinking_tags(raw); + let after_fences = strip_code_fences(after_thinking.trim()); + assert_eq!(after_fences, "{\"ok\": 1}"); + } +} diff --git a/crates/noxa-llm/src/providers/mod.rs b/crates/noxa-llm/src/providers/mod.rs index c6b8f60..b1a8736 100644 --- a/crates/noxa-llm/src/providers/mod.rs +++ b/crates/noxa-llm/src/providers/mod.rs @@ -1,4 +1,5 @@ pub mod anthropic; +pub mod gemini_cli; pub mod ollama; pub mod openai; diff --git a/crates/noxa-llm/src/providers/ollama.rs b/crates/noxa-llm/src/providers/ollama.rs index b42a584..d728e67 100644 --- a/crates/noxa-llm/src/providers/ollama.rs +++ b/crates/noxa-llm/src/providers/ollama.rs @@ -2,6 +2,7 @@ /// First choice in the provider chain: free, private, fast on Apple Silicon. use async_trait::async_trait; use serde_json::json; +use std::time::Duration; use crate::clean::strip_thinking_tags; use crate::error::LlmError; @@ -96,7 +97,10 @@ impl LlmProvider for OllamaProvider { async fn is_available(&self) -> bool { let url = format!("{}/api/tags", self.base_url); - matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success()) + matches!( + tokio::time::timeout(Duration::from_millis(500), self.client.get(&url).send()).await, + Ok(Ok(r)) if r.status().is_success() + ) } fn name(&self) -> &str { diff --git a/crates/noxa-llm/src/testing.rs b/crates/noxa-llm/src/testing.rs index 66157a2..da5cc0b 100644 --- a/crates/noxa-llm/src/testing.rs +++ b/crates/noxa-llm/src/testing.rs @@ -4,6 +4,9 @@ /// extract, chain, and other modules that need a fake LLM backend. #[cfg(test)] pub(crate) mod mock { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + use async_trait::async_trait; use crate::error::LlmError; @@ -45,4 +48,48 @@ pub(crate) mod mock { self.name } } + + /// A mock provider that returns responses from a sequence. + /// Call N → returns responses[N], wrapping at the end. + /// Useful for testing first-failure / second-success retry paths. + pub struct SequenceMockProvider { + pub name: &'static str, + pub responses: Vec>, + pub available: bool, + call_count: Arc, + } + + impl SequenceMockProvider { + pub fn new( + name: &'static str, + responses: Vec>, + ) -> Self { + Self { + name, + responses, + available: true, + call_count: Arc::new(AtomicUsize::new(0)), + } + } + } + + #[async_trait] + impl LlmProvider for SequenceMockProvider { + async fn complete(&self, _request: &CompletionRequest) -> Result { + let idx = self.call_count.fetch_add(1, Ordering::SeqCst); + let response = &self.responses[idx.min(self.responses.len() - 1)]; + match response { + Ok(text) => Ok(text.clone()), + Err(msg) => Err(LlmError::ProviderError(msg.clone())), + } + } + + async fn is_available(&self) -> bool { + self.available + } + + fn name(&self) -> &str { + self.name + } + } } diff --git a/crates/noxa-mcp/Cargo.toml b/crates/noxa-mcp/Cargo.toml index 16f4f2e..a82757b 100644 --- a/crates/noxa-mcp/Cargo.toml +++ b/crates/noxa-mcp/Cargo.toml @@ -5,6 +5,10 @@ version.workspace = true edition.workspace = true license.workspace = true +[lib] +name = "noxa_mcp" +path = "src/lib.rs" + [[bin]] name = "noxa-mcp" path = "src/main.rs" @@ -14,8 +18,8 @@ noxa-core = { workspace = true } noxa-fetch = { workspace = true } noxa-llm = { workspace = true } noxa-pdf = { workspace = true } -rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] } -schemars = "1.0" +rmcp = { workspace = true } +schemars = { workspace = true } dotenvy = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } @@ -24,4 +28,4 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true } reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } url = "2" -dirs = "6.0.0" +dirs = { workspace = true } diff --git a/crates/noxa-mcp/src/lib.rs b/crates/noxa-mcp/src/lib.rs new file mode 100644 index 0000000..fe75c97 --- /dev/null +++ b/crates/noxa-mcp/src/lib.rs @@ -0,0 +1,20 @@ +/// noxa-mcp library wrapper. +/// +/// This exposes the MCP server so it can be embedded by the `noxa` CLI via +/// `noxa mcp` without duplicating the transport/bootstrap code. +/// +/// Callers must initialize tracing before calling `run()`. Stdout must remain +/// untouched after `run()` begins because it carries the MCP wire protocol. +pub(crate) mod cloud; +pub(crate) mod server; +pub(crate) mod tools; + +use rmcp::ServiceExt; +use rmcp::transport::stdio; + +/// Start the MCP server over stdio and block until the client disconnects. +pub async fn run() -> Result<(), Box> { + let service = server::NoxaMcp::new().await.serve(stdio()).await?; + service.waiting().await?; + Ok(()) +} diff --git a/crates/noxa-mcp/src/main.rs b/crates/noxa-mcp/src/main.rs index 5abde92..fdc71c0 100644 --- a/crates/noxa-mcp/src/main.rs +++ b/crates/noxa-mcp/src/main.rs @@ -1,15 +1,6 @@ /// noxa-mcp: MCP (Model Context Protocol) server for noxa. /// Exposes web extraction tools over stdio transport for AI agents /// like Claude Desktop, Claude Code, and other MCP clients. -mod cloud; -mod server; -mod tools; - -use rmcp::ServiceExt; -use rmcp::transport::stdio; - -use server::NoxaMcp; - #[tokio::main] async fn main() -> Result<(), Box> { dotenvy::dotenv().ok(); @@ -21,8 +12,5 @@ async fn main() -> Result<(), Box> { .with_ansi(false) .init(); - let service = NoxaMcp::new().await.serve(stdio()).await?; - - service.waiting().await?; - Ok(()) + noxa_mcp::run().await } diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs index 804861d..4b7bb44 100644 --- a/crates/noxa-mcp/src/server.rs +++ b/crates/noxa-mcp/src/server.rs @@ -89,7 +89,7 @@ impl NoxaMcp { let chain = noxa_llm::ProviderChain::default().await; let llm_chain = if chain.is_empty() { - warn!("no LLM providers available -- extract/summarize tools will fail"); + warn!("no LLM providers available (gemini CLI, OPENAI_API_KEY, ANTHROPIC_API_KEY) -- extract/summarize tools will fail"); None } else { info!(providers = chain.len(), "LLM provider chain ready"); @@ -334,7 +334,7 @@ impl NoxaMcp { // No local LLM — fall back to cloud API directly if self.llm_chain.is_none() { let cloud = self.cloud.as_ref().ok_or( - "No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.", + "No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.", )?; let mut body = json!({"url": params.url}); if let Some(ref schema) = params.schema { @@ -387,7 +387,7 @@ impl NoxaMcp { // No local LLM — fall back to cloud API directly if self.llm_chain.is_none() { let cloud = self.cloud.as_ref().ok_or( - "No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.", + "No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.", )?; let mut body = json!({"url": params.url}); if let Some(sentences) = params.max_sentences { diff --git a/env.example b/env.example index d15b729..aad81c5 100644 --- a/env.example +++ b/env.example @@ -1,43 +1,20 @@ -# ============================================ -# Noxa Configuration -# Copy to .env and fill in your values -# ============================================ +# Secrets, URLs, and path overrides only — everything else goes in config.json +# See config.example.json for the full list of configurable defaults. -# --- LLM Providers --- +# Cloud API key (required for --cloud / --research) +NOXA_API_KEY= -# Ollama (local, default provider) -OLLAMA_HOST=http://localhost:11434 -OLLAMA_MODEL=qwen3:8b +# Single proxy URL (or use NOXA_PROXY_FILE for pool rotation) +NOXA_PROXY= -# OpenAI (optional cloud fallback) -# OPENAI_API_KEY — set your OpenAI key -# OPENAI_BASE_URL — defaults to https://api.openai.com/v1 -# OPENAI_MODEL — defaults to gpt-4o-mini +# Proxy pool file path for rotating proxies +NOXA_PROXY_FILE= -# Anthropic (optional cloud fallback) -# ANTHROPIC_API_KEY — set your Anthropic key -# ANTHROPIC_MODEL — defaults to claude-sonnet-4-20250514 +# Webhook URL for completion notifications +NOXA_WEBHOOK_URL= -# --- Proxy --- +# LLM base URL (Ollama or OpenAI-compatible endpoint) +NOXA_LLM_BASE_URL= -# Single proxy -# NOXA_PROXY=http://user:pass@host:port - -# Proxy file (one per line: host:port:user:pass) -# NOXA_PROXY_FILE=/path/to/proxies.txt - -# --- Server (noxa-server only) --- -# NOXA_PORT=3000 -# NOXA_HOST=0.0.0.0 -# NOXA_AUTH_KEY=your-auth-key -# NOXA_MAX_CONCURRENCY=50 -# NOXA_JOB_TTL_SECS=3600 -# NOXA_MAX_JOBS=100 - -# --- CLI LLM overrides --- -# NOXA_LLM_PROVIDER=ollama -# NOXA_LLM_MODEL=qwen3:8b -# NOXA_LLM_BASE_URL=http://localhost:11434 - -# --- Logging --- -# NOXA_LOG=info +# Optional: path to a non-default config file (default: ./config.json) +# NOXA_CONFIG=/path/to/my-config.json