mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
Merge pull request #2 from jmagar/feature/noxa-mcp-subcommand
refactor: add noxa mcp subcommand
This commit is contained in:
commit
464eb1baec
23 changed files with 1874 additions and 199 deletions
16
.gitignore
vendored
16
.gitignore
vendored
|
|
@ -1,5 +1,21 @@
|
|||
target/
|
||||
.DS_Store
|
||||
.env
|
||||
config.json
|
||||
proxies.txt
|
||||
.claude/skills/
|
||||
.omc
|
||||
.lavra
|
||||
.beads
|
||||
.cache
|
||||
docs/plans
|
||||
docs/superpowers
|
||||
docs/reports
|
||||
docs/sessions
|
||||
benchmarks
|
||||
docs
|
||||
|
||||
# Beads / Dolt files (added by bd init)
|
||||
.dolt/
|
||||
*.db
|
||||
.beads-credential-key
|
||||
|
|
|
|||
16
CLAUDE.md
16
CLAUDE.md
|
|
@ -15,8 +15,8 @@ noxa/
|
|||
# + proxy pool rotation (per-request)
|
||||
# + PDF content-type detection
|
||||
# + document parsing (DOCX, XLSX, CSV)
|
||||
noxa-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic)
|
||||
# + JSON schema extraction, prompt extraction, summarization
|
||||
noxa-llm/ # LLM provider chain (Gemini CLI -> OpenAI -> Ollama -> Anthropic)
|
||||
# + JSON schema extraction (validated + retry), prompt extraction, summarization
|
||||
noxa-pdf/ # PDF text extraction via pdf-extract
|
||||
noxa-mcp/ # MCP server (Model Context Protocol) for AI agents
|
||||
noxa/ # CLI binary
|
||||
|
|
@ -48,8 +48,10 @@ Two binaries: `noxa` (CLI), `noxa-mcp` (MCP server).
|
|||
- `search.rs` — Web search via Serper.dev with parallel result scraping
|
||||
|
||||
### LLM Modules (`noxa-llm`)
|
||||
- Provider chain: Ollama (local-first) -> OpenAI -> Anthropic
|
||||
- JSON schema extraction, prompt-based extraction, summarization
|
||||
- Provider chain: Gemini CLI (primary) -> OpenAI -> Ollama -> Anthropic
|
||||
- Gemini CLI requires the `gemini` binary on PATH; `GEMINI_MODEL` env var controls model (default: `gemini-2.5-pro`)
|
||||
- JSON schema extraction with jsonschema validation; parse failures retry once; schema mismatches fail immediately
|
||||
- Prompt-based extraction, summarization
|
||||
|
||||
### PDF Modules (`noxa-pdf`)
|
||||
- PDF text extraction via pdf-extract crate
|
||||
|
|
@ -105,11 +107,15 @@ noxa https://example.com --diff-with snap.json
|
|||
# Brand extraction
|
||||
noxa https://example.com --brand
|
||||
|
||||
# LLM features (Ollama local-first)
|
||||
# LLM features (Gemini CLI primary; requires `gemini` on PATH)
|
||||
noxa https://example.com --summarize
|
||||
noxa https://example.com --extract-prompt "Get all pricing tiers"
|
||||
noxa https://example.com --extract-json '{"type":"object","properties":{"title":{"type":"string"}}}'
|
||||
|
||||
# Force a specific LLM provider
|
||||
noxa https://example.com --llm-provider gemini --summarize
|
||||
noxa https://example.com --llm-provider openai --summarize
|
||||
|
||||
# PDF (auto-detected via Content-Type)
|
||||
noxa https://example.com/report.pdf
|
||||
|
||||
|
|
|
|||
246
Cargo.lock
generated
246
Cargo.lock
generated
|
|
@ -35,7 +35,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"getrandom 0.3.4",
|
||||
"once_cell",
|
||||
"serde",
|
||||
"version_check",
|
||||
"zerocopy",
|
||||
]
|
||||
|
|
@ -64,6 +66,12 @@ dependencies = [
|
|||
"alloc-no-stdlib",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "allocator-api2"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
|
|
@ -206,6 +214,21 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-set"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
|
||||
dependencies = [
|
||||
"bit-vec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-vec"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.11.0"
|
||||
|
|
@ -246,6 +269,12 @@ dependencies = [
|
|||
"openssl-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "borrow-or-share"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc0b364ead1874514c8c2855ab558056ebfeb775653e7ae45ff72f28f8f3166c"
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "8.0.2"
|
||||
|
|
@ -273,6 +302,12 @@ version = "3.20.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
|
||||
|
||||
[[package]]
|
||||
name = "bytecount"
|
||||
version = "0.6.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
|
|
@ -601,6 +636,12 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "data-encoding"
|
||||
version = "2.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea"
|
||||
|
||||
[[package]]
|
||||
name = "debug_unsafe"
|
||||
version = "0.1.4"
|
||||
|
|
@ -726,6 +767,15 @@ version = "1.15.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "email_address"
|
||||
version = "0.2.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e079f19b08ca6239f47f8ba8509c11cf3ea30095831f7fed61441475edd8c449"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.35"
|
||||
|
|
@ -760,6 +810,17 @@ dependencies = [
|
|||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fancy-regex"
|
||||
version = "0.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
|
||||
dependencies = [
|
||||
"bit-set",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fast-float2"
|
||||
version = "0.2.3"
|
||||
|
|
@ -789,6 +850,17 @@ dependencies = [
|
|||
"zlib-rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fluent-uri"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc74ac4d8359ae70623506d512209619e5cf8f347124910440dbc221714b328e"
|
||||
dependencies = [
|
||||
"borrow-or-share",
|
||||
"ref-cast",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
|
|
@ -801,6 +873,12 @@ version = "0.1.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.5.0"
|
||||
|
|
@ -837,6 +915,16 @@ dependencies = [
|
|||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fraction"
|
||||
version = "0.15.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f158e3ff0a1b334408dc9fb811cd99b446986f4d8b741bb08f9df1604085ae7"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"num",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs_extra"
|
||||
version = "1.3.0"
|
||||
|
|
@ -1037,7 +1125,7 @@ version = "0.15.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
|
||||
dependencies = [
|
||||
"foldhash",
|
||||
"foldhash 0.1.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1045,6 +1133,11 @@ name = "hashbrown"
|
|||
version = "0.16.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
|
||||
dependencies = [
|
||||
"allocator-api2",
|
||||
"equivalent",
|
||||
"foldhash 0.2.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
|
|
@ -1410,6 +1503,33 @@ dependencies = [
|
|||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jsonschema"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "84695c6689b01384700a3d93acecbd07231ee6fff1bf22ae980b4c307e6ddfd5"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"bytecount",
|
||||
"data-encoding",
|
||||
"email_address",
|
||||
"fancy-regex",
|
||||
"fraction",
|
||||
"getrandom 0.3.4",
|
||||
"idna",
|
||||
"itoa",
|
||||
"num-cmp",
|
||||
"num-traits",
|
||||
"percent-encoding",
|
||||
"referencing",
|
||||
"regex",
|
||||
"regex-syntax",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"unicode-general-category",
|
||||
"uuid-simd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
|
|
@ -1575,6 +1695,12 @@ version = "2.8.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
|
||||
|
||||
[[package]]
|
||||
name = "micromap"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2a86d3146ed3995b5913c414f6664344b9617457320782e64f0bb44afd49d74"
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
|
|
@ -1627,10 +1753,12 @@ dependencies = [
|
|||
"noxa-core",
|
||||
"noxa-fetch",
|
||||
"noxa-llm",
|
||||
"noxa-mcp",
|
||||
"noxa-pdf",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
|
|
@ -1683,6 +1811,7 @@ name = "noxa-llm"
|
|||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"jsonschema",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
|
@ -1730,12 +1859,82 @@ dependencies = [
|
|||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
|
||||
dependencies = [
|
||||
"num-bigint",
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-iter",
|
||||
"num-rational",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
|
||||
dependencies = [
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-cmp"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "63335b2e2c34fae2fb0aa2cecfd9f0832a1e24b3b32ecec612c3426d46dc8aaa"
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.46"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-iter"
|
||||
version = "0.1.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-rational"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
|
||||
dependencies = [
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
|
|
@ -1774,6 +1973,12 @@ version = "0.2.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
||||
|
||||
[[package]]
|
||||
name = "outref"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.5"
|
||||
|
|
@ -2160,6 +2365,23 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "referencing"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2d5554bf79f4acf770dc3193b44b2d63b348f5f7b7448a0ea1191b37b620728"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"fluent-uri",
|
||||
"getrandom 0.3.4",
|
||||
"hashbrown 0.16.1",
|
||||
"itoa",
|
||||
"micromap",
|
||||
"parking_lot",
|
||||
"percent-encoding",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.12.3"
|
||||
|
|
@ -2985,6 +3207,12 @@ version = "1.19.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-general-category"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b993bddc193ae5bd0d623b49ec06ac3e9312875fdae725a975c51db1cc1677f"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
|
|
@ -3049,6 +3277,16 @@ version = "0.2.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "uuid-simd"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "23b082222b4f6619906941c17eb2297fff4c2fb96cb60164170522942a200bd8"
|
||||
dependencies = [
|
||||
"outref",
|
||||
"vsimd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "valuable"
|
||||
version = "0.1.1"
|
||||
|
|
@ -3061,6 +3299,12 @@ version = "0.9.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "vsimd"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64"
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.1"
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ noxa-core = { path = "crates/noxa-core" }
|
|||
noxa-fetch = { path = "crates/noxa-fetch" }
|
||||
noxa-llm = { path = "crates/noxa-llm" }
|
||||
noxa-pdf = { path = "crates/noxa-pdf" }
|
||||
noxa-mcp = { path = "crates/noxa-mcp" }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
|
@ -21,3 +22,6 @@ tracing = "0.1"
|
|||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
clap = { version = "4", features = ["derive", "env"] }
|
||||
dotenvy = "0.15"
|
||||
rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
|
||||
schemars = "1.0"
|
||||
dirs = "6.0.0"
|
||||
|
|
|
|||
292
README.md
292
README.md
|
|
@ -77,7 +77,7 @@ Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for mac
|
|||
### Cargo (from source)
|
||||
|
||||
```bash
|
||||
cargo install --git https://github.com/jmagar/noxa.git noxa
|
||||
cargo install --git https://github.com/jmagar/noxa.git noxa-cli --bin noxa
|
||||
cargo install --git https://github.com/jmagar/noxa.git noxa-mcp
|
||||
```
|
||||
|
||||
|
|
@ -159,6 +159,271 @@ Crawling... 50/50 pages extracted
|
|||
|
||||
---
|
||||
|
||||
## Examples
|
||||
|
||||
### Basic Extraction
|
||||
|
||||
```bash
|
||||
# Extract as markdown (default)
|
||||
noxa https://example.com
|
||||
|
||||
# Multiple output formats
|
||||
noxa https://example.com -f markdown # Clean markdown
|
||||
noxa https://example.com -f json # Full structured JSON
|
||||
noxa https://example.com -f text # Plain text (no formatting)
|
||||
noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens)
|
||||
|
||||
# Bare domains work (auto-prepends https://)
|
||||
noxa example.com
|
||||
```
|
||||
|
||||
### Content Filtering
|
||||
|
||||
```bash
|
||||
# Only extract main content (skip nav, sidebar, footer)
|
||||
noxa https://docs.rs/tokio --only-main-content
|
||||
|
||||
# Include specific CSS selectors
|
||||
noxa https://news.ycombinator.com --include ".titleline,.score"
|
||||
|
||||
# Exclude specific elements
|
||||
noxa https://example.com --exclude "nav,footer,.ads,.sidebar"
|
||||
|
||||
# Combine both
|
||||
noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar"
|
||||
```
|
||||
|
||||
### Brand Identity Extraction
|
||||
|
||||
```bash
|
||||
# Extract colors, fonts, logos from any website
|
||||
noxa --brand https://stripe.com
|
||||
# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] }
|
||||
|
||||
noxa --brand https://github.com
|
||||
# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... }
|
||||
|
||||
noxa --brand wikipedia.org
|
||||
# Output: 10 colors, 5 fonts, favicon, logo URL
|
||||
```
|
||||
|
||||
### Sitemap Discovery
|
||||
|
||||
```bash
|
||||
# Discover all URLs from a site's sitemaps
|
||||
noxa --map https://sitemaps.org
|
||||
# Output: one URL per line (84 URLs found)
|
||||
|
||||
# JSON output with metadata
|
||||
noxa --map https://sitemaps.org -f json
|
||||
# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }]
|
||||
```
|
||||
|
||||
### Recursive Crawling
|
||||
|
||||
```bash
|
||||
# Crawl a site (default: depth 1, max 20 pages)
|
||||
noxa --crawl https://example.com
|
||||
|
||||
# Control depth and page limit
|
||||
noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio
|
||||
|
||||
# Crawl with sitemap seeding (finds more pages)
|
||||
noxa --crawl --sitemap --depth 2 https://docs.rs/tokio
|
||||
|
||||
# Filter crawl paths
|
||||
noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com
|
||||
noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com
|
||||
|
||||
# Control concurrency and delay
|
||||
noxa --crawl --concurrency 10 --delay 200 https://example.com
|
||||
```
|
||||
|
||||
### Change Detection (Diff)
|
||||
|
||||
```bash
|
||||
# Step 1: Save a snapshot
|
||||
noxa https://example.com -f json > snapshot.json
|
||||
|
||||
# Step 2: Later, compare against the snapshot
|
||||
noxa --diff-with snapshot.json https://example.com
|
||||
# Output:
|
||||
# Status: Same
|
||||
# Word count delta: +0
|
||||
|
||||
# If the page changed:
|
||||
# Status: Changed
|
||||
# Word count delta: +42
|
||||
# --- old
|
||||
# +++ new
|
||||
# @@ -1,3 +1,3 @@
|
||||
# -Old content here
|
||||
# +New content here
|
||||
```
|
||||
|
||||
### PDF Extraction
|
||||
|
||||
```bash
|
||||
# PDF URLs are auto-detected via Content-Type
|
||||
noxa https://example.com/report.pdf
|
||||
|
||||
# Control PDF mode
|
||||
noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs)
|
||||
noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found
|
||||
```
|
||||
|
||||
### Batch Processing
|
||||
|
||||
```bash
|
||||
# Multiple URLs in one command
|
||||
noxa https://example.com https://httpbin.org/html https://rust-lang.org
|
||||
|
||||
# URLs from a file (one per line, # comments supported)
|
||||
noxa --urls-file urls.txt
|
||||
|
||||
# Batch with JSON output
|
||||
noxa --urls-file urls.txt -f json
|
||||
|
||||
# Proxy rotation for large batches
|
||||
noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10
|
||||
```
|
||||
|
||||
### Local Files & Stdin
|
||||
|
||||
```bash
|
||||
# Extract from a local HTML file
|
||||
noxa --file page.html
|
||||
|
||||
# Pipe HTML from another command
|
||||
curl -s https://example.com | noxa --stdin
|
||||
|
||||
# Chain with other tools
|
||||
noxa https://example.com -f text | wc -w # Word count
|
||||
noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq
|
||||
```
|
||||
|
||||
### Browser Impersonation
|
||||
|
||||
```bash
|
||||
# Chrome (default) — latest Chrome TLS fingerprint
|
||||
noxa https://example.com
|
||||
|
||||
# Firefox fingerprint
|
||||
noxa --browser firefox https://example.com
|
||||
|
||||
# Random browser per request (good for batch)
|
||||
noxa --browser random --urls-file urls.txt
|
||||
```
|
||||
|
||||
### Custom Headers & Cookies
|
||||
|
||||
```bash
|
||||
# Custom headers
|
||||
noxa -H "Authorization: Bearer token123" https://api.example.com
|
||||
noxa -H "Accept-Language: de-DE" https://example.com
|
||||
|
||||
# Cookies
|
||||
noxa --cookie "session=abc123; theme=dark" https://example.com
|
||||
|
||||
# Multiple headers
|
||||
noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com
|
||||
```
|
||||
|
||||
### LLM-Powered Features
|
||||
|
||||
These require an LLM provider (Ollama local, or OpenAI/Anthropic API key).
|
||||
|
||||
```bash
|
||||
# Summarize a page (default: 3 sentences)
|
||||
noxa --summarize https://example.com
|
||||
|
||||
# Control summary length
|
||||
noxa --summarize 5 https://example.com
|
||||
|
||||
# Extract structured JSON with a schema
|
||||
noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product
|
||||
|
||||
# Extract with a schema from file
|
||||
noxa --extract-json @schema.json https://example.com/product
|
||||
|
||||
# Extract with natural language prompt
|
||||
noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing
|
||||
|
||||
# Use a specific LLM provider
|
||||
noxa --llm-provider ollama --summarize https://example.com
|
||||
noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com
|
||||
noxa --llm-provider anthropic --summarize https://example.com
|
||||
```
|
||||
|
||||
### Raw HTML Output
|
||||
|
||||
```bash
|
||||
# Get the raw fetched HTML (no extraction)
|
||||
noxa --raw-html https://example.com
|
||||
|
||||
# Useful for debugging extraction issues
|
||||
noxa --raw-html https://example.com > raw.html
|
||||
noxa --file raw.html # Then extract locally
|
||||
```
|
||||
|
||||
### Metadata & Verbose Mode
|
||||
|
||||
```bash
|
||||
# Include YAML frontmatter with metadata
|
||||
noxa --metadata https://example.com
|
||||
# Output:
|
||||
# ---
|
||||
# title: "Example Domain"
|
||||
# source: "https://example.com"
|
||||
# word_count: 20
|
||||
# ---
|
||||
# # Example Domain
|
||||
# ...
|
||||
|
||||
# Verbose logging (debug extraction pipeline)
|
||||
noxa -v https://example.com
|
||||
```
|
||||
|
||||
### Proxy Usage
|
||||
|
||||
```bash
|
||||
# Single proxy
|
||||
noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com
|
||||
|
||||
# SOCKS5 proxy
|
||||
noxa --proxy socks5://proxy.example.com:1080 https://example.com
|
||||
|
||||
# Proxy rotation from file (one per line: host:port:user:pass)
|
||||
noxa --proxy-file proxies.txt https://example.com
|
||||
|
||||
# Auto-load proxies.txt from current directory
|
||||
echo "proxy1.com:8080:user:pass" > proxies.txt
|
||||
noxa https://example.com # Automatically detects and uses proxies.txt
|
||||
```
|
||||
|
||||
### Real-World Recipes
|
||||
|
||||
```bash
|
||||
# Monitor competitor pricing — save today's pricing
|
||||
noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \
|
||||
https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json
|
||||
|
||||
# Build a documentation search index
|
||||
noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt
|
||||
|
||||
# Extract all images from a page
|
||||
noxa https://example.com -f json | jq -r '.content.images[].src'
|
||||
|
||||
# Get all external links
|
||||
noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href'
|
||||
|
||||
# Compare two pages
|
||||
noxa https://site-a.com -f json > a.json
|
||||
noxa https://site-b.com --diff-with a.json
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## MCP Server — 10 tools for AI agents
|
||||
|
||||
<a href="https://glama.ai/mcp/servers/jmagar/noxa"><img src="https://glama.ai/mcp/servers/jmagar/noxa/badge" alt="noxa MCP server" /></a>
|
||||
|
|
@ -327,6 +592,31 @@ noxa/
|
|||
|
||||
## Configuration
|
||||
|
||||
Non-secret defaults live in `config.json` in your working directory. Copy the example:
|
||||
|
||||
```bash
|
||||
cp config.example.json config.json
|
||||
```
|
||||
|
||||
**Precedence:** CLI flags > `config.json` > built-in defaults
|
||||
|
||||
**Secrets and URLs** (API keys, proxy, webhook, LLM base URL) always go in `.env`, not `config.json`:
|
||||
|
||||
```bash
|
||||
cp env.example .env
|
||||
```
|
||||
|
||||
**Override config path** for a single run:
|
||||
|
||||
```bash
|
||||
NOXA_CONFIG=/path/to/other-config.json noxa https://example.com
|
||||
NOXA_CONFIG=/dev/null noxa https://example.com # bypass config entirely
|
||||
```
|
||||
|
||||
**Bool flag limitation:** flags like `--metadata`, `--only-main-content`, `--verbose` set to `true` in `config.json` cannot be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass.
|
||||
|
||||
### Environment variables
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) |
|
||||
|
|
|
|||
34
config.example.json
Normal file
34
config.example.json
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"_doc": [
|
||||
"Copy to config.json and remove fields you don't need.",
|
||||
"Secrets (api_key, proxy, webhook, llm_base_url) go in .env — NOT here.",
|
||||
"BOOL FLAG LIMITATION: once set to true here, cannot be overridden to false",
|
||||
"from the CLI for a single run (no --no-flag support). Use NOXA_CONFIG=/dev/null",
|
||||
"on the command line to bypass this config entirely.",
|
||||
"on_change is intentionally absent — it must remain a CLI-only flag.",
|
||||
"Unknown fields are silently ignored, so this file works across noxa versions."
|
||||
],
|
||||
|
||||
"format": "markdown",
|
||||
"browser": "chrome",
|
||||
"timeout": 30,
|
||||
"pdf_mode": "auto",
|
||||
"metadata": false,
|
||||
"verbose": false,
|
||||
"only_main_content": false,
|
||||
|
||||
"include_selectors": [],
|
||||
"exclude_selectors": ["nav", "footer", ".sidebar", ".cookie-banner"],
|
||||
|
||||
"depth": 1,
|
||||
"max_pages": 20,
|
||||
"concurrency": 5,
|
||||
"delay": 100,
|
||||
"path_prefix": null,
|
||||
"include_paths": [],
|
||||
"exclude_paths": ["/changelog/*", "/blog/*", "/releases/*"],
|
||||
"use_sitemap": false,
|
||||
|
||||
"llm_provider": "gemini",
|
||||
"llm_model": "gemini-2.5-pro"
|
||||
}
|
||||
|
|
@ -14,9 +14,11 @@ noxa-core = { workspace = true }
|
|||
noxa-fetch = { workspace = true }
|
||||
noxa-llm = { workspace = true }
|
||||
noxa-pdf = { workspace = true }
|
||||
noxa-mcp = { workspace = true }
|
||||
dotenvy = { workspace = true }
|
||||
rand = "0.8"
|
||||
serde_json = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
|
|
|
|||
315
crates/noxa-cli/src/config.rs
Normal file
315
crates/noxa-cli/src/config.rs
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
use serde::Deserialize;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::{Browser, OutputFormat, PdfModeArg};
|
||||
|
||||
/// Non-secret, non-URL configuration defaults loaded from config.json.
|
||||
/// All fields optional — absent means "use the hard default".
|
||||
/// Unknown fields are silently ignored (serde default) so config files
|
||||
/// written for a newer version of noxa work on older binaries.
|
||||
///
|
||||
/// DELIBERATELY EXCLUDED:
|
||||
/// - on_change: passes content to sh -c; must remain CLI-only to prevent
|
||||
/// shell injection via config file writes.
|
||||
/// - Secrets/URLs (api_key, proxy, webhook, llm_base_url): stay in .env.
|
||||
///
|
||||
/// BOOL FLAG LIMITATION:
|
||||
/// only_main_content, metadata, verbose, use_sitemap set to true here
|
||||
/// cannot be overridden to false from the CLI for a single run (no --no-flag
|
||||
/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null to bypass.
|
||||
#[derive(Debug, Default, Deserialize)]
|
||||
pub struct NoxaConfig {
|
||||
// Output
|
||||
pub format: Option<OutputFormat>,
|
||||
pub metadata: Option<bool>,
|
||||
pub verbose: Option<bool>,
|
||||
|
||||
// Fetch
|
||||
pub browser: Option<Browser>,
|
||||
pub timeout: Option<u64>,
|
||||
pub pdf_mode: Option<PdfModeArg>,
|
||||
pub only_main_content: Option<bool>,
|
||||
|
||||
// CSS selectors
|
||||
pub include_selectors: Option<Vec<String>>,
|
||||
pub exclude_selectors: Option<Vec<String>>,
|
||||
|
||||
// Crawl
|
||||
pub depth: Option<usize>,
|
||||
pub max_pages: Option<usize>,
|
||||
pub concurrency: Option<usize>,
|
||||
pub delay: Option<u64>,
|
||||
pub path_prefix: Option<String>,
|
||||
pub include_paths: Option<Vec<String>>,
|
||||
pub exclude_paths: Option<Vec<String>>,
|
||||
pub use_sitemap: Option<bool>,
|
||||
|
||||
// LLM (non-secret: provider name and model only; base URL stays in .env)
|
||||
pub llm_provider: Option<String>,
|
||||
pub llm_model: Option<String>,
|
||||
}
|
||||
|
||||
impl NoxaConfig {
|
||||
/// Load config from an explicit path, NOXA_CONFIG env var, or ./config.json.
|
||||
/// Returns an empty (all-None) config if the file doesn't exist.
|
||||
/// Prints an error and exits if the file exists but is invalid JSON.
|
||||
pub fn load(explicit_path: Option<&str>) -> Self {
|
||||
let noxa_config_env = std::env::var("NOXA_CONFIG").ok();
|
||||
let was_explicit = explicit_path.is_some() || noxa_config_env.is_some();
|
||||
|
||||
let path_str = explicit_path
|
||||
.map(String::from)
|
||||
.or(noxa_config_env)
|
||||
.unwrap_or_else(|| "config.json".to_string());
|
||||
|
||||
let path = Path::new(&path_str);
|
||||
if !path.exists() {
|
||||
if was_explicit {
|
||||
let display_name = path.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or(&path_str);
|
||||
eprintln!("error: config file not found: {display_name}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
return Self::default();
|
||||
}
|
||||
|
||||
let display_name = path.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or(&path_str);
|
||||
eprintln!(
|
||||
"noxa: config loaded from {display_name} \
|
||||
(API keys and secrets belong in .env, not config.json)"
|
||||
);
|
||||
tracing::debug!("config path: {}", path.display());
|
||||
|
||||
let content = match std::fs::read_to_string(path) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
eprintln!("error: cannot read config file {display_name}: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
match serde_json::from_str(&content) {
|
||||
Ok(cfg) => cfg,
|
||||
Err(e) => {
|
||||
eprintln!("error: invalid JSON in config file {display_name}: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Fully resolved configuration after merging CLI flags > config file > hard defaults.
|
||||
/// All fields are concrete — no Option<T>. This is what the rest of main.rs reads.
|
||||
///
|
||||
/// The merge uses clap's ValueSource to detect which fields were explicitly set on
|
||||
/// the command line. CLI-explicit values always win. Config fills in the rest.
|
||||
/// Hard defaults are the fallback of last resort.
|
||||
pub struct ResolvedConfig {
|
||||
// Output
|
||||
pub format: OutputFormat,
|
||||
pub metadata: bool,
|
||||
pub verbose: bool,
|
||||
|
||||
// Fetch
|
||||
pub browser: Browser,
|
||||
pub timeout: u64,
|
||||
pub pdf_mode: PdfModeArg,
|
||||
pub only_main_content: bool,
|
||||
/// CLI-only output flag — not configurable via config.json (it is a per-run mode, not a persistent default).
|
||||
pub raw_html: bool,
|
||||
|
||||
// CSS selectors
|
||||
/// Vec<String> — CSS selectors passed directly to extraction filter.
|
||||
pub include_selectors: Vec<String>,
|
||||
/// Vec<String> — CSS selectors passed directly to extraction filter.
|
||||
pub exclude_selectors: Vec<String>,
|
||||
|
||||
// Crawl
|
||||
pub depth: usize,
|
||||
pub max_pages: usize,
|
||||
pub concurrency: usize,
|
||||
pub delay: u64,
|
||||
pub path_prefix: Option<String>,
|
||||
/// Vec<String> — never joined to a comma-string. Passed directly to CrawlConfig.
|
||||
pub include_paths: Vec<String>,
|
||||
/// Vec<String> — never joined to a comma-string. Passed directly to CrawlConfig.
|
||||
pub exclude_paths: Vec<String>,
|
||||
pub use_sitemap: bool,
|
||||
|
||||
// LLM
|
||||
pub llm_provider: Option<String>,
|
||||
pub llm_model: Option<String>,
|
||||
}
|
||||
|
||||
use clap::parser::ValueSource;
|
||||
|
||||
/// Merge CLI flags (detected via ValueSource), config file, and hard defaults
|
||||
/// into a single ResolvedConfig. CLI explicit values always win.
|
||||
pub fn resolve(
|
||||
cli: &crate::Cli,
|
||||
matches: &clap::ArgMatches,
|
||||
cfg: &NoxaConfig,
|
||||
) -> ResolvedConfig {
|
||||
let explicit = |name: &str| {
|
||||
matches.value_source(name) == Some(ValueSource::CommandLine)
|
||||
};
|
||||
|
||||
ResolvedConfig {
|
||||
format: if explicit("format") {
|
||||
cli.format.clone()
|
||||
} else {
|
||||
cfg.format.clone().unwrap_or(crate::OutputFormat::Markdown)
|
||||
},
|
||||
browser: if explicit("browser") {
|
||||
cli.browser.clone()
|
||||
} else {
|
||||
cfg.browser.clone().unwrap_or(crate::Browser::Chrome)
|
||||
},
|
||||
pdf_mode: if explicit("pdf_mode") {
|
||||
cli.pdf_mode.clone()
|
||||
} else {
|
||||
cfg.pdf_mode.clone().unwrap_or(crate::PdfModeArg::Auto)
|
||||
},
|
||||
timeout: if explicit("timeout") {
|
||||
cli.timeout
|
||||
} else {
|
||||
cfg.timeout.unwrap_or(30)
|
||||
},
|
||||
depth: if explicit("depth") {
|
||||
cli.depth
|
||||
} else {
|
||||
cfg.depth.unwrap_or(1)
|
||||
},
|
||||
max_pages: if explicit("max_pages") {
|
||||
cli.max_pages
|
||||
} else {
|
||||
cfg.max_pages.unwrap_or(20)
|
||||
},
|
||||
concurrency: if explicit("concurrency") {
|
||||
cli.concurrency
|
||||
} else {
|
||||
cfg.concurrency.unwrap_or(5)
|
||||
},
|
||||
delay: if explicit("delay") {
|
||||
cli.delay
|
||||
} else {
|
||||
cfg.delay.unwrap_or(100)
|
||||
},
|
||||
path_prefix: if explicit("path_prefix") {
|
||||
cli.path_prefix.clone()
|
||||
} else {
|
||||
cfg.path_prefix.clone()
|
||||
},
|
||||
include_paths: if explicit("include_paths") {
|
||||
cli.include_paths
|
||||
.as_deref()
|
||||
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
cfg.include_paths.clone().unwrap_or_default()
|
||||
},
|
||||
exclude_paths: if explicit("exclude_paths") {
|
||||
cli.exclude_paths
|
||||
.as_deref()
|
||||
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
cfg.exclude_paths.clone().unwrap_or_default()
|
||||
},
|
||||
include_selectors: if explicit("include") {
|
||||
cli.include
|
||||
.as_deref()
|
||||
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
cfg.include_selectors.clone().unwrap_or_default()
|
||||
},
|
||||
exclude_selectors: if explicit("exclude") {
|
||||
cli.exclude
|
||||
.as_deref()
|
||||
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
cfg.exclude_selectors.clone().unwrap_or_default()
|
||||
},
|
||||
only_main_content: cli.only_main_content || cfg.only_main_content.unwrap_or(false),
|
||||
metadata: cli.metadata || cfg.metadata.unwrap_or(false),
|
||||
verbose: cli.verbose || cfg.verbose.unwrap_or(false),
|
||||
use_sitemap: cli.sitemap || cfg.use_sitemap.unwrap_or(false),
|
||||
raw_html: cli.raw_html,
|
||||
llm_provider: if cli.llm_provider.is_some() {
|
||||
cli.llm_provider.clone()
|
||||
} else {
|
||||
cfg.llm_provider.clone()
|
||||
},
|
||||
llm_model: if cli.llm_model.is_some() {
|
||||
cli.llm_model.clone()
|
||||
} else {
|
||||
cfg.llm_model.clone()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_noxa_config_deserialize_full() {
|
||||
let json = r#"{
|
||||
"format": "llm",
|
||||
"depth": 3,
|
||||
"max_pages": 100,
|
||||
"concurrency": 10,
|
||||
"delay": 200,
|
||||
"browser": "firefox",
|
||||
"timeout": 60,
|
||||
"only_main_content": true,
|
||||
"use_sitemap": true,
|
||||
"path_prefix": "/docs/",
|
||||
"include_paths": ["/docs/*", "/api/*"],
|
||||
"exclude_paths": ["/changelog/*", "/blog/*"],
|
||||
"include_selectors": ["article", ".content"],
|
||||
"exclude_selectors": ["nav", "footer"],
|
||||
"llm_provider": "gemini",
|
||||
"llm_model": "gemini-2.5-pro",
|
||||
"pdf_mode": "fast",
|
||||
"metadata": true,
|
||||
"verbose": false
|
||||
}"#;
|
||||
let cfg: NoxaConfig = serde_json::from_str(json).unwrap();
|
||||
assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm)));
|
||||
assert_eq!(cfg.depth, Some(3));
|
||||
assert_eq!(cfg.exclude_paths, Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()]));
|
||||
assert!(matches!(cfg.pdf_mode, Some(crate::PdfModeArg::Fast)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_noxa_config_empty() {
|
||||
let cfg: NoxaConfig = serde_json::from_str("{}").unwrap();
|
||||
assert!(cfg.format.is_none());
|
||||
assert!(cfg.depth.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_noxa_config_unknown_fields_ignored() {
|
||||
// Unknown fields must NOT cause a parse failure
|
||||
let cfg: NoxaConfig = serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap();
|
||||
assert_eq!(cfg.depth, Some(2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_load_implicit_missing_file_returns_default() {
|
||||
// When no explicit path and ./config.json doesn't exist, silently return default.
|
||||
// The simplest test: call with None and rely on ./config.json not existing in test env.
|
||||
// If CWD has config.json this test is skipped to avoid flakiness.
|
||||
if std::path::Path::new("config.json").exists() {
|
||||
return; // skip: CWD has config.json
|
||||
}
|
||||
let cfg = NoxaConfig::load(None);
|
||||
assert!(cfg.format.is_none());
|
||||
}
|
||||
}
|
||||
|
|
@ -2,6 +2,7 @@
|
|||
/// CLI entry point -- wires noxa-core and noxa-fetch into a single command.
|
||||
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
|
||||
mod cloud;
|
||||
mod config;
|
||||
|
||||
use std::io::{self, Read as _};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
|
@ -9,8 +10,7 @@ use std::process;
|
|||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use clap::{Parser, ValueEnum};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use clap::{CommandFactory, FromArgMatches, Parser, ValueEnum};
|
||||
use noxa_core::{
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
|
||||
to_llm_text,
|
||||
|
|
@ -20,7 +20,10 @@ use noxa_fetch::{
|
|||
FetchConfig, FetchResult, PageResult, SitemapEntry,
|
||||
};
|
||||
use noxa_llm::LlmProvider;
|
||||
use noxa_mcp;
|
||||
use noxa_pdf::PdfMode;
|
||||
use serde::Deserialize;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
/// Known anti-bot challenge page titles (case-insensitive prefix match).
|
||||
const ANTIBOT_TITLES: &[&str] = &[
|
||||
|
|
@ -87,6 +90,10 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
|
|||
#[derive(Parser)]
|
||||
#[command(name = "noxa", about = "Extract web content for LLMs", version)]
|
||||
struct Cli {
|
||||
/// Path to config.json (default: ./config.json, override with NOXA_CONFIG env var)
|
||||
#[arg(long, global = true)]
|
||||
config: Option<String>,
|
||||
|
||||
/// URLs to fetch (multiple allowed)
|
||||
#[arg()]
|
||||
urls: Vec<String>,
|
||||
|
|
@ -247,7 +254,7 @@ struct Cli {
|
|||
#[arg(long, num_args = 0..=1, default_missing_value = "3")]
|
||||
summarize: Option<usize>,
|
||||
|
||||
/// Force a specific LLM provider (ollama, openai, anthropic)
|
||||
/// Force a specific LLM provider (gemini, ollama, openai, anthropic)
|
||||
#[arg(long, env = "NOXA_LLM_PROVIDER")]
|
||||
llm_provider: Option<String>,
|
||||
|
||||
|
|
@ -284,7 +291,8 @@ struct Cli {
|
|||
output_dir: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
#[derive(Clone, Debug, ValueEnum, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum OutputFormat {
|
||||
Markdown,
|
||||
Json,
|
||||
|
|
@ -293,14 +301,16 @@ enum OutputFormat {
|
|||
Html,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
#[derive(Clone, Debug, ValueEnum, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum Browser {
|
||||
Chrome,
|
||||
Firefox,
|
||||
Random,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum, Default)]
|
||||
#[derive(Clone, Debug, ValueEnum, Default, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum PdfModeArg {
|
||||
/// Error if PDF has no extractable text (catches scanned PDFs)
|
||||
#[default]
|
||||
|
|
@ -338,12 +348,21 @@ fn init_logging(verbose: bool) {
|
|||
tracing_subscriber::fmt().with_env_filter(filter).init();
|
||||
}
|
||||
|
||||
fn init_mcp_logging() {
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
|
||||
.with_writer(std::io::stderr)
|
||||
.with_ansi(false)
|
||||
.try_init()
|
||||
.ok();
|
||||
}
|
||||
|
||||
/// Build FetchConfig from CLI flags.
|
||||
///
|
||||
/// `--proxy` sets a single static proxy (no rotation).
|
||||
/// `--proxy-file` loads a pool of proxies and rotates per-request.
|
||||
/// `--proxy` takes priority: if both are set, only the single proxy is used.
|
||||
fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
||||
fn build_fetch_config(cli: &Cli, resolved: &config::ResolvedConfig) -> FetchConfig {
|
||||
let (proxy, proxy_pool) = if cli.proxy.is_some() {
|
||||
(cli.proxy.clone(), Vec::new())
|
||||
} else if let Some(ref path) = cli.proxy_file {
|
||||
|
|
@ -403,11 +422,11 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
|||
}
|
||||
|
||||
FetchConfig {
|
||||
browser: cli.browser.clone().into(),
|
||||
browser: resolved.browser.clone().into(),
|
||||
proxy,
|
||||
proxy_pool,
|
||||
timeout: std::time::Duration::from_secs(cli.timeout),
|
||||
pdf_mode: cli.pdf_mode.clone().into(),
|
||||
timeout: std::time::Duration::from_secs(resolved.timeout),
|
||||
pdf_mode: resolved.pdf_mode.clone().into(),
|
||||
headers,
|
||||
..Default::default()
|
||||
}
|
||||
|
|
@ -436,20 +455,12 @@ fn parse_cookie_file(path: &str) -> Result<String, String> {
|
|||
Ok(pairs.join("; "))
|
||||
}
|
||||
|
||||
fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
|
||||
fn build_extraction_options(resolved: &config::ResolvedConfig) -> ExtractionOptions {
|
||||
ExtractionOptions {
|
||||
include_selectors: cli
|
||||
.include
|
||||
.as_deref()
|
||||
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
|
||||
.unwrap_or_default(),
|
||||
exclude_selectors: cli
|
||||
.exclude
|
||||
.as_deref()
|
||||
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
|
||||
.unwrap_or_default(),
|
||||
only_main_content: cli.only_main_content,
|
||||
include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
|
||||
include_selectors: resolved.include_selectors.clone(),
|
||||
exclude_selectors: resolved.exclude_selectors.clone(),
|
||||
only_main_content: resolved.only_main_content,
|
||||
include_raw_html: resolved.raw_html || matches!(resolved.format, OutputFormat::Html),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -618,14 +629,17 @@ impl FetchOutput {
|
|||
|
||||
/// Fetch a URL and extract content, handling PDF detection automatically.
|
||||
/// Falls back to cloud API when bot protection or JS rendering is detected.
|
||||
async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
||||
async fn fetch_and_extract(
|
||||
cli: &Cli,
|
||||
resolved: &config::ResolvedConfig,
|
||||
) -> Result<FetchOutput, String> {
|
||||
// Local sources: read and extract as HTML
|
||||
if cli.stdin {
|
||||
let mut buf = String::new();
|
||||
io::stdin()
|
||||
.read_to_string(&mut buf)
|
||||
.map_err(|e| format!("failed to read stdin: {e}"))?;
|
||||
let options = build_extraction_options(cli);
|
||||
let options = build_extraction_options(resolved);
|
||||
return extract_with_options(&buf, None, &options)
|
||||
.map(|r| FetchOutput::Local(Box::new(r)))
|
||||
.map_err(|e| format!("extraction error: {e}"));
|
||||
|
|
@ -634,7 +648,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
if let Some(ref path) = cli.file {
|
||||
let html =
|
||||
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
|
||||
let options = build_extraction_options(cli);
|
||||
let options = build_extraction_options(resolved);
|
||||
return extract_with_options(&html, None, &options)
|
||||
.map(|r| FetchOutput::Local(Box::new(r)))
|
||||
.map_err(|e| format!("extraction error: {e}"));
|
||||
|
|
@ -651,10 +665,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
|
||||
// --cloud: skip local, go straight to cloud API
|
||||
if cli.cloud {
|
||||
let c =
|
||||
cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
|
||||
let options = build_extraction_options(cli);
|
||||
let format_str = match cli.format {
|
||||
let c = cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
|
||||
let options = build_extraction_options(resolved);
|
||||
let format_str = match resolved.format {
|
||||
OutputFormat::Markdown => "markdown",
|
||||
OutputFormat::Json => "json",
|
||||
OutputFormat::Text => "text",
|
||||
|
|
@ -674,9 +687,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
}
|
||||
|
||||
// Normal path: try local first
|
||||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
let options = build_extraction_options(cli);
|
||||
let client = FetchClient::new(build_fetch_config(cli, resolved))
|
||||
.map_err(|e| format!("client error: {e}"))?;
|
||||
let options = build_extraction_options(resolved);
|
||||
let result = client
|
||||
.fetch_and_extract_with_options(url, &options)
|
||||
.await
|
||||
|
|
@ -687,7 +700,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
if !matches!(reason, EmptyReason::None) {
|
||||
if let Some(ref c) = cloud_client {
|
||||
eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
|
||||
let format_str = match cli.format {
|
||||
let format_str = match resolved.format {
|
||||
OutputFormat::Markdown => "markdown",
|
||||
OutputFormat::Json => "json",
|
||||
OutputFormat::Text => "text",
|
||||
|
|
@ -718,7 +731,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
}
|
||||
|
||||
/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
|
||||
async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
|
||||
async fn fetch_html(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<FetchResult, String> {
|
||||
if cli.stdin {
|
||||
let mut buf = String::new();
|
||||
io::stdin()
|
||||
|
|
@ -751,8 +764,8 @@ async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
|
|||
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
|
||||
let url = normalize_url(raw_url);
|
||||
|
||||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
let client = FetchClient::new(build_fetch_config(cli, resolved))
|
||||
.map_err(|e| format!("client error: {e}"))?;
|
||||
client
|
||||
.fetch(&url)
|
||||
.await
|
||||
|
|
@ -1166,7 +1179,7 @@ fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String
|
|||
)
|
||||
}
|
||||
|
||||
async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
||||
async fn run_crawl(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
|
||||
let url = cli
|
||||
.urls
|
||||
.first()
|
||||
|
|
@ -1178,16 +1191,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
|||
return Err("--crawl cannot be used with --file or --stdin".into());
|
||||
}
|
||||
|
||||
let include_patterns: Vec<String> = cli
|
||||
.include_paths
|
||||
.as_deref()
|
||||
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
|
||||
.unwrap_or_default();
|
||||
let exclude_patterns: Vec<String> = cli
|
||||
.exclude_paths
|
||||
.as_deref()
|
||||
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
|
||||
.unwrap_or_default();
|
||||
let include_patterns = resolved.include_paths.clone();
|
||||
let exclude_patterns = resolved.exclude_paths.clone();
|
||||
|
||||
// Set up streaming progress channel
|
||||
let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::<PageResult>(100);
|
||||
|
|
@ -1207,13 +1212,13 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
|||
}
|
||||
|
||||
let config = CrawlConfig {
|
||||
fetch: build_fetch_config(cli),
|
||||
max_depth: cli.depth,
|
||||
max_pages: cli.max_pages,
|
||||
concurrency: cli.concurrency,
|
||||
delay: std::time::Duration::from_millis(cli.delay),
|
||||
path_prefix: cli.path_prefix.clone(),
|
||||
use_sitemap: cli.sitemap,
|
||||
fetch: build_fetch_config(cli, resolved),
|
||||
max_depth: resolved.depth,
|
||||
max_pages: resolved.max_pages,
|
||||
concurrency: resolved.concurrency,
|
||||
delay: std::time::Duration::from_millis(resolved.delay),
|
||||
path_prefix: resolved.path_prefix.clone(),
|
||||
use_sitemap: resolved.use_sitemap,
|
||||
include_patterns,
|
||||
exclude_patterns,
|
||||
progress_tx: Some(progress_tx),
|
||||
|
|
@ -1232,7 +1237,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
|||
);
|
||||
});
|
||||
|
||||
let max_pages = cli.max_pages;
|
||||
let max_pages = resolved.max_pages;
|
||||
let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages);
|
||||
|
||||
// Spawn background task to print streaming progress to stderr
|
||||
|
|
@ -1261,8 +1266,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
|||
&result.visited,
|
||||
&result.remaining_frontier,
|
||||
completed_offset + result.pages.len(),
|
||||
cli.max_pages,
|
||||
cli.depth,
|
||||
resolved.max_pages,
|
||||
resolved.depth,
|
||||
)?;
|
||||
eprintln!(
|
||||
"Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}",
|
||||
|
|
@ -1294,15 +1299,15 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
|||
let mut saved = 0usize;
|
||||
for page in &result.pages {
|
||||
if let Some(ref extraction) = page.extraction {
|
||||
let filename = url_to_filename(&page.url, &cli.format);
|
||||
let content = format_output(extraction, &cli.format, cli.metadata);
|
||||
let filename = url_to_filename(&page.url, &resolved.format);
|
||||
let content = format_output(extraction, &resolved.format, resolved.metadata);
|
||||
write_to_file(dir, &filename, &content)?;
|
||||
saved += 1;
|
||||
}
|
||||
}
|
||||
eprintln!("Saved {saved} files to {}", dir.display());
|
||||
} else {
|
||||
print_crawl_output(&result, &cli.format, cli.metadata);
|
||||
print_crawl_output(&result, &resolved.format, resolved.metadata);
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
|
|
@ -1338,7 +1343,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
|||
}
|
||||
}
|
||||
|
||||
async fn run_map(cli: &Cli) -> Result<(), String> {
|
||||
async fn run_map(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
|
||||
let url = cli
|
||||
.urls
|
||||
.first()
|
||||
|
|
@ -1346,8 +1351,8 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
|
|||
.map(|u| normalize_url(u))?;
|
||||
let url = url.as_str();
|
||||
|
||||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
let client = FetchClient::new(build_fetch_config(cli, resolved))
|
||||
.map_err(|e| format!("client error: {e}"))?;
|
||||
|
||||
let entries = noxa_fetch::sitemap::discover(&client, url)
|
||||
.await
|
||||
|
|
@ -1359,19 +1364,24 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
|
|||
eprintln!("discovered {} URLs", entries.len());
|
||||
}
|
||||
|
||||
print_map_output(&entries, &cli.format);
|
||||
print_map_output(&entries, &resolved.format);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
|
||||
async fn run_batch(
|
||||
cli: &Cli,
|
||||
resolved: &config::ResolvedConfig,
|
||||
entries: &[(String, Option<String>)],
|
||||
) -> Result<(), String> {
|
||||
let client = Arc::new(
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
|
||||
FetchClient::new(build_fetch_config(cli, resolved))
|
||||
.map_err(|e| format!("client error: {e}"))?,
|
||||
);
|
||||
|
||||
let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect();
|
||||
let options = build_extraction_options(cli);
|
||||
let options = build_extraction_options(resolved);
|
||||
let results = client
|
||||
.fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options)
|
||||
.fetch_and_extract_batch_with_options(&urls, resolved.concurrency, &options)
|
||||
.await;
|
||||
|
||||
let ok = results.iter().filter(|r| r.result.is_ok()).count();
|
||||
|
|
@ -1402,15 +1412,15 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
|
|||
let filename = custom_names
|
||||
.get(r.url.as_str())
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| url_to_filename(&r.url, &cli.format));
|
||||
let content = format_output(extraction, &cli.format, cli.metadata);
|
||||
.unwrap_or_else(|| url_to_filename(&r.url, &resolved.format));
|
||||
let content = format_output(extraction, &resolved.format, resolved.metadata);
|
||||
write_to_file(dir, &filename, &content)?;
|
||||
saved += 1;
|
||||
}
|
||||
}
|
||||
eprintln!("Saved {saved} files to {}", dir.display());
|
||||
} else {
|
||||
print_batch_output(&results, &cli.format, cli.metadata);
|
||||
print_batch_output(&results, &resolved.format, resolved.metadata);
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
|
|
@ -1514,15 +1524,20 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
|||
});
|
||||
}
|
||||
|
||||
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
|
||||
async fn run_watch(
|
||||
cli: &Cli,
|
||||
resolved: &config::ResolvedConfig,
|
||||
urls: &[String],
|
||||
) -> Result<(), String> {
|
||||
if urls.is_empty() {
|
||||
return Err("--watch requires at least one URL".into());
|
||||
}
|
||||
|
||||
let client = Arc::new(
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
|
||||
FetchClient::new(build_fetch_config(cli, resolved))
|
||||
.map_err(|e| format!("client error: {e}"))?,
|
||||
);
|
||||
let options = build_extraction_options(cli);
|
||||
let options = build_extraction_options(resolved);
|
||||
|
||||
// Ctrl+C handler
|
||||
let cancelled = Arc::new(AtomicBool::new(false));
|
||||
|
|
@ -1534,16 +1549,17 @@ async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
|
|||
|
||||
// Single-URL mode: preserve original behavior exactly
|
||||
if urls.len() == 1 {
|
||||
return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
|
||||
return run_watch_single(cli, resolved, &client, &options, &urls[0], &cancelled).await;
|
||||
}
|
||||
|
||||
// Multi-URL mode: batch fetch, diff each, report aggregate
|
||||
run_watch_multi(cli, &client, &options, urls, &cancelled).await
|
||||
run_watch_multi(cli, resolved, &client, &options, urls, &cancelled).await
|
||||
}
|
||||
|
||||
/// Original single-URL watch loop -- backward compatible.
|
||||
async fn run_watch_single(
|
||||
cli: &Cli,
|
||||
resolved: &config::ResolvedConfig,
|
||||
client: &Arc<FetchClient>,
|
||||
options: &ExtractionOptions,
|
||||
url: &str,
|
||||
|
|
@ -1580,7 +1596,7 @@ async fn run_watch_single(
|
|||
if diff.status == ChangeStatus::Same {
|
||||
eprintln!("[watch] No changes ({})", timestamp());
|
||||
} else {
|
||||
print_diff_output(&diff, &cli.format);
|
||||
print_diff_output(&diff, &resolved.format);
|
||||
eprintln!("[watch] Changes detected! ({})", timestamp());
|
||||
|
||||
if let Some(ref cmd) = cli.on_change {
|
||||
|
|
@ -1627,6 +1643,7 @@ async fn run_watch_single(
|
|||
/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
|
||||
async fn run_watch_multi(
|
||||
cli: &Cli,
|
||||
resolved: &config::ResolvedConfig,
|
||||
client: &Arc<FetchClient>,
|
||||
options: &ExtractionOptions,
|
||||
urls: &[String],
|
||||
|
|
@ -1636,7 +1653,7 @@ async fn run_watch_multi(
|
|||
|
||||
// Initial pass: fetch all URLs in parallel
|
||||
let initial_results = client
|
||||
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
|
||||
.fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options)
|
||||
.await;
|
||||
|
||||
let mut snapshots = std::collections::HashMap::new();
|
||||
|
|
@ -1676,7 +1693,7 @@ async fn run_watch_multi(
|
|||
check_number += 1;
|
||||
|
||||
let current_results = client
|
||||
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
|
||||
.fetch_and_extract_batch_with_options(&url_refs, resolved.concurrency, options)
|
||||
.await;
|
||||
|
||||
let mut changed: Vec<serde_json::Value> = Vec::new();
|
||||
|
|
@ -1780,7 +1797,11 @@ async fn run_watch_multi(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
|
||||
async fn run_diff(
|
||||
cli: &Cli,
|
||||
resolved: &config::ResolvedConfig,
|
||||
snapshot_path: &str,
|
||||
) -> Result<(), String> {
|
||||
// Load previous snapshot
|
||||
let snapshot_json = std::fs::read_to_string(snapshot_path)
|
||||
.map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?;
|
||||
|
|
@ -1788,16 +1809,16 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
|
|||
.map_err(|e| format!("failed to parse snapshot JSON: {e}"))?;
|
||||
|
||||
// Extract current version (handles PDF detection for URLs)
|
||||
let new_result = fetch_and_extract(cli).await?.into_extraction()?;
|
||||
let new_result = fetch_and_extract(cli, resolved).await?.into_extraction()?;
|
||||
|
||||
let diff = noxa_core::diff::diff(&old, &new_result);
|
||||
print_diff_output(&diff, &cli.format);
|
||||
print_diff_output(&diff, &resolved.format);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_brand(cli: &Cli) -> Result<(), String> {
|
||||
let result = fetch_html(cli).await?;
|
||||
async fn run_brand(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
|
||||
let result = fetch_html(cli, resolved).await?;
|
||||
let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
|
||||
let brand = noxa_core::brand::extract_brand(
|
||||
&enriched,
|
||||
|
|
@ -1811,13 +1832,27 @@ async fn run_brand(cli: &Cli) -> Result<(), String> {
|
|||
}
|
||||
|
||||
/// Build an LLM provider based on CLI flags, or fall back to the default chain.
|
||||
async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
||||
if let Some(ref name) = cli.llm_provider {
|
||||
async fn build_llm_provider(
|
||||
cli: &Cli,
|
||||
resolved: &config::ResolvedConfig,
|
||||
) -> Result<Box<dyn LlmProvider>, String> {
|
||||
if let Some(ref name) = resolved.llm_provider {
|
||||
match name.as_str() {
|
||||
"gemini" => {
|
||||
let provider = noxa_llm::providers::gemini_cli::GeminiCliProvider::new(
|
||||
resolved.llm_model.clone(),
|
||||
);
|
||||
if !provider.is_available().await {
|
||||
return Err(
|
||||
"gemini CLI not found on PATH -- install it or omit --llm-provider".into(),
|
||||
);
|
||||
}
|
||||
Ok(Box::new(provider))
|
||||
}
|
||||
"ollama" => {
|
||||
let provider = noxa_llm::providers::ollama::OllamaProvider::new(
|
||||
cli.llm_base_url.clone(),
|
||||
cli.llm_model.clone(),
|
||||
resolved.llm_model.clone(),
|
||||
);
|
||||
if !provider.is_available().await {
|
||||
return Err("ollama is not running or unreachable".into());
|
||||
|
|
@ -1828,7 +1863,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
let provider = noxa_llm::providers::openai::OpenAiProvider::new(
|
||||
None,
|
||||
cli.llm_base_url.clone(),
|
||||
cli.llm_model.clone(),
|
||||
resolved.llm_model.clone(),
|
||||
)
|
||||
.ok_or("OPENAI_API_KEY not set")?;
|
||||
Ok(Box::new(provider))
|
||||
|
|
@ -1836,20 +1871,20 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
"anthropic" => {
|
||||
let provider = noxa_llm::providers::anthropic::AnthropicProvider::new(
|
||||
None,
|
||||
cli.llm_model.clone(),
|
||||
resolved.llm_model.clone(),
|
||||
)
|
||||
.ok_or("ANTHROPIC_API_KEY not set")?;
|
||||
Ok(Box::new(provider))
|
||||
}
|
||||
other => Err(format!(
|
||||
"unknown LLM provider: {other} (use ollama, openai, or anthropic)"
|
||||
"unknown LLM provider: {other} (use gemini, ollama, openai, or anthropic)"
|
||||
)),
|
||||
}
|
||||
} else {
|
||||
let chain = noxa_llm::ProviderChain::default().await;
|
||||
if chain.is_empty() {
|
||||
return Err(
|
||||
"no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
|
||||
"no LLM providers available -- install the gemini CLI, start Ollama, or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
|
|
@ -1857,12 +1892,12 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
}
|
||||
}
|
||||
|
||||
async fn run_llm(cli: &Cli) -> Result<(), String> {
|
||||
async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), String> {
|
||||
// Extract content from source first (handles PDF detection for URLs)
|
||||
let result = fetch_and_extract(cli).await?.into_extraction()?;
|
||||
let result = fetch_and_extract(cli, resolved).await?.into_extraction()?;
|
||||
|
||||
let provider = build_llm_provider(cli).await?;
|
||||
let model = cli.llm_model.as_deref();
|
||||
let provider = build_llm_provider(cli, resolved).await?;
|
||||
let model = resolved.llm_model.as_deref();
|
||||
|
||||
if let Some(ref schema_input) = cli.extract_json {
|
||||
// Support @file syntax for loading schema from file
|
||||
|
|
@ -1876,6 +1911,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
let schema: serde_json::Value =
|
||||
serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?;
|
||||
|
||||
let t = std::time::Instant::now();
|
||||
let extracted = noxa_llm::extract::extract_json(
|
||||
&result.content.plain_text,
|
||||
&schema,
|
||||
|
|
@ -1884,12 +1920,14 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
)
|
||||
.await
|
||||
.map_err(|e| format!("LLM extraction failed: {e}"))?;
|
||||
eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&extracted).expect("serialization failed")
|
||||
);
|
||||
} else if let Some(ref prompt) = cli.extract_prompt {
|
||||
let t = std::time::Instant::now();
|
||||
let extracted = noxa_llm::extract::extract_with_prompt(
|
||||
&result.content.plain_text,
|
||||
prompt,
|
||||
|
|
@ -1898,12 +1936,14 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
)
|
||||
.await
|
||||
.map_err(|e| format!("LLM extraction failed: {e}"))?;
|
||||
eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&extracted).expect("serialization failed")
|
||||
);
|
||||
} else if let Some(sentences) = cli.summarize {
|
||||
let t = std::time::Instant::now();
|
||||
let summary = noxa_llm::summarize::summarize(
|
||||
&result.content.plain_text,
|
||||
Some(sentences),
|
||||
|
|
@ -1912,6 +1952,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
)
|
||||
.await
|
||||
.map_err(|e| format!("LLM summarization failed: {e}"))?;
|
||||
eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64());
|
||||
|
||||
println!("{summary}");
|
||||
}
|
||||
|
|
@ -1921,12 +1962,16 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
|
||||
/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
|
||||
/// URLs are processed sequentially to respect LLM provider rate limits.
|
||||
async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
|
||||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
let options = build_extraction_options(cli);
|
||||
let provider = build_llm_provider(cli).await?;
|
||||
let model = cli.llm_model.as_deref();
|
||||
async fn run_batch_llm(
|
||||
cli: &Cli,
|
||||
resolved: &config::ResolvedConfig,
|
||||
entries: &[(String, Option<String>)],
|
||||
) -> Result<(), String> {
|
||||
let client = FetchClient::new(build_fetch_config(cli, resolved))
|
||||
.map_err(|e| format!("client error: {e}"))?;
|
||||
let options = build_extraction_options(resolved);
|
||||
let provider = build_llm_provider(cli, resolved).await?;
|
||||
let model = resolved.llm_model.as_deref();
|
||||
|
||||
// Pre-parse schema once if --extract-json is used
|
||||
let schema = if let Some(ref schema_input) = cli.extract_json {
|
||||
|
|
@ -1974,6 +2019,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
|||
let text = &extraction.content.plain_text;
|
||||
|
||||
// Run the appropriate LLM operation
|
||||
let llm_start = std::time::Instant::now();
|
||||
let llm_result = if let Some(ref schema) = schema {
|
||||
noxa_llm::extract::extract_json(text, schema, provider.as_ref(), model)
|
||||
.await
|
||||
|
|
@ -1989,6 +2035,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
|||
} else {
|
||||
unreachable!("run_batch_llm called without LLM flags")
|
||||
};
|
||||
let llm_elapsed = llm_start.elapsed();
|
||||
|
||||
match llm_result {
|
||||
Ok(output) => {
|
||||
|
|
@ -2018,7 +2065,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
|||
format!("{words} words")
|
||||
}
|
||||
};
|
||||
eprintln!("-> extracted {detail}");
|
||||
eprintln!("-> extracted {detail} ({:.1}s)", llm_elapsed.as_secs_f64());
|
||||
|
||||
if let Some(ref dir) = cli.output_dir {
|
||||
let filename = custom_names
|
||||
|
|
@ -2215,12 +2262,29 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
async fn main() {
|
||||
dotenvy::dotenv().ok();
|
||||
|
||||
let cli = Cli::parse();
|
||||
init_logging(cli.verbose);
|
||||
if matches!(std::env::args().nth(1).as_deref(), Some("mcp")) {
|
||||
init_mcp_logging();
|
||||
|
||||
if let Err(e) = noxa_mcp::run().await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Use low-level API to get both typed Cli and ArgMatches for ValueSource detection.
|
||||
let matches = Cli::command().get_matches();
|
||||
let cli = Cli::from_arg_matches(&matches).unwrap_or_else(|e| e.exit());
|
||||
|
||||
// Load config BEFORE init_logging so verbose from config takes effect.
|
||||
let cfg = config::NoxaConfig::load(cli.config.as_deref());
|
||||
let resolved = config::resolve(&cli, &matches, &cfg);
|
||||
|
||||
init_logging(resolved.verbose);
|
||||
|
||||
// --map: sitemap discovery mode
|
||||
if cli.map {
|
||||
if let Err(e) = run_map(&cli).await {
|
||||
if let Err(e) = run_map(&cli, &resolved).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
|
|
@ -2229,7 +2293,7 @@ async fn main() {
|
|||
|
||||
// --crawl: recursive crawl mode
|
||||
if cli.crawl {
|
||||
if let Err(e) = run_crawl(&cli).await {
|
||||
if let Err(e) = run_crawl(&cli, &resolved).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
|
|
@ -2245,7 +2309,7 @@ async fn main() {
|
|||
process::exit(1);
|
||||
}
|
||||
};
|
||||
if let Err(e) = run_watch(&cli, &watch_urls).await {
|
||||
if let Err(e) = run_watch(&cli, &resolved, &watch_urls).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
|
|
@ -2254,7 +2318,7 @@ async fn main() {
|
|||
|
||||
// --diff-with: change tracking mode
|
||||
if let Some(ref snapshot_path) = cli.diff_with {
|
||||
if let Err(e) = run_diff(&cli, snapshot_path).await {
|
||||
if let Err(e) = run_diff(&cli, &resolved, snapshot_path).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
|
|
@ -2263,7 +2327,7 @@ async fn main() {
|
|||
|
||||
// --brand: brand identity extraction mode
|
||||
if cli.brand {
|
||||
if let Err(e) = run_brand(&cli).await {
|
||||
if let Err(e) = run_brand(&cli, &resolved).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
|
|
@ -2292,11 +2356,11 @@ async fn main() {
|
|||
// When multiple URLs are provided, run batch LLM extraction over all of them.
|
||||
if has_llm_flags(&cli) {
|
||||
if entries.len() > 1 {
|
||||
if let Err(e) = run_batch_llm(&cli, &entries).await {
|
||||
if let Err(e) = run_batch_llm(&cli, &resolved, &entries).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
} else if let Err(e) = run_llm(&cli).await {
|
||||
} else if let Err(e) = run_llm(&cli, &resolved).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
|
|
@ -2305,7 +2369,7 @@ async fn main() {
|
|||
|
||||
// Multi-URL batch mode
|
||||
if entries.len() > 1 {
|
||||
if let Err(e) = run_batch(&cli, &entries).await {
|
||||
if let Err(e) = run_batch(&cli, &resolved, &entries).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
|
|
@ -2313,8 +2377,11 @@ async fn main() {
|
|||
}
|
||||
|
||||
// --raw-html: skip extraction, dump the fetched HTML
|
||||
if cli.raw_html && cli.include.is_none() && cli.exclude.is_none() {
|
||||
match fetch_html(&cli).await {
|
||||
if resolved.raw_html
|
||||
&& resolved.include_selectors.is_empty()
|
||||
&& resolved.exclude_selectors.is_empty()
|
||||
{
|
||||
match fetch_html(&cli, &resolved).await {
|
||||
Ok(r) => println!("{}", r.html),
|
||||
Err(e) => {
|
||||
eprintln!("error: {e}");
|
||||
|
|
@ -2325,7 +2392,7 @@ async fn main() {
|
|||
}
|
||||
|
||||
// Single-page extraction (handles both HTML and PDF via content-type detection)
|
||||
match fetch_and_extract(&cli).await {
|
||||
match fetch_and_extract(&cli, &resolved).await {
|
||||
Ok(FetchOutput::Local(result)) => {
|
||||
if let Some(ref dir) = cli.output_dir {
|
||||
let url = cli
|
||||
|
|
@ -2334,18 +2401,19 @@ async fn main() {
|
|||
.map(|u| normalize_url(u))
|
||||
.unwrap_or_default();
|
||||
let custom_name = entries.first().and_then(|(_, name)| name.clone());
|
||||
let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
|
||||
let content = format_output(&result, &cli.format, cli.metadata);
|
||||
let filename =
|
||||
custom_name.unwrap_or_else(|| url_to_filename(&url, &resolved.format));
|
||||
let content = format_output(&result, &resolved.format, resolved.metadata);
|
||||
if let Err(e) = write_to_file(dir, &filename, &content) {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
} else {
|
||||
print_output(&result, &cli.format, cli.metadata);
|
||||
print_output(&result, &resolved.format, resolved.metadata);
|
||||
}
|
||||
}
|
||||
Ok(FetchOutput::Cloud(resp)) => {
|
||||
print_cloud_output(&resp, &cli.format);
|
||||
print_cloud_output(&resp, &resolved.format);
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("{e}");
|
||||
|
|
@ -2456,3 +2524,28 @@ mod tests {
|
|||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod enum_deserialize_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_output_format_deserialize() {
|
||||
let f: OutputFormat = serde_json::from_str("\"llm\"").unwrap();
|
||||
assert!(matches!(f, OutputFormat::Llm));
|
||||
let f: OutputFormat = serde_json::from_str("\"markdown\"").unwrap();
|
||||
assert!(matches!(f, OutputFormat::Markdown));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_browser_deserialize() {
|
||||
let b: Browser = serde_json::from_str("\"firefox\"").unwrap();
|
||||
assert!(matches!(b, Browser::Firefox));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_mode_deserialize() {
|
||||
let p: PdfModeArg = serde_json::from_str("\"fast\"").unwrap();
|
||||
assert!(matches!(p, PdfModeArg::Fast));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ license.workspace = true
|
|||
[dependencies]
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
async-trait = "0.1"
|
||||
jsonschema = { version = "0.46", default-features = false }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -2,12 +2,15 @@
|
|||
/// Default order: Ollama (local, free) -> OpenAI -> Anthropic.
|
||||
/// Only includes providers that are actually configured/available.
|
||||
use async_trait::async_trait;
|
||||
use tracing::{debug, warn};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::error::LlmError;
|
||||
use crate::provider::{CompletionRequest, LlmProvider};
|
||||
use crate::providers::{
|
||||
anthropic::AnthropicProvider, ollama::OllamaProvider, openai::OpenAiProvider,
|
||||
anthropic::AnthropicProvider,
|
||||
gemini_cli::GeminiCliProvider,
|
||||
ollama::OllamaProvider,
|
||||
openai::OpenAiProvider,
|
||||
};
|
||||
|
||||
pub struct ProviderChain {
|
||||
|
|
@ -15,12 +18,26 @@ pub struct ProviderChain {
|
|||
}
|
||||
|
||||
impl ProviderChain {
|
||||
/// Build the default chain: Ollama -> OpenAI -> Anthropic.
|
||||
/// Ollama is always added (availability checked at call time).
|
||||
/// Build the default chain: Gemini CLI -> OpenAI -> Ollama -> Anthropic.
|
||||
/// Gemini CLI is the primary backend (subprocess-based, requires `gemini` on PATH).
|
||||
/// Cloud providers are only added if their API keys are configured.
|
||||
/// Ollama is added if reachable at call time.
|
||||
pub async fn default() -> Self {
|
||||
let mut providers: Vec<Box<dyn LlmProvider>> = Vec::new();
|
||||
|
||||
let gemini = GeminiCliProvider::new(None);
|
||||
if gemini.is_available().await {
|
||||
debug!("gemini cli available, adding as primary provider");
|
||||
providers.push(Box::new(gemini));
|
||||
} else {
|
||||
debug!("gemini cli not found on PATH, skipping");
|
||||
}
|
||||
|
||||
if let Some(openai) = OpenAiProvider::new(None, None, None) {
|
||||
debug!("openai configured, adding to chain");
|
||||
providers.push(Box::new(openai));
|
||||
}
|
||||
|
||||
let ollama = OllamaProvider::new(None, None);
|
||||
if ollama.is_available().await {
|
||||
debug!("ollama is available, adding to chain");
|
||||
|
|
@ -29,11 +46,6 @@ impl ProviderChain {
|
|||
debug!("ollama not available, skipping");
|
||||
}
|
||||
|
||||
if let Some(openai) = OpenAiProvider::new(None, None, None) {
|
||||
debug!("openai configured, adding to chain");
|
||||
providers.push(Box::new(openai));
|
||||
}
|
||||
|
||||
if let Some(anthropic) = AnthropicProvider::new(None, None) {
|
||||
debug!("anthropic configured, adding to chain");
|
||||
providers.push(Box::new(anthropic));
|
||||
|
|
@ -79,9 +91,10 @@ impl LlmProvider for ProviderChain {
|
|||
for provider in &self.providers {
|
||||
debug!(provider = provider.name(), "attempting completion");
|
||||
|
||||
let t = std::time::Instant::now();
|
||||
match provider.complete(request).await {
|
||||
Ok(response) => {
|
||||
debug!(provider = provider.name(), "completion succeeded");
|
||||
info!(provider = provider.name(), elapsed_ms = t.elapsed().as_millis(), "completion succeeded");
|
||||
return Ok(response);
|
||||
}
|
||||
Err(e) => {
|
||||
|
|
@ -202,4 +215,46 @@ mod tests {
|
|||
assert_eq!(chain.len(), 2);
|
||||
assert!(!chain.is_empty());
|
||||
}
|
||||
|
||||
// ── Gemini-first chain ordering ───────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn gemini_first_in_single_provider_chain() {
|
||||
// When we build a chain with a mock "gemini" provider first, it should
|
||||
// be used before any fallback.
|
||||
let chain = ProviderChain::from_providers(vec![
|
||||
Box::new(MockProvider {
|
||||
name: "gemini",
|
||||
response: Ok("from gemini".into()),
|
||||
available: true,
|
||||
}),
|
||||
Box::new(MockProvider {
|
||||
name: "openai",
|
||||
response: Ok("from openai".into()),
|
||||
available: true,
|
||||
}),
|
||||
]);
|
||||
let result = chain.complete(&test_request()).await.unwrap();
|
||||
assert_eq!(result, "from gemini");
|
||||
// Confirm order: first provider name is "gemini"
|
||||
assert_eq!(chain.providers[0].name(), "gemini");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn gemini_failure_falls_back_to_openai() {
|
||||
let chain = ProviderChain::from_providers(vec![
|
||||
Box::new(MockProvider {
|
||||
name: "gemini",
|
||||
response: Err("subprocess timed out".into()),
|
||||
available: true,
|
||||
}),
|
||||
Box::new(MockProvider {
|
||||
name: "openai",
|
||||
response: Ok("from openai".into()),
|
||||
available: true,
|
||||
}),
|
||||
]);
|
||||
let result = chain.complete(&test_request()).await.unwrap();
|
||||
assert_eq!(result, "from openai");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,12 @@ pub enum LlmError {
|
|||
#[error("HTTP error: {0}")]
|
||||
Http(#[from] reqwest::Error),
|
||||
|
||||
#[error("subprocess error: {0}")]
|
||||
Subprocess(#[from] std::io::Error),
|
||||
|
||||
#[error("subprocess timed out")]
|
||||
Timeout,
|
||||
|
||||
#[error("no providers available")]
|
||||
NoProviders,
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,45 @@
|
|||
/// Schema-based and prompt-based LLM extraction.
|
||||
/// Both functions build a system prompt, send content to the LLM, and parse JSON back.
|
||||
use jsonschema;
|
||||
|
||||
use crate::clean::strip_thinking_tags;
|
||||
use crate::error::LlmError;
|
||||
use crate::provider::{CompletionRequest, LlmProvider, Message};
|
||||
|
||||
/// Validate a JSON value against a schema. Returns Ok(()) on success or
|
||||
/// Err(LlmError::InvalidJson) with a concise error message on failure.
|
||||
fn validate_schema(
|
||||
value: &serde_json::Value,
|
||||
schema: &serde_json::Value,
|
||||
) -> Result<(), LlmError> {
|
||||
let compiled = jsonschema::validator_for(schema).map_err(|e| {
|
||||
LlmError::InvalidJson(format!("invalid schema: {e}"))
|
||||
})?;
|
||||
|
||||
let errors: Vec<String> = compiled
|
||||
.iter_errors(value)
|
||||
.map(|e| format!("{} at {}", e, e.instance_path()))
|
||||
.collect();
|
||||
|
||||
if errors.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(LlmError::InvalidJson(format!(
|
||||
"schema validation failed: {}",
|
||||
errors.join("; ")
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract structured JSON from content using a JSON schema.
|
||||
/// The schema tells the LLM exactly what fields to extract and their types.
|
||||
///
|
||||
/// Retry policy:
|
||||
/// - If the response cannot be parsed as JSON at all: retry once with the
|
||||
/// identical request (handles transient formatting issues).
|
||||
/// - If the response is valid JSON but fails schema validation: return
|
||||
/// `LlmError::InvalidJson` immediately — the schema is likely unsatisfiable
|
||||
/// for this content, so retrying would produce the same result.
|
||||
pub async fn extract_json(
|
||||
content: &str,
|
||||
schema: &serde_json::Value,
|
||||
|
|
@ -37,7 +71,22 @@ pub async fn extract_json(
|
|||
};
|
||||
|
||||
let response = provider.complete(&request).await?;
|
||||
parse_json_response(&response)
|
||||
|
||||
match parse_json_response(&response) {
|
||||
Ok(value) => {
|
||||
// Valid JSON — now validate against the schema.
|
||||
// Schema mismatches do not retry (unsatisfiable → same result).
|
||||
validate_schema(&value, schema)?;
|
||||
Ok(value)
|
||||
}
|
||||
Err(_parse_err) => {
|
||||
// Unparseable JSON — retry once with the identical request.
|
||||
let retry_response = provider.complete(&request).await?;
|
||||
let value = parse_json_response(&retry_response)?;
|
||||
validate_schema(&value, schema)?;
|
||||
Ok(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract information using a natural language prompt.
|
||||
|
|
@ -184,4 +233,130 @@ mod tests {
|
|||
|
||||
assert_eq!(result["emails"][0], "test@example.com");
|
||||
}
|
||||
|
||||
// ── Schema validation ─────────────────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn schema_validation_passes_for_matching_json() {
|
||||
let schema = serde_json::json!({
|
||||
"type": "object",
|
||||
"required": ["price"],
|
||||
"properties": {
|
||||
"price": { "type": "number" }
|
||||
}
|
||||
});
|
||||
let mock = MockProvider::ok(r#"{"price": 9.99}"#);
|
||||
let result = extract_json("content", &schema, &mock, None).await.unwrap();
|
||||
assert_eq!(result["price"], 9.99);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn schema_validation_fails_for_wrong_type() {
|
||||
let schema = serde_json::json!({
|
||||
"type": "object",
|
||||
"required": ["price"],
|
||||
"properties": {
|
||||
"price": { "type": "number" }
|
||||
}
|
||||
});
|
||||
// Model returns valid JSON but wrong type ("string" instead of number).
|
||||
// Should NOT retry (schema mismatch ≠ parse failure) — returns InvalidJson immediately.
|
||||
let mock = MockProvider::ok(r#"{"price": "not-a-number"}"#);
|
||||
let result = extract_json("content", &schema, &mock, None).await;
|
||||
assert!(
|
||||
matches!(result, Err(LlmError::InvalidJson(_))),
|
||||
"expected InvalidJson for schema mismatch, got {result:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn schema_validation_fails_for_missing_required_field() {
|
||||
let schema = serde_json::json!({
|
||||
"type": "object",
|
||||
"required": ["title"],
|
||||
"properties": {
|
||||
"title": { "type": "string" }
|
||||
}
|
||||
});
|
||||
let mock = MockProvider::ok(r#"{"other": "value"}"#);
|
||||
let result = extract_json("content", &schema, &mock, None).await;
|
||||
assert!(matches!(result, Err(LlmError::InvalidJson(_))));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn parse_failure_triggers_one_retry() {
|
||||
use crate::testing::mock::SequenceMockProvider;
|
||||
|
||||
let schema = serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": { "title": { "type": "string" } }
|
||||
});
|
||||
|
||||
// First call: unparseable JSON. Second call: valid JSON matching schema.
|
||||
let mock = SequenceMockProvider::new(
|
||||
"mock-seq",
|
||||
vec![
|
||||
Ok("this is not json at all".to_string()),
|
||||
Ok(r#"{"title": "Retry succeeded"}"#.to_string()),
|
||||
],
|
||||
);
|
||||
|
||||
let result = extract_json("content", &schema, &mock, None)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result["title"], "Retry succeeded");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn both_attempts_fail_returns_invalid_json() {
|
||||
use crate::testing::mock::SequenceMockProvider;
|
||||
|
||||
let schema = serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": { "title": { "type": "string" } }
|
||||
});
|
||||
|
||||
let mock = SequenceMockProvider::new(
|
||||
"mock-seq",
|
||||
vec![
|
||||
Ok("not json".to_string()),
|
||||
Ok("also not json".to_string()),
|
||||
],
|
||||
);
|
||||
|
||||
let result = extract_json("content", &schema, &mock, None).await;
|
||||
assert!(
|
||||
matches!(result, Err(LlmError::InvalidJson(_))),
|
||||
"expected InvalidJson after both attempts fail"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn schema_mismatch_does_not_retry() {
|
||||
use crate::testing::mock::SequenceMockProvider;
|
||||
|
||||
let schema = serde_json::json!({
|
||||
"type": "object",
|
||||
"required": ["price"],
|
||||
"properties": {
|
||||
"price": { "type": "number" }
|
||||
}
|
||||
});
|
||||
|
||||
// Both calls return valid JSON with wrong schema — but only one call should happen.
|
||||
let mock = SequenceMockProvider::new(
|
||||
"mock-seq",
|
||||
vec![
|
||||
Ok(r#"{"price": "wrong-type"}"#.to_string()),
|
||||
Ok(r#"{"price": 9.99}"#.to_string()), // would succeed — but shouldn't be called
|
||||
],
|
||||
);
|
||||
|
||||
// Should return InvalidJson without calling second response.
|
||||
let result = extract_json("content", &schema, &mock, None).await;
|
||||
assert!(
|
||||
matches!(result, Err(LlmError::InvalidJson(_))),
|
||||
"schema mismatch should not trigger retry"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
/// noxa-llm: LLM integration with local-first hybrid architecture.
|
||||
/// noxa-llm: LLM integration with Gemini-CLI-first hybrid architecture.
|
||||
///
|
||||
/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
|
||||
/// Provides schema-based extraction, prompt extraction, and summarization
|
||||
/// on top of noxa-core's content pipeline.
|
||||
/// Provider chain: Gemini CLI (primary) → OpenAI → Ollama → Anthropic.
|
||||
/// Gemini CLI requires the `gemini` binary on PATH; GEMINI_MODEL env var sets the model.
|
||||
/// Provides schema-validated extraction (with one retry on parse failure),
|
||||
/// prompt extraction, and summarization on top of noxa-core's content pipeline.
|
||||
pub mod chain;
|
||||
pub mod clean;
|
||||
pub mod error;
|
||||
|
|
|
|||
392
crates/noxa-llm/src/providers/gemini_cli.rs
Normal file
392
crates/noxa-llm/src/providers/gemini_cli.rs
Normal file
|
|
@ -0,0 +1,392 @@
|
|||
/// Gemini CLI provider — shells out to `gemini -p` for completions.
|
||||
/// Primary provider in the default chain; requires the `gemini` binary on PATH.
|
||||
///
|
||||
/// Prompts are passed via the `-p` flag (not via stdin or as a positional) to prevent
|
||||
/// command injection from web-scraped content. Output is parsed from `--output-format json`.
|
||||
///
|
||||
/// # Startup optimizations
|
||||
///
|
||||
/// The gemini CLI is an agentic Node.js application that connects to every configured MCP
|
||||
/// server at startup (the user has 6). Without mitigation this can add 10-60+ seconds per
|
||||
/// call as those servers spin up and time out.
|
||||
///
|
||||
/// Two flags reduce this:
|
||||
/// - `--extensions ""` — skips extension loading (~3 s saved)
|
||||
/// - `current_dir` set to a temp workdir containing `.gemini/settings.json` with
|
||||
/// `{"mcpServers":{}}` — workspace settings override user settings, so all 6 MCP
|
||||
/// servers are skipped at subprocess startup (major speedup).
|
||||
///
|
||||
/// The workdir is created once at construction and reused for every call.
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use tokio::process::Command;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio::time::timeout;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::clean::strip_thinking_tags;
|
||||
use crate::error::LlmError;
|
||||
use crate::provider::{CompletionRequest, LlmProvider};
|
||||
|
||||
/// Maximum concurrent Gemini subprocess calls.
|
||||
const MAX_CONCURRENT: usize = 6;
|
||||
/// Subprocess deadline — prevents hung `gemini` processes blocking the chain.
|
||||
const SUBPROCESS_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
/// Fixed workdir used for every subprocess call.
|
||||
/// A workspace-level `.gemini/settings.json` here overrides the user's MCP server config.
|
||||
const NOXA_GEMINI_WORKDIR: &str = "/tmp/noxa-gemini";
|
||||
|
||||
pub struct GeminiCliProvider {
|
||||
default_model: String,
|
||||
semaphore: Arc<Semaphore>,
|
||||
/// Workdir with a minimal `.gemini/settings.json` that disables MCP servers.
|
||||
workdir: PathBuf,
|
||||
}
|
||||
|
||||
impl GeminiCliProvider {
|
||||
/// Construct the provider.
|
||||
/// Model resolves as: `model` arg → `GEMINI_MODEL` env → `"gemini-2.5-pro"`.
|
||||
pub fn new(model: Option<String>) -> Self {
|
||||
let default_model = model
|
||||
.or_else(|| std::env::var("GEMINI_MODEL").ok())
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| "gemini-2.5-pro".into());
|
||||
|
||||
let workdir = PathBuf::from(NOXA_GEMINI_WORKDIR);
|
||||
ensure_gemini_workdir(&workdir);
|
||||
|
||||
Self {
|
||||
default_model,
|
||||
semaphore: Arc::new(Semaphore::new(MAX_CONCURRENT)),
|
||||
workdir,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn default_model(&self) -> &str {
|
||||
&self.default_model
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl LlmProvider for GeminiCliProvider {
|
||||
async fn complete(&self, request: &CompletionRequest) -> Result<String, LlmError> {
|
||||
let model = if request.model.is_empty() {
|
||||
&self.default_model
|
||||
} else {
|
||||
&request.model
|
||||
};
|
||||
|
||||
// Build the prompt text from all messages.
|
||||
let prompt = build_prompt(&request.messages);
|
||||
|
||||
// Acquire concurrency slot before spawning.
|
||||
let _permit = self
|
||||
.semaphore
|
||||
.acquire()
|
||||
.await
|
||||
.map_err(|_| LlmError::ProviderError("gemini semaphore closed".into()))?;
|
||||
|
||||
let mut cmd = Command::new("gemini");
|
||||
// -p STRING: headless mode with prompt as the flag value (never positional arg).
|
||||
// Passing via -p prevents command injection; the value is never interpreted as a shell command.
|
||||
cmd.arg("-p").arg(&prompt);
|
||||
cmd.arg("--model").arg(model);
|
||||
// Always request structured JSON output so we can extract the `response` field
|
||||
// and skip any preceding noise lines (e.g. MCP status warnings).
|
||||
cmd.arg("--output-format").arg("json");
|
||||
// --yolo suppresses any interactive confirmation prompts in headless mode.
|
||||
cmd.arg("--yolo");
|
||||
// --extensions "" skips loading user extensions (~3 s startup savings).
|
||||
cmd.arg("--extensions").arg("");
|
||||
// Workspace settings in self.workdir override the user's ~/.gemini/settings.json,
|
||||
// replacing the user's MCP server list with {} so none are spawned at startup.
|
||||
// Without this, each of the user's MCP servers adds latency to every call.
|
||||
cmd.current_dir(&self.workdir);
|
||||
|
||||
cmd.stdin(std::process::Stdio::null());
|
||||
cmd.stdout(std::process::Stdio::piped());
|
||||
cmd.stderr(std::process::Stdio::piped());
|
||||
|
||||
debug!(model, workdir = %self.workdir.display(), "spawning gemini subprocess");
|
||||
|
||||
let child = cmd.spawn().map_err(LlmError::Subprocess)?;
|
||||
|
||||
// Bounded wait — prevents indefinite hangs on auth expiry or network stall.
|
||||
let output = match timeout(SUBPROCESS_TIMEOUT, child.wait_with_output()).await {
|
||||
Ok(Ok(out)) => out,
|
||||
Ok(Err(e)) => return Err(LlmError::Subprocess(e)),
|
||||
Err(_elapsed) => return Err(LlmError::Timeout),
|
||||
};
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr_preview = String::from_utf8_lossy(&output.stderr);
|
||||
let preview = &stderr_preview[..stderr_preview.len().min(500)];
|
||||
return Err(LlmError::ProviderError(format!(
|
||||
"gemini exited with {}: {preview}",
|
||||
output.status
|
||||
)));
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let response = extract_response_from_output(&stdout)?;
|
||||
let cleaned = strip_code_fences(strip_thinking_tags(&response).trim());
|
||||
Ok(cleaned)
|
||||
}
|
||||
|
||||
async fn is_available(&self) -> bool {
|
||||
// Pure PATH check — no inference call, fast.
|
||||
matches!(
|
||||
Command::new("gemini")
|
||||
.arg("--version")
|
||||
.stdout(std::process::Stdio::null())
|
||||
.stderr(std::process::Stdio::null())
|
||||
.status()
|
||||
.await,
|
||||
Ok(s) if s.success()
|
||||
)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"gemini"
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse the `response` field from gemini's `--output-format json` output.
|
||||
///
|
||||
/// The CLI emits lines before the JSON object (e.g. MCP status warnings).
|
||||
/// We find the first `{` to locate the JSON, parse it, and extract `.response`.
|
||||
fn extract_response_from_output(stdout: &str) -> Result<String, LlmError> {
|
||||
let json_start = stdout.find('{').ok_or_else(|| {
|
||||
let preview = &stdout[..stdout.len().min(300)];
|
||||
LlmError::ProviderError(format!("gemini produced no JSON output: {preview}"))
|
||||
})?;
|
||||
|
||||
let json_str = &stdout[json_start..];
|
||||
let outer: serde_json::Value = serde_json::from_str(json_str).map_err(|e| {
|
||||
let preview = &json_str[..json_str.len().min(300)];
|
||||
LlmError::ProviderError(format!("failed to parse gemini JSON output: {e} — {preview}"))
|
||||
})?;
|
||||
|
||||
// `response` holds the model's actual text output.
|
||||
outer["response"]
|
||||
.as_str()
|
||||
.ok_or_else(|| {
|
||||
LlmError::ProviderError(format!(
|
||||
"gemini JSON output missing 'response' field: {}",
|
||||
&json_str[..json_str.len().min(300)]
|
||||
))
|
||||
})
|
||||
.map(|s| s.to_string())
|
||||
}
|
||||
|
||||
/// Create the noxa gemini workdir with a minimal workspace settings file.
|
||||
///
|
||||
/// The `.gemini/settings.json` written here overrides the user's `~/.gemini/settings.json`
|
||||
/// for any `gemini` subprocess run from this directory. Setting `mcpServers` to `{}` prevents
|
||||
/// the CLI from spawning the user's configured MCP servers on every headless call.
|
||||
///
|
||||
/// Errors are intentionally ignored — if the write fails, the subprocess still works,
|
||||
/// just without the startup optimization (and with a warning in the logs).
|
||||
fn ensure_gemini_workdir(workdir: &std::path::Path) {
|
||||
let settings_dir = workdir.join(".gemini");
|
||||
let settings_path = settings_dir.join("settings.json");
|
||||
|
||||
if settings_path.exists() {
|
||||
return;
|
||||
}
|
||||
|
||||
if let Err(e) = std::fs::create_dir_all(&settings_dir) {
|
||||
tracing::warn!(path = %settings_dir.display(), error = %e, "failed to create gemini workdir");
|
||||
return;
|
||||
}
|
||||
|
||||
// Minimal workspace settings: disable all MCP servers.
|
||||
// Workspace settings override ~/.gemini/settings.json per gemini CLI docs.
|
||||
let content = r#"{"mcpServers":{}}"#;
|
||||
if let Err(e) = std::fs::write(&settings_path, content) {
|
||||
tracing::warn!(path = %settings_path.display(), error = %e, "failed to write gemini workspace settings");
|
||||
}
|
||||
}
|
||||
|
||||
/// Concatenate all messages into a single prompt string for the CLI.
|
||||
fn build_prompt(messages: &[crate::provider::Message]) -> String {
|
||||
messages
|
||||
.iter()
|
||||
.map(|m| match m.role.as_str() {
|
||||
"system" => format!("[System]: {}", m.content),
|
||||
"assistant" => format!("[Assistant]: {}", m.content),
|
||||
_ => m.content.clone(),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n\n")
|
||||
}
|
||||
|
||||
/// Strip markdown code fences from a response string.
|
||||
fn strip_code_fences(s: &str) -> String {
|
||||
let trimmed = s.trim();
|
||||
if trimmed.starts_with("```") {
|
||||
let without_opener = trimmed
|
||||
.strip_prefix("```json")
|
||||
.or_else(|| trimmed.strip_prefix("```"))
|
||||
.unwrap_or(trimmed);
|
||||
without_opener
|
||||
.strip_suffix("```")
|
||||
.unwrap_or(without_opener)
|
||||
.trim()
|
||||
.to_string()
|
||||
} else {
|
||||
trimmed.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// ── Construction ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn explicit_model_used() {
|
||||
let p = GeminiCliProvider::new(Some("gemini-1.5-flash".into()));
|
||||
assert_eq!(p.default_model(), "gemini-1.5-flash");
|
||||
assert_eq!(p.name(), "gemini");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_model_fallback() {
|
||||
// Explicit None + no GEMINI_MODEL env → hardcoded default.
|
||||
// We unset the env to avoid flakiness (it may or may not be set).
|
||||
unsafe { std::env::remove_var("GEMINI_MODEL") };
|
||||
let p = GeminiCliProvider::new(None);
|
||||
assert_eq!(p.default_model(), "gemini-2.5-pro");
|
||||
}
|
||||
|
||||
// Env var tests mutate process-global state and race with parallel tests.
|
||||
// Run in isolation if needed:
|
||||
// cargo test -p noxa-llm env_model_override -- --ignored --test-threads=1
|
||||
#[test]
|
||||
#[ignore = "mutates process env; run with --test-threads=1"]
|
||||
fn env_model_override() {
|
||||
unsafe { std::env::set_var("GEMINI_MODEL", "gemini-1.5-pro") };
|
||||
let p = GeminiCliProvider::new(None);
|
||||
assert_eq!(p.default_model(), "gemini-1.5-pro");
|
||||
unsafe { std::env::remove_var("GEMINI_MODEL") };
|
||||
}
|
||||
|
||||
// ── build_prompt ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn build_prompt_user_only() {
|
||||
use crate::provider::Message;
|
||||
let messages = vec![Message {
|
||||
role: "user".into(),
|
||||
content: "hello world".into(),
|
||||
}];
|
||||
assert_eq!(build_prompt(&messages), "hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_prompt_system_and_user() {
|
||||
use crate::provider::Message;
|
||||
let messages = vec![
|
||||
Message {
|
||||
role: "system".into(),
|
||||
content: "You are helpful.".into(),
|
||||
},
|
||||
Message {
|
||||
role: "user".into(),
|
||||
content: "Tell me something.".into(),
|
||||
},
|
||||
];
|
||||
let result = build_prompt(&messages);
|
||||
assert!(result.contains("[System]: You are helpful."));
|
||||
assert!(result.contains("Tell me something."));
|
||||
}
|
||||
|
||||
// ── extract_response_from_output ──────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn extracts_response_from_clean_json() {
|
||||
let stdout = r#"{"session_id":"abc","response":"Hello world","stats":{}}"#;
|
||||
assert_eq!(extract_response_from_output(stdout).unwrap(), "Hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_response_skipping_mcp_noise() {
|
||||
// MCP warning line appears before the JSON object in real gemini output.
|
||||
let stdout = "MCP issues detected. Run /mcp list for status.\n{\"session_id\":\"abc\",\"response\":\"the answer\",\"stats\":{}}";
|
||||
assert_eq!(
|
||||
extract_response_from_output(stdout).unwrap(),
|
||||
"the answer"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_when_no_json_in_output() {
|
||||
let result = extract_response_from_output("MCP issues detected. No JSON follows.");
|
||||
assert!(matches!(result, Err(LlmError::ProviderError(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_when_response_field_missing() {
|
||||
let stdout = r#"{"session_id":"abc","stats":{}}"#;
|
||||
let result = extract_response_from_output(stdout);
|
||||
assert!(matches!(result, Err(LlmError::ProviderError(_))));
|
||||
}
|
||||
|
||||
// ── strip_code_fences ─────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn strips_json_fence() {
|
||||
let input = "```json\n{\"key\": \"value\"}\n```";
|
||||
assert_eq!(strip_code_fences(input), "{\"key\": \"value\"}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_plain_fence() {
|
||||
let input = "```\nhello\n```";
|
||||
assert_eq!(strip_code_fences(input), "hello");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn passthrough_no_fence() {
|
||||
let input = "{\"key\": \"value\"}";
|
||||
assert_eq!(strip_code_fences(input), "{\"key\": \"value\"}");
|
||||
}
|
||||
|
||||
// ── is_available returns false when binary absent ──────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn unavailable_when_binary_missing() {
|
||||
let result = tokio::process::Command::new("__noxa_nonexistent_binary_xyz__")
|
||||
.arg("--version")
|
||||
.stdout(std::process::Stdio::null())
|
||||
.stderr(std::process::Stdio::null())
|
||||
.status()
|
||||
.await;
|
||||
assert!(result.is_err(), "missing binary should fail to spawn");
|
||||
}
|
||||
|
||||
// ── thinking tag stripping ────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn strips_thinking_tags_from_output() {
|
||||
let raw = "<think>internal reasoning</think>{\"result\": true}";
|
||||
let after_thinking = strip_thinking_tags(raw);
|
||||
let after_fences = strip_code_fences(after_thinking.trim());
|
||||
assert_eq!(after_fences, "{\"result\": true}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_code_fence_after_thinking() {
|
||||
let raw = "<think>let me check</think>\n```json\n{\"ok\": 1}\n```";
|
||||
let after_thinking = strip_thinking_tags(raw);
|
||||
let after_fences = strip_code_fences(after_thinking.trim());
|
||||
assert_eq!(after_fences, "{\"ok\": 1}");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
pub mod anthropic;
|
||||
pub mod gemini_cli;
|
||||
pub mod ollama;
|
||||
pub mod openai;
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
/// First choice in the provider chain: free, private, fast on Apple Silicon.
|
||||
use async_trait::async_trait;
|
||||
use serde_json::json;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::clean::strip_thinking_tags;
|
||||
use crate::error::LlmError;
|
||||
|
|
@ -96,7 +97,10 @@ impl LlmProvider for OllamaProvider {
|
|||
|
||||
async fn is_available(&self) -> bool {
|
||||
let url = format!("{}/api/tags", self.base_url);
|
||||
matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success())
|
||||
matches!(
|
||||
tokio::time::timeout(Duration::from_millis(500), self.client.get(&url).send()).await,
|
||||
Ok(Ok(r)) if r.status().is_success()
|
||||
)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
|
|
|
|||
|
|
@ -4,6 +4,9 @@
|
|||
/// extract, chain, and other modules that need a fake LLM backend.
|
||||
#[cfg(test)]
|
||||
pub(crate) mod mock {
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::error::LlmError;
|
||||
|
|
@ -45,4 +48,48 @@ pub(crate) mod mock {
|
|||
self.name
|
||||
}
|
||||
}
|
||||
|
||||
/// A mock provider that returns responses from a sequence.
|
||||
/// Call N → returns responses[N], wrapping at the end.
|
||||
/// Useful for testing first-failure / second-success retry paths.
|
||||
pub struct SequenceMockProvider {
|
||||
pub name: &'static str,
|
||||
pub responses: Vec<Result<String, String>>,
|
||||
pub available: bool,
|
||||
call_count: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl SequenceMockProvider {
|
||||
pub fn new(
|
||||
name: &'static str,
|
||||
responses: Vec<Result<String, String>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
responses,
|
||||
available: true,
|
||||
call_count: Arc::new(AtomicUsize::new(0)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl LlmProvider for SequenceMockProvider {
|
||||
async fn complete(&self, _request: &CompletionRequest) -> Result<String, LlmError> {
|
||||
let idx = self.call_count.fetch_add(1, Ordering::SeqCst);
|
||||
let response = &self.responses[idx.min(self.responses.len() - 1)];
|
||||
match response {
|
||||
Ok(text) => Ok(text.clone()),
|
||||
Err(msg) => Err(LlmError::ProviderError(msg.clone())),
|
||||
}
|
||||
}
|
||||
|
||||
async fn is_available(&self) -> bool {
|
||||
self.available
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
self.name
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,10 @@ version.workspace = true
|
|||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[lib]
|
||||
name = "noxa_mcp"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "noxa-mcp"
|
||||
path = "src/main.rs"
|
||||
|
|
@ -14,8 +18,8 @@ noxa-core = { workspace = true }
|
|||
noxa-fetch = { workspace = true }
|
||||
noxa-llm = { workspace = true }
|
||||
noxa-pdf = { workspace = true }
|
||||
rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
|
||||
schemars = "1.0"
|
||||
rmcp = { workspace = true }
|
||||
schemars = { workspace = true }
|
||||
dotenvy = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
|
|
@ -24,4 +28,4 @@ tracing = { workspace = true }
|
|||
tracing-subscriber = { workspace = true }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
url = "2"
|
||||
dirs = "6.0.0"
|
||||
dirs = { workspace = true }
|
||||
|
|
|
|||
20
crates/noxa-mcp/src/lib.rs
Normal file
20
crates/noxa-mcp/src/lib.rs
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
/// noxa-mcp library wrapper.
|
||||
///
|
||||
/// This exposes the MCP server so it can be embedded by the `noxa` CLI via
|
||||
/// `noxa mcp` without duplicating the transport/bootstrap code.
|
||||
///
|
||||
/// Callers must initialize tracing before calling `run()`. Stdout must remain
|
||||
/// untouched after `run()` begins because it carries the MCP wire protocol.
|
||||
pub(crate) mod cloud;
|
||||
pub(crate) mod server;
|
||||
pub(crate) mod tools;
|
||||
|
||||
use rmcp::ServiceExt;
|
||||
use rmcp::transport::stdio;
|
||||
|
||||
/// Start the MCP server over stdio and block until the client disconnects.
|
||||
pub async fn run() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let service = server::NoxaMcp::new().await.serve(stdio()).await?;
|
||||
service.waiting().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -1,15 +1,6 @@
|
|||
/// noxa-mcp: MCP (Model Context Protocol) server for noxa.
|
||||
/// Exposes web extraction tools over stdio transport for AI agents
|
||||
/// like Claude Desktop, Claude Code, and other MCP clients.
|
||||
mod cloud;
|
||||
mod server;
|
||||
mod tools;
|
||||
|
||||
use rmcp::ServiceExt;
|
||||
use rmcp::transport::stdio;
|
||||
|
||||
use server::NoxaMcp;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
dotenvy::dotenv().ok();
|
||||
|
|
@ -21,8 +12,5 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
.with_ansi(false)
|
||||
.init();
|
||||
|
||||
let service = NoxaMcp::new().await.serve(stdio()).await?;
|
||||
|
||||
service.waiting().await?;
|
||||
Ok(())
|
||||
noxa_mcp::run().await
|
||||
}
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@ impl NoxaMcp {
|
|||
|
||||
let chain = noxa_llm::ProviderChain::default().await;
|
||||
let llm_chain = if chain.is_empty() {
|
||||
warn!("no LLM providers available -- extract/summarize tools will fail");
|
||||
warn!("no LLM providers available (gemini CLI, OPENAI_API_KEY, ANTHROPIC_API_KEY) -- extract/summarize tools will fail");
|
||||
None
|
||||
} else {
|
||||
info!(providers = chain.len(), "LLM provider chain ready");
|
||||
|
|
@ -334,7 +334,7 @@ impl NoxaMcp {
|
|||
// No local LLM — fall back to cloud API directly
|
||||
if self.llm_chain.is_none() {
|
||||
let cloud = self.cloud.as_ref().ok_or(
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
|
||||
"No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
|
||||
)?;
|
||||
let mut body = json!({"url": params.url});
|
||||
if let Some(ref schema) = params.schema {
|
||||
|
|
@ -387,7 +387,7 @@ impl NoxaMcp {
|
|||
// No local LLM — fall back to cloud API directly
|
||||
if self.llm_chain.is_none() {
|
||||
let cloud = self.cloud.as_ref().ok_or(
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
|
||||
"No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
|
||||
)?;
|
||||
let mut body = json!({"url": params.url});
|
||||
if let Some(sentences) = params.max_sentences {
|
||||
|
|
|
|||
51
env.example
51
env.example
|
|
@ -1,43 +1,20 @@
|
|||
# ============================================
|
||||
# Noxa Configuration
|
||||
# Copy to .env and fill in your values
|
||||
# ============================================
|
||||
# Secrets, URLs, and path overrides only — everything else goes in config.json
|
||||
# See config.example.json for the full list of configurable defaults.
|
||||
|
||||
# --- LLM Providers ---
|
||||
# Cloud API key (required for --cloud / --research)
|
||||
NOXA_API_KEY=
|
||||
|
||||
# Ollama (local, default provider)
|
||||
OLLAMA_HOST=http://localhost:11434
|
||||
OLLAMA_MODEL=qwen3:8b
|
||||
# Single proxy URL (or use NOXA_PROXY_FILE for pool rotation)
|
||||
NOXA_PROXY=
|
||||
|
||||
# OpenAI (optional cloud fallback)
|
||||
# OPENAI_API_KEY — set your OpenAI key
|
||||
# OPENAI_BASE_URL — defaults to https://api.openai.com/v1
|
||||
# OPENAI_MODEL — defaults to gpt-4o-mini
|
||||
# Proxy pool file path for rotating proxies
|
||||
NOXA_PROXY_FILE=
|
||||
|
||||
# Anthropic (optional cloud fallback)
|
||||
# ANTHROPIC_API_KEY — set your Anthropic key
|
||||
# ANTHROPIC_MODEL — defaults to claude-sonnet-4-20250514
|
||||
# Webhook URL for completion notifications
|
||||
NOXA_WEBHOOK_URL=
|
||||
|
||||
# --- Proxy ---
|
||||
# LLM base URL (Ollama or OpenAI-compatible endpoint)
|
||||
NOXA_LLM_BASE_URL=
|
||||
|
||||
# Single proxy
|
||||
# NOXA_PROXY=http://user:pass@host:port
|
||||
|
||||
# Proxy file (one per line: host:port:user:pass)
|
||||
# NOXA_PROXY_FILE=/path/to/proxies.txt
|
||||
|
||||
# --- Server (noxa-server only) ---
|
||||
# NOXA_PORT=3000
|
||||
# NOXA_HOST=0.0.0.0
|
||||
# NOXA_AUTH_KEY=your-auth-key
|
||||
# NOXA_MAX_CONCURRENCY=50
|
||||
# NOXA_JOB_TTL_SECS=3600
|
||||
# NOXA_MAX_JOBS=100
|
||||
|
||||
# --- CLI LLM overrides ---
|
||||
# NOXA_LLM_PROVIDER=ollama
|
||||
# NOXA_LLM_MODEL=qwen3:8b
|
||||
# NOXA_LLM_BASE_URL=http://localhost:11434
|
||||
|
||||
# --- Logging ---
|
||||
# NOXA_LOG=info
|
||||
# Optional: path to a non-default config file (default: ./config.json)
|
||||
# NOXA_CONFIG=/path/to/my-config.json
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue