diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json new file mode 100644 index 0000000..288f6a2 --- /dev/null +++ b/.agents/plugins/marketplace.json @@ -0,0 +1,20 @@ +{ + "name": "noxa-marketplace", + "interface": { + "displayName": "noxa" + }, + "plugins": [ + { + "name": "noxa", + "source": { + "source": "local", + "path": "./" + }, + "policy": { + "installation": "AVAILABLE", + "authentication": "ON_INSTALL" + }, + "category": "Productivity" + } + ] +} diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json new file mode 100644 index 0000000..e8a74fb --- /dev/null +++ b/.claude-plugin/marketplace.json @@ -0,0 +1,21 @@ +{ + "name": "noxa-marketplace", + "owner": { + "name": "jmagar" + }, + "metadata": { + "description": "Marketplace for the noxa plugin", + "version": "0.3.11", + "pluginRoot": "./" + }, + "plugins": [ + { + "name": "noxa", + "source": "./", + "description": "noxa CLI, MCP server, and skills for AI-assisted web extraction", + "version": "0.3.11", + "category": "Productivity", + "tags": ["web", "extraction", "mcp", "skills"] + } + ] +} diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..dfdeec4 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,14 @@ +{ + "name": "noxa", + "version": "0.4.0", + "description": "noxa CLI, MCP server, and skills for AI-assisted web extraction", + "author": { + "name": "jmagar" + }, + "homepage": "https://noxa.io", + "repository": "https://github.com/jmagar/noxa", + "license": "AGPL-3.0", + "keywords": ["web", "extraction", "mcp", "ai"], + "skills": "./skills/", + "mcpServers": "./.mcp.json" +} diff --git a/.codex-plugin/plugin.json b/.codex-plugin/plugin.json new file mode 100644 index 0000000..629ae84 --- /dev/null +++ b/.codex-plugin/plugin.json @@ -0,0 +1,19 @@ +{ + "name": "noxa", + "version": "0.4.0", + "description": "noxa CLI, MCP server, and skills for AI-assisted web extraction", + "author": { + "name": "jmagar" + }, + "homepage": "https://noxa.io", + "repository": "https://github.com/jmagar/noxa", + "license": "AGPL-3.0", + "keywords": ["web", "extraction", "mcp", "ai"], + "skills": "./skills/", + "mcpServers": "./.mcp.json", + "interface": { + "displayName": "noxa", + "shortDescription": "AI-assisted web extraction", + "longDescription": "Bundle the noxa skill and MCP server for Codex workflows." + } +} diff --git a/.gitignore b/.gitignore index 6293f80..2a48456 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,9 @@ docs/superpowers docs/reports docs/sessions benchmarks -docs +docs/* +!docs/config.md +.worktrees/ # Beads / Dolt files (added by bd init) .dolt/ diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..e8d56c1 --- /dev/null +++ b/.mcp.json @@ -0,0 +1,7 @@ +{ + "mcpServers": { + "noxa": { + "command": "/home/jmagar/workspace/noxa/target/debug/noxa-mcp" + } + } +} diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..2080244 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,84 @@ +# Agent Instructions + +This project uses **bd** (beads) for issue tracking. Run `bd onboard` to get started. + +## Quick Reference + +```bash +bd ready # Find available work +bd show # View issue details +bd update --claim # Claim work atomically +bd close # Complete work +bd dolt push # Push beads data to remote +``` + +## Non-Interactive Shell Commands + +**ALWAYS use non-interactive flags** with file operations to avoid hanging on confirmation prompts. + +Shell commands like `cp`, `mv`, and `rm` may be aliased to include `-i` (interactive) mode on some systems, causing the agent to hang indefinitely waiting for y/n input. + +**Use these forms instead:** +```bash +# Force overwrite without prompting +cp -f source dest # NOT: cp source dest +mv -f source dest # NOT: mv source dest +rm -f file # NOT: rm file + +# For recursive operations +rm -rf directory # NOT: rm -r directory +cp -rf source dest # NOT: cp -r source dest +``` + +**Other commands that may prompt:** +- `scp` - use `-o BatchMode=yes` for non-interactive +- `ssh` - use `-o BatchMode=yes` to fail instead of prompting +- `apt-get` - use `-y` flag +- `brew` - use `HOMEBREW_NO_AUTO_UPDATE=1` env var + + +## Beads Issue Tracker + +This project uses **bd (beads)** for issue tracking. Run `bd prime` to see full workflow context and commands. + +### Quick Reference + +```bash +bd ready # Find available work +bd show # View issue details +bd update --claim # Claim work +bd close # Complete work +``` + +### Rules + +- Use `bd` for ALL task tracking — do NOT use TodoWrite, TaskCreate, or markdown TODO lists +- Run `bd prime` for detailed command reference and session close protocol +- Use `bd remember` for persistent knowledge — do NOT use MEMORY.md files + +## Session Completion + +**When ending a work session**, you MUST complete ALL steps below. Work is NOT complete until `git push` succeeds. + +**MANDATORY WORKFLOW:** + +1. **File issues for remaining work** - Create issues for anything that needs follow-up +2. **Run quality gates** (if code changed) - Tests, linters, builds +3. **Update issue status** - Close finished work, update in-progress items +4. **PUSH TO REMOTE** - This is MANDATORY: + ```bash + git pull --rebase + bd dolt push + git push + git status # MUST show "up to date with origin" + ``` +5. **Clean up** - Clear stashes, prune remote branches +6. **Verify** - All changes committed AND pushed +7. **Hand off** - Provide context for next session + +**CRITICAL RULES:** +- Work is NOT complete until `git push` succeeds +- NEVER stop before pushing - that leaves work stranded locally +- NEVER say "ready to push when you are" - YOU must push +- If push fails, resolve and retry until it succeeds + diff --git a/CHANGELOG.md b/CHANGELOG.md index c65ca0d..4697dc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,24 @@ All notable changes to noxa are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.4.0] — 2026-04-12 + +### Added +- **Gemini CLI provider**: new primary LLM provider that shells out to the `gemini` binary. Passes prompts via `-p` flag (injection-safe), requests `--output-format json`, and suppresses MCP server startup via a temp workdir with `{"mcpServers":{}}`. Concurrency limited to 6 parallel subprocess calls with a 60s deadline. +- **`--llm-provider` flag**: force a specific provider (`gemini`, `ollama`, `openai`, `anthropic`) per invocation. +- **`--llm-model` flag**: override the model name for the selected provider. +- **`--llm-base-url` flag**: override the base URL for Ollama or OpenAI-compatible endpoints. +- **MCP `noxa mcp` subcommand**: expose the MCP server via a dedicated CLI subcommand. +- **LLM benchmark report**: `docs/reports/llm-benchmark-2026-04-11.md` — timing and quality comparison of Gemini CLI vs qwen3.5:4b vs qwen3.5:9b across summarize, prompt extract, and schema extract tasks. + +### Changed +- **Provider chain order**: Gemini CLI → OpenAI → Ollama → Anthropic (Gemini is now the default primary). +- **Default Ollama model**: changed from `qwen3:8b` to `qwen3.5:9b` based on benchmark results showing better quality on schema extraction. +- **LLM timing moved to dispatch layer**: `LLM: Xs` line printed to stderr at the call site rather than inside individual providers. +- **Gemini startup optimization**: workspace settings override disables all MCP servers for subprocess calls, saving 10–60s of startup latency per call. + +--- + ## [0.3.11] — 2026-04-10 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index 6e6ab01..27a244f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -50,7 +50,7 @@ Two binaries: `noxa` (CLI), `noxa-mcp` (MCP server). ### LLM Modules (`noxa-llm`) - Provider chain: Gemini CLI (primary) -> OpenAI -> Ollama -> Anthropic - Gemini CLI requires the `gemini` binary on PATH; `GEMINI_MODEL` env var controls model (default: `gemini-2.5-pro`) -- JSON schema extraction with jsonschema validation; parse failures retry once; schema mismatches fail immediately +- JSON schema extraction with jsonschema validation; retries once with a correction prompt on both parse failures and schema mismatches. - Prompt-based extraction, summarization ### PDF Modules (`noxa-pdf`) diff --git a/Cargo.lock b/Cargo.lock index f9ca781..16bbe05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1746,7 +1746,7 @@ dependencies = [ [[package]] name = "noxa-cli" -version = "0.3.11" +version = "0.4.0" dependencies = [ "clap", "dotenvy", @@ -1768,7 +1768,7 @@ dependencies = [ [[package]] name = "noxa-core" -version = "0.3.11" +version = "0.4.0" dependencies = [ "ego-tree", "once_cell", @@ -1786,7 +1786,7 @@ dependencies = [ [[package]] name = "noxa-fetch" -version = "0.3.11" +version = "0.4.0" dependencies = [ "bytes", "calamine", @@ -1808,7 +1808,7 @@ dependencies = [ [[package]] name = "noxa-llm" -version = "0.3.11" +version = "0.4.0" dependencies = [ "async-trait", "jsonschema", @@ -1822,7 +1822,7 @@ dependencies = [ [[package]] name = "noxa-mcp" -version = "0.3.11" +version = "0.4.0" dependencies = [ "dirs", "dotenvy", @@ -1843,7 +1843,7 @@ dependencies = [ [[package]] name = "noxa-pdf" -version = "0.3.11" +version = "0.4.0" dependencies = [ "pdf-extract", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index 81bfd4b..aea979f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.3.11" +version = "0.4.0" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/jmagar/noxa" diff --git a/README.md b/README.md index fea03dc..c52eab3 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ X / Twitter Website Docs + Config guide

--- @@ -55,6 +56,8 @@ It extracts clean, structured content from any URL using Chrome-level TLS finger ## Get Started (30 seconds) +Need config details? See [`docs/config.md`](docs/config.md) for how `config.json` and `.env` work together. + ### For AI agents (Claude, Cursor, Windsurf, VS Code) ```bash @@ -78,9 +81,12 @@ Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for mac ```bash cargo install --git https://github.com/jmagar/noxa.git noxa-cli --bin noxa -cargo install --git https://github.com/jmagar/noxa.git noxa-mcp ``` +After installing the CLI, run the embedded MCP entrypoint with `noxa mcp`. +If you still need the standalone server binary for legacy setups, `noxa-mcp` +remains available as a separate package. + ### Docker ```bash @@ -331,7 +337,11 @@ noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com ### LLM-Powered Features -These require an LLM provider (Ollama local, or OpenAI/Anthropic API key). +These require an LLM provider. noxa tries Gemini CLI first (requires the `gemini` +binary on PATH and uses `GEMINI_MODEL`, default `gemini-2.5-pro`), then falls +back to OpenAI, Ollama local, and Anthropic. Structured JSON extraction is +automatically validated against your schema and retried once with a correction +prompt if the first attempt fails. ```bash # Summarize a page (default: 3 sentences) @@ -350,6 +360,7 @@ noxa --extract-json @schema.json https://example.com/product noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing # Use a specific LLM provider +noxa --llm-provider gemini --summarize https://example.com noxa --llm-provider ollama --summarize https://example.com noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com noxa --llm-provider anthropic --summarize https://example.com @@ -440,7 +451,8 @@ Or manual setup — add to your Claude Desktop config: { "mcpServers": { "noxa": { - "command": "~/.noxa/noxa-mcp" + "command": "noxa", + "args": ["mcp"] } } } @@ -580,7 +592,7 @@ noxa/ crates/ noxa-core Pure extraction engine. Zero network deps. WASM-safe. noxa-fetch HTTP client + TLS fingerprinting (wreq/BoringSSL). Crawler. Batch ops. - noxa-llm LLM provider chain (Ollama -> OpenAI -> Anthropic) + noxa-llm LLM provider chain (Gemini CLI -> OpenAI -> Ollama -> Anthropic) noxa-pdf PDF text extraction noxa-mcp MCP server (10 tools for AI agents) noxa CLI binary @@ -592,7 +604,10 @@ noxa/ ## Configuration -Non-secret defaults live in `config.json` in your working directory. Copy the example: +Non-secret defaults live in `config.json` in your working directory. The full behavior contract is documented in [`docs/config.md`](docs/config.md). +Set `output_dir` in `config.json` if you want results written to files instead of stdout. + +Copy the example: ```bash cp config.example.json config.json @@ -600,7 +615,11 @@ cp config.example.json config.json **Precedence:** CLI flags > `config.json` > built-in defaults -**Secrets and URLs** (API keys, proxy, webhook, LLM base URL) always go in `.env`, not `config.json`: +For `llm_provider` and `llm_model`, leaving the keys unset preserves the +Gemini -> OpenAI -> Ollama -> Anthropic fallback chain. Setting them in +`config.json` or on the CLI forces that specific provider/model. + +**Secrets and URLs** always go in `.env`, not `config.json`: ```bash cp env.example .env @@ -613,18 +632,64 @@ NOXA_CONFIG=/path/to/other-config.json noxa https://example.com NOXA_CONFIG=/dev/null noxa https://example.com # bypass config entirely ``` -**Bool flag limitation:** flags like `--metadata`, `--only-main-content`, `--verbose` set to `true` in `config.json` cannot be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass. +**Bool flag limitation:** flags like `--metadata`, `--only-main-content`, `--verbose`, and `--use-sitemap` set to `true` in `config.json` cannot be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass. + +### Cloud configuration + +The `cloud` block in `config.json` allows you to configure the cloud provider settings. + +```json +{ + "cloud": { + "provider": "gcp", + "project": "my-gcp-project", + "zone": "us-central1-a", + "cluster": "my-cluster", + "service_account_key": "/path/to/key.json", + "disabled": false + } +} +``` + +These settings can also be controlled via command-line flags: + +- `--cloud-provider`: Cloud provider to use (e.g. "gcp", "aws") +- `--cloud-project`: Cloud project ID +- `--cloud-zone`: Cloud zone or region +- `--cloud-cluster`: Cloud cluster name +- `--cloud-service-account-key`: Path to cloud service account key file +- `--cloud-disabled`: Disable cloud features ### Environment variables | Variable | Description | |----------|-------------| | `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) | -| `OLLAMA_HOST` | Ollama URL for local LLM features (default: `http://localhost:11434`) | -| `OPENAI_API_KEY` | OpenAI API key for LLM features | -| `ANTHROPIC_API_KEY` | Anthropic API key for LLM features | | `NOXA_PROXY` | Single proxy URL | | `NOXA_PROXY_FILE` | Path to proxy pool file | +| `NOXA_WEBHOOK_URL` | Webhook URL for notifications | +| `NOXA_LLM_BASE_URL` | LLM base URL for Ollama or OpenAI-compatible endpoints | +| `NOXA_LLM_PROVIDER` | Default LLM provider | +| `NOXA_LLM_MODEL` | Default LLM model | +| `OLLAMA_HEALTH_TIMEOUT_MS` | Ollama availability check timeout in milliseconds | +| `NOXA_CONFIG` | Path to `config.json` or `/dev/null` to bypass it | + +The `env.example` file covers the runtime noxa variables above. + +If you use `setup.sh` or the Docker Compose stack, they also rely on these local deployment variables: + +- `NOXA_PORT` +- `NOXA_HOST` +- `NOXA_AUTH_KEY` +- `NOXA_LOG` +- `OLLAMA_HOST` +- `OLLAMA_MODEL` + +LLM provider backends may also use these environment variables: + +- `OPENAI_API_KEY` +- `ANTHROPIC_API_KEY` +- `GEMINI_MODEL` --- diff --git a/SKILL.md b/SKILL.md deleted file mode 100644 index a68cd96..0000000 --- a/SKILL.md +++ /dev/null @@ -1,634 +0,0 @@ ---- -name: noxa -description: Web extraction engine with antibot bypass. Scrape, crawl, extract, summarize, search, map, diff, monitor, research, and analyze any URL — including Cloudflare-protected sites. Use when you need reliable web content, the built-in web_fetch fails, or you need structured data extraction from web pages. -homepage: https://noxa.io -user-invocable: true -metadata: {"openclaw":{"emoji":"🦀","requires":{"env":["NOXA_API_KEY"]},"primaryEnv":"NOXA_API_KEY","homepage":"https://noxa.io","install":[{"id":"npx","kind":"node","bins":["noxa-mcp"],"label":"npx create-noxa"}]}} ---- - -# noxa - -High-quality web extraction with automatic antibot bypass. Beats Firecrawl on extraction quality and handles Cloudflare, DataDome, and JS-rendered pages automatically. - -## When to use this skill - -- **Always** when you need to fetch web content and want reliable results -- When `web_fetch` returns empty/blocked content (403, Cloudflare challenges) -- When you need structured data extraction (pricing tables, product info) -- When you need to crawl an entire site or discover all URLs -- When you need LLM-optimized content (cleaner than raw markdown) -- When you need to summarize a page without reading the full content -- When you need to detect content changes between visits -- When you need brand identity analysis (colors, fonts, logos) -- When you need web search results with optional page scraping -- When you need deep multi-source research on a topic -- When you need AI-guided scraping to accomplish a goal on a page -- When you need to monitor a URL for changes over time - -## API base - -All requests go to `https://api.noxa.io/v1/`. - -Authentication: `Authorization: Bearer $NOXA_API_KEY` - -## Endpoints - -### 1. Scrape — extract content from a single URL - -```bash -curl -X POST https://api.noxa.io/v1/scrape \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com", - "formats": ["markdown"], - "only_main_content": true - }' -``` - -**Request fields:** - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `url` | string | required | URL to scrape | -| `formats` | string[] | `["markdown"]` | Output formats: `markdown`, `text`, `llm`, `json` | -| `include_selectors` | string[] | `[]` | CSS selectors to keep (e.g. `["article", ".content"]`) | -| `exclude_selectors` | string[] | `[]` | CSS selectors to remove (e.g. `["nav", "footer", ".ads"]`) | -| `only_main_content` | bool | `false` | Extract only the main article/content area | -| `no_cache` | bool | `false` | Skip cache, fetch fresh | -| `max_cache_age` | int | server default | Max acceptable cache age in seconds | - -**Response:** - -```json -{ - "url": "https://example.com", - "metadata": { - "title": "Example", - "description": "...", - "language": "en", - "word_count": 1234 - }, - "markdown": "# Page Title\n\nContent here...", - "cache": { "status": "miss" } -} -``` - -**Format options:** -- `markdown` — clean markdown, best for general use -- `text` — plain text without formatting -- `llm` — optimized for LLM consumption: includes page title, URL, and cleaned content with link references. Best for feeding to AI models. -- `json` — full extraction result with all metadata - -**When antibot bypass activates** (automatic, no extra config): -```json -{ - "antibot": { - "bypass": true, - "elapsed_ms": 3200 - } -} -``` - -### 2. Crawl — scrape an entire website - -Starts an async job. Poll for results. - -**Start crawl:** -```bash -curl -X POST https://api.noxa.io/v1/crawl \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://docs.example.com", - "max_depth": 3, - "max_pages": 50, - "use_sitemap": true - }' -``` - -Response: `{ "job_id": "abc-123", "status": "running" }` - -**Poll status:** -```bash -curl https://api.noxa.io/v1/crawl/abc-123 \ - -H "Authorization: Bearer $NOXA_API_KEY" -``` - -Response when complete: -```json -{ - "job_id": "abc-123", - "status": "completed", - "total": 47, - "completed": 45, - "errors": 2, - "pages": [ - { - "url": "https://docs.example.com/intro", - "markdown": "# Introduction\n...", - "metadata": { "title": "Intro", "word_count": 500 } - } - ] -} -``` - -**Request fields:** - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `url` | string | required | Starting URL | -| `max_depth` | int | `3` | How many links deep to follow | -| `max_pages` | int | `100` | Maximum pages to crawl | -| `use_sitemap` | bool | `false` | Seed URLs from sitemap.xml | -| `formats` | string[] | `["markdown"]` | Output formats per page | -| `include_selectors` | string[] | `[]` | CSS selectors to keep | -| `exclude_selectors` | string[] | `[]` | CSS selectors to remove | -| `only_main_content` | bool | `false` | Main content only | - -### 3. Map — discover all URLs on a site - -Fast URL discovery without full content extraction. - -```bash -curl -X POST https://api.noxa.io/v1/map \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{"url": "https://example.com"}' -``` - -Response: -```json -{ - "url": "https://example.com", - "count": 142, - "urls": [ - "https://example.com/about", - "https://example.com/pricing", - "https://example.com/docs/intro" - ] -} -``` - -### 4. Batch — scrape multiple URLs in parallel - -```bash -curl -X POST https://api.noxa.io/v1/batch \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "urls": [ - "https://a.com", - "https://b.com", - "https://c.com" - ], - "formats": ["markdown"], - "concurrency": 5 - }' -``` - -Response: -```json -{ - "total": 3, - "completed": 3, - "errors": 0, - "results": [ - { "url": "https://a.com", "markdown": "...", "metadata": {} }, - { "url": "https://b.com", "markdown": "...", "metadata": {} }, - { "url": "https://c.com", "error": "timeout" } - ] -} -``` - -### 5. Extract — LLM-powered structured extraction - -Pull structured data from any page using a JSON schema or plain-text prompt. - -**With JSON schema:** -```bash -curl -X POST https://api.noxa.io/v1/extract \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com/pricing", - "schema": { - "type": "object", - "properties": { - "plans": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { "type": "string" }, - "price": { "type": "string" }, - "features": { "type": "array", "items": { "type": "string" } } - } - } - } - } - } - }' -``` - -**With prompt:** -```bash -curl -X POST https://api.noxa.io/v1/extract \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com/pricing", - "prompt": "Extract all pricing tiers with names, monthly prices, and key features" - }' -``` - -Response: -```json -{ - "url": "https://example.com/pricing", - "data": { - "plans": [ - { "name": "Starter", "price": "$49/mo", "features": ["10k pages", "Email support"] }, - { "name": "Pro", "price": "$99/mo", "features": ["100k pages", "Priority support", "API access"] } - ] - } -} -``` - -### 6. Summarize — get a quick summary of any page - -```bash -curl -X POST https://api.noxa.io/v1/summarize \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com/long-article", - "max_sentences": 3 - }' -``` - -Response: -```json -{ - "url": "https://example.com/long-article", - "summary": "The article discusses... Key findings include... The author concludes that..." -} -``` - -### 7. Diff — detect content changes - -Compare current page content against a previous snapshot. - -```bash -curl -X POST https://api.noxa.io/v1/diff \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com", - "previous": { - "markdown": "# Old content...", - "metadata": { "title": "Old Title" } - } - }' -``` - -Response: -```json -{ - "url": "https://example.com", - "status": "changed", - "diff": "--- previous\n+++ current\n@@ -1 +1 @@\n-# Old content\n+# New content", - "metadata_changes": [ - { "field": "title", "old": "Old Title", "new": "New Title" } - ] -} -``` - -### 8. Brand — extract brand identity - -Analyze a website's visual identity: colors, fonts, logo. - -```bash -curl -X POST https://api.noxa.io/v1/brand \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{"url": "https://example.com"}' -``` - -Response: -```json -{ - "url": "https://example.com", - "brand": { - "colors": [ - { "hex": "#FF6B35", "usage": "primary" }, - { "hex": "#1A1A2E", "usage": "background" } - ], - "fonts": ["Inter", "JetBrains Mono"], - "logo_url": "https://example.com/logo.svg", - "favicon_url": "https://example.com/favicon.ico" - } -} -``` - -### 9. Search — web search with optional scraping - -Search the web and optionally scrape each result page. - -```bash -curl -X POST https://api.noxa.io/v1/search \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "query": "best rust web frameworks 2026", - "num_results": 5, - "scrape": true, - "formats": ["markdown"] - }' -``` - -**Request fields:** - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `query` | string | required | Search query | -| `num_results` | int | `10` | Number of search results to return | -| `scrape` | bool | `false` | Also scrape each result page for full content | -| `formats` | string[] | `["markdown"]` | Output formats when `scrape` is true | -| `country` | string | none | Country code for localized results (e.g. `"us"`, `"de"`) | -| `lang` | string | none | Language code for results (e.g. `"en"`, `"fr"`) | - -**Response:** - -```json -{ - "query": "best rust web frameworks 2026", - "results": [ - { - "title": "Top Rust Web Frameworks in 2026", - "url": "https://blog.example.com/rust-frameworks", - "snippet": "A comprehensive comparison of Axum, Actix, and Rocket...", - "position": 1, - "markdown": "# Top Rust Web Frameworks\n\n..." - }, - { - "title": "Choosing a Rust Backend Framework", - "url": "https://dev.to/rust-backends", - "snippet": "When starting a new Rust web project...", - "position": 2, - "markdown": "# Choosing a Rust Backend\n\n..." - } - ] -} -``` - -The `markdown` field on each result is only present when `scrape: true`. Without it, you get titles, URLs, snippets, and positions only. - -### 10. Research — deep multi-source research - -Starts an async research job that searches, scrapes, and synthesizes information across multiple sources. Poll for results. - -**Start research:** -```bash -curl -X POST https://api.noxa.io/v1/research \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "query": "How does Cloudflare Turnstile work and what are its known bypass methods?", - "max_iterations": 5, - "max_sources": 10, - "topic": "security", - "deep": true - }' -``` - -**Request fields:** - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `query` | string | required | Research question or topic | -| `max_iterations` | int | server default | Maximum research iterations (search-read-analyze cycles) | -| `max_sources` | int | server default | Maximum number of sources to consult | -| `topic` | string | none | Topic hint to guide search strategy (e.g. `"security"`, `"finance"`, `"engineering"`) | -| `deep` | bool | `false` | Enable deep research mode for more thorough analysis (costs 10 credits instead of 1) | - -Response: `{ "id": "res-abc-123", "status": "running" }` - -**Poll results:** -```bash -curl https://api.noxa.io/v1/research/res-abc-123 \ - -H "Authorization: Bearer $NOXA_API_KEY" -``` - -Response when complete: -```json -{ - "id": "res-abc-123", - "status": "completed", - "query": "How does Cloudflare Turnstile work and what are its known bypass methods?", - "report": "# Cloudflare Turnstile Analysis\n\n## Overview\nCloudflare Turnstile is a CAPTCHA replacement that...\n\n## How It Works\n...\n\n## Known Bypass Methods\n...", - "sources": [ - { "url": "https://developers.cloudflare.com/turnstile/", "title": "Turnstile Documentation" }, - { "url": "https://blog.cloudflare.com/turnstile-ga/", "title": "Turnstile GA Announcement" } - ], - "findings": [ - "Turnstile uses browser environment signals and proof-of-work challenges", - "Managed mode auto-selects challenge difficulty based on visitor risk score", - "Known bypass approaches include instrumented browser automation" - ], - "iterations": 5, - "elapsed_ms": 34200 -} -``` - -**Status values:** `running`, `completed`, `failed` - -### 11. Agent Scrape — AI-guided scraping - -Use an AI agent to navigate and interact with a page to accomplish a specific goal. The agent can click, scroll, fill forms, and extract data across multiple steps. - -```bash -curl -X POST https://api.noxa.io/v1/agent-scrape \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com/products", - "goal": "Find the cheapest laptop with at least 16GB RAM and extract its full specs", - "max_steps": 10 - }' -``` - -**Request fields:** - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `url` | string | required | Starting URL | -| `goal` | string | required | What the agent should accomplish | -| `max_steps` | int | server default | Maximum number of actions the agent can take | - -**Response:** - -```json -{ - "url": "https://example.com/products", - "result": "The cheapest laptop with 16GB+ RAM is the ThinkPad E14 Gen 6 at $649. Specs: AMD Ryzen 5 7535U, 16GB DDR4, 512GB SSD, 14\" FHD IPS display, 57Wh battery.", - "steps": [ - { "action": "navigate", "detail": "Loaded products page" }, - { "action": "click", "detail": "Clicked 'Laptops' category filter" }, - { "action": "click", "detail": "Applied '16GB+' RAM filter" }, - { "action": "click", "detail": "Sorted by price: low to high" }, - { "action": "extract", "detail": "Extracted specs from first matching product" } - ] -} -``` - -### 12. Watch — monitor a URL for changes - -Create persistent monitors that check a URL on a schedule and notify via webhook when content changes. - -**Create a monitor:** -```bash -curl -X POST https://api.noxa.io/v1/watch \ - -H "Authorization: Bearer $NOXA_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com/pricing", - "interval": "0 */6 * * *", - "webhook_url": "https://hooks.example.com/pricing-changed", - "formats": ["markdown"] - }' -``` - -**Request fields:** - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `url` | string | required | URL to monitor | -| `interval` | string | required | Check frequency as cron expression or seconds (e.g. `"0 */6 * * *"` or `"3600"`) | -| `webhook_url` | string | none | URL to POST when changes are detected | -| `formats` | string[] | `["markdown"]` | Output formats for snapshots | - -Response: -```json -{ - "id": "watch-abc-123", - "url": "https://example.com/pricing", - "interval": "0 */6 * * *", - "webhook_url": "https://hooks.example.com/pricing-changed", - "formats": ["markdown"], - "created_at": "2026-03-20T10:00:00Z", - "last_check": null, - "status": "active" -} -``` - -**List all monitors:** -```bash -curl https://api.noxa.io/v1/watch \ - -H "Authorization: Bearer $NOXA_API_KEY" -``` - -Response: -```json -{ - "monitors": [ - { - "id": "watch-abc-123", - "url": "https://example.com/pricing", - "interval": "0 */6 * * *", - "status": "active", - "last_check": "2026-03-20T16:00:00Z", - "checks": 4 - } - ] -} -``` - -**Get a monitor with snapshots:** -```bash -curl https://api.noxa.io/v1/watch/watch-abc-123 \ - -H "Authorization: Bearer $NOXA_API_KEY" -``` - -Response: -```json -{ - "id": "watch-abc-123", - "url": "https://example.com/pricing", - "interval": "0 */6 * * *", - "status": "active", - "snapshots": [ - { - "checked_at": "2026-03-20T16:00:00Z", - "status": "changed", - "diff": "--- previous\n+++ current\n@@ -5 +5 @@\n-Pro: $99/mo\n+Pro: $119/mo" - }, - { - "checked_at": "2026-03-20T10:00:00Z", - "status": "baseline" - } - ] -} -``` - -**Trigger an immediate check:** -```bash -curl -X POST https://api.noxa.io/v1/watch/watch-abc-123/check \ - -H "Authorization: Bearer $NOXA_API_KEY" -``` - -**Delete a monitor:** -```bash -curl -X DELETE https://api.noxa.io/v1/watch/watch-abc-123 \ - -H "Authorization: Bearer $NOXA_API_KEY" -``` - -## Choosing the right format - -| Goal | Format | Why | -|------|--------|-----| -| Read and understand a page | `markdown` | Clean structure, headings, links preserved | -| Feed content to an AI model | `llm` | Optimized: includes title + URL header, clean link refs | -| Search or index content | `text` | Plain text, no formatting noise | -| Programmatic analysis | `json` | Full metadata, structured data, DOM statistics | - -## Tips - -- **Use `llm` format** when passing content to yourself or another AI — it's specifically optimized for LLM consumption with better context framing. -- **Use `only_main_content: true`** to skip navigation, sidebars, and footers. Reduces noise significantly. -- **Use `include_selectors`/`exclude_selectors`** for fine-grained control when `only_main_content` isn't enough. -- **Batch over individual scrapes** when fetching multiple URLs — it's faster and more efficient. -- **Use `map` before `crawl`** to discover the site structure first, then crawl specific sections. -- **Use `extract` with a JSON schema** for reliable structured output (e.g., pricing tables, product specs, contact info). -- **Antibot bypass is automatic** — no extra configuration needed. Works on Cloudflare, DataDome, AWS WAF, and JS-rendered SPAs. -- **Use `search` with `scrape: true`** to get full page content for each search result in one call instead of searching then scraping separately. -- **Use `research` for complex questions** that need multiple sources — it handles the search-read-synthesize loop automatically. Enable `deep: true` for thorough analysis. -- **Use `agent-scrape` for interactive pages** where data is behind filters, pagination, or form submissions that a simple scrape cannot reach. -- **Use `watch` for ongoing monitoring** — set up a cron schedule and a webhook to get notified when a page changes without polling manually. - -## Smart Fetch Architecture - -The noxa MCP server uses a **local-first** approach: - -1. **Local fetch** — fast, free, no API credits used (~80% of sites) -2. **Cloud API fallback** — automatic when bot protection or JS rendering is detected - -This means: -- Most scrapes cost zero credits (local extraction) -- Cloudflare, DataDome, AWS WAF sites automatically fall back to the cloud API -- JS-rendered SPAs (React, Next.js, Vue) also fall back automatically -- Set `NOXA_API_KEY` to enable cloud fallback - -## vs web_fetch - -| | noxa | web_fetch | -|---|---------|-----------| -| Cloudflare bypass | Automatic (cloud fallback) | Fails (403) | -| JS-rendered pages | Automatic fallback | Readability only | -| Output quality | 20-step optimization pipeline | Basic HTML parsing | -| Structured extraction | LLM-powered, schema-based | None | -| Crawling | Full site crawl with sitemap | Single page only | -| Caching | Built-in, configurable TTL | Per-session | -| Rate limiting | Managed server-side | Client responsibility | - -Use `web_fetch` for simple, fast lookups. Use noxa when you need reliability, quality, or advanced features. diff --git a/config.example.json b/config.example.json index db863eb..10f200a 100644 --- a/config.example.json +++ b/config.example.json @@ -1,12 +1,16 @@ { + "$schema": "./config.schema.json", "_doc": [ "Copy to config.json and remove fields you don't need.", "Secrets (api_key, proxy, webhook, llm_base_url) go in .env — NOT here.", "BOOL FLAG LIMITATION: once set to true here, cannot be overridden to false", "from the CLI for a single run (no --no-flag support). Use NOXA_CONFIG=/dev/null", "on the command line to bypass this config entirely.", + "LLM provider/model are optional overrides. Leave them unset to keep the", + "Gemini -> OpenAI -> Ollama -> Anthropic fallback chain intact.", "on_change is intentionally absent — it must remain a CLI-only flag.", - "Unknown fields are silently ignored, so this file works across noxa versions." + "Unknown fields are silently ignored, so this file works across noxa versions.", + "Set output_dir to write results to files instead of stdout." ], "format": "markdown", @@ -27,8 +31,5 @@ "path_prefix": null, "include_paths": [], "exclude_paths": ["/changelog/*", "/blog/*", "/releases/*"], - "use_sitemap": false, - - "llm_provider": "gemini", - "llm_model": "gemini-2.5-pro" + "use_sitemap": false } diff --git a/config.schema.json b/config.schema.json new file mode 100644 index 0000000..f16d1c3 --- /dev/null +++ b/config.schema.json @@ -0,0 +1,140 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "./config.schema.json", + "title": "Noxa config.json", + "description": "Optional non-secret defaults for the noxa CLI. Unknown fields are ignored by the binary, and secrets/URLs belong in .env.", + "type": "object", + "additionalProperties": true, + "properties": { + "$schema": { + "type": "string", + "description": "Editor hint pointing at this schema." + }, + "_doc": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Human-readable notes. Ignored by noxa." + }, + "format": { + "type": "string", + "enum": ["markdown", "json", "text", "llm", "html"], + "default": "markdown", + "description": "Default output format." + }, + "browser": { + "type": "string", + "enum": ["chrome", "firefox", "random"], + "default": "chrome", + "description": "TLS/browser fingerprint profile." + }, + "timeout": { + "type": "integer", + "minimum": 0, + "default": 30, + "description": "Request timeout in seconds." + }, + "pdf_mode": { + "type": "string", + "enum": ["auto", "fast"], + "default": "auto", + "description": "How PDFs are handled." + }, + "metadata": { + "type": "boolean", + "default": false, + "description": "Include metadata in output." + }, + "verbose": { + "type": "boolean", + "default": false, + "description": "Enable verbose logging." + }, + "output_dir": { + "type": ["string", "null"], + "default": null, + "description": "Write outputs to files in this directory instead of stdout." + }, + "only_main_content": { + "type": "boolean", + "default": false, + "description": "Strip nav/sidebar/footer noise automatically." + }, + "include_selectors": { + "type": "array", + "items": { + "type": "string" + }, + "default": [], + "description": "CSS selectors to force-include." + }, + "exclude_selectors": { + "type": "array", + "items": { + "type": "string" + }, + "default": [], + "description": "CSS selectors to exclude." + }, + "depth": { + "type": "integer", + "minimum": 0, + "default": 1, + "description": "Maximum crawl depth." + }, + "max_pages": { + "type": "integer", + "minimum": 0, + "default": 20, + "description": "Maximum number of pages to crawl." + }, + "concurrency": { + "type": "integer", + "minimum": 0, + "default": 5, + "description": "Maximum concurrent requests." + }, + "delay": { + "type": "integer", + "minimum": 0, + "default": 100, + "description": "Delay between requests in milliseconds." + }, + "path_prefix": { + "type": ["string", "null"], + "default": null, + "description": "Only crawl paths with this prefix." + }, + "include_paths": { + "type": "array", + "items": { + "type": "string" + }, + "default": [], + "description": "Glob patterns for crawl paths to include." + }, + "exclude_paths": { + "type": "array", + "items": { + "type": "string" + }, + "default": [], + "description": "Glob patterns for crawl paths to exclude." + }, + "use_sitemap": { + "type": "boolean", + "default": false, + "description": "Seed crawl traversal from sitemap discovery." + }, + "llm_provider": { + "type": "string", + "enum": ["gemini", "ollama", "openai", "anthropic"], + "description": "Optional LLM provider name." + }, + "llm_model": { + "type": "string", + "description": "Optional LLM model override." + } + } +} diff --git a/crates/noxa-cli/src/config.rs b/crates/noxa-cli/src/config.rs index 894716f..19eef66 100644 --- a/crates/noxa-cli/src/config.rs +++ b/crates/noxa-cli/src/config.rs @@ -1,5 +1,5 @@ use serde::Deserialize; -use std::path::Path; +use std::path::{Path, PathBuf}; use crate::{Browser, OutputFormat, PdfModeArg}; @@ -16,7 +16,8 @@ use crate::{Browser, OutputFormat, PdfModeArg}; /// BOOL FLAG LIMITATION: /// only_main_content, metadata, verbose, use_sitemap set to true here /// cannot be overridden to false from the CLI for a single run (no --no-flag -/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null to bypass. +/// variant in clap). Edit config.json or use NOXA_CONFIG=/dev/null (or an +/// empty file) to bypass. #[derive(Debug, Default, Deserialize)] pub struct NoxaConfig { // Output @@ -47,6 +48,20 @@ pub struct NoxaConfig { // LLM (non-secret: provider name and model only; base URL stays in .env) pub llm_provider: Option, pub llm_model: Option, + pub output_dir: Option, + + #[serde(default)] + pub cloud: Option, +} + +#[derive(Debug, Default, Deserialize, Clone)] +pub struct CloudConfig { + pub provider: Option, + pub project: Option, + pub zone: Option, + pub cluster: Option, + pub service_account_key: Option, + pub disabled: Option, } impl NoxaConfig { @@ -65,7 +80,8 @@ impl NoxaConfig { let path = Path::new(&path_str); if !path.exists() { if was_explicit { - let display_name = path.file_name() + let display_name = path + .file_name() .and_then(|n| n.to_str()) .unwrap_or(&path_str); eprintln!("error: config file not found: {display_name}"); @@ -74,7 +90,24 @@ impl NoxaConfig { return Self::default(); } - let display_name = path.file_name() + let content = match std::fs::read_to_string(path) { + Ok(s) => s, + Err(e) => { + let display_name = path + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or(&path_str); + eprintln!("error: cannot read config file {display_name}: {e}"); + std::process::exit(1); + } + }; + + if path_str == "/dev/null" || content.trim().is_empty() { + return Self::default(); + } + + let display_name = path + .file_name() .and_then(|n| n.to_str()) .unwrap_or(&path_str); eprintln!( @@ -83,14 +116,6 @@ impl NoxaConfig { ); tracing::debug!("config path: {}", path.display()); - let content = match std::fs::read_to_string(path) { - Ok(s) => s, - Err(e) => { - eprintln!("error: cannot read config file {display_name}: {e}"); - std::process::exit(1); - } - }; - match serde_json::from_str(&content) { Ok(cfg) => cfg, Err(e) => { @@ -142,20 +167,31 @@ pub struct ResolvedConfig { // LLM pub llm_provider: Option, pub llm_model: Option, + pub output_dir: Option, + + // Cloud + pub cloud: Option, } use clap::parser::ValueSource; +fn resolve_optional_setting( + cli_explicit: bool, + cli_value: Option, + cfg_value: Option, + env_value: Option, +) -> Option { + if cli_explicit { + cli_value + } else { + cfg_value.or_else(|| env_value.filter(|s| !s.is_empty())) + } +} + /// Merge CLI flags (detected via ValueSource), config file, and hard defaults /// into a single ResolvedConfig. CLI explicit values always win. -pub fn resolve( - cli: &crate::Cli, - matches: &clap::ArgMatches, - cfg: &NoxaConfig, -) -> ResolvedConfig { - let explicit = |name: &str| { - matches.value_source(name) == Some(ValueSource::CommandLine) - }; +pub fn resolve(cli: &crate::Cli, matches: &clap::ArgMatches, cfg: &NoxaConfig) -> ResolvedConfig { + let explicit = |name: &str| matches.value_source(name) == Some(ValueSource::CommandLine); ResolvedConfig { format: if explicit("format") { @@ -240,15 +276,40 @@ pub fn resolve( verbose: cli.verbose || cfg.verbose.unwrap_or(false), use_sitemap: cli.sitemap || cfg.use_sitemap.unwrap_or(false), raw_html: cli.raw_html, - llm_provider: if cli.llm_provider.is_some() { - cli.llm_provider.clone() + llm_provider: resolve_optional_setting( + explicit("llm_provider"), + cli.llm_provider.clone(), + cfg.llm_provider.clone(), + std::env::var("NOXA_LLM_PROVIDER").ok(), + ), + llm_model: resolve_optional_setting( + explicit("llm_model"), + cli.llm_model.clone(), + cfg.llm_model.clone(), + std::env::var("NOXA_LLM_MODEL").ok(), + ), + output_dir: if explicit("output_dir") { + cli.output_dir.clone() } else { - cfg.llm_provider.clone() + cfg.output_dir.clone() }, - llm_model: if cli.llm_model.is_some() { - cli.llm_model.clone() + cloud: if explicit("cloud_provider") + || explicit("cloud_project") + || explicit("cloud_zone") + || explicit("cloud_cluster") + || explicit("cloud_service_account_key") + || explicit("cloud_disabled") + { + Some(CloudConfig { + provider: cli.cloud_provider.clone(), + project: cli.cloud_project.clone(), + zone: cli.cloud_zone.clone(), + cluster: cli.cloud_cluster.clone(), + service_account_key: cli.cloud_service_account_key.clone(), + disabled: Some(cli.cloud_disabled), + }) } else { - cfg.llm_model.clone() + cfg.cloud.clone() }, } } @@ -256,6 +317,7 @@ pub fn resolve( #[cfg(test)] mod tests { use super::*; + use clap::{CommandFactory, Parser}; #[test] fn test_noxa_config_deserialize_full() { @@ -283,7 +345,10 @@ mod tests { let cfg: NoxaConfig = serde_json::from_str(json).unwrap(); assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm))); assert_eq!(cfg.depth, Some(3)); - assert_eq!(cfg.exclude_paths, Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()])); + assert_eq!( + cfg.exclude_paths, + Some(vec!["/changelog/*".to_string(), "/blog/*".to_string()]) + ); assert!(matches!(cfg.pdf_mode, Some(crate::PdfModeArg::Fast))); } @@ -297,10 +362,39 @@ mod tests { #[test] fn test_noxa_config_unknown_fields_ignored() { // Unknown fields must NOT cause a parse failure - let cfg: NoxaConfig = serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap(); + let cfg: NoxaConfig = + serde_json::from_str(r#"{"depth": 2, "future_field": true}"#).unwrap(); assert_eq!(cfg.depth, Some(2)); } + #[test] + fn test_noxa_config_output_dir_deserialize() { + let cfg: NoxaConfig = serde_json::from_str(r#"{"output_dir":"out"}"#).unwrap(); + assert_eq!(cfg.output_dir, Some(PathBuf::from("out"))); + } + + #[test] + fn test_resolve_uses_config_output_dir() { + let cli = crate::Cli::parse_from(["noxa"]); + let matches = crate::Cli::command().get_matches_from(["noxa"]); + let cfg: NoxaConfig = serde_json::from_str(r#"{"output_dir":"out"}"#).unwrap(); + let resolved = resolve(&cli, &matches, &cfg); + assert_eq!(resolved.output_dir, Some(PathBuf::from("out"))); + } + + #[test] + fn test_resolve_prefers_config_llm_provider_over_env_default() { + let resolved = + resolve_optional_setting(false, None, Some("gemini".into()), Some("ollama".into())); + assert_eq!(resolved, Some("gemini".into())); + } + + #[test] + fn test_resolve_uses_env_llm_provider_when_config_missing() { + let resolved = resolve_optional_setting(false, None, None, Some("ollama".into())); + assert_eq!(resolved, Some("ollama".into())); + } + #[test] fn test_load_implicit_missing_file_returns_default() { // When no explicit path and ./config.json doesn't exist, silently return default. @@ -312,4 +406,57 @@ mod tests { let cfg = NoxaConfig::load(None); assert!(cfg.format.is_none()); } + + #[test] + fn test_load_dev_null_returns_default() { + let cfg = NoxaConfig::load(Some("/dev/null")); + assert!(cfg.format.is_none()); + assert!(cfg.llm_provider.is_none()); + } + + #[test] + fn test_load_whitespace_file_returns_default() { + let mut path = std::env::temp_dir(); + let suffix = format!( + "noxa-config-{}-{}.json", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + ); + path.push(suffix); + std::fs::write(&path, " \n\t ").unwrap(); + + let cfg = NoxaConfig::load(Some(path.to_str().unwrap())); + assert!(cfg.format.is_none()); + assert!(cfg.llm_model.is_none()); + + let _ = std::fs::remove_file(&path); + } + + #[test] + fn test_noxa_config_cloud_fields() { + let json = r#"{ + "cloud": { + "provider": "gcp", + "project": "my-gcp-project", + "zone": "us-central1-a", + "cluster": "my-cluster", + "service_account_key": "/path/to/key.json", + "disabled": false + } + }"#; + let cfg: NoxaConfig = serde_json::from_str(json).unwrap(); + let cloud = cfg.cloud.unwrap(); + assert_eq!(cloud.provider, Some("gcp".to_string())); + assert_eq!(cloud.project, Some("my-gcp-project".to_string())); + assert_eq!(cloud.zone, Some("us-central1-a".to_string())); + assert_eq!(cloud.cluster, Some("my-cluster".to_string())); + assert_eq!( + cloud.service_account_key, + Some("/path/to/key.json".to_string()) + ); + assert_eq!(cloud.disabled, Some(false)); + } } diff --git a/crates/noxa-cli/src/main.rs b/crates/noxa-cli/src/main.rs index 7144c24..8940af2 100644 --- a/crates/noxa-cli/src/main.rs +++ b/crates/noxa-cli/src/main.rs @@ -255,11 +255,11 @@ struct Cli { summarize: Option, /// Force a specific LLM provider (gemini, ollama, openai, anthropic) - #[arg(long, env = "NOXA_LLM_PROVIDER")] + #[arg(long)] llm_provider: Option, /// Override the LLM model name - #[arg(long, env = "NOXA_LLM_MODEL")] + #[arg(long)] llm_model: Option, /// Override the LLM base URL (Ollama or OpenAI-compatible) @@ -275,6 +275,30 @@ struct Cli { #[arg(long)] cloud: bool, + /// Cloud provider to use (e.g. "gcp", "aws") + #[arg(long, env = "NOXA_CLOUD_PROVIDER")] + cloud_provider: Option, + + /// Cloud project ID + #[arg(long, env = "NOXA_CLOUD_PROJECT")] + cloud_project: Option, + + /// Cloud zone or region + #[arg(long, env = "NOXA_CLOUD_ZONE")] + cloud_zone: Option, + + /// Cloud cluster name + #[arg(long, env = "NOXA_CLOUD_CLUSTER")] + cloud_cluster: Option, + + /// Path to cloud service account key file + #[arg(long, env = "NOXA_CLOUD_SERVICE_ACCOUNT_KEY")] + cloud_service_account_key: Option, + + /// Disable cloud features + #[arg(long)] + cloud_disabled: bool, + /// Run deep research on a topic via the cloud API. Requires --api-key. /// Saves full result (report + sources + findings) to a JSON file. #[arg(long)] @@ -571,6 +595,103 @@ fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata } } +fn file_extension_for_format(format: &OutputFormat) -> &'static str { + match format { + OutputFormat::Markdown | OutputFormat::Llm => "md", + OutputFormat::Json => "json", + OutputFormat::Text => "txt", + OutputFormat::Html => "html", + } +} + +fn format_cloud_output(resp: &serde_json::Value, format: &OutputFormat) -> String { + match format { + OutputFormat::Json => serde_json::to_string_pretty(resp).expect("serialization failed"), + OutputFormat::Markdown => resp + .get("content") + .and_then(|c| c.get("markdown")) + .and_then(|m| m.as_str()) + .or_else(|| resp.get("markdown").and_then(|m| m.as_str())) + .map(str::to_string) + .unwrap_or_else(|| serde_json::to_string_pretty(resp).expect("serialization failed")), + OutputFormat::Text => resp + .get("content") + .and_then(|c| c.get("plain_text")) + .and_then(|t| t.as_str()) + .map(str::to_string) + .unwrap_or_else(|| format_cloud_output(resp, &OutputFormat::Markdown)), + OutputFormat::Llm => resp + .get("content") + .and_then(|c| c.get("llm_text")) + .and_then(|t| t.as_str()) + .map(str::to_string) + .unwrap_or_else(|| format_cloud_output(resp, &OutputFormat::Markdown)), + OutputFormat::Html => resp + .get("content") + .and_then(|c| c.get("raw_html")) + .and_then(|h| h.as_str()) + .map(str::to_string) + .unwrap_or_else(|| format_cloud_output(resp, &OutputFormat::Markdown)), + } +} + +fn format_diff_output(diff: &ContentDiff, format: &OutputFormat) -> String { + match format { + OutputFormat::Json => serde_json::to_string_pretty(diff).expect("serialization failed"), + _ => { + let mut out = String::new(); + out.push_str(&format!("Status: {:?}\n", diff.status)); + out.push_str(&format!("Word count delta: {:+}\n", diff.word_count_delta)); + + if !diff.metadata_changes.is_empty() { + out.push_str("\nMetadata changes:\n"); + for change in &diff.metadata_changes { + out.push_str(&format!( + " {}: {} -> {}\n", + change.field, + change.old.as_deref().unwrap_or("(none)"), + change.new.as_deref().unwrap_or("(none)"), + )); + } + } + + if !diff.links_added.is_empty() { + out.push_str("\nLinks added:\n"); + for link in &diff.links_added { + out.push_str(&format!(" + {} ({})\n", link.href, link.text)); + } + } + + if !diff.links_removed.is_empty() { + out.push_str("\nLinks removed:\n"); + for link in &diff.links_removed { + out.push_str(&format!(" - {} ({})\n", link.href, link.text)); + } + } + + if let Some(ref text_diff) = diff.text_diff { + out.push_str(&format!("\n{text_diff}\n")); + } + + out + } + } +} + +fn format_map_output(entries: &[SitemapEntry], format: &OutputFormat) -> String { + match format { + OutputFormat::Json => serde_json::to_string_pretty(entries).expect("serialization failed"), + _ => { + let mut out = String::new(); + for entry in entries { + out.push_str(&entry.url); + out.push('\n'); + } + out + } + } +} + /// Collect all URLs from positional args + --urls-file, normalizing bare domains. /// /// Returns `(url, optional_custom_filename)` pairs. Custom filenames come from @@ -1295,7 +1416,7 @@ async fn run_crawl(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), S } } - if let Some(ref dir) = cli.output_dir { + if let Some(ref dir) = resolved.output_dir { let mut saved = 0usize; for page in &result.pages { if let Some(ref extraction) = page.extraction { @@ -1364,7 +1485,20 @@ async fn run_map(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), Str eprintln!("discovered {} URLs", entries.len()); } - print_map_output(&entries, &resolved.format); + if let Some(ref dir) = resolved.output_dir { + let content = format_map_output(&entries, &resolved.format); + let filename = format!( + "sitemap.{}", + if matches!(resolved.format, OutputFormat::Json) { + "json" + } else { + "txt" + } + ); + write_to_file(dir, &filename, &content)?; + } else { + print_map_output(&entries, &resolved.format); + } Ok(()) } @@ -1405,7 +1539,7 @@ async fn run_batch( .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n))) .collect(); - if let Some(ref dir) = cli.output_dir { + if let Some(ref dir) = resolved.output_dir { let mut saved = 0usize; for r in &results { if let Ok(ref extraction) = r.result { @@ -1749,6 +1883,20 @@ async fn run_watch_multi( eprintln!(" -> {url} (word delta: {delta:+})"); } + if let Some(ref dir) = resolved.output_dir { + let payload = serde_json::json!({ + "event": "watch_changes", + "check_number": check_number, + "total_urls": urls.len(), + "changed": changed.len(), + "same": same_count, + "changes": changed, + }); + let filename = format!("watch-{}.json", ts.replace(':', "-")); + let content = serde_json::to_string_pretty(&payload).unwrap_or_default(); + write_to_file(dir, &filename, &content)?; + } + // Fire --on-change once with all changes if let Some(ref cmd) = cli.on_change { let payload = serde_json::json!({ @@ -1812,7 +1960,20 @@ async fn run_diff( let new_result = fetch_and_extract(cli, resolved).await?.into_extraction()?; let diff = noxa_core::diff::diff(&old, &new_result); - print_diff_output(&diff, &resolved.format); + if let Some(ref dir) = resolved.output_dir { + let content = format_diff_output(&diff, &resolved.format); + let filename = format!( + "diff.{}", + if matches!(resolved.format, OutputFormat::Json) { + "json" + } else { + "txt" + } + ); + write_to_file(dir, &filename, &content)?; + } else { + print_diff_output(&diff, &resolved.format); + } Ok(()) } @@ -1824,10 +1985,12 @@ async fn run_brand(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), S &enriched, Some(result.url.as_str()).filter(|s| !s.is_empty()), ); - println!( - "{}", - serde_json::to_string_pretty(&brand).expect("serialization failed") - ); + let output = serde_json::to_string_pretty(&brand).expect("serialization failed"); + if let Some(ref dir) = resolved.output_dir { + write_to_file(dir, "brand.json", &output)?; + } else { + println!("{output}"); + } Ok(()) } @@ -1884,7 +2047,7 @@ async fn build_llm_provider( let chain = noxa_llm::ProviderChain::default().await; if chain.is_empty() { return Err( - "no LLM providers available -- install the gemini CLI, start Ollama, or set OPENAI_API_KEY / ANTHROPIC_API_KEY" + "no LLM providers available (priority: Gemini CLI -> OpenAI -> Ollama -> Anthropic) -- install gemini on PATH, set OPENAI_API_KEY, OLLAMA_HOST / OLLAMA_MODEL, or ANTHROPIC_API_KEY" .into(), ); } @@ -1898,6 +2061,7 @@ async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), Str let provider = build_llm_provider(cli, resolved).await?; let model = resolved.llm_model.as_deref(); + let mut file_output: Option<(String, OutputFormat)> = None; if let Some(ref schema_input) = cli.extract_json { // Support @file syntax for loading schema from file @@ -1922,10 +2086,10 @@ async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), Str .map_err(|e| format!("LLM extraction failed: {e}"))?; eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64()); - println!( - "{}", - serde_json::to_string_pretty(&extracted).expect("serialization failed") - ); + file_output = Some(( + serde_json::to_string_pretty(&extracted).expect("serialization failed"), + OutputFormat::Json, + )); } else if let Some(ref prompt) = cli.extract_prompt { let t = std::time::Instant::now(); let extracted = noxa_llm::extract::extract_with_prompt( @@ -1938,10 +2102,10 @@ async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), Str .map_err(|e| format!("LLM extraction failed: {e}"))?; eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64()); - println!( - "{}", - serde_json::to_string_pretty(&extracted).expect("serialization failed") - ); + file_output = Some(( + serde_json::to_string_pretty(&extracted).expect("serialization failed"), + OutputFormat::Json, + )); } else if let Some(sentences) = cli.summarize { let t = std::time::Instant::now(); let summary = noxa_llm::summarize::summarize( @@ -1954,7 +2118,21 @@ async fn run_llm(cli: &Cli, resolved: &config::ResolvedConfig) -> Result<(), Str .map_err(|e| format!("LLM summarization failed: {e}"))?; eprintln!("LLM: {:.1}s", t.elapsed().as_secs_f64()); - println!("{summary}"); + file_output = Some((summary, OutputFormat::Text)); + } + + if let Some((output_str, file_format)) = file_output { + if let Some(ref dir) = resolved.output_dir { + let url = cli + .urls + .first() + .map(|u| normalize_url(u)) + .unwrap_or_default(); + let filename = url_to_filename(&url, &file_format); + write_to_file(dir, &filename, &output_str)?; + } else { + println!("{output_str}"); + } } Ok(()) @@ -2067,11 +2245,16 @@ async fn run_batch_llm( }; eprintln!("-> extracted {detail} ({:.1}s)", llm_elapsed.as_secs_f64()); - if let Some(ref dir) = cli.output_dir { + if let Some(ref dir) = resolved.output_dir { + let file_format = if cli.summarize.is_some() { + OutputFormat::Text + } else { + OutputFormat::Json + }; let filename = custom_names .get(url.as_str()) .map(|s| s.to_string()) - .unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json)); + .unwrap_or_else(|| url_to_filename(url, &file_format)); write_to_file(dir, &filename, &output_str)?; } else { println!("--- {url}"); @@ -2123,7 +2306,11 @@ fn has_llm_flags(cli: &Cli) -> bool { cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some() } -async fn run_research(cli: &Cli, query: &str) -> Result<(), String> { +async fn run_research( + cli: &Cli, + resolved: &config::ResolvedConfig, + query: &str, +) -> Result<(), String> { let api_key = cli .api_key .as_deref() @@ -2209,8 +2396,12 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> { let filename = format!("research-{slug}.json"); let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default(); - std::fs::write(&filename, &json) - .map_err(|e| format!("failed to write {filename}: {e}"))?; + if let Some(ref dir) = resolved.output_dir { + write_to_file(dir, &filename, &json)?; + } else { + std::fs::write(&filename, &json) + .map_err(|e| format!("failed to write {filename}: {e}"))?; + } let elapsed = status_resp .get("elapsed_ms") @@ -2336,7 +2527,7 @@ async fn main() { // --research: deep research via cloud API if let Some(ref query) = cli.research { - if let Err(e) = run_research(&cli, query).await { + if let Err(e) = run_research(&cli, &resolved, query).await { eprintln!("error: {e}"); process::exit(1); } @@ -2377,10 +2568,7 @@ async fn main() { } // --raw-html: skip extraction, dump the fetched HTML - if resolved.raw_html - && resolved.include_selectors.is_empty() - && resolved.exclude_selectors.is_empty() - { + if resolved.raw_html { match fetch_html(&cli, &resolved).await { Ok(r) => println!("{}", r.html), Err(e) => { @@ -2394,7 +2582,7 @@ async fn main() { // Single-page extraction (handles both HTML and PDF via content-type detection) match fetch_and_extract(&cli, &resolved).await { Ok(FetchOutput::Local(result)) => { - if let Some(ref dir) = cli.output_dir { + if let Some(ref dir) = resolved.output_dir { let url = cli .urls .first() @@ -2413,7 +2601,23 @@ async fn main() { } } Ok(FetchOutput::Cloud(resp)) => { - print_cloud_output(&resp, &resolved.format); + if let Some(ref dir) = resolved.output_dir { + let url = cli + .urls + .first() + .map(|u| normalize_url(u)) + .unwrap_or_default(); + let custom_name = entries.first().and_then(|(_, name)| name.clone()); + let filename = + custom_name.unwrap_or_else(|| url_to_filename(&url, &resolved.format)); + let content = format_cloud_output(&resp, &resolved.format); + if let Err(e) = write_to_file(dir, &filename, &content) { + eprintln!("error: {e}"); + process::exit(1); + } + } else { + print_cloud_output(&resp, &resolved.format); + } } Err(e) => { eprintln!("{e}"); diff --git a/crates/noxa-core/src/llm/mod.rs b/crates/noxa-core/src/llm/mod.rs index 126558f..edbd993 100644 --- a/crates/noxa-core/src/llm/mod.rs +++ b/crates/noxa-core/src/llm/mod.rs @@ -91,6 +91,51 @@ mod tests { } } + #[test] + fn strips_emphasis_from_body() { + let md = "# Hello\n\nThis is **bold** and this is *italic*. Also __underbold__ and _underitalic_."; + let result = make_result(md); + let out = to_llm_text(&result, None); + + assert!(out.contains("This is bold and this is italic. Also underbold and underitalic.")); + assert!(!out.contains("**")); + assert!(!out.contains("__")); + assert!(!out.contains("* italic *")); // regex shouldn't leave spaces usually but checking marker absence + assert!(!out.contains("_underitalic_")); + } + + #[test] + fn dedups_repeated_phrases_in_line() { + let md = "Read more Read more Read more\n\nSome other text."; + let result = make_result(md); + let out = to_llm_text(&result, None); + + assert!(out.contains("Read more")); + assert_eq!(out.matches("Read more").count(), 1); + } + + #[test] + fn dedups_repeated_content_blocks() { + let md = "This is a block of text that is long enough to be deduped properly by the fingerprinting logic.\n\n\ + This is a block of text that is long enough to be deduped properly by the fingerprinting logic."; + let result = make_result(md); + let out = to_llm_text(&result, None); + + // Should only appear once + assert_eq!(out.matches("fingerprinting logic").count(), 1); + } + + #[test] + fn dedups_near_duplicate_content_blocks() { + let md = "First ten words of this block should be unique enough for prefix matching.\n\n\ + First ten words of this block should be unique enough for prefix matching but with extra text."; + let result = make_result(md); + let out = to_llm_text(&result, None); + + // Near duplicate (same first 10 words) should be removed + assert_eq!(out.matches("First ten words").count(), 1); + } + #[test] fn metadata_header_includes_populated_fields() { let result = make_result("# Hello"); diff --git a/crates/noxa-fetch/src/client.rs b/crates/noxa-fetch/src/client.rs index 1438dc3..e20066a 100644 --- a/crates/noxa-fetch/src/client.rs +++ b/crates/noxa-fetch/src/client.rs @@ -12,10 +12,10 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use std::time::{Duration, Instant}; +use noxa_pdf::PdfMode; use rand::seq::SliceRandom; use tokio::sync::Semaphore; use tracing::{debug, instrument, warn}; -use noxa_pdf::PdfMode; use crate::browser::{self, BrowserProfile, BrowserVariant}; use crate::error::FetchError; @@ -573,10 +573,7 @@ fn extract_homepage(url: &str) -> Option { } /// Convert a noxa-pdf PdfResult into a noxa-core ExtractionResult. -fn pdf_to_extraction_result( - pdf: &noxa_pdf::PdfResult, - url: &str, -) -> noxa_core::ExtractionResult { +fn pdf_to_extraction_result(pdf: &noxa_pdf::PdfResult, url: &str) -> noxa_core::ExtractionResult { let markdown = noxa_pdf::to_markdown(pdf); let word_count = markdown.split_whitespace().count(); diff --git a/crates/noxa-fetch/src/lib.rs b/crates/noxa-fetch/src/lib.rs index 54f2034..1bc933c 100644 --- a/crates/noxa-fetch/src/lib.rs +++ b/crates/noxa-fetch/src/lib.rs @@ -17,6 +17,6 @@ pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, Fetc pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult}; pub use error::FetchError; pub use http::HeaderMap; +pub use noxa_pdf::PdfMode; pub use proxy::{parse_proxy_file, parse_proxy_line}; pub use sitemap::SitemapEntry; -pub use noxa_pdf::PdfMode; diff --git a/crates/noxa-fetch/src/linkedin.rs b/crates/noxa-fetch/src/linkedin.rs index b9a42cd..1c0bb69 100644 --- a/crates/noxa-fetch/src/linkedin.rs +++ b/crates/noxa-fetch/src/linkedin.rs @@ -1,3 +1,4 @@ +use noxa_core::{Content, ExtractionResult, Metadata}; /// LinkedIn post extraction from authenticated HTML. /// /// LinkedIn's SPA stores all data in `` tags as HTML-escaped JSON. @@ -5,7 +6,6 @@ /// Profile, etc. We parse these to reconstruct post + comments as markdown. use serde_json::Value; use tracing::debug; -use noxa_core::{Content, ExtractionResult, Metadata}; /// Check if a URL is a LinkedIn post/activity. pub fn is_linkedin_post(url: &str) -> bool { diff --git a/crates/noxa-fetch/src/reddit.rs b/crates/noxa-fetch/src/reddit.rs index be8622c..4d11c0f 100644 --- a/crates/noxa-fetch/src/reddit.rs +++ b/crates/noxa-fetch/src/reddit.rs @@ -1,3 +1,4 @@ +use noxa_core::{Content, ExtractionResult, Metadata}; /// Reddit JSON API fallback for extracting posts + comments without JS rendering. /// /// Reddit's new `shreddit` frontend only SSRs the post body — comments are @@ -5,7 +6,6 @@ /// comment tree as structured JSON, which we convert to clean markdown. use serde::Deserialize; use tracing::debug; -use noxa_core::{Content, ExtractionResult, Metadata}; /// Check if a URL points to a Reddit post/comment page. pub fn is_reddit_url(url: &str) -> bool { diff --git a/crates/noxa-llm/src/chain.rs b/crates/noxa-llm/src/chain.rs index 43f3de9..6e1561d 100644 --- a/crates/noxa-llm/src/chain.rs +++ b/crates/noxa-llm/src/chain.rs @@ -1,5 +1,5 @@ /// Provider chain — tries providers in order until one succeeds. -/// Default order: Ollama (local, free) -> OpenAI -> Anthropic. +/// Default order: Gemini CLI (primary) -> OpenAI -> Ollama -> Anthropic. /// Only includes providers that are actually configured/available. use async_trait::async_trait; use tracing::{debug, info, warn}; @@ -7,9 +7,7 @@ use tracing::{debug, info, warn}; use crate::error::LlmError; use crate::provider::{CompletionRequest, LlmProvider}; use crate::providers::{ - anthropic::AnthropicProvider, - gemini_cli::GeminiCliProvider, - ollama::OllamaProvider, + anthropic::AnthropicProvider, gemini_cli::GeminiCliProvider, ollama::OllamaProvider, openai::OpenAiProvider, }; @@ -94,7 +92,11 @@ impl LlmProvider for ProviderChain { let t = std::time::Instant::now(); match provider.complete(request).await { Ok(response) => { - info!(provider = provider.name(), elapsed_ms = t.elapsed().as_millis(), "completion succeeded"); + info!( + provider = provider.name(), + elapsed_ms = t.elapsed().as_millis(), + "completion succeeded" + ); return Ok(response); } Err(e) => { diff --git a/crates/noxa-llm/src/extract.rs b/crates/noxa-llm/src/extract.rs index 9216b0d..e637628 100644 --- a/crates/noxa-llm/src/extract.rs +++ b/crates/noxa-llm/src/extract.rs @@ -8,44 +8,47 @@ use crate::provider::{CompletionRequest, LlmProvider, Message}; /// Validate a JSON value against a schema. Returns Ok(()) on success or /// Err(LlmError::InvalidJson) with a concise error message on failure. -fn validate_schema( - value: &serde_json::Value, - schema: &serde_json::Value, -) -> Result<(), LlmError> { - let compiled = jsonschema::validator_for(schema).map_err(|e| { - LlmError::InvalidJson(format!("invalid schema: {e}")) - })?; +fn validate_schema(value: &serde_json::Value, schema: &serde_json::Value) -> Result<(), LlmError> { + let compiled = jsonschema::validator_for(schema) + .map_err(|e| LlmError::InvalidJson(format!("invalid schema: {e}")))?; - let errors: Vec = compiled - .iter_errors(value) - .map(|e| format!("{} at {}", e, e.instance_path())) - .collect(); + let first_error = compiled.iter_errors(value).next(); - if errors.is_empty() { - Ok(()) - } else { - Err(LlmError::InvalidJson(format!( - "schema validation failed: {}", - errors.join("; ") - ))) + match first_error { + None => Ok(()), + Some(e) => { + let msg = format!("{} at {}", e, e.instance_path()); + Err(LlmError::InvalidJson(format!( + "schema validation failed: {msg}" + ))) + } } } +/// Compile a schema up front so invalid schemas fail before any provider call. +fn validate_schema_definition(schema: &serde_json::Value) -> Result<(), LlmError> { + jsonschema::validator_for(schema) + .map(|_| ()) + .map_err(|e| LlmError::InvalidJson(format!("invalid schema: {e}"))) +} + /// Extract structured JSON from content using a JSON schema. /// The schema tells the LLM exactly what fields to extract and their types. /// /// Retry policy: -/// - If the response cannot be parsed as JSON at all: retry once with the -/// identical request (handles transient formatting issues). -/// - If the response is valid JSON but fails schema validation: return -/// `LlmError::InvalidJson` immediately — the schema is likely unsatisfiable -/// for this content, so retrying would produce the same result. +/// - If the response cannot be parsed as JSON: retry once with a correction prompt. +/// - If the response is valid JSON but fails schema validation: retry once with +/// a tighter correction prompt that includes the specific validation error. +/// - Both retry attempts add the previous failed response as an 'assistant' message +/// and the correction instructions as a 'user' message to improve success. pub async fn extract_json( content: &str, schema: &serde_json::Value, provider: &dyn LlmProvider, model: Option<&str>, ) -> Result { + validate_schema_definition(schema)?; + let system = format!( "You are a JSON extraction engine. Extract data from the content according to this schema.\n\ Return ONLY valid JSON matching the schema. No explanations, no markdown, no commentary.\n\n\ @@ -53,18 +56,20 @@ pub async fn extract_json( serde_json::to_string_pretty(schema).unwrap_or_else(|_| schema.to_string()) ); - let request = CompletionRequest { + let mut messages = vec![ + Message { + role: "system".into(), + content: system, + }, + Message { + role: "user".into(), + content: content.to_string(), + }, + ]; + + let mut request = CompletionRequest { model: model.unwrap_or_default().to_string(), - messages: vec![ - Message { - role: "system".into(), - content: system, - }, - Message { - role: "user".into(), - content: content.to_string(), - }, - ], + messages: messages.clone(), temperature: Some(0.0), max_tokens: None, json_mode: true, @@ -72,23 +77,54 @@ pub async fn extract_json( let response = provider.complete(&request).await?; - match parse_json_response(&response) { - Ok(value) => { - // Valid JSON — now validate against the schema. - // Schema mismatches do not retry (unsatisfiable → same result). - validate_schema(&value, schema)?; - Ok(value) - } - Err(_parse_err) => { - // Unparseable JSON — retry once with the identical request. + match parse_and_validate(&response, schema) { + Ok(value) => Ok(value), + Err(e) => { + // First attempt failed — retry once with a correction prompt. + // Construct a concise correction prompt based on the error type. + let correction_prompt = match &e { + LlmError::InvalidJson(msg) if msg.contains("schema validation failed") => { + let error_msg = msg.replace("schema validation failed: ", ""); + format!("Correction required: {}. Return ONLY the corrected JSON.", error_msg) + } + _ => { + "Your response was not valid JSON. Please return ONLY valid JSON matching the schema.".to_string() + } + }; + + // Limit correction context to prevent token blowup on large hallucinated outputs. + let capped_response = if response.len() > 2000 { + format!("{}... [truncated]", &response[..2000]) + } else { + response.clone() + }; + + messages.push(Message { + role: "assistant".into(), + content: capped_response, + }); + messages.push(Message { + role: "user".into(), + content: correction_prompt, + }); + + request.messages = messages; let retry_response = provider.complete(&request).await?; - let value = parse_json_response(&retry_response)?; - validate_schema(&value, schema)?; - Ok(value) + parse_and_validate(&retry_response, schema) } } } +/// Helper: parse response string as JSON and validate it against the schema. +fn parse_and_validate( + response: &str, + schema: &serde_json::Value, +) -> Result { + let value = parse_json_response(response)?; + validate_schema(&value, schema)?; + Ok(value) +} + /// Extract information using a natural language prompt. /// More flexible than schema extraction — the user describes what they want. pub async fn extract_with_prompt( @@ -301,9 +337,7 @@ mod tests { ], ); - let result = extract_json("content", &schema, &mock, None) - .await - .unwrap(); + let result = extract_json("content", &schema, &mock, None).await.unwrap(); assert_eq!(result["title"], "Retry succeeded"); } @@ -318,10 +352,7 @@ mod tests { let mock = SequenceMockProvider::new( "mock-seq", - vec![ - Ok("not json".to_string()), - Ok("also not json".to_string()), - ], + vec![Ok("not json".to_string()), Ok("also not json".to_string())], ); let result = extract_json("content", &schema, &mock, None).await; @@ -332,7 +363,7 @@ mod tests { } #[tokio::test] - async fn schema_mismatch_does_not_retry() { + async fn schema_mismatch_triggers_retry() { use crate::testing::mock::SequenceMockProvider; let schema = serde_json::json!({ @@ -343,20 +374,17 @@ mod tests { } }); - // Both calls return valid JSON with wrong schema — but only one call should happen. + // First call: valid JSON but schema mismatch (price is string). + // Second call: valid JSON matching schema. let mock = SequenceMockProvider::new( "mock-seq", vec![ Ok(r#"{"price": "wrong-type"}"#.to_string()), - Ok(r#"{"price": 9.99}"#.to_string()), // would succeed — but shouldn't be called + Ok(r#"{"price": 9.99}"#.to_string()), ], ); - // Should return InvalidJson without calling second response. - let result = extract_json("content", &schema, &mock, None).await; - assert!( - matches!(result, Err(LlmError::InvalidJson(_))), - "schema mismatch should not trigger retry" - ); + let result = extract_json("content", &schema, &mock, None).await.unwrap(); + assert_eq!(result["price"], 9.99); } } diff --git a/crates/noxa-llm/src/lib.rs b/crates/noxa-llm/src/lib.rs index 250ae88..129b148 100644 --- a/crates/noxa-llm/src/lib.rs +++ b/crates/noxa-llm/src/lib.rs @@ -2,7 +2,7 @@ /// /// Provider chain: Gemini CLI (primary) → OpenAI → Ollama → Anthropic. /// Gemini CLI requires the `gemini` binary on PATH; GEMINI_MODEL env var sets the model. -/// Provides schema-validated extraction (with one retry on parse failure), +/// Provides schema-validated extraction (with one retry on parse or schema mismatch), /// prompt extraction, and summarization on top of noxa-core's content pipeline. pub mod chain; pub mod clean; diff --git a/crates/noxa-llm/src/providers/gemini_cli.rs b/crates/noxa-llm/src/providers/gemini_cli.rs index 9d2d2d7..54f137e 100644 --- a/crates/noxa-llm/src/providers/gemini_cli.rs +++ b/crates/noxa-llm/src/providers/gemini_cli.rs @@ -12,11 +12,11 @@ /// /// Two flags reduce this: /// - `--extensions ""` — skips extension loading (~3 s saved) -/// - `current_dir` set to a temp workdir containing `.gemini/settings.json` with -/// `{"mcpServers":{}}` — workspace settings override user settings, so all 6 MCP +/// - `current_dir` set to a best-effort temp workdir containing `.gemini/settings.json` +/// with `{"mcpServers":{}}` — workspace settings override user settings, so all 6 MCP /// servers are skipped at subprocess startup (major speedup). /// -/// The workdir is created once at construction and reused for every call. +/// The workdir is created once at construction and reused for every call when available. use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; @@ -36,10 +36,6 @@ const MAX_CONCURRENT: usize = 6; /// Subprocess deadline — prevents hung `gemini` processes blocking the chain. const SUBPROCESS_TIMEOUT: Duration = Duration::from_secs(60); -/// Fixed workdir used for every subprocess call. -/// A workspace-level `.gemini/settings.json` here overrides the user's MCP server config. -const NOXA_GEMINI_WORKDIR: &str = "/tmp/noxa-gemini"; - pub struct GeminiCliProvider { default_model: String, semaphore: Arc, @@ -56,7 +52,7 @@ impl GeminiCliProvider { .filter(|s| !s.is_empty()) .unwrap_or_else(|| "gemini-2.5-pro".into()); - let workdir = PathBuf::from(NOXA_GEMINI_WORKDIR); + let workdir = std::env::temp_dir().join("noxa-gemini"); ensure_gemini_workdir(&workdir); Self { @@ -106,11 +102,14 @@ impl LlmProvider for GeminiCliProvider { // Workspace settings in self.workdir override the user's ~/.gemini/settings.json, // replacing the user's MCP server list with {} so none are spawned at startup. // Without this, each of the user's MCP servers adds latency to every call. - cmd.current_dir(&self.workdir); + if self.workdir.is_dir() { + cmd.current_dir(&self.workdir); + } cmd.stdin(std::process::Stdio::null()); cmd.stdout(std::process::Stdio::piped()); cmd.stderr(std::process::Stdio::piped()); + cmd.kill_on_drop(true); debug!(model, workdir = %self.workdir.display(), "spawning gemini subprocess"); @@ -169,7 +168,9 @@ fn extract_response_from_output(stdout: &str) -> Result { let json_str = &stdout[json_start..]; let outer: serde_json::Value = serde_json::from_str(json_str).map_err(|e| { let preview = &json_str[..json_str.len().min(300)]; - LlmError::ProviderError(format!("failed to parse gemini JSON output: {e} — {preview}")) + LlmError::ProviderError(format!( + "failed to parse gemini JSON output: {e} — {preview}" + )) })?; // `response` holds the model's actual text output. @@ -320,10 +321,7 @@ mod tests { fn extracts_response_skipping_mcp_noise() { // MCP warning line appears before the JSON object in real gemini output. let stdout = "MCP issues detected. Run /mcp list for status.\n{\"session_id\":\"abc\",\"response\":\"the answer\",\"stats\":{}}"; - assert_eq!( - extract_response_from_output(stdout).unwrap(), - "the answer" - ); + assert_eq!(extract_response_from_output(stdout).unwrap(), "the answer"); } #[test] diff --git a/crates/noxa-llm/src/providers/mod.rs b/crates/noxa-llm/src/providers/mod.rs index b1a8736..53dc760 100644 --- a/crates/noxa-llm/src/providers/mod.rs +++ b/crates/noxa-llm/src/providers/mod.rs @@ -29,9 +29,6 @@ mod tests { #[test] fn none_override_with_no_env_returns_none() { - assert_eq!( - load_api_key(None, "NOXA_TEST_NONEXISTENT_KEY_12345"), - None - ); + assert_eq!(load_api_key(None, "NOXA_TEST_NONEXISTENT_KEY_12345"), None); } } diff --git a/crates/noxa-llm/src/providers/ollama.rs b/crates/noxa-llm/src/providers/ollama.rs index d728e67..dbdbecb 100644 --- a/crates/noxa-llm/src/providers/ollama.rs +++ b/crates/noxa-llm/src/providers/ollama.rs @@ -8,6 +8,8 @@ use crate::clean::strip_thinking_tags; use crate::error::LlmError; use crate::provider::{CompletionRequest, LlmProvider}; +const DEFAULT_HEALTH_TIMEOUT_MS: u64 = 2_000; + pub struct OllamaProvider { client: reqwest::Client, base_url: String, @@ -22,7 +24,7 @@ impl OllamaProvider { let default_model = model .or_else(|| std::env::var("OLLAMA_MODEL").ok()) - .unwrap_or_else(|| "qwen3:8b".into()); + .unwrap_or_else(|| "qwen3.5:9b".into()); Self { client: reqwest::Client::new(), @@ -98,7 +100,7 @@ impl LlmProvider for OllamaProvider { async fn is_available(&self) -> bool { let url = format!("{}/api/tags", self.base_url); matches!( - tokio::time::timeout(Duration::from_millis(500), self.client.get(&url).send()).await, + tokio::time::timeout(health_timeout(), self.client.get(&url).send()).await, Ok(Ok(r)) if r.status().is_success() ) } @@ -108,6 +110,18 @@ impl LlmProvider for OllamaProvider { } } +fn health_timeout() -> Duration { + health_timeout_from_env(std::env::var("OLLAMA_HEALTH_TIMEOUT_MS").ok()) +} + +fn health_timeout_from_env(value: Option) -> Duration { + value + .and_then(|v| v.parse::().ok()) + .filter(|ms| *ms > 0) + .map(Duration::from_millis) + .unwrap_or_else(|| Duration::from_millis(DEFAULT_HEALTH_TIMEOUT_MS)) +} + #[cfg(test)] mod tests { use super::*; @@ -142,6 +156,27 @@ mod tests { assert_eq!(provider.default_model(), "phi3:mini"); } + #[test] + fn health_timeout_from_env_defaults_when_unset() { + assert_eq!(health_timeout_from_env(None), Duration::from_millis(2000)); + } + + #[test] + fn health_timeout_from_env_parses_override() { + assert_eq!( + health_timeout_from_env(Some("1500".into())), + Duration::from_millis(1500) + ); + } + + #[test] + fn health_timeout_from_env_ignores_invalid_values() { + assert_eq!( + health_timeout_from_env(Some("not-a-number".into())), + Duration::from_millis(2000) + ); + } + // Env var fallback is a trivial `env::var().ok()` -- not worth the flakiness // of manipulating process-global state. Run in isolation if needed: // cargo test -p noxa-llm env_var_fallback -- --ignored --test-threads=1 diff --git a/crates/noxa-llm/src/testing.rs b/crates/noxa-llm/src/testing.rs index da5cc0b..98a0693 100644 --- a/crates/noxa-llm/src/testing.rs +++ b/crates/noxa-llm/src/testing.rs @@ -4,8 +4,8 @@ /// extract, chain, and other modules that need a fake LLM backend. #[cfg(test)] pub(crate) mod mock { - use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; use async_trait::async_trait; @@ -50,7 +50,7 @@ pub(crate) mod mock { } /// A mock provider that returns responses from a sequence. - /// Call N → returns responses[N], wrapping at the end. + /// Call N → returns responses[N], clamping to the final response. /// Useful for testing first-failure / second-success retry paths. pub struct SequenceMockProvider { pub name: &'static str, @@ -60,10 +60,11 @@ pub(crate) mod mock { } impl SequenceMockProvider { - pub fn new( - name: &'static str, - responses: Vec>, - ) -> Self { + pub fn new(name: &'static str, responses: Vec>) -> Self { + assert!( + !responses.is_empty(), + "SequenceMockProvider requires at least one response" + ); Self { name, responses, diff --git a/crates/noxa-mcp/src/cloud.rs b/crates/noxa-mcp/src/cloud.rs index ee4d259..315ef59 100644 --- a/crates/noxa-mcp/src/cloud.rs +++ b/crates/noxa-mcp/src/cloud.rs @@ -7,7 +7,6 @@ use std::time::Duration; use serde_json::{Value, json}; use tracing::info; - const API_BASE: &str = "https://api.noxa.io/v1"; /// Lightweight client for the noxa cloud API. diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs index 4b7bb44..db926e7 100644 --- a/crates/noxa-mcp/src/server.rs +++ b/crates/noxa-mcp/src/server.rs @@ -18,6 +18,8 @@ use url::Url; use crate::cloud::{self, CloudClient, SmartFetchResult}; use crate::tools::*; +const NO_LLM_PROVIDERS_MESSAGE: &str = "No LLM providers available (priority: Gemini CLI -> OpenAI -> Ollama -> Anthropic). Install gemini on PATH, set OPENAI_API_KEY, OLLAMA_HOST / OLLAMA_MODEL, or ANTHROPIC_API_KEY, or set NOXA_API_KEY for cloud fallback."; + pub struct NoxaMcp { tool_router: ToolRouter, fetch_client: Arc, @@ -89,7 +91,7 @@ impl NoxaMcp { let chain = noxa_llm::ProviderChain::default().await; let llm_chain = if chain.is_empty() { - warn!("no LLM providers available (gemini CLI, OPENAI_API_KEY, ANTHROPIC_API_KEY) -- extract/summarize tools will fail"); + warn!("{NO_LLM_PROVIDERS_MESSAGE} -- extract/summarize tools will fail"); None } else { info!(providers = chain.len(), "LLM provider chain ready"); @@ -333,9 +335,7 @@ impl NoxaMcp { // No local LLM — fall back to cloud API directly if self.llm_chain.is_none() { - let cloud = self.cloud.as_ref().ok_or( - "No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.", - )?; + let cloud = self.cloud.as_ref().ok_or(NO_LLM_PROVIDERS_MESSAGE)?; let mut body = json!({"url": params.url}); if let Some(ref schema) = params.schema { body["schema"] = json!(schema); @@ -386,9 +386,7 @@ impl NoxaMcp { // No local LLM — fall back to cloud API directly if self.llm_chain.is_none() { - let cloud = self.cloud.as_ref().ok_or( - "No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.", - )?; + let cloud = self.cloud.as_ref().ok_or(NO_LLM_PROVIDERS_MESSAGE)?; let mut body = json!({"url": params.url}); if let Some(sentences) = params.max_sentences { body["max_sentences"] = json!(sentences); @@ -425,9 +423,8 @@ impl NoxaMcp { #[tool] async fn diff(&self, Parameters(params): Parameters) -> Result { validate_url(¶ms.url)?; - let previous: noxa_core::ExtractionResult = - serde_json::from_str(¶ms.previous_snapshot) - .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?; + let previous: noxa_core::ExtractionResult = serde_json::from_str(¶ms.previous_snapshot) + .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?; let result = cloud::smart_fetch( &self.fetch_client, @@ -515,8 +512,7 @@ impl NoxaMcp { } } - let identity = - noxa_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url)); + let identity = noxa_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url)); Ok(serde_json::to_string_pretty(&identity).unwrap_or_default()) } diff --git a/docs/config.md b/docs/config.md new file mode 100644 index 0000000..0a89454 --- /dev/null +++ b/docs/config.md @@ -0,0 +1,273 @@ +# Config and Environment + +This document explains how `noxa` loads configuration, how it merges `config.json` with environment variables and CLI flags, and which settings belong in each place. + +## Quick Summary + +- `config.json` is for non-secret defaults. +- `.env` is for secrets and URLs. +- CLI flags always win over config and environment variables. +- Unknown keys in `config.json` are ignored. +- `config.json` uses `snake_case` keys. + +## Load Order + +`noxa` resolves settings in this order: + +1. CLI flags +2. `config.json` +3. Environment variables +4. Built-in defaults + +That means you can set a default in `config.json`, override it for a single run with a CLI flag, and keep secrets in `.env` without checking them into source control. + +## Where `config.json` Comes From + +By default, `noxa` loads `./config.json` from the current working directory. + +You can override that in two ways: + +- `--config ` on the CLI +- `NOXA_CONFIG=` in the environment + +If the file does not exist: + +- an explicit `--config` path or `NOXA_CONFIG` path is an error +- the default `./config.json` is optional and missing files are ignored + +To bypass config entirely for one run: + +```bash +NOXA_CONFIG=/dev/null noxa https://example.com +``` + +## What Belongs Where + +### `config.json` + +Use `config.json` for stable, non-secret defaults such as: + +- output format +- output directory +- browser fingerprint +- timeout +- crawl depth and page limits +- selector filters +- LLM provider and model + +### `.env` + +Use `.env` for secrets, URLs, and a small number of runtime overrides: + +- `NOXA_API_KEY` +- `NOXA_PROXY` +- `NOXA_PROXY_FILE` +- `NOXA_WEBHOOK_URL` +- `NOXA_LLM_BASE_URL` + +Those values are intentionally excluded from `config.json`. + +If you run `setup.sh` or the Docker Compose stack, the generated `.env` may also include local deployment settings such as `NOXA_PORT`, `NOXA_HOST`, `NOXA_AUTH_KEY`, `NOXA_LOG`, `OLLAMA_HOST`, and `OLLAMA_MODEL`. + +### CLI-only + +These options stay on the command line and do not belong in `config.json`: + +- `--on-change` +- `--raw-html` + +`--on-change` is CLI-only because it executes shell commands. `--raw-html` is a per-run mode, not a persistent default. + +## Config File Rules + +- Keys are `snake_case`. +- All fields are optional. +- Unknown fields are ignored. +- Arrays are used for selector and path lists. +- Boolean flags have one important limitation: if you set them to `true` in `config.json`, you cannot disable them for a single CLI run with a `--no-...` flag because `noxa` does not define one. + +The boolean fields with this limitation are: + +- `metadata` +- `verbose` +- `only_main_content` +- `use_sitemap` + +If you need to turn one of those off temporarily, bypass the config file with `NOXA_CONFIG=/dev/null`. + +## Supported `config.json` Keys + +### Output + +| Key | Type | Default | Notes | +|---|---|---:|---| +| `format` | string | `markdown` | One of `markdown`, `json`, `text`, `llm`, `html` | +| `metadata` | boolean | `false` | Include metadata in output | +| `verbose` | boolean | `false` | Enable verbose logging | +| `output_dir` | string or null | `null` | Write outputs to files in this directory instead of stdout | + +When `output_dir` is set, noxa writes results to files instead of printing them for the modes that support file output: + +- single URL extraction +- multi-URL batch extraction +- crawl +- LLM extraction and summarization +- sitemap discovery +- diff output +- brand extraction +- research reports +- watch changes + +File names are derived from the URL or mode name, and the directory is created on demand. + +### Output Directory Layout + +For URL-based output, noxa mirrors the URL path under `output_dir`: + +| URL | Written file | +|---|---| +| `https://example.com/` | `output_dir/example_com/index.md` | +| `https://example.com/docs/api` | `output_dir/docs/api.md` | +| `https://example.com/docs/api/` | `output_dir/docs/api.md` | +| `https://example.com/blog/post?id=123` | `output_dir/blog/post_id_123.md` | + +The extension comes from the selected output format: + +| Format | Extension | +|---|---| +| `markdown` | `.md` | +| `llm` | `.md` | +| `json` | `.json` | +| `text` | `.txt` | +| `html` | `.html` | + +For `--urls-file`, a CSV entry of `url,filename` uses the custom filename instead of the URL-derived name. + +Examples: + +```txt +https://example.com/docs/api,api.md +https://example.com/blog/post +``` + +Becomes: + +```txt +output_dir/api.md +output_dir/blog/post.md +``` + +Mode-specific outputs use fixed filenames in the root of `output_dir`: + +| Mode | File | +|---|---| +| `--map` | `sitemap.json` or `sitemap.txt` | +| `--diff-with` | `diff.json` or `diff.txt` | +| `--brand` | `brand.json` | +| `--research` | `research-.json` | +| `--watch` | `watch-.json` | + +The directory tree is created automatically, so nested paths do not need to exist ahead of time. + +### Fetch + +| Key | Type | Default | Notes | +|---|---|---:|---| +| `browser` | string | `chrome` | One of `chrome`, `firefox`, `random` | +| `timeout` | integer | `30` | Request timeout in seconds | +| `pdf_mode` | string | `auto` | One of `auto`, `fast` | +| `only_main_content` | boolean | `false` | Auto-detect the main content area | + +### Content Filtering + +| Key | Type | Default | Notes | +|---|---|---:|---| +| `include_selectors` | array of strings | `[]` | CSS selectors to include | +| `exclude_selectors` | array of strings | `[]` | CSS selectors to exclude | + +### Crawl + +| Key | Type | Default | Notes | +|---|---|---:|---| +| `depth` | integer | `1` | Crawl depth | +| `max_pages` | integer | `20` | Maximum pages to crawl | +| `concurrency` | integer | `5` | Concurrent requests | +| `delay` | integer | `100` | Delay between requests in ms | +| `path_prefix` | string or null | `null` | Only crawl URLs whose path starts with this prefix | +| `include_paths` | array of strings | `[]` | Glob patterns to include | +| `exclude_paths` | array of strings | `[]` | Glob patterns to exclude | +| `use_sitemap` | boolean | `false` | Seed the crawl from sitemap discovery | + +### LLM + +| Key | Type | Default | Notes | +|---|---|---:|---| +| `llm_provider` | string | unset | Optional provider name: `gemini`, `ollama`, `openai`, `anthropic` | +| `llm_model` | string | unset | Optional model override | + +## Environment Variables + +| Variable | Purpose | Notes | +|---|---|---| +| `NOXA_API_KEY` | Cloud API key | Used for cloud fallback and cloud-only features | +| `NOXA_PROXY` | Single proxy URL | Takes priority over proxy file when set | +| `NOXA_PROXY_FILE` | Proxy pool file path | One proxy per line | +| `NOXA_WEBHOOK_URL` | Notification webhook | Used by watch/crawl/batch notifications | +| `NOXA_LLM_BASE_URL` | LLM endpoint URL | For Ollama or OpenAI-compatible endpoints | +| `NOXA_LLM_PROVIDER` | Default LLM provider | Environment override for the provider name | +| `NOXA_LLM_MODEL` | Default LLM model | Environment override for the model name | +| `NOXA_CONFIG` | Config file path | Override `./config.json` or bypass with `/dev/null` | + +The following variables are not part of the `config.json` contract, but they still matter for LLM provider behavior: + +- `OPENAI_API_KEY` +- `ANTHROPIC_API_KEY` +- `OLLAMA_HOST` +- `OLLAMA_MODEL` +- `GEMINI_MODEL` + +## Example + +`config.example.json` shows the recommended baseline: + +```json +{ + "$schema": "./config.schema.json", + "_doc": [ + "Copy to config.json and remove fields you don't need.", + "Secrets (api_key, proxy, webhook, llm_base_url) go in .env — NOT here." + ], + "format": "markdown", + "browser": "chrome", + "timeout": 30, + "pdf_mode": "auto", + "metadata": false, + "verbose": false, + "only_main_content": false, + "include_selectors": [], + "exclude_selectors": ["nav", "footer", ".sidebar", ".cookie-banner"], + "depth": 1, + "max_pages": 20, + "concurrency": 5, + "delay": 100, + "path_prefix": null, + "include_paths": [], + "exclude_paths": ["/changelog/*", "/blog/*", "/releases/*"], + "use_sitemap": false, + "llm_provider": "gemini", + "llm_model": "gemini-2.5-pro" +} +``` + +## Gotchas + +- `config.json` is permissive by design: unknown fields are ignored so newer config files still work on older binaries. +- `llm_provider` is validated by the CLI at runtime; invalid values will fail when the provider is selected. +- `browser`, `timeout`, `depth`, `max_pages`, `concurrency`, and `delay` are ordinary defaults, so CLI flags can override them per run. +- Boolean defaults set to `true` in config are sticky for that run unless you bypass the file. + +## Related Files + +- [`config.schema.json`](../config.schema.json) +- [`config.example.json`](../config.example.json) +- [`env.example`](../env.example) diff --git a/env.example b/env.example index aad81c5..f85a0c9 100644 --- a/env.example +++ b/env.example @@ -13,8 +13,16 @@ NOXA_PROXY_FILE= # Webhook URL for completion notifications NOXA_WEBHOOK_URL= -# LLM base URL (Ollama or OpenAI-compatible endpoint) -NOXA_LLM_BASE_URL= +# LLM provider configuration and backend defaults +# NOXA_LLM_PROVIDER=gemini +# NOXA_LLM_MODEL=gemini-2.5-pro +# NOXA_LLM_BASE_URL= (Ollama or OpenAI-compatible endpoint) +# GEMINI_MODEL=gemini-2.5-pro +# OLLAMA_HOST=http://localhost:11434 +# OLLAMA_MODEL=qwen3.5:9b +# OLLAMA_HEALTH_TIMEOUT_MS=2000 +# OPENAI_API_KEY= +# ANTHROPIC_API_KEY= # Optional: path to a non-default config file (default: ./config.json) # NOXA_CONFIG=/path/to/my-config.json diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index f9aee68..0000000 --- a/examples/README.md +++ /dev/null @@ -1,320 +0,0 @@ -# Examples - -Practical examples showing what noxa can do. Each example is a self-contained command you can run immediately. - -## Basic Extraction - -```bash -# Extract as markdown (default) -noxa https://example.com - -# Multiple output formats -noxa https://example.com -f markdown # Clean markdown -noxa https://example.com -f json # Full structured JSON -noxa https://example.com -f text # Plain text (no formatting) -noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens) - -# Bare domains work (auto-prepends https://) -noxa example.com -``` - -## Content Filtering - -```bash -# Only extract main content (skip nav, sidebar, footer) -noxa https://docs.rs/tokio --only-main-content - -# Include specific CSS selectors -noxa https://news.ycombinator.com --include ".titleline,.score" - -# Exclude specific elements -noxa https://example.com --exclude "nav,footer,.ads,.sidebar" - -# Combine both -noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar" -``` - -## Brand Identity Extraction - -```bash -# Extract colors, fonts, logos from any website -noxa --brand https://stripe.com -# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] } - -noxa --brand https://github.com -# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... } - -noxa --brand wikipedia.org -# Output: 10 colors, 5 fonts, favicon, logo URL -``` - -## Sitemap Discovery - -```bash -# Discover all URLs from a site's sitemaps -noxa --map https://sitemaps.org -# Output: one URL per line (84 URLs found) - -# JSON output with metadata -noxa --map https://sitemaps.org -f json -# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }] -``` - -## Recursive Crawling - -```bash -# Crawl a site (default: depth 1, max 20 pages) -noxa --crawl https://example.com - -# Control depth and page limit -noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio - -# Crawl with sitemap seeding (finds more pages) -noxa --crawl --sitemap --depth 2 https://docs.rs/tokio - -# Filter crawl paths -noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com -noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com - -# Control concurrency and delay -noxa --crawl --concurrency 10 --delay 200 https://example.com -``` - -## Change Detection (Diff) - -```bash -# Step 1: Save a snapshot -noxa https://example.com -f json > snapshot.json - -# Step 2: Later, compare against the snapshot -noxa --diff-with snapshot.json https://example.com -# Output: -# Status: Same -# Word count delta: +0 - -# If the page changed: -# Status: Changed -# Word count delta: +42 -# --- old -# +++ new -# @@ -1,3 +1,3 @@ -# -Old content here -# +New content here -``` - -## PDF Extraction - -```bash -# PDF URLs are auto-detected via Content-Type -noxa https://example.com/report.pdf - -# Control PDF mode -noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs) -noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found -``` - -## Batch Processing - -```bash -# Multiple URLs in one command -noxa https://example.com https://httpbin.org/html https://rust-lang.org - -# URLs from a file (one per line, # comments supported) -noxa --urls-file urls.txt - -# Batch with JSON output -noxa --urls-file urls.txt -f json - -# Proxy rotation for large batches -noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10 -``` - -## Local Files & Stdin - -```bash -# Extract from a local HTML file -noxa --file page.html - -# Pipe HTML from another command -curl -s https://example.com | noxa --stdin - -# Chain with other tools -noxa https://example.com -f text | wc -w # Word count -noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq -``` - -## Cloud API Mode - -When you have a noxa API key, the CLI can route through the cloud for bot protection bypass, JS rendering, and proxy rotation. - -```bash -# Set API key (one time) -export NOXA_API_KEY=wc_your_key_here - -# Automatic fallback: tries local first, cloud on bot detection -noxa https://protected-site.com - -# Force cloud mode (skip local, always use API) -noxa --cloud https://spa-site.com - -# Cloud mode works with all features -noxa --cloud --brand https://stripe.com -noxa --cloud -f json https://producthunt.com -noxa --cloud --crawl --depth 2 https://protected-docs.com -``` - -## Browser Impersonation - -```bash -# Chrome (default) — latest Chrome TLS fingerprint -noxa https://example.com - -# Firefox fingerprint -noxa --browser firefox https://example.com - -# Random browser per request (good for batch) -noxa --browser random --urls-file urls.txt -``` - -## Custom Headers & Cookies - -```bash -# Custom headers -noxa -H "Authorization: Bearer token123" https://api.example.com -noxa -H "Accept-Language: de-DE" https://example.com - -# Cookies -noxa --cookie "session=abc123; theme=dark" https://example.com - -# Multiple headers -noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com -``` - -## LLM-Powered Features - -These require an LLM provider (Ollama local, or OpenAI/Anthropic API key). - -```bash -# Summarize a page (default: 3 sentences) -noxa --summarize https://example.com - -# Control summary length -noxa --summarize 5 https://example.com - -# Extract structured JSON with a schema -noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product - -# Extract with a schema from file -noxa --extract-json @schema.json https://example.com/product - -# Extract with natural language prompt -noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing - -# Use a specific LLM provider -noxa --llm-provider ollama --summarize https://example.com -noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com -noxa --llm-provider anthropic --summarize https://example.com -``` - -## Raw HTML Output - -```bash -# Get the raw fetched HTML (no extraction) -noxa --raw-html https://example.com - -# Useful for debugging extraction issues -noxa --raw-html https://example.com > raw.html -noxa --file raw.html # Then extract locally -``` - -## Metadata & Verbose Mode - -```bash -# Include YAML frontmatter with metadata -noxa --metadata https://example.com -# Output: -# --- -# title: "Example Domain" -# source: "https://example.com" -# word_count: 20 -# --- -# # Example Domain -# ... - -# Verbose logging (debug extraction pipeline) -noxa -v https://example.com -``` - -## Proxy Usage - -```bash -# Single proxy -noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com - -# SOCKS5 proxy -noxa --proxy socks5://proxy.example.com:1080 https://example.com - -# Proxy rotation from file (one per line: host:port:user:pass) -noxa --proxy-file proxies.txt https://example.com - -# Auto-load proxies.txt from current directory -echo "proxy1.com:8080:user:pass" > proxies.txt -noxa https://example.com # Automatically detects and uses proxies.txt -``` - -## MCP Server (AI Agent Integration) - -```bash -# Start the MCP server (stdio transport) -noxa-mcp - -# Configure in Claude Desktop (~/.config/claude/claude_desktop_config.json): -# { -# "mcpServers": { -# "noxa": { -# "command": "/path/to/noxa-mcp", -# "env": { -# "NOXA_API_KEY": "wc_your_key" // optional, enables cloud fallback -# } -# } -# } -# } - -# Available tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search -``` - -## Real-World Recipes - -### Monitor competitor pricing - -```bash -# Save today's pricing -noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \ - https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json -``` - -### Build a documentation search index - -```bash -# Crawl docs and extract as LLM-optimized text -noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt -``` - -### Extract all images from a page - -```bash -noxa https://example.com -f json | jq -r '.content.images[].src' -``` - -### Get all external links - -```bash -noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href' -``` - -### Compare two pages - -```bash -noxa https://site-a.com -f json > a.json -noxa https://site-b.com --diff-with a.json -``` diff --git a/gemini-extension.json b/gemini-extension.json new file mode 100644 index 0000000..a696c3e --- /dev/null +++ b/gemini-extension.json @@ -0,0 +1,11 @@ +{ + "name": "noxa", + "version": "0.4.0", + "description": "noxa CLI, MCP server, and skills for AI-assisted web extraction", + "mcpServers": { + "noxa": { + "command": "${extensionPath}${/}bin${/}noxa-mcp", + "cwd": "${extensionPath}" + } + } +} diff --git a/setup.sh b/setup.sh index 5e7ccc8..4ea7244 100755 --- a/setup.sh +++ b/setup.sh @@ -3,6 +3,8 @@ # # Checks prerequisites, builds binaries, configures .env, # optionally installs Ollama, and wires up the MCP server. +# The generated .env is broader than env.example: it includes local +# deployment and Ollama settings used by the setup script and compose stack. # # Usage: # ./setup.sh # Interactive full setup @@ -214,6 +216,8 @@ configure_env() { fi # Write .env + # env.example covers the runtime noxa variables; this file adds local + # deployment and Ollama settings used by setup.sh and docker-compose.yml. cat > "$SCRIPT_DIR/.env" <- + This skill should be used when the user wants to scrape, extract, or fetch content from + a URL using the noxa CLI, crawl a website, get the text of a web page, monitor or watch + a page for changes, extract brand identity (colors, fonts, logos) from a site, + batch-process URLs, summarize a web page with an LLM, extract structured data from a + page, run deep research on a topic, or save crawl output to files. + Trigger on phrases like: "scrape", "extract from", "get content from", "crawl", "fetch + this page", "what does this site say", "get the text of", "monitor changes", "watch this + URL", "brand colors of", "sitemap of", "summarize this URL", "deep research". Use this + skill before running noxa — it covers the correct flag combinations for every workflow + and prevents common mistakes. +--- + +# Noxa — Web Content Extraction for AI + +noxa extracts clean, LLM-optimized content from any URL using Chrome-level TLS fingerprinting. +No browser required. Output is 67% fewer tokens than raw HTML. + +Binary: `noxa` (CLI) — assumed to be on PATH. Verify with `which noxa`. + +> **Complete flag reference:** See `references/flags.md` for every flag, its default, env var binding, and the full config.json schema. + +--- + +## Choosing the right mode + +To choose the right mode, identify what the user wants from this URL: + +| Goal | Mode | +|------|------| +| Read a page | Basic extraction | +| Read docs / whole site | Crawl | +| Find all URLs on a site | Map | +| Multiple URLs at once | Batch | +| Extract structured fields | LLM extraction | +| Summarize a page | Summarize | +| Deep research on a topic | Research (cloud) | +| Track changes once | Diff | +| Continuously watch for changes | Watch | +| Get brand colors/fonts/logos | Brand | +| Debug a 403 or bad output | Raw HTML | + +--- + +## Basic extraction + +```bash +# Default: clean markdown, great for reading +noxa https://example.com + +# Format options +noxa https://example.com -f llm # Token-optimized (best for feeding to Claude) +noxa https://example.com -f json # Full structured JSON with metadata +noxa https://example.com -f text # Plain text, no formatting +noxa https://example.com -f markdown # Markdown (same as default) +noxa https://example.com -f html # Raw extracted HTML + +# Skip nav/sidebar/footer noise +noxa https://example.com --only-main-content + +# Include/exclude specific elements via CSS selectors +noxa https://example.com --include "article,.content" +noxa https://example.com --exclude "nav,footer,.sidebar,.ads" + +# Include metadata as YAML frontmatter +noxa https://example.com --metadata + +# Request timeout (default: 30s) +noxa --timeout 60 https://slow-site.com +``` + +Use `-f llm` when passing content to Claude — it cuts token usage by ~67%. + +--- + +## Crawling a site + +```bash +# Crawl with defaults (depth 1, up to 20 pages) +noxa --crawl https://docs.example.com + +# Control scope +noxa --crawl --depth 3 --max-pages 100 https://docs.example.com + +# Seed from sitemap first (finds more pages) +noxa --crawl --sitemap --depth 2 https://docs.example.com + +# Filter by path prefix (strict prefix match) +noxa --crawl --path-prefix /docs https://docs.example.com + +# Filter by glob patterns (more flexible than --path-prefix) +noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com +noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com + +# Control concurrency and delay (ms between requests) +noxa --crawl --concurrency 5 --delay 500 https://example.com + +# Save/resume crawl state (Ctrl+C saves progress; rerunning resumes) +noxa --crawl --crawl-state state.json --max-pages 500 https://docs.example.com + +# Save each page to a separate file instead of stdout +noxa --crawl --output-dir ./output https://docs.example.com +``` + +Good for: building search indexes, ingesting documentation, research. + +--- + +## Sitemap discovery + +```bash +# List all URLs from the site's sitemaps +noxa --map https://example.com + +# JSON with last_modified and priority +noxa --map https://example.com -f json +``` + +Use `--map` when you want to know what's on a site before crawling. + +--- + +## Batch processing + +```bash +# Multiple URLs in one command +noxa https://site-a.com https://site-b.com https://site-c.com + +# From a file (one URL per line, # comments OK) +# Also supports CSV format: url,custom-filename +noxa --urls-file urls.txt + +# Save each result to a separate file +noxa --urls-file urls.txt --output-dir ./pages + +# With concurrency and proxy rotation +noxa --urls-file urls.txt --concurrency 10 -f llm --proxy-file proxies.txt +``` + +--- + +## LLM-powered extraction + +These require an LLM provider. noxa tries Gemini CLI first, then Ollama, then OpenAI, then Anthropic. + +Configure whichever provider you have available: +```bash +# Gemini CLI (primary — requires `gemini` binary on PATH) +# Model controlled by GEMINI_MODEL env var (default: gemini-2.5-pro) + +# Ollama (local, no key needed — default endpoint http://localhost:11434) +export OLLAMA_HOST=http://localhost:11434 # only needed if non-default + +# OpenAI +export OPENAI_API_KEY=sk-... + +# Anthropic +export ANTHROPIC_API_KEY=sk-ant-... + +# Override provider/model/URL via env vars +export NOXA_LLM_PROVIDER=openai # gemini | ollama | openai | anthropic +export NOXA_LLM_MODEL=gpt-4o +export NOXA_LLM_BASE_URL=http://localhost:11434 # for Ollama or OpenAI-compatible endpoints +``` + +```bash +# Summarize (default: 3 sentences) +noxa --summarize https://example.com +noxa --summarize 5 https://example.com # pass sentence count as positional arg after the flag + +# Extract with natural language +noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing + +# Extract as structured JSON +noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product + +# Schema from file +noxa --extract-json @schema.json https://example.com/product + +# Force a specific provider via flag +noxa --llm-provider ollama --summarize https://example.com +noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com +noxa --llm-provider anthropic --summarize https://example.com + +# Override LLM base URL (for self-hosted OpenAI-compatible endpoints) +noxa --llm-base-url http://my-server:8080 --llm-provider openai --summarize https://example.com +``` + +--- + +## Change detection (diff) + +```bash +# Step 1: snapshot +noxa https://example.com -f json > snapshot.json + +# Step 2: compare later +noxa --diff-with snapshot.json https://example.com +# Output: Status: Same | Changed, word delta, unified diff +``` + +Good for: one-off comparisons, price monitoring, detecting updates. + +--- + +## Watch mode (continuous monitoring) + +Watch polls a URL on a schedule and reports diffs whenever the content changes. + +```bash +# Watch with default interval (300s / 5 minutes) +noxa --watch https://example.com + +# Custom interval +noxa --watch --watch-interval 60 https://example.com # check every 60s + +# Run a command when a change is detected (receives diff JSON on stdin) +noxa --watch --on-change "python notify.py" https://example.com + +# Post to a webhook on change (also works with --crawl and batch) +noxa --watch --webhook https://hooks.slack.com/... https://example.com +export NOXA_WEBHOOK_URL=https://hooks.discord.com/... # or via env var +``` + +Webhook auto-detects Discord and Slack URLs and wraps the payload accordingly. + +--- + +## Deep research (cloud) + +Runs multi-source research on a topic via the noxa.io cloud API. Saves a full report (findings + sources) to a JSON file. Requires an API key. + +```bash +export NOXA_API_KEY=wc_your_key + +# Standard research +noxa --research "best practices for Rust error handling" --api-key $NOXA_API_KEY + +# Deep mode (longer, more thorough report) +noxa --research "Rust async runtimes compared" --deep --api-key $NOXA_API_KEY +``` + +--- + +## Brand identity extraction + +```bash +noxa --brand https://stripe.com +# Returns: name, colors (hex + usage), fonts, logos, favicon +``` + +Output is JSON. Useful for design audits, competitive analysis, or building themed UIs. + +--- + +## PDF extraction + +```bash +# Auto-detected via Content-Type header +noxa https://example.com/report.pdf + +# Control behavior on scanned PDFs (no extractable text) +noxa --pdf-mode auto https://example.com/report.pdf # error on empty (default) +noxa --pdf-mode fast https://example.com/report.pdf # return whatever text exists +``` + +--- + +## Auth, headers, cookies, proxies + +```bash +# Custom headers +noxa -H "Authorization: Bearer token123" https://api.example.com +noxa -H "Accept-Language: fr-FR" -H "X-Custom: value" https://example.com + +# Cookie string (shorthand) +noxa --cookie "session=abc123; theme=dark" https://example.com + +# Cookie file (Chrome extension JSON export format) +noxa --cookie-file cookies.json https://example.com + +# Browser impersonation (default: Chrome) +noxa --browser firefox https://example.com +noxa --browser random https://example.com # random per request, good for batch + +# Single proxy +noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com +noxa --proxy socks5://proxy.example.com:1080 https://example.com + +# Proxy pool rotation +noxa --proxy-file proxies.txt https://example.com # host:port:user:pass per line +``` + +--- + +## Bot-protected sites / JS rendering + +noxa.io is the optional hosted cloud rendering service — it handles Cloudflare, DataDome, and JS-rendered SPAs that local TLS fingerprinting can't bypass. Get an API key at [noxa.io](https://noxa.io). + +```bash +# Pass key via env var or --api-key flag +export NOXA_API_KEY=wc_your_key +# or: noxa --api-key wc_your_key https://example.com + +# Auto: tries local TLS fingerprinting first, falls back to cloud on bot detection +noxa https://cloudflare-protected-site.com + +# Force cloud (for SPA / JS-heavy pages) +noxa --cloud https://spa-site.com +``` + +--- + +## Output to files + +```bash +# Save crawl output — one file per page, filenames derived from URL paths +noxa --crawl --output-dir ./docs https://docs.example.com + +# Save batch output +noxa --urls-file urls.txt --output-dir ./pages -f llm + +# Single URL to file +noxa --output-dir ./out https://example.com +``` + +--- + +## Config file + +noxa loads `./config.json` by default. Override with `--config` or `NOXA_CONFIG`: + +```bash +noxa --config ~/.noxa/config.json https://example.com +export NOXA_CONFIG=/etc/noxa/config.json +``` + +Config uses snake_case keys that match `config.example.json` and the Rust config struct. Useful for setting defaults like `llm_provider`, `browser`, `concurrency`, `timeout`. + +--- + +## Local files and stdin + +```bash +# Local HTML file +noxa --file page.html + +# Pipe HTML +curl -s https://example.com | noxa --stdin +``` + +--- + +## Debugging + +```bash +# Get the raw fetched HTML to see what noxa received +noxa --raw-html https://example.com + +# Verbose extraction pipeline logging +noxa -v https://example.com +``` + +If a site returns 403, try `--browser firefox` or `--browser random`. If still blocked, use `--cloud` with an API key. + +--- + +## Environment variables reference + +| Variable | Flag equivalent | Description | +|----------|----------------|-------------| +| `NOXA_API_KEY` | `--api-key` | Cloud API key | +| `NOXA_PROXY` | `--proxy` | Single proxy URL | +| `NOXA_PROXY_FILE` | `--proxy-file` | Proxy pool file path | +| `NOXA_WEBHOOK_URL` | `--webhook` | Webhook URL for notifications | +| `NOXA_LLM_PROVIDER` | `--llm-provider` | LLM provider (gemini/ollama/openai/anthropic) | +| `NOXA_LLM_MODEL` | `--llm-model` | LLM model name override | +| `NOXA_LLM_BASE_URL` | `--llm-base-url` | LLM base URL (Ollama/OpenAI-compatible) | +| `NOXA_CONFIG` | `--config` | Path to config.json | +| `OPENAI_API_KEY` | — | OpenAI API key | +| `ANTHROPIC_API_KEY` | — | Anthropic API key | +| `OLLAMA_HOST` | — | Ollama endpoint (default: http://localhost:11434) | + +--- + +## Common recipes + +```bash +# Read docs site as a single LLM-optimized text file +noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt + +# Save full crawl to individual files +noxa --crawl --sitemap --depth 2 --output-dir ./docs -f llm https://docs.example.com + +# Extract all external links from a page +noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href' + +# Monitor competitor pricing — snapshot then diff +noxa https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json +noxa https://competitor.com/pricing --diff-with pricing-yesterday.json + +# Watch a page and notify on Slack when it changes +noxa --watch --watch-interval 3600 --webhook https://hooks.slack.com/... https://example.com + +# Resumable large crawl +noxa --crawl --crawl-state state.json --depth 4 --max-pages 2000 https://docs.example.com + +# Word count of a page +noxa https://example.com -f text | wc -w + +# Extract article title with jq +noxa https://example.com -f json | jq '.metadata.title' +``` diff --git a/skills/noxa/references/flags.md b/skills/noxa/references/flags.md new file mode 100644 index 0000000..ed6a08c --- /dev/null +++ b/skills/noxa/references/flags.md @@ -0,0 +1,246 @@ +# Noxa CLI — Complete Flag Reference + +All flags for the `noxa` binary. Sourced directly from `crates/noxa-cli/src/main.rs`. + +Priority order when the same setting appears in multiple places: +**CLI flag > config.json > environment variable > hard default** + +--- + +## Table of Contents + +- [Input](#input) +- [Output](#output) +- [Content Filtering](#content-filtering) +- [Request / Network](#request--network) +- [Auth & Identity](#auth--identity) +- [Crawl](#crawl) +- [LLM](#llm) +- [Change Detection](#change-detection) +- [Watch Mode](#watch-mode) +- [Brand Extraction](#brand-extraction) +- [PDF](#pdf) +- [Cloud API](#cloud-api) +- [Config File](#config-file) +- [Environment Variables](#environment-variables) +- [config.json Reference](#configjson-reference) + +--- + +## Input + +| Flag | Type | Description | +|------|------|-------------| +| `[URLS]...` | positional | One or more URLs to fetch. Bare domains are auto-prefixed with `https://`. | +| `--urls-file ` | string | File with URLs, one per line. `#` comments supported. CSV format `url,filename` sets a custom output filename. | +| `--file ` | string | Extract from a local HTML file instead of fetching. | +| `--stdin` | bool | Read HTML from stdin. | + +--- + +## Output + +| Flag | Short | Default | Description | +|------|-------|---------|-------------| +| `--format ` | `-f` | `markdown` | Output format: `markdown`, `json`, `text`, `llm`, `html`. Use `llm` when feeding to Claude — 67% fewer tokens than raw HTML. | +| `--metadata` | | false | Include YAML frontmatter with title, source URL, word count. Always included in JSON format. | +| `--raw-html` | | false | Output the raw fetched HTML with no extraction. Useful for debugging. CLI-only — not settable in config.json. | +| `--output-dir ` | | — | Save each page to a separate file instead of stdout. Works with `--crawl`, batch, and single-URL mode. Filenames derived from URL paths (e.g. `/docs/api` → `docs/api.md`). | +| `--verbose` / `-v` | `-v` | false | Enable verbose extraction pipeline logging to stderr. | + +--- + +## Content Filtering + +| Flag | Description | +|------|-------------| +| `--only-main-content` | Auto-detect and extract only the main content element (`
`, `
`). Strips nav, sidebar, footer. | +| `--include ` | Comma-separated CSS selectors to include (e.g. `"article,.content"`). In config.json: `include_selectors` array. | +| `--exclude ` | Comma-separated CSS selectors to exclude (e.g. `"nav,footer,.ads"`). In config.json: `exclude_selectors` array. | + +--- + +## Request / Network + +| Flag | Short | Env | Default | Description | +|------|-------|-----|---------|-------------| +| `--browser ` | `-b` | — | `chrome` | TLS fingerprint to impersonate: `chrome`, `firefox`, `random`. `random` picks a different profile per request. | +| `--timeout ` | `-t` | — | `30` | Request timeout in seconds. | +| `--proxy ` | `-p` | `NOXA_PROXY` | — | Single proxy URL. Formats: `http://user:pass@host:port`, `socks5://host:port`. Takes priority over `--proxy-file` if both are set. | +| `--proxy-file ` | | `NOXA_PROXY_FILE` | — | Proxy pool file — one proxy per line as `host:port:user:pass`. Rotates per request. | +| `--concurrency ` | | — | `5` | Max concurrent requests (also used for crawl). | +| `--delay ` | | — | `100` | Delay between requests in milliseconds. | + +--- + +## Auth & Identity + +| Flag | Description | +|------|-------------| +| `-H / --header ` | Custom request header, repeatable. Format: `"Name: value"`. | +| `--cookie ` | Cookie string, shorthand for `-H "Cookie: ..."`. | +| `--cookie-file ` | JSON cookie file in Chrome extension export format: `[{name, value, domain, path, secure, ...}]`. | + +--- + +## Crawl + +All crawl flags require `--crawl` to be active, except `--map` and `--sitemap` which are standalone. + +| Flag | Default | Description | +|------|---------|-------------| +| `--crawl` | false | Enable recursive BFS crawl of same-origin links. | +| `--depth ` | `1` | Max crawl depth from the start URL. | +| `--max-pages ` | `20` | Maximum number of pages to crawl. | +| `--concurrency ` | `5` | Max concurrent fetch workers during crawl. | +| `--delay ` | `100` | Delay between requests in milliseconds. | +| `--path-prefix ` | — | Only crawl URLs whose path starts with this prefix (strict string match). | +| `--include-paths ` | — | Comma-separated glob patterns for paths to include (e.g. `"/api/*,/guides/**"`). More flexible than `--path-prefix`. In config.json: `include_paths` array. | +| `--exclude-paths ` | — | Comma-separated glob patterns for paths to exclude (e.g. `"/changelog/*,/blog/*"`). In config.json: `exclude_paths` array. | +| `--sitemap` | false | Seed the crawl frontier from sitemap discovery (checks `robots.txt` and `/sitemap.xml`). Also usable standalone to enable sitemaps without crawling. In config.json: `use_sitemap`. | +| `--map` | false | Discover and print all URLs from the site's sitemaps without fetching content. One URL per line; JSON array with `-f json`. | +| `--crawl-state ` | — | Path to a JSON file for saving/resuming crawl state. On Ctrl+C: saves progress. On next run: resumes from where it left off. | + +--- + +## LLM + +Requires a configured LLM provider. noxa tries Gemini CLI → Ollama → OpenAI → Anthropic in order. + +| Flag | Env | Description | +|------|-----|-------------| +| `--summarize [N]` | — | Summarize extracted content. Optional sentence count (default: 3). Pass as positional arg: `--summarize 5`. | +| `--extract-prompt ` | — | Extract content using a natural language prompt. | +| `--extract-json ` | — | Extract structured JSON conforming to a JSON Schema string. Pass `@file.json` to load schema from a file. | +| `--llm-provider ` | `NOXA_LLM_PROVIDER` | Force a specific provider: `gemini`, `ollama`, `openai`, `anthropic`. | +| `--llm-model ` | `NOXA_LLM_MODEL` | Override the model name (e.g. `gpt-4o`, `gemini-2.5-pro`). | +| `--llm-base-url ` | `NOXA_LLM_BASE_URL` | Override the LLM base URL. Use for self-hosted Ollama or OpenAI-compatible endpoints. | + +Provider setup: +- **Gemini CLI**: requires `gemini` binary on PATH. Model via `GEMINI_MODEL` (default: `gemini-2.5-pro`). +- **Ollama**: set `OLLAMA_HOST` if not on `http://localhost:11434`. +- **OpenAI**: set `OPENAI_API_KEY`. +- **Anthropic**: set `ANTHROPIC_API_KEY`. + +--- + +## Change Detection + +| Flag | Description | +|------|-------------| +| `--diff-with ` | Compare current extraction against a previously saved JSON snapshot. Reports status (Same/Changed), word delta, and a unified diff. Take a snapshot with `noxa -f json > snapshot.json`. | + +--- + +## Watch Mode + +| Flag | Default | Description | +|------|---------|-------------| +| `--watch` | false | Continuously poll a URL for changes and report diffs. | +| `--watch-interval ` | `300` | Poll interval in seconds. | +| `--on-change ` | — | Shell command to run when a change is detected. Receives the diff JSON on stdin. CLI-only — intentionally excluded from config.json to prevent shell injection via config file writes. | +| `--webhook ` | `NOXA_WEBHOOK_URL` | POST a JSON payload when changes are detected (watch), a crawl completes, or a batch finishes. Auto-detects Discord and Slack URLs and wraps the payload accordingly. | + +--- + +## Brand Extraction + +| Flag | Description | +|------|-------------| +| `--brand` | Extract brand identity: colors (hex + usage), fonts, logos, favicon. Output is JSON. | + +--- + +## PDF + +| Flag | Default | Description | +|------|---------|-------------| +| `--pdf-mode ` | `auto` | How to handle PDFs: `auto` errors on empty text (catches scanned/image PDFs), `fast` returns whatever text is found. PDFs are auto-detected via `Content-Type` header. | + +--- + +## Cloud API + +noxa.io is the optional hosted rendering service. Handles Cloudflare, DataDome, WAF, and JS-rendered SPAs. Get a key at [noxa.io](https://noxa.io). + +| Flag | Env | Description | +|------|-----|-------------| +| `--api-key ` | `NOXA_API_KEY` | Cloud API key. When set, enables automatic fallback to cloud on bot detection. | +| `--cloud` | — | Force all requests through the cloud API, skipping local extraction entirely. | +| `--research ` | — | Run deep multi-source research on a topic via the cloud API. Saves full result (report + sources + findings) to a JSON file. Requires `--api-key`. | +| `--deep` | — | Enable deep research mode (longer, more thorough report). Used with `--research`. | + +--- + +## Config File + +noxa loads `./config.json` by default. Override with `--config ` or `NOXA_CONFIG`. + +```bash +noxa --config ~/.noxa/config.json https://example.com +export NOXA_CONFIG=/etc/noxa/config.json +``` + +**Important caveats:** +- CLI flags always win over config.json values. +- `on_change` is intentionally excluded from config.json (security: prevents shell injection via config writes). +- Secrets and URLs (`api_key`, `proxy`, `webhook`, `llm_base_url`) belong in `.env`, not config.json. +- Bool flags set to `true` in config.json (`only_main_content`, `metadata`, `verbose`, `use_sitemap`) **cannot** be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass the config entirely. + +--- + +## Environment Variables + +| Variable | Flag equivalent | Description | +|----------|----------------|-------------| +| `NOXA_API_KEY` | `--api-key` | Cloud API key | +| `NOXA_PROXY` | `--proxy` | Single proxy URL | +| `NOXA_PROXY_FILE` | `--proxy-file` | Proxy pool file path | +| `NOXA_WEBHOOK_URL` | `--webhook` | Webhook URL for notifications | +| `NOXA_LLM_PROVIDER` | `--llm-provider` | LLM provider (`gemini`/`ollama`/`openai`/`anthropic`) | +| `NOXA_LLM_MODEL` | `--llm-model` | LLM model name override | +| `NOXA_LLM_BASE_URL` | `--llm-base-url` | LLM base URL for Ollama or OpenAI-compatible endpoints | +| `NOXA_CONFIG` | `--config` | Path to config.json | +| `OPENAI_API_KEY` | — | OpenAI API key | +| `ANTHROPIC_API_KEY` | — | Anthropic API key | +| `OLLAMA_HOST` | — | Ollama endpoint (default: `http://localhost:11434`) | +| `GEMINI_MODEL` | — | Gemini model override (default: `gemini-2.5-pro`) | + +--- + +## config.json Reference + +All fields are optional. Unknown fields are silently ignored. + +```json +{ + "format": "llm", + "metadata": true, + "verbose": false, + + "browser": "firefox", + "timeout": 60, + "pdf_mode": "fast", + "only_main_content": true, + + "include_selectors": ["article", ".content"], + "exclude_selectors": ["nav", "footer"], + + "depth": 3, + "max_pages": 100, + "concurrency": 10, + "delay": 200, + "path_prefix": "/docs/", + "include_paths": ["/docs/*", "/api/*"], + "exclude_paths": ["/changelog/*", "/blog/*"], + "use_sitemap": true, + + "llm_provider": "gemini", + "llm_model": "gemini-2.5-pro" +} +``` + +**Not configurable via config.json** (CLI-only or secrets): +- `on_change` — shell injection risk +- `api_key`, `proxy`, `webhook`, `llm_base_url` — secrets/URLs belong in `.env` +- `raw_html` — per-run mode, not a persistent default