webclaw/config.schema.json
Jacob Magar adf4b6ba55 feat(llm): add Gemini CLI provider as primary; set qwen3.5:9b as default Ollama model
- Add GeminiCliProvider: shells out to `gemini -p` with --output-format json,
  injection-safe prompt passing, MCP server suppression via temp workdir,
  6-slot concurrency semaphore, 60s subprocess deadline
- Add --llm-provider, --llm-model, --llm-base-url CLI flags for per-call overrides
- Provider chain: Gemini CLI → OpenAI → Ollama → Anthropic
- Move LLM timing to dispatch layer (LLM: Xs on stderr)
- Default Ollama model: qwen3:8b → qwen3.5:9b (benchmark shows better schema extraction)
- Add noxa mcp subcommand
- Add docs/reports/llm-benchmark-2026-04-11.md (Gemini vs qwen3.5:4b vs qwen3.5:9b)
- Bump version 0.3.11 → 0.4.0

Co-authored-by: Claude <claude@anthropic.com>
2026-04-12 00:52:53 -04:00

140 lines
3.6 KiB
JSON

{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "./config.schema.json",
"title": "Noxa config.json",
"description": "Optional non-secret defaults for the noxa CLI. Unknown fields are ignored by the binary, and secrets/URLs belong in .env.",
"type": "object",
"additionalProperties": true,
"properties": {
"$schema": {
"type": "string",
"description": "Editor hint pointing at this schema."
},
"_doc": {
"type": "array",
"items": {
"type": "string"
},
"description": "Human-readable notes. Ignored by noxa."
},
"format": {
"type": "string",
"enum": ["markdown", "json", "text", "llm", "html"],
"default": "markdown",
"description": "Default output format."
},
"browser": {
"type": "string",
"enum": ["chrome", "firefox", "random"],
"default": "chrome",
"description": "TLS/browser fingerprint profile."
},
"timeout": {
"type": "integer",
"minimum": 0,
"default": 30,
"description": "Request timeout in seconds."
},
"pdf_mode": {
"type": "string",
"enum": ["auto", "fast"],
"default": "auto",
"description": "How PDFs are handled."
},
"metadata": {
"type": "boolean",
"default": false,
"description": "Include metadata in output."
},
"verbose": {
"type": "boolean",
"default": false,
"description": "Enable verbose logging."
},
"output_dir": {
"type": ["string", "null"],
"default": null,
"description": "Write outputs to files in this directory instead of stdout."
},
"only_main_content": {
"type": "boolean",
"default": false,
"description": "Strip nav/sidebar/footer noise automatically."
},
"include_selectors": {
"type": "array",
"items": {
"type": "string"
},
"default": [],
"description": "CSS selectors to force-include."
},
"exclude_selectors": {
"type": "array",
"items": {
"type": "string"
},
"default": [],
"description": "CSS selectors to exclude."
},
"depth": {
"type": "integer",
"minimum": 0,
"default": 1,
"description": "Maximum crawl depth."
},
"max_pages": {
"type": "integer",
"minimum": 0,
"default": 20,
"description": "Maximum number of pages to crawl."
},
"concurrency": {
"type": "integer",
"minimum": 0,
"default": 5,
"description": "Maximum concurrent requests."
},
"delay": {
"type": "integer",
"minimum": 0,
"default": 100,
"description": "Delay between requests in milliseconds."
},
"path_prefix": {
"type": ["string", "null"],
"default": null,
"description": "Only crawl paths with this prefix."
},
"include_paths": {
"type": "array",
"items": {
"type": "string"
},
"default": [],
"description": "Glob patterns for crawl paths to include."
},
"exclude_paths": {
"type": "array",
"items": {
"type": "string"
},
"default": [],
"description": "Glob patterns for crawl paths to exclude."
},
"use_sitemap": {
"type": "boolean",
"default": false,
"description": "Seed crawl traversal from sitemap discovery."
},
"llm_provider": {
"type": "string",
"enum": ["gemini", "ollama", "openai", "anthropic"],
"description": "Optional LLM provider name."
},
"llm_model": {
"type": "string",
"description": "Optional LLM model override."
}
}
}