diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json
new file mode 100644
index 0000000..288f6a2
--- /dev/null
+++ b/.agents/plugins/marketplace.json
@@ -0,0 +1,20 @@
+{
+ "name": "noxa-marketplace",
+ "interface": {
+ "displayName": "noxa"
+ },
+ "plugins": [
+ {
+ "name": "noxa",
+ "source": {
+ "source": "local",
+ "path": "./"
+ },
+ "policy": {
+ "installation": "AVAILABLE",
+ "authentication": "ON_INSTALL"
+ },
+ "category": "Productivity"
+ }
+ ]
+}
diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
new file mode 100644
index 0000000..e8a74fb
--- /dev/null
+++ b/.claude-plugin/marketplace.json
@@ -0,0 +1,21 @@
+{
+ "name": "noxa-marketplace",
+ "owner": {
+ "name": "jmagar"
+ },
+ "metadata": {
+ "description": "Marketplace for the noxa plugin",
+ "version": "0.3.11",
+ "pluginRoot": "./"
+ },
+ "plugins": [
+ {
+ "name": "noxa",
+ "source": "./",
+ "description": "noxa CLI, MCP server, and skills for AI-assisted web extraction",
+ "version": "0.3.11",
+ "category": "Productivity",
+ "tags": ["web", "extraction", "mcp", "skills"]
+ }
+ ]
+}
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
new file mode 100644
index 0000000..dfdeec4
--- /dev/null
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,14 @@
+{
+ "name": "noxa",
+ "version": "0.4.0",
+ "description": "noxa CLI, MCP server, and skills for AI-assisted web extraction",
+ "author": {
+ "name": "jmagar"
+ },
+ "homepage": "https://noxa.io",
+ "repository": "https://github.com/jmagar/noxa",
+ "license": "AGPL-3.0",
+ "keywords": ["web", "extraction", "mcp", "ai"],
+ "skills": "./skills/",
+ "mcpServers": "./.mcp.json"
+}
diff --git a/.codex-plugin/plugin.json b/.codex-plugin/plugin.json
new file mode 100644
index 0000000..629ae84
--- /dev/null
+++ b/.codex-plugin/plugin.json
@@ -0,0 +1,19 @@
+{
+ "name": "noxa",
+ "version": "0.4.0",
+ "description": "noxa CLI, MCP server, and skills for AI-assisted web extraction",
+ "author": {
+ "name": "jmagar"
+ },
+ "homepage": "https://noxa.io",
+ "repository": "https://github.com/jmagar/noxa",
+ "license": "AGPL-3.0",
+ "keywords": ["web", "extraction", "mcp", "ai"],
+ "skills": "./skills/",
+ "mcpServers": "./.mcp.json",
+ "interface": {
+ "displayName": "noxa",
+ "shortDescription": "AI-assisted web extraction",
+ "longDescription": "Bundle the noxa skill and MCP server for Codex workflows."
+ }
+}
diff --git a/.gitignore b/.gitignore
index 6293f80..2a48456 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,7 +13,9 @@ docs/superpowers
docs/reports
docs/sessions
benchmarks
-docs
+docs/*
+!docs/config.md
+.worktrees/
# Beads / Dolt files (added by bd init)
.dolt/
diff --git a/.mcp.json b/.mcp.json
new file mode 100644
index 0000000..e8d56c1
--- /dev/null
+++ b/.mcp.json
@@ -0,0 +1,7 @@
+{
+ "mcpServers": {
+ "noxa": {
+ "command": "/home/jmagar/workspace/noxa/target/debug/noxa-mcp"
+ }
+ }
+}
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..2080244
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,84 @@
+# Agent Instructions
+
+This project uses **bd** (beads) for issue tracking. Run `bd onboard` to get started.
+
+## Quick Reference
+
+```bash
+bd ready # Find available work
+bd show
+
` tags as HTML-escaped JSON.
@@ -5,7 +6,6 @@
/// Profile, etc. We parse these to reconstruct post + comments as markdown.
use serde_json::Value;
use tracing::debug;
-use noxa_core::{Content, ExtractionResult, Metadata};
/// Check if a URL is a LinkedIn post/activity.
pub fn is_linkedin_post(url: &str) -> bool {
diff --git a/crates/noxa-fetch/src/reddit.rs b/crates/noxa-fetch/src/reddit.rs
index be8622c..4d11c0f 100644
--- a/crates/noxa-fetch/src/reddit.rs
+++ b/crates/noxa-fetch/src/reddit.rs
@@ -1,3 +1,4 @@
+use noxa_core::{Content, ExtractionResult, Metadata};
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
///
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
@@ -5,7 +6,6 @@
/// comment tree as structured JSON, which we convert to clean markdown.
use serde::Deserialize;
use tracing::debug;
-use noxa_core::{Content, ExtractionResult, Metadata};
/// Check if a URL points to a Reddit post/comment page.
pub fn is_reddit_url(url: &str) -> bool {
diff --git a/crates/noxa-llm/src/chain.rs b/crates/noxa-llm/src/chain.rs
index 43f3de9..6e1561d 100644
--- a/crates/noxa-llm/src/chain.rs
+++ b/crates/noxa-llm/src/chain.rs
@@ -1,5 +1,5 @@
/// Provider chain — tries providers in order until one succeeds.
-/// Default order: Ollama (local, free) -> OpenAI -> Anthropic.
+/// Default order: Gemini CLI (primary) -> OpenAI -> Ollama -> Anthropic.
/// Only includes providers that are actually configured/available.
use async_trait::async_trait;
use tracing::{debug, info, warn};
@@ -7,9 +7,7 @@ use tracing::{debug, info, warn};
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider};
use crate::providers::{
- anthropic::AnthropicProvider,
- gemini_cli::GeminiCliProvider,
- ollama::OllamaProvider,
+ anthropic::AnthropicProvider, gemini_cli::GeminiCliProvider, ollama::OllamaProvider,
openai::OpenAiProvider,
};
@@ -94,7 +92,11 @@ impl LlmProvider for ProviderChain {
let t = std::time::Instant::now();
match provider.complete(request).await {
Ok(response) => {
- info!(provider = provider.name(), elapsed_ms = t.elapsed().as_millis(), "completion succeeded");
+ info!(
+ provider = provider.name(),
+ elapsed_ms = t.elapsed().as_millis(),
+ "completion succeeded"
+ );
return Ok(response);
}
Err(e) => {
diff --git a/crates/noxa-llm/src/extract.rs b/crates/noxa-llm/src/extract.rs
index 9216b0d..e637628 100644
--- a/crates/noxa-llm/src/extract.rs
+++ b/crates/noxa-llm/src/extract.rs
@@ -8,44 +8,47 @@ use crate::provider::{CompletionRequest, LlmProvider, Message};
/// Validate a JSON value against a schema. Returns Ok(()) on success or
/// Err(LlmError::InvalidJson) with a concise error message on failure.
-fn validate_schema(
- value: &serde_json::Value,
- schema: &serde_json::Value,
-) -> Result<(), LlmError> {
- let compiled = jsonschema::validator_for(schema).map_err(|e| {
- LlmError::InvalidJson(format!("invalid schema: {e}"))
- })?;
+fn validate_schema(value: &serde_json::Value, schema: &serde_json::Value) -> Result<(), LlmError> {
+ let compiled = jsonschema::validator_for(schema)
+ .map_err(|e| LlmError::InvalidJson(format!("invalid schema: {e}")))?;
- let errors: Vec = compiled
- .iter_errors(value)
- .map(|e| format!("{} at {}", e, e.instance_path()))
- .collect();
+ let first_error = compiled.iter_errors(value).next();
- if errors.is_empty() {
- Ok(())
- } else {
- Err(LlmError::InvalidJson(format!(
- "schema validation failed: {}",
- errors.join("; ")
- )))
+ match first_error {
+ None => Ok(()),
+ Some(e) => {
+ let msg = format!("{} at {}", e, e.instance_path());
+ Err(LlmError::InvalidJson(format!(
+ "schema validation failed: {msg}"
+ )))
+ }
}
}
+/// Compile a schema up front so invalid schemas fail before any provider call.
+fn validate_schema_definition(schema: &serde_json::Value) -> Result<(), LlmError> {
+ jsonschema::validator_for(schema)
+ .map(|_| ())
+ .map_err(|e| LlmError::InvalidJson(format!("invalid schema: {e}")))
+}
+
/// Extract structured JSON from content using a JSON schema.
/// The schema tells the LLM exactly what fields to extract and their types.
///
/// Retry policy:
-/// - If the response cannot be parsed as JSON at all: retry once with the
-/// identical request (handles transient formatting issues).
-/// - If the response is valid JSON but fails schema validation: return
-/// `LlmError::InvalidJson` immediately — the schema is likely unsatisfiable
-/// for this content, so retrying would produce the same result.
+/// - If the response cannot be parsed as JSON: retry once with a correction prompt.
+/// - If the response is valid JSON but fails schema validation: retry once with
+/// a tighter correction prompt that includes the specific validation error.
+/// - Both retry attempts add the previous failed response as an 'assistant' message
+/// and the correction instructions as a 'user' message to improve success.
pub async fn extract_json(
content: &str,
schema: &serde_json::Value,
provider: &dyn LlmProvider,
model: Option<&str>,
) -> Result {
+ validate_schema_definition(schema)?;
+
let system = format!(
"You are a JSON extraction engine. Extract data from the content according to this schema.\n\
Return ONLY valid JSON matching the schema. No explanations, no markdown, no commentary.\n\n\
@@ -53,18 +56,20 @@ pub async fn extract_json(
serde_json::to_string_pretty(schema).unwrap_or_else(|_| schema.to_string())
);
- let request = CompletionRequest {
+ let mut messages = vec![
+ Message {
+ role: "system".into(),
+ content: system,
+ },
+ Message {
+ role: "user".into(),
+ content: content.to_string(),
+ },
+ ];
+
+ let mut request = CompletionRequest {
model: model.unwrap_or_default().to_string(),
- messages: vec![
- Message {
- role: "system".into(),
- content: system,
- },
- Message {
- role: "user".into(),
- content: content.to_string(),
- },
- ],
+ messages: messages.clone(),
temperature: Some(0.0),
max_tokens: None,
json_mode: true,
@@ -72,23 +77,54 @@ pub async fn extract_json(
let response = provider.complete(&request).await?;
- match parse_json_response(&response) {
- Ok(value) => {
- // Valid JSON — now validate against the schema.
- // Schema mismatches do not retry (unsatisfiable → same result).
- validate_schema(&value, schema)?;
- Ok(value)
- }
- Err(_parse_err) => {
- // Unparseable JSON — retry once with the identical request.
+ match parse_and_validate(&response, schema) {
+ Ok(value) => Ok(value),
+ Err(e) => {
+ // First attempt failed — retry once with a correction prompt.
+ // Construct a concise correction prompt based on the error type.
+ let correction_prompt = match &e {
+ LlmError::InvalidJson(msg) if msg.contains("schema validation failed") => {
+ let error_msg = msg.replace("schema validation failed: ", "");
+ format!("Correction required: {}. Return ONLY the corrected JSON.", error_msg)
+ }
+ _ => {
+ "Your response was not valid JSON. Please return ONLY valid JSON matching the schema.".to_string()
+ }
+ };
+
+ // Limit correction context to prevent token blowup on large hallucinated outputs.
+ let capped_response = if response.len() > 2000 {
+ format!("{}... [truncated]", &response[..2000])
+ } else {
+ response.clone()
+ };
+
+ messages.push(Message {
+ role: "assistant".into(),
+ content: capped_response,
+ });
+ messages.push(Message {
+ role: "user".into(),
+ content: correction_prompt,
+ });
+
+ request.messages = messages;
let retry_response = provider.complete(&request).await?;
- let value = parse_json_response(&retry_response)?;
- validate_schema(&value, schema)?;
- Ok(value)
+ parse_and_validate(&retry_response, schema)
}
}
}
+/// Helper: parse response string as JSON and validate it against the schema.
+fn parse_and_validate(
+ response: &str,
+ schema: &serde_json::Value,
+) -> Result {
+ let value = parse_json_response(response)?;
+ validate_schema(&value, schema)?;
+ Ok(value)
+}
+
/// Extract information using a natural language prompt.
/// More flexible than schema extraction — the user describes what they want.
pub async fn extract_with_prompt(
@@ -301,9 +337,7 @@ mod tests {
],
);
- let result = extract_json("content", &schema, &mock, None)
- .await
- .unwrap();
+ let result = extract_json("content", &schema, &mock, None).await.unwrap();
assert_eq!(result["title"], "Retry succeeded");
}
@@ -318,10 +352,7 @@ mod tests {
let mock = SequenceMockProvider::new(
"mock-seq",
- vec![
- Ok("not json".to_string()),
- Ok("also not json".to_string()),
- ],
+ vec![Ok("not json".to_string()), Ok("also not json".to_string())],
);
let result = extract_json("content", &schema, &mock, None).await;
@@ -332,7 +363,7 @@ mod tests {
}
#[tokio::test]
- async fn schema_mismatch_does_not_retry() {
+ async fn schema_mismatch_triggers_retry() {
use crate::testing::mock::SequenceMockProvider;
let schema = serde_json::json!({
@@ -343,20 +374,17 @@ mod tests {
}
});
- // Both calls return valid JSON with wrong schema — but only one call should happen.
+ // First call: valid JSON but schema mismatch (price is string).
+ // Second call: valid JSON matching schema.
let mock = SequenceMockProvider::new(
"mock-seq",
vec![
Ok(r#"{"price": "wrong-type"}"#.to_string()),
- Ok(r#"{"price": 9.99}"#.to_string()), // would succeed — but shouldn't be called
+ Ok(r#"{"price": 9.99}"#.to_string()),
],
);
- // Should return InvalidJson without calling second response.
- let result = extract_json("content", &schema, &mock, None).await;
- assert!(
- matches!(result, Err(LlmError::InvalidJson(_))),
- "schema mismatch should not trigger retry"
- );
+ let result = extract_json("content", &schema, &mock, None).await.unwrap();
+ assert_eq!(result["price"], 9.99);
}
}
diff --git a/crates/noxa-llm/src/lib.rs b/crates/noxa-llm/src/lib.rs
index 250ae88..129b148 100644
--- a/crates/noxa-llm/src/lib.rs
+++ b/crates/noxa-llm/src/lib.rs
@@ -2,7 +2,7 @@
///
/// Provider chain: Gemini CLI (primary) → OpenAI → Ollama → Anthropic.
/// Gemini CLI requires the `gemini` binary on PATH; GEMINI_MODEL env var sets the model.
-/// Provides schema-validated extraction (with one retry on parse failure),
+/// Provides schema-validated extraction (with one retry on parse or schema mismatch),
/// prompt extraction, and summarization on top of noxa-core's content pipeline.
pub mod chain;
pub mod clean;
diff --git a/crates/noxa-llm/src/providers/gemini_cli.rs b/crates/noxa-llm/src/providers/gemini_cli.rs
index 9d2d2d7..54f137e 100644
--- a/crates/noxa-llm/src/providers/gemini_cli.rs
+++ b/crates/noxa-llm/src/providers/gemini_cli.rs
@@ -12,11 +12,11 @@
///
/// Two flags reduce this:
/// - `--extensions ""` — skips extension loading (~3 s saved)
-/// - `current_dir` set to a temp workdir containing `.gemini/settings.json` with
-/// `{"mcpServers":{}}` — workspace settings override user settings, so all 6 MCP
+/// - `current_dir` set to a best-effort temp workdir containing `.gemini/settings.json`
+/// with `{"mcpServers":{}}` — workspace settings override user settings, so all 6 MCP
/// servers are skipped at subprocess startup (major speedup).
///
-/// The workdir is created once at construction and reused for every call.
+/// The workdir is created once at construction and reused for every call when available.
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
@@ -36,10 +36,6 @@ const MAX_CONCURRENT: usize = 6;
/// Subprocess deadline — prevents hung `gemini` processes blocking the chain.
const SUBPROCESS_TIMEOUT: Duration = Duration::from_secs(60);
-/// Fixed workdir used for every subprocess call.
-/// A workspace-level `.gemini/settings.json` here overrides the user's MCP server config.
-const NOXA_GEMINI_WORKDIR: &str = "/tmp/noxa-gemini";
-
pub struct GeminiCliProvider {
default_model: String,
semaphore: Arc,
@@ -56,7 +52,7 @@ impl GeminiCliProvider {
.filter(|s| !s.is_empty())
.unwrap_or_else(|| "gemini-2.5-pro".into());
- let workdir = PathBuf::from(NOXA_GEMINI_WORKDIR);
+ let workdir = std::env::temp_dir().join("noxa-gemini");
ensure_gemini_workdir(&workdir);
Self {
@@ -106,11 +102,14 @@ impl LlmProvider for GeminiCliProvider {
// Workspace settings in self.workdir override the user's ~/.gemini/settings.json,
// replacing the user's MCP server list with {} so none are spawned at startup.
// Without this, each of the user's MCP servers adds latency to every call.
- cmd.current_dir(&self.workdir);
+ if self.workdir.is_dir() {
+ cmd.current_dir(&self.workdir);
+ }
cmd.stdin(std::process::Stdio::null());
cmd.stdout(std::process::Stdio::piped());
cmd.stderr(std::process::Stdio::piped());
+ cmd.kill_on_drop(true);
debug!(model, workdir = %self.workdir.display(), "spawning gemini subprocess");
@@ -169,7 +168,9 @@ fn extract_response_from_output(stdout: &str) -> Result {
let json_str = &stdout[json_start..];
let outer: serde_json::Value = serde_json::from_str(json_str).map_err(|e| {
let preview = &json_str[..json_str.len().min(300)];
- LlmError::ProviderError(format!("failed to parse gemini JSON output: {e} — {preview}"))
+ LlmError::ProviderError(format!(
+ "failed to parse gemini JSON output: {e} — {preview}"
+ ))
})?;
// `response` holds the model's actual text output.
@@ -320,10 +321,7 @@ mod tests {
fn extracts_response_skipping_mcp_noise() {
// MCP warning line appears before the JSON object in real gemini output.
let stdout = "MCP issues detected. Run /mcp list for status.\n{\"session_id\":\"abc\",\"response\":\"the answer\",\"stats\":{}}";
- assert_eq!(
- extract_response_from_output(stdout).unwrap(),
- "the answer"
- );
+ assert_eq!(extract_response_from_output(stdout).unwrap(), "the answer");
}
#[test]
diff --git a/crates/noxa-llm/src/providers/mod.rs b/crates/noxa-llm/src/providers/mod.rs
index b1a8736..53dc760 100644
--- a/crates/noxa-llm/src/providers/mod.rs
+++ b/crates/noxa-llm/src/providers/mod.rs
@@ -29,9 +29,6 @@ mod tests {
#[test]
fn none_override_with_no_env_returns_none() {
- assert_eq!(
- load_api_key(None, "NOXA_TEST_NONEXISTENT_KEY_12345"),
- None
- );
+ assert_eq!(load_api_key(None, "NOXA_TEST_NONEXISTENT_KEY_12345"), None);
}
}
diff --git a/crates/noxa-llm/src/providers/ollama.rs b/crates/noxa-llm/src/providers/ollama.rs
index d728e67..dbdbecb 100644
--- a/crates/noxa-llm/src/providers/ollama.rs
+++ b/crates/noxa-llm/src/providers/ollama.rs
@@ -8,6 +8,8 @@ use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider};
+const DEFAULT_HEALTH_TIMEOUT_MS: u64 = 2_000;
+
pub struct OllamaProvider {
client: reqwest::Client,
base_url: String,
@@ -22,7 +24,7 @@ impl OllamaProvider {
let default_model = model
.or_else(|| std::env::var("OLLAMA_MODEL").ok())
- .unwrap_or_else(|| "qwen3:8b".into());
+ .unwrap_or_else(|| "qwen3.5:9b".into());
Self {
client: reqwest::Client::new(),
@@ -98,7 +100,7 @@ impl LlmProvider for OllamaProvider {
async fn is_available(&self) -> bool {
let url = format!("{}/api/tags", self.base_url);
matches!(
- tokio::time::timeout(Duration::from_millis(500), self.client.get(&url).send()).await,
+ tokio::time::timeout(health_timeout(), self.client.get(&url).send()).await,
Ok(Ok(r)) if r.status().is_success()
)
}
@@ -108,6 +110,18 @@ impl LlmProvider for OllamaProvider {
}
}
+fn health_timeout() -> Duration {
+ health_timeout_from_env(std::env::var("OLLAMA_HEALTH_TIMEOUT_MS").ok())
+}
+
+fn health_timeout_from_env(value: Option) -> Duration {
+ value
+ .and_then(|v| v.parse::().ok())
+ .filter(|ms| *ms > 0)
+ .map(Duration::from_millis)
+ .unwrap_or_else(|| Duration::from_millis(DEFAULT_HEALTH_TIMEOUT_MS))
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -142,6 +156,27 @@ mod tests {
assert_eq!(provider.default_model(), "phi3:mini");
}
+ #[test]
+ fn health_timeout_from_env_defaults_when_unset() {
+ assert_eq!(health_timeout_from_env(None), Duration::from_millis(2000));
+ }
+
+ #[test]
+ fn health_timeout_from_env_parses_override() {
+ assert_eq!(
+ health_timeout_from_env(Some("1500".into())),
+ Duration::from_millis(1500)
+ );
+ }
+
+ #[test]
+ fn health_timeout_from_env_ignores_invalid_values() {
+ assert_eq!(
+ health_timeout_from_env(Some("not-a-number".into())),
+ Duration::from_millis(2000)
+ );
+ }
+
// Env var fallback is a trivial `env::var().ok()` -- not worth the flakiness
// of manipulating process-global state. Run in isolation if needed:
// cargo test -p noxa-llm env_var_fallback -- --ignored --test-threads=1
diff --git a/crates/noxa-llm/src/testing.rs b/crates/noxa-llm/src/testing.rs
index da5cc0b..98a0693 100644
--- a/crates/noxa-llm/src/testing.rs
+++ b/crates/noxa-llm/src/testing.rs
@@ -4,8 +4,8 @@
/// extract, chain, and other modules that need a fake LLM backend.
#[cfg(test)]
pub(crate) mod mock {
- use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
+ use std::sync::atomic::{AtomicUsize, Ordering};
use async_trait::async_trait;
@@ -50,7 +50,7 @@ pub(crate) mod mock {
}
/// A mock provider that returns responses from a sequence.
- /// Call N → returns responses[N], wrapping at the end.
+ /// Call N → returns responses[N], clamping to the final response.
/// Useful for testing first-failure / second-success retry paths.
pub struct SequenceMockProvider {
pub name: &'static str,
@@ -60,10 +60,11 @@ pub(crate) mod mock {
}
impl SequenceMockProvider {
- pub fn new(
- name: &'static str,
- responses: Vec>,
- ) -> Self {
+ pub fn new(name: &'static str, responses: Vec>) -> Self {
+ assert!(
+ !responses.is_empty(),
+ "SequenceMockProvider requires at least one response"
+ );
Self {
name,
responses,
diff --git a/crates/noxa-mcp/src/cloud.rs b/crates/noxa-mcp/src/cloud.rs
index ee4d259..315ef59 100644
--- a/crates/noxa-mcp/src/cloud.rs
+++ b/crates/noxa-mcp/src/cloud.rs
@@ -7,7 +7,6 @@ use std::time::Duration;
use serde_json::{Value, json};
use tracing::info;
-
const API_BASE: &str = "https://api.noxa.io/v1";
/// Lightweight client for the noxa cloud API.
diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs
index 4b7bb44..db926e7 100644
--- a/crates/noxa-mcp/src/server.rs
+++ b/crates/noxa-mcp/src/server.rs
@@ -18,6 +18,8 @@ use url::Url;
use crate::cloud::{self, CloudClient, SmartFetchResult};
use crate::tools::*;
+const NO_LLM_PROVIDERS_MESSAGE: &str = "No LLM providers available (priority: Gemini CLI -> OpenAI -> Ollama -> Anthropic). Install gemini on PATH, set OPENAI_API_KEY, OLLAMA_HOST / OLLAMA_MODEL, or ANTHROPIC_API_KEY, or set NOXA_API_KEY for cloud fallback.";
+
pub struct NoxaMcp {
tool_router: ToolRouter,
fetch_client: Arc,
@@ -89,7 +91,7 @@ impl NoxaMcp {
let chain = noxa_llm::ProviderChain::default().await;
let llm_chain = if chain.is_empty() {
- warn!("no LLM providers available (gemini CLI, OPENAI_API_KEY, ANTHROPIC_API_KEY) -- extract/summarize tools will fail");
+ warn!("{NO_LLM_PROVIDERS_MESSAGE} -- extract/summarize tools will fail");
None
} else {
info!(providers = chain.len(), "LLM provider chain ready");
@@ -333,9 +335,7 @@ impl NoxaMcp {
// No local LLM — fall back to cloud API directly
if self.llm_chain.is_none() {
- let cloud = self.cloud.as_ref().ok_or(
- "No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
- )?;
+ let cloud = self.cloud.as_ref().ok_or(NO_LLM_PROVIDERS_MESSAGE)?;
let mut body = json!({"url": params.url});
if let Some(ref schema) = params.schema {
body["schema"] = json!(schema);
@@ -386,9 +386,7 @@ impl NoxaMcp {
// No local LLM — fall back to cloud API directly
if self.llm_chain.is_none() {
- let cloud = self.cloud.as_ref().ok_or(
- "No LLM providers available. Install the gemini CLI, set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
- )?;
+ let cloud = self.cloud.as_ref().ok_or(NO_LLM_PROVIDERS_MESSAGE)?;
let mut body = json!({"url": params.url});
if let Some(sentences) = params.max_sentences {
body["max_sentences"] = json!(sentences);
@@ -425,9 +423,8 @@ impl NoxaMcp {
#[tool]
async fn diff(&self, Parameters(params): Parameters) -> Result {
validate_url(¶ms.url)?;
- let previous: noxa_core::ExtractionResult =
- serde_json::from_str(¶ms.previous_snapshot)
- .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
+ let previous: noxa_core::ExtractionResult = serde_json::from_str(¶ms.previous_snapshot)
+ .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
let result = cloud::smart_fetch(
&self.fetch_client,
@@ -515,8 +512,7 @@ impl NoxaMcp {
}
}
- let identity =
- noxa_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url));
+ let identity = noxa_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url));
Ok(serde_json::to_string_pretty(&identity).unwrap_or_default())
}
diff --git a/docs/config.md b/docs/config.md
new file mode 100644
index 0000000..0a89454
--- /dev/null
+++ b/docs/config.md
@@ -0,0 +1,273 @@
+# Config and Environment
+
+This document explains how `noxa` loads configuration, how it merges `config.json` with environment variables and CLI flags, and which settings belong in each place.
+
+## Quick Summary
+
+- `config.json` is for non-secret defaults.
+- `.env` is for secrets and URLs.
+- CLI flags always win over config and environment variables.
+- Unknown keys in `config.json` are ignored.
+- `config.json` uses `snake_case` keys.
+
+## Load Order
+
+`noxa` resolves settings in this order:
+
+1. CLI flags
+2. `config.json`
+3. Environment variables
+4. Built-in defaults
+
+That means you can set a default in `config.json`, override it for a single run with a CLI flag, and keep secrets in `.env` without checking them into source control.
+
+## Where `config.json` Comes From
+
+By default, `noxa` loads `./config.json` from the current working directory.
+
+You can override that in two ways:
+
+- `--config ` on the CLI
+- `NOXA_CONFIG=` in the environment
+
+If the file does not exist:
+
+- an explicit `--config` path or `NOXA_CONFIG` path is an error
+- the default `./config.json` is optional and missing files are ignored
+
+To bypass config entirely for one run:
+
+```bash
+NOXA_CONFIG=/dev/null noxa https://example.com
+```
+
+## What Belongs Where
+
+### `config.json`
+
+Use `config.json` for stable, non-secret defaults such as:
+
+- output format
+- output directory
+- browser fingerprint
+- timeout
+- crawl depth and page limits
+- selector filters
+- LLM provider and model
+
+### `.env`
+
+Use `.env` for secrets, URLs, and a small number of runtime overrides:
+
+- `NOXA_API_KEY`
+- `NOXA_PROXY`
+- `NOXA_PROXY_FILE`
+- `NOXA_WEBHOOK_URL`
+- `NOXA_LLM_BASE_URL`
+
+Those values are intentionally excluded from `config.json`.
+
+If you run `setup.sh` or the Docker Compose stack, the generated `.env` may also include local deployment settings such as `NOXA_PORT`, `NOXA_HOST`, `NOXA_AUTH_KEY`, `NOXA_LOG`, `OLLAMA_HOST`, and `OLLAMA_MODEL`.
+
+### CLI-only
+
+These options stay on the command line and do not belong in `config.json`:
+
+- `--on-change`
+- `--raw-html`
+
+`--on-change` is CLI-only because it executes shell commands. `--raw-html` is a per-run mode, not a persistent default.
+
+## Config File Rules
+
+- Keys are `snake_case`.
+- All fields are optional.
+- Unknown fields are ignored.
+- Arrays are used for selector and path lists.
+- Boolean flags have one important limitation: if you set them to `true` in `config.json`, you cannot disable them for a single CLI run with a `--no-...` flag because `noxa` does not define one.
+
+The boolean fields with this limitation are:
+
+- `metadata`
+- `verbose`
+- `only_main_content`
+- `use_sitemap`
+
+If you need to turn one of those off temporarily, bypass the config file with `NOXA_CONFIG=/dev/null`.
+
+## Supported `config.json` Keys
+
+### Output
+
+| Key | Type | Default | Notes |
+|---|---|---:|---|
+| `format` | string | `markdown` | One of `markdown`, `json`, `text`, `llm`, `html` |
+| `metadata` | boolean | `false` | Include metadata in output |
+| `verbose` | boolean | `false` | Enable verbose logging |
+| `output_dir` | string or null | `null` | Write outputs to files in this directory instead of stdout |
+
+When `output_dir` is set, noxa writes results to files instead of printing them for the modes that support file output:
+
+- single URL extraction
+- multi-URL batch extraction
+- crawl
+- LLM extraction and summarization
+- sitemap discovery
+- diff output
+- brand extraction
+- research reports
+- watch changes
+
+File names are derived from the URL or mode name, and the directory is created on demand.
+
+### Output Directory Layout
+
+For URL-based output, noxa mirrors the URL path under `output_dir`:
+
+| URL | Written file |
+|---|---|
+| `https://example.com/` | `output_dir/example_com/index.md` |
+| `https://example.com/docs/api` | `output_dir/docs/api.md` |
+| `https://example.com/docs/api/` | `output_dir/docs/api.md` |
+| `https://example.com/blog/post?id=123` | `output_dir/blog/post_id_123.md` |
+
+The extension comes from the selected output format:
+
+| Format | Extension |
+|---|---|
+| `markdown` | `.md` |
+| `llm` | `.md` |
+| `json` | `.json` |
+| `text` | `.txt` |
+| `html` | `.html` |
+
+For `--urls-file`, a CSV entry of `url,filename` uses the custom filename instead of the URL-derived name.
+
+Examples:
+
+```txt
+https://example.com/docs/api,api.md
+https://example.com/blog/post
+```
+
+Becomes:
+
+```txt
+output_dir/api.md
+output_dir/blog/post.md
+```
+
+Mode-specific outputs use fixed filenames in the root of `output_dir`:
+
+| Mode | File |
+|---|---|
+| `--map` | `sitemap.json` or `sitemap.txt` |
+| `--diff-with` | `diff.json` or `diff.txt` |
+| `--brand` | `brand.json` |
+| `--research` | `research-.json` |
+| `--watch` | `watch-.json` |
+
+The directory tree is created automatically, so nested paths do not need to exist ahead of time.
+
+### Fetch
+
+| Key | Type | Default | Notes |
+|---|---|---:|---|
+| `browser` | string | `chrome` | One of `chrome`, `firefox`, `random` |
+| `timeout` | integer | `30` | Request timeout in seconds |
+| `pdf_mode` | string | `auto` | One of `auto`, `fast` |
+| `only_main_content` | boolean | `false` | Auto-detect the main content area |
+
+### Content Filtering
+
+| Key | Type | Default | Notes |
+|---|---|---:|---|
+| `include_selectors` | array of strings | `[]` | CSS selectors to include |
+| `exclude_selectors` | array of strings | `[]` | CSS selectors to exclude |
+
+### Crawl
+
+| Key | Type | Default | Notes |
+|---|---|---:|---|
+| `depth` | integer | `1` | Crawl depth |
+| `max_pages` | integer | `20` | Maximum pages to crawl |
+| `concurrency` | integer | `5` | Concurrent requests |
+| `delay` | integer | `100` | Delay between requests in ms |
+| `path_prefix` | string or null | `null` | Only crawl URLs whose path starts with this prefix |
+| `include_paths` | array of strings | `[]` | Glob patterns to include |
+| `exclude_paths` | array of strings | `[]` | Glob patterns to exclude |
+| `use_sitemap` | boolean | `false` | Seed the crawl from sitemap discovery |
+
+### LLM
+
+| Key | Type | Default | Notes |
+|---|---|---:|---|
+| `llm_provider` | string | unset | Optional provider name: `gemini`, `ollama`, `openai`, `anthropic` |
+| `llm_model` | string | unset | Optional model override |
+
+## Environment Variables
+
+| Variable | Purpose | Notes |
+|---|---|---|
+| `NOXA_API_KEY` | Cloud API key | Used for cloud fallback and cloud-only features |
+| `NOXA_PROXY` | Single proxy URL | Takes priority over proxy file when set |
+| `NOXA_PROXY_FILE` | Proxy pool file path | One proxy per line |
+| `NOXA_WEBHOOK_URL` | Notification webhook | Used by watch/crawl/batch notifications |
+| `NOXA_LLM_BASE_URL` | LLM endpoint URL | For Ollama or OpenAI-compatible endpoints |
+| `NOXA_LLM_PROVIDER` | Default LLM provider | Environment override for the provider name |
+| `NOXA_LLM_MODEL` | Default LLM model | Environment override for the model name |
+| `NOXA_CONFIG` | Config file path | Override `./config.json` or bypass with `/dev/null` |
+
+The following variables are not part of the `config.json` contract, but they still matter for LLM provider behavior:
+
+- `OPENAI_API_KEY`
+- `ANTHROPIC_API_KEY`
+- `OLLAMA_HOST`
+- `OLLAMA_MODEL`
+- `GEMINI_MODEL`
+
+## Example
+
+`config.example.json` shows the recommended baseline:
+
+```json
+{
+ "$schema": "./config.schema.json",
+ "_doc": [
+ "Copy to config.json and remove fields you don't need.",
+ "Secrets (api_key, proxy, webhook, llm_base_url) go in .env — NOT here."
+ ],
+ "format": "markdown",
+ "browser": "chrome",
+ "timeout": 30,
+ "pdf_mode": "auto",
+ "metadata": false,
+ "verbose": false,
+ "only_main_content": false,
+ "include_selectors": [],
+ "exclude_selectors": ["nav", "footer", ".sidebar", ".cookie-banner"],
+ "depth": 1,
+ "max_pages": 20,
+ "concurrency": 5,
+ "delay": 100,
+ "path_prefix": null,
+ "include_paths": [],
+ "exclude_paths": ["/changelog/*", "/blog/*", "/releases/*"],
+ "use_sitemap": false,
+ "llm_provider": "gemini",
+ "llm_model": "gemini-2.5-pro"
+}
+```
+
+## Gotchas
+
+- `config.json` is permissive by design: unknown fields are ignored so newer config files still work on older binaries.
+- `llm_provider` is validated by the CLI at runtime; invalid values will fail when the provider is selected.
+- `browser`, `timeout`, `depth`, `max_pages`, `concurrency`, and `delay` are ordinary defaults, so CLI flags can override them per run.
+- Boolean defaults set to `true` in config are sticky for that run unless you bypass the file.
+
+## Related Files
+
+- [`config.schema.json`](../config.schema.json)
+- [`config.example.json`](../config.example.json)
+- [`env.example`](../env.example)
diff --git a/env.example b/env.example
index aad81c5..f85a0c9 100644
--- a/env.example
+++ b/env.example
@@ -13,8 +13,16 @@ NOXA_PROXY_FILE=
# Webhook URL for completion notifications
NOXA_WEBHOOK_URL=
-# LLM base URL (Ollama or OpenAI-compatible endpoint)
-NOXA_LLM_BASE_URL=
+# LLM provider configuration and backend defaults
+# NOXA_LLM_PROVIDER=gemini
+# NOXA_LLM_MODEL=gemini-2.5-pro
+# NOXA_LLM_BASE_URL= (Ollama or OpenAI-compatible endpoint)
+# GEMINI_MODEL=gemini-2.5-pro
+# OLLAMA_HOST=http://localhost:11434
+# OLLAMA_MODEL=qwen3.5:9b
+# OLLAMA_HEALTH_TIMEOUT_MS=2000
+# OPENAI_API_KEY=
+# ANTHROPIC_API_KEY=
# Optional: path to a non-default config file (default: ./config.json)
# NOXA_CONFIG=/path/to/my-config.json
diff --git a/examples/README.md b/examples/README.md
deleted file mode 100644
index f9aee68..0000000
--- a/examples/README.md
+++ /dev/null
@@ -1,320 +0,0 @@
-# Examples
-
-Practical examples showing what noxa can do. Each example is a self-contained command you can run immediately.
-
-## Basic Extraction
-
-```bash
-# Extract as markdown (default)
-noxa https://example.com
-
-# Multiple output formats
-noxa https://example.com -f markdown # Clean markdown
-noxa https://example.com -f json # Full structured JSON
-noxa https://example.com -f text # Plain text (no formatting)
-noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens)
-
-# Bare domains work (auto-prepends https://)
-noxa example.com
-```
-
-## Content Filtering
-
-```bash
-# Only extract main content (skip nav, sidebar, footer)
-noxa https://docs.rs/tokio --only-main-content
-
-# Include specific CSS selectors
-noxa https://news.ycombinator.com --include ".titleline,.score"
-
-# Exclude specific elements
-noxa https://example.com --exclude "nav,footer,.ads,.sidebar"
-
-# Combine both
-noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar"
-```
-
-## Brand Identity Extraction
-
-```bash
-# Extract colors, fonts, logos from any website
-noxa --brand https://stripe.com
-# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] }
-
-noxa --brand https://github.com
-# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... }
-
-noxa --brand wikipedia.org
-# Output: 10 colors, 5 fonts, favicon, logo URL
-```
-
-## Sitemap Discovery
-
-```bash
-# Discover all URLs from a site's sitemaps
-noxa --map https://sitemaps.org
-# Output: one URL per line (84 URLs found)
-
-# JSON output with metadata
-noxa --map https://sitemaps.org -f json
-# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }]
-```
-
-## Recursive Crawling
-
-```bash
-# Crawl a site (default: depth 1, max 20 pages)
-noxa --crawl https://example.com
-
-# Control depth and page limit
-noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio
-
-# Crawl with sitemap seeding (finds more pages)
-noxa --crawl --sitemap --depth 2 https://docs.rs/tokio
-
-# Filter crawl paths
-noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com
-noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com
-
-# Control concurrency and delay
-noxa --crawl --concurrency 10 --delay 200 https://example.com
-```
-
-## Change Detection (Diff)
-
-```bash
-# Step 1: Save a snapshot
-noxa https://example.com -f json > snapshot.json
-
-# Step 2: Later, compare against the snapshot
-noxa --diff-with snapshot.json https://example.com
-# Output:
-# Status: Same
-# Word count delta: +0
-
-# If the page changed:
-# Status: Changed
-# Word count delta: +42
-# --- old
-# +++ new
-# @@ -1,3 +1,3 @@
-# -Old content here
-# +New content here
-```
-
-## PDF Extraction
-
-```bash
-# PDF URLs are auto-detected via Content-Type
-noxa https://example.com/report.pdf
-
-# Control PDF mode
-noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs)
-noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found
-```
-
-## Batch Processing
-
-```bash
-# Multiple URLs in one command
-noxa https://example.com https://httpbin.org/html https://rust-lang.org
-
-# URLs from a file (one per line, # comments supported)
-noxa --urls-file urls.txt
-
-# Batch with JSON output
-noxa --urls-file urls.txt -f json
-
-# Proxy rotation for large batches
-noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10
-```
-
-## Local Files & Stdin
-
-```bash
-# Extract from a local HTML file
-noxa --file page.html
-
-# Pipe HTML from another command
-curl -s https://example.com | noxa --stdin
-
-# Chain with other tools
-noxa https://example.com -f text | wc -w # Word count
-noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq
-```
-
-## Cloud API Mode
-
-When you have a noxa API key, the CLI can route through the cloud for bot protection bypass, JS rendering, and proxy rotation.
-
-```bash
-# Set API key (one time)
-export NOXA_API_KEY=wc_your_key_here
-
-# Automatic fallback: tries local first, cloud on bot detection
-noxa https://protected-site.com
-
-# Force cloud mode (skip local, always use API)
-noxa --cloud https://spa-site.com
-
-# Cloud mode works with all features
-noxa --cloud --brand https://stripe.com
-noxa --cloud -f json https://producthunt.com
-noxa --cloud --crawl --depth 2 https://protected-docs.com
-```
-
-## Browser Impersonation
-
-```bash
-# Chrome (default) — latest Chrome TLS fingerprint
-noxa https://example.com
-
-# Firefox fingerprint
-noxa --browser firefox https://example.com
-
-# Random browser per request (good for batch)
-noxa --browser random --urls-file urls.txt
-```
-
-## Custom Headers & Cookies
-
-```bash
-# Custom headers
-noxa -H "Authorization: Bearer token123" https://api.example.com
-noxa -H "Accept-Language: de-DE" https://example.com
-
-# Cookies
-noxa --cookie "session=abc123; theme=dark" https://example.com
-
-# Multiple headers
-noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com
-```
-
-## LLM-Powered Features
-
-These require an LLM provider (Ollama local, or OpenAI/Anthropic API key).
-
-```bash
-# Summarize a page (default: 3 sentences)
-noxa --summarize https://example.com
-
-# Control summary length
-noxa --summarize 5 https://example.com
-
-# Extract structured JSON with a schema
-noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product
-
-# Extract with a schema from file
-noxa --extract-json @schema.json https://example.com/product
-
-# Extract with natural language prompt
-noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing
-
-# Use a specific LLM provider
-noxa --llm-provider ollama --summarize https://example.com
-noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com
-noxa --llm-provider anthropic --summarize https://example.com
-```
-
-## Raw HTML Output
-
-```bash
-# Get the raw fetched HTML (no extraction)
-noxa --raw-html https://example.com
-
-# Useful for debugging extraction issues
-noxa --raw-html https://example.com > raw.html
-noxa --file raw.html # Then extract locally
-```
-
-## Metadata & Verbose Mode
-
-```bash
-# Include YAML frontmatter with metadata
-noxa --metadata https://example.com
-# Output:
-# ---
-# title: "Example Domain"
-# source: "https://example.com"
-# word_count: 20
-# ---
-# # Example Domain
-# ...
-
-# Verbose logging (debug extraction pipeline)
-noxa -v https://example.com
-```
-
-## Proxy Usage
-
-```bash
-# Single proxy
-noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com
-
-# SOCKS5 proxy
-noxa --proxy socks5://proxy.example.com:1080 https://example.com
-
-# Proxy rotation from file (one per line: host:port:user:pass)
-noxa --proxy-file proxies.txt https://example.com
-
-# Auto-load proxies.txt from current directory
-echo "proxy1.com:8080:user:pass" > proxies.txt
-noxa https://example.com # Automatically detects and uses proxies.txt
-```
-
-## MCP Server (AI Agent Integration)
-
-```bash
-# Start the MCP server (stdio transport)
-noxa-mcp
-
-# Configure in Claude Desktop (~/.config/claude/claude_desktop_config.json):
-# {
-# "mcpServers": {
-# "noxa": {
-# "command": "/path/to/noxa-mcp",
-# "env": {
-# "NOXA_API_KEY": "wc_your_key" // optional, enables cloud fallback
-# }
-# }
-# }
-# }
-
-# Available tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search
-```
-
-## Real-World Recipes
-
-### Monitor competitor pricing
-
-```bash
-# Save today's pricing
-noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \
- https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json
-```
-
-### Build a documentation search index
-
-```bash
-# Crawl docs and extract as LLM-optimized text
-noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt
-```
-
-### Extract all images from a page
-
-```bash
-noxa https://example.com -f json | jq -r '.content.images[].src'
-```
-
-### Get all external links
-
-```bash
-noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href'
-```
-
-### Compare two pages
-
-```bash
-noxa https://site-a.com -f json > a.json
-noxa https://site-b.com --diff-with a.json
-```
diff --git a/gemini-extension.json b/gemini-extension.json
new file mode 100644
index 0000000..a696c3e
--- /dev/null
+++ b/gemini-extension.json
@@ -0,0 +1,11 @@
+{
+ "name": "noxa",
+ "version": "0.4.0",
+ "description": "noxa CLI, MCP server, and skills for AI-assisted web extraction",
+ "mcpServers": {
+ "noxa": {
+ "command": "${extensionPath}${/}bin${/}noxa-mcp",
+ "cwd": "${extensionPath}"
+ }
+ }
+}
diff --git a/setup.sh b/setup.sh
index 5e7ccc8..4ea7244 100755
--- a/setup.sh
+++ b/setup.sh
@@ -3,6 +3,8 @@
#
# Checks prerequisites, builds binaries, configures .env,
# optionally installs Ollama, and wires up the MCP server.
+# The generated .env is broader than env.example: it includes local
+# deployment and Ollama settings used by the setup script and compose stack.
#
# Usage:
# ./setup.sh # Interactive full setup
@@ -214,6 +216,8 @@ configure_env() {
fi
# Write .env
+ # env.example covers the runtime noxa variables; this file adds local
+ # deployment and Ollama settings used by setup.sh and docker-compose.yml.
cat > "$SCRIPT_DIR/.env" <-
+ This skill should be used when the user wants to scrape, extract, or fetch content from
+ a URL using the noxa CLI, crawl a website, get the text of a web page, monitor or watch
+ a page for changes, extract brand identity (colors, fonts, logos) from a site,
+ batch-process URLs, summarize a web page with an LLM, extract structured data from a
+ page, run deep research on a topic, or save crawl output to files.
+ Trigger on phrases like: "scrape", "extract from", "get content from", "crawl", "fetch
+ this page", "what does this site say", "get the text of", "monitor changes", "watch this
+ URL", "brand colors of", "sitemap of", "summarize this URL", "deep research". Use this
+ skill before running noxa — it covers the correct flag combinations for every workflow
+ and prevents common mistakes.
+---
+
+# Noxa — Web Content Extraction for AI
+
+noxa extracts clean, LLM-optimized content from any URL using Chrome-level TLS fingerprinting.
+No browser required. Output is 67% fewer tokens than raw HTML.
+
+Binary: `noxa` (CLI) — assumed to be on PATH. Verify with `which noxa`.
+
+> **Complete flag reference:** See `references/flags.md` for every flag, its default, env var binding, and the full config.json schema.
+
+---
+
+## Choosing the right mode
+
+To choose the right mode, identify what the user wants from this URL:
+
+| Goal | Mode |
+|------|------|
+| Read a page | Basic extraction |
+| Read docs / whole site | Crawl |
+| Find all URLs on a site | Map |
+| Multiple URLs at once | Batch |
+| Extract structured fields | LLM extraction |
+| Summarize a page | Summarize |
+| Deep research on a topic | Research (cloud) |
+| Track changes once | Diff |
+| Continuously watch for changes | Watch |
+| Get brand colors/fonts/logos | Brand |
+| Debug a 403 or bad output | Raw HTML |
+
+---
+
+## Basic extraction
+
+```bash
+# Default: clean markdown, great for reading
+noxa https://example.com
+
+# Format options
+noxa https://example.com -f llm # Token-optimized (best for feeding to Claude)
+noxa https://example.com -f json # Full structured JSON with metadata
+noxa https://example.com -f text # Plain text, no formatting
+noxa https://example.com -f markdown # Markdown (same as default)
+noxa https://example.com -f html # Raw extracted HTML
+
+# Skip nav/sidebar/footer noise
+noxa https://example.com --only-main-content
+
+# Include/exclude specific elements via CSS selectors
+noxa https://example.com --include "article,.content"
+noxa https://example.com --exclude "nav,footer,.sidebar,.ads"
+
+# Include metadata as YAML frontmatter
+noxa https://example.com --metadata
+
+# Request timeout (default: 30s)
+noxa --timeout 60 https://slow-site.com
+```
+
+Use `-f llm` when passing content to Claude — it cuts token usage by ~67%.
+
+---
+
+## Crawling a site
+
+```bash
+# Crawl with defaults (depth 1, up to 20 pages)
+noxa --crawl https://docs.example.com
+
+# Control scope
+noxa --crawl --depth 3 --max-pages 100 https://docs.example.com
+
+# Seed from sitemap first (finds more pages)
+noxa --crawl --sitemap --depth 2 https://docs.example.com
+
+# Filter by path prefix (strict prefix match)
+noxa --crawl --path-prefix /docs https://docs.example.com
+
+# Filter by glob patterns (more flexible than --path-prefix)
+noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com
+noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com
+
+# Control concurrency and delay (ms between requests)
+noxa --crawl --concurrency 5 --delay 500 https://example.com
+
+# Save/resume crawl state (Ctrl+C saves progress; rerunning resumes)
+noxa --crawl --crawl-state state.json --max-pages 500 https://docs.example.com
+
+# Save each page to a separate file instead of stdout
+noxa --crawl --output-dir ./output https://docs.example.com
+```
+
+Good for: building search indexes, ingesting documentation, research.
+
+---
+
+## Sitemap discovery
+
+```bash
+# List all URLs from the site's sitemaps
+noxa --map https://example.com
+
+# JSON with last_modified and priority
+noxa --map https://example.com -f json
+```
+
+Use `--map` when you want to know what's on a site before crawling.
+
+---
+
+## Batch processing
+
+```bash
+# Multiple URLs in one command
+noxa https://site-a.com https://site-b.com https://site-c.com
+
+# From a file (one URL per line, # comments OK)
+# Also supports CSV format: url,custom-filename
+noxa --urls-file urls.txt
+
+# Save each result to a separate file
+noxa --urls-file urls.txt --output-dir ./pages
+
+# With concurrency and proxy rotation
+noxa --urls-file urls.txt --concurrency 10 -f llm --proxy-file proxies.txt
+```
+
+---
+
+## LLM-powered extraction
+
+These require an LLM provider. noxa tries Gemini CLI first, then Ollama, then OpenAI, then Anthropic.
+
+Configure whichever provider you have available:
+```bash
+# Gemini CLI (primary — requires `gemini` binary on PATH)
+# Model controlled by GEMINI_MODEL env var (default: gemini-2.5-pro)
+
+# Ollama (local, no key needed — default endpoint http://localhost:11434)
+export OLLAMA_HOST=http://localhost:11434 # only needed if non-default
+
+# OpenAI
+export OPENAI_API_KEY=sk-...
+
+# Anthropic
+export ANTHROPIC_API_KEY=sk-ant-...
+
+# Override provider/model/URL via env vars
+export NOXA_LLM_PROVIDER=openai # gemini | ollama | openai | anthropic
+export NOXA_LLM_MODEL=gpt-4o
+export NOXA_LLM_BASE_URL=http://localhost:11434 # for Ollama or OpenAI-compatible endpoints
+```
+
+```bash
+# Summarize (default: 3 sentences)
+noxa --summarize https://example.com
+noxa --summarize 5 https://example.com # pass sentence count as positional arg after the flag
+
+# Extract with natural language
+noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing
+
+# Extract as structured JSON
+noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product
+
+# Schema from file
+noxa --extract-json @schema.json https://example.com/product
+
+# Force a specific provider via flag
+noxa --llm-provider ollama --summarize https://example.com
+noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com
+noxa --llm-provider anthropic --summarize https://example.com
+
+# Override LLM base URL (for self-hosted OpenAI-compatible endpoints)
+noxa --llm-base-url http://my-server:8080 --llm-provider openai --summarize https://example.com
+```
+
+---
+
+## Change detection (diff)
+
+```bash
+# Step 1: snapshot
+noxa https://example.com -f json > snapshot.json
+
+# Step 2: compare later
+noxa --diff-with snapshot.json https://example.com
+# Output: Status: Same | Changed, word delta, unified diff
+```
+
+Good for: one-off comparisons, price monitoring, detecting updates.
+
+---
+
+## Watch mode (continuous monitoring)
+
+Watch polls a URL on a schedule and reports diffs whenever the content changes.
+
+```bash
+# Watch with default interval (300s / 5 minutes)
+noxa --watch https://example.com
+
+# Custom interval
+noxa --watch --watch-interval 60 https://example.com # check every 60s
+
+# Run a command when a change is detected (receives diff JSON on stdin)
+noxa --watch --on-change "python notify.py" https://example.com
+
+# Post to a webhook on change (also works with --crawl and batch)
+noxa --watch --webhook https://hooks.slack.com/... https://example.com
+export NOXA_WEBHOOK_URL=https://hooks.discord.com/... # or via env var
+```
+
+Webhook auto-detects Discord and Slack URLs and wraps the payload accordingly.
+
+---
+
+## Deep research (cloud)
+
+Runs multi-source research on a topic via the noxa.io cloud API. Saves a full report (findings + sources) to a JSON file. Requires an API key.
+
+```bash
+export NOXA_API_KEY=wc_your_key
+
+# Standard research
+noxa --research "best practices for Rust error handling" --api-key $NOXA_API_KEY
+
+# Deep mode (longer, more thorough report)
+noxa --research "Rust async runtimes compared" --deep --api-key $NOXA_API_KEY
+```
+
+---
+
+## Brand identity extraction
+
+```bash
+noxa --brand https://stripe.com
+# Returns: name, colors (hex + usage), fonts, logos, favicon
+```
+
+Output is JSON. Useful for design audits, competitive analysis, or building themed UIs.
+
+---
+
+## PDF extraction
+
+```bash
+# Auto-detected via Content-Type header
+noxa https://example.com/report.pdf
+
+# Control behavior on scanned PDFs (no extractable text)
+noxa --pdf-mode auto https://example.com/report.pdf # error on empty (default)
+noxa --pdf-mode fast https://example.com/report.pdf # return whatever text exists
+```
+
+---
+
+## Auth, headers, cookies, proxies
+
+```bash
+# Custom headers
+noxa -H "Authorization: Bearer token123" https://api.example.com
+noxa -H "Accept-Language: fr-FR" -H "X-Custom: value" https://example.com
+
+# Cookie string (shorthand)
+noxa --cookie "session=abc123; theme=dark" https://example.com
+
+# Cookie file (Chrome extension JSON export format)
+noxa --cookie-file cookies.json https://example.com
+
+# Browser impersonation (default: Chrome)
+noxa --browser firefox https://example.com
+noxa --browser random https://example.com # random per request, good for batch
+
+# Single proxy
+noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com
+noxa --proxy socks5://proxy.example.com:1080 https://example.com
+
+# Proxy pool rotation
+noxa --proxy-file proxies.txt https://example.com # host:port:user:pass per line
+```
+
+---
+
+## Bot-protected sites / JS rendering
+
+noxa.io is the optional hosted cloud rendering service — it handles Cloudflare, DataDome, and JS-rendered SPAs that local TLS fingerprinting can't bypass. Get an API key at [noxa.io](https://noxa.io).
+
+```bash
+# Pass key via env var or --api-key flag
+export NOXA_API_KEY=wc_your_key
+# or: noxa --api-key wc_your_key https://example.com
+
+# Auto: tries local TLS fingerprinting first, falls back to cloud on bot detection
+noxa https://cloudflare-protected-site.com
+
+# Force cloud (for SPA / JS-heavy pages)
+noxa --cloud https://spa-site.com
+```
+
+---
+
+## Output to files
+
+```bash
+# Save crawl output — one file per page, filenames derived from URL paths
+noxa --crawl --output-dir ./docs https://docs.example.com
+
+# Save batch output
+noxa --urls-file urls.txt --output-dir ./pages -f llm
+
+# Single URL to file
+noxa --output-dir ./out https://example.com
+```
+
+---
+
+## Config file
+
+noxa loads `./config.json` by default. Override with `--config` or `NOXA_CONFIG`:
+
+```bash
+noxa --config ~/.noxa/config.json https://example.com
+export NOXA_CONFIG=/etc/noxa/config.json
+```
+
+Config uses snake_case keys that match `config.example.json` and the Rust config struct. Useful for setting defaults like `llm_provider`, `browser`, `concurrency`, `timeout`.
+
+---
+
+## Local files and stdin
+
+```bash
+# Local HTML file
+noxa --file page.html
+
+# Pipe HTML
+curl -s https://example.com | noxa --stdin
+```
+
+---
+
+## Debugging
+
+```bash
+# Get the raw fetched HTML to see what noxa received
+noxa --raw-html https://example.com
+
+# Verbose extraction pipeline logging
+noxa -v https://example.com
+```
+
+If a site returns 403, try `--browser firefox` or `--browser random`. If still blocked, use `--cloud` with an API key.
+
+---
+
+## Environment variables reference
+
+| Variable | Flag equivalent | Description |
+|----------|----------------|-------------|
+| `NOXA_API_KEY` | `--api-key` | Cloud API key |
+| `NOXA_PROXY` | `--proxy` | Single proxy URL |
+| `NOXA_PROXY_FILE` | `--proxy-file` | Proxy pool file path |
+| `NOXA_WEBHOOK_URL` | `--webhook` | Webhook URL for notifications |
+| `NOXA_LLM_PROVIDER` | `--llm-provider` | LLM provider (gemini/ollama/openai/anthropic) |
+| `NOXA_LLM_MODEL` | `--llm-model` | LLM model name override |
+| `NOXA_LLM_BASE_URL` | `--llm-base-url` | LLM base URL (Ollama/OpenAI-compatible) |
+| `NOXA_CONFIG` | `--config` | Path to config.json |
+| `OPENAI_API_KEY` | — | OpenAI API key |
+| `ANTHROPIC_API_KEY` | — | Anthropic API key |
+| `OLLAMA_HOST` | — | Ollama endpoint (default: http://localhost:11434) |
+
+---
+
+## Common recipes
+
+```bash
+# Read docs site as a single LLM-optimized text file
+noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt
+
+# Save full crawl to individual files
+noxa --crawl --sitemap --depth 2 --output-dir ./docs -f llm https://docs.example.com
+
+# Extract all external links from a page
+noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href'
+
+# Monitor competitor pricing — snapshot then diff
+noxa https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json
+noxa https://competitor.com/pricing --diff-with pricing-yesterday.json
+
+# Watch a page and notify on Slack when it changes
+noxa --watch --watch-interval 3600 --webhook https://hooks.slack.com/... https://example.com
+
+# Resumable large crawl
+noxa --crawl --crawl-state state.json --depth 4 --max-pages 2000 https://docs.example.com
+
+# Word count of a page
+noxa https://example.com -f text | wc -w
+
+# Extract article title with jq
+noxa https://example.com -f json | jq '.metadata.title'
+```
diff --git a/skills/noxa/references/flags.md b/skills/noxa/references/flags.md
new file mode 100644
index 0000000..ed6a08c
--- /dev/null
+++ b/skills/noxa/references/flags.md
@@ -0,0 +1,246 @@
+# Noxa CLI — Complete Flag Reference
+
+All flags for the `noxa` binary. Sourced directly from `crates/noxa-cli/src/main.rs`.
+
+Priority order when the same setting appears in multiple places:
+**CLI flag > config.json > environment variable > hard default**
+
+---
+
+## Table of Contents
+
+- [Input](#input)
+- [Output](#output)
+- [Content Filtering](#content-filtering)
+- [Request / Network](#request--network)
+- [Auth & Identity](#auth--identity)
+- [Crawl](#crawl)
+- [LLM](#llm)
+- [Change Detection](#change-detection)
+- [Watch Mode](#watch-mode)
+- [Brand Extraction](#brand-extraction)
+- [PDF](#pdf)
+- [Cloud API](#cloud-api)
+- [Config File](#config-file)
+- [Environment Variables](#environment-variables)
+- [config.json Reference](#configjson-reference)
+
+---
+
+## Input
+
+| Flag | Type | Description |
+|------|------|-------------|
+| `[URLS]...` | positional | One or more URLs to fetch. Bare domains are auto-prefixed with `https://`. |
+| `--urls-file ` | string | File with URLs, one per line. `#` comments supported. CSV format `url,filename` sets a custom output filename. |
+| `--file ` | string | Extract from a local HTML file instead of fetching. |
+| `--stdin` | bool | Read HTML from stdin. |
+
+---
+
+## Output
+
+| Flag | Short | Default | Description |
+|------|-------|---------|-------------|
+| `--format ` | `-f` | `markdown` | Output format: `markdown`, `json`, `text`, `llm`, `html`. Use `llm` when feeding to Claude — 67% fewer tokens than raw HTML. |
+| `--metadata` | | false | Include YAML frontmatter with title, source URL, word count. Always included in JSON format. |
+| `--raw-html` | | false | Output the raw fetched HTML with no extraction. Useful for debugging. CLI-only — not settable in config.json. |
+| `--output-dir ` | | — | Save each page to a separate file instead of stdout. Works with `--crawl`, batch, and single-URL mode. Filenames derived from URL paths (e.g. `/docs/api` → `docs/api.md`). |
+| `--verbose` / `-v` | `-v` | false | Enable verbose extraction pipeline logging to stderr. |
+
+---
+
+## Content Filtering
+
+| Flag | Description |
+|------|-------------|
+| `--only-main-content` | Auto-detect and extract only the main content element (``, ``). Strips nav, sidebar, footer. |
+| `--include ` | Comma-separated CSS selectors to include (e.g. `"article,.content"`). In config.json: `include_selectors` array. |
+| `--exclude ` | Comma-separated CSS selectors to exclude (e.g. `"nav,footer,.ads"`). In config.json: `exclude_selectors` array. |
+
+---
+
+## Request / Network
+
+| Flag | Short | Env | Default | Description |
+|------|-------|-----|---------|-------------|
+| `--browser ` | `-b` | — | `chrome` | TLS fingerprint to impersonate: `chrome`, `firefox`, `random`. `random` picks a different profile per request. |
+| `--timeout ` | `-t` | — | `30` | Request timeout in seconds. |
+| `--proxy ` | `-p` | `NOXA_PROXY` | — | Single proxy URL. Formats: `http://user:pass@host:port`, `socks5://host:port`. Takes priority over `--proxy-file` if both are set. |
+| `--proxy-file ` | | `NOXA_PROXY_FILE` | — | Proxy pool file — one proxy per line as `host:port:user:pass`. Rotates per request. |
+| `--concurrency ` | | — | `5` | Max concurrent requests (also used for crawl). |
+| `--delay ` | | — | `100` | Delay between requests in milliseconds. |
+
+---
+
+## Auth & Identity
+
+| Flag | Description |
+|------|-------------|
+| `-H / --header ` | Custom request header, repeatable. Format: `"Name: value"`. |
+| `--cookie ` | Cookie string, shorthand for `-H "Cookie: ..."`. |
+| `--cookie-file ` | JSON cookie file in Chrome extension export format: `[{name, value, domain, path, secure, ...}]`. |
+
+---
+
+## Crawl
+
+All crawl flags require `--crawl` to be active, except `--map` and `--sitemap` which are standalone.
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--crawl` | false | Enable recursive BFS crawl of same-origin links. |
+| `--depth ` | `1` | Max crawl depth from the start URL. |
+| `--max-pages ` | `20` | Maximum number of pages to crawl. |
+| `--concurrency ` | `5` | Max concurrent fetch workers during crawl. |
+| `--delay ` | `100` | Delay between requests in milliseconds. |
+| `--path-prefix ` | — | Only crawl URLs whose path starts with this prefix (strict string match). |
+| `--include-paths ` | — | Comma-separated glob patterns for paths to include (e.g. `"/api/*,/guides/**"`). More flexible than `--path-prefix`. In config.json: `include_paths` array. |
+| `--exclude-paths ` | — | Comma-separated glob patterns for paths to exclude (e.g. `"/changelog/*,/blog/*"`). In config.json: `exclude_paths` array. |
+| `--sitemap` | false | Seed the crawl frontier from sitemap discovery (checks `robots.txt` and `/sitemap.xml`). Also usable standalone to enable sitemaps without crawling. In config.json: `use_sitemap`. |
+| `--map` | false | Discover and print all URLs from the site's sitemaps without fetching content. One URL per line; JSON array with `-f json`. |
+| `--crawl-state ` | — | Path to a JSON file for saving/resuming crawl state. On Ctrl+C: saves progress. On next run: resumes from where it left off. |
+
+---
+
+## LLM
+
+Requires a configured LLM provider. noxa tries Gemini CLI → Ollama → OpenAI → Anthropic in order.
+
+| Flag | Env | Description |
+|------|-----|-------------|
+| `--summarize [N]` | — | Summarize extracted content. Optional sentence count (default: 3). Pass as positional arg: `--summarize 5`. |
+| `--extract-prompt ` | — | Extract content using a natural language prompt. |
+| `--extract-json ` | — | Extract structured JSON conforming to a JSON Schema string. Pass `@file.json` to load schema from a file. |
+| `--llm-provider ` | `NOXA_LLM_PROVIDER` | Force a specific provider: `gemini`, `ollama`, `openai`, `anthropic`. |
+| `--llm-model ` | `NOXA_LLM_MODEL` | Override the model name (e.g. `gpt-4o`, `gemini-2.5-pro`). |
+| `--llm-base-url ` | `NOXA_LLM_BASE_URL` | Override the LLM base URL. Use for self-hosted Ollama or OpenAI-compatible endpoints. |
+
+Provider setup:
+- **Gemini CLI**: requires `gemini` binary on PATH. Model via `GEMINI_MODEL` (default: `gemini-2.5-pro`).
+- **Ollama**: set `OLLAMA_HOST` if not on `http://localhost:11434`.
+- **OpenAI**: set `OPENAI_API_KEY`.
+- **Anthropic**: set `ANTHROPIC_API_KEY`.
+
+---
+
+## Change Detection
+
+| Flag | Description |
+|------|-------------|
+| `--diff-with ` | Compare current extraction against a previously saved JSON snapshot. Reports status (Same/Changed), word delta, and a unified diff. Take a snapshot with `noxa -f json > snapshot.json`. |
+
+---
+
+## Watch Mode
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--watch` | false | Continuously poll a URL for changes and report diffs. |
+| `--watch-interval ` | `300` | Poll interval in seconds. |
+| `--on-change ` | — | Shell command to run when a change is detected. Receives the diff JSON on stdin. CLI-only — intentionally excluded from config.json to prevent shell injection via config file writes. |
+| `--webhook ` | `NOXA_WEBHOOK_URL` | POST a JSON payload when changes are detected (watch), a crawl completes, or a batch finishes. Auto-detects Discord and Slack URLs and wraps the payload accordingly. |
+
+---
+
+## Brand Extraction
+
+| Flag | Description |
+|------|-------------|
+| `--brand` | Extract brand identity: colors (hex + usage), fonts, logos, favicon. Output is JSON. |
+
+---
+
+## PDF
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--pdf-mode ` | `auto` | How to handle PDFs: `auto` errors on empty text (catches scanned/image PDFs), `fast` returns whatever text is found. PDFs are auto-detected via `Content-Type` header. |
+
+---
+
+## Cloud API
+
+noxa.io is the optional hosted rendering service. Handles Cloudflare, DataDome, WAF, and JS-rendered SPAs. Get a key at [noxa.io](https://noxa.io).
+
+| Flag | Env | Description |
+|------|-----|-------------|
+| `--api-key ` | `NOXA_API_KEY` | Cloud API key. When set, enables automatic fallback to cloud on bot detection. |
+| `--cloud` | — | Force all requests through the cloud API, skipping local extraction entirely. |
+| `--research ` | — | Run deep multi-source research on a topic via the cloud API. Saves full result (report + sources + findings) to a JSON file. Requires `--api-key`. |
+| `--deep` | — | Enable deep research mode (longer, more thorough report). Used with `--research`. |
+
+---
+
+## Config File
+
+noxa loads `./config.json` by default. Override with `--config ` or `NOXA_CONFIG`.
+
+```bash
+noxa --config ~/.noxa/config.json https://example.com
+export NOXA_CONFIG=/etc/noxa/config.json
+```
+
+**Important caveats:**
+- CLI flags always win over config.json values.
+- `on_change` is intentionally excluded from config.json (security: prevents shell injection via config writes).
+- Secrets and URLs (`api_key`, `proxy`, `webhook`, `llm_base_url`) belong in `.env`, not config.json.
+- Bool flags set to `true` in config.json (`only_main_content`, `metadata`, `verbose`, `use_sitemap`) **cannot** be overridden to `false` from the CLI for a single run (clap has no `--no-flag` variant). Use `NOXA_CONFIG=/dev/null` to bypass the config entirely.
+
+---
+
+## Environment Variables
+
+| Variable | Flag equivalent | Description |
+|----------|----------------|-------------|
+| `NOXA_API_KEY` | `--api-key` | Cloud API key |
+| `NOXA_PROXY` | `--proxy` | Single proxy URL |
+| `NOXA_PROXY_FILE` | `--proxy-file` | Proxy pool file path |
+| `NOXA_WEBHOOK_URL` | `--webhook` | Webhook URL for notifications |
+| `NOXA_LLM_PROVIDER` | `--llm-provider` | LLM provider (`gemini`/`ollama`/`openai`/`anthropic`) |
+| `NOXA_LLM_MODEL` | `--llm-model` | LLM model name override |
+| `NOXA_LLM_BASE_URL` | `--llm-base-url` | LLM base URL for Ollama or OpenAI-compatible endpoints |
+| `NOXA_CONFIG` | `--config` | Path to config.json |
+| `OPENAI_API_KEY` | — | OpenAI API key |
+| `ANTHROPIC_API_KEY` | — | Anthropic API key |
+| `OLLAMA_HOST` | — | Ollama endpoint (default: `http://localhost:11434`) |
+| `GEMINI_MODEL` | — | Gemini model override (default: `gemini-2.5-pro`) |
+
+---
+
+## config.json Reference
+
+All fields are optional. Unknown fields are silently ignored.
+
+```json
+{
+ "format": "llm",
+ "metadata": true,
+ "verbose": false,
+
+ "browser": "firefox",
+ "timeout": 60,
+ "pdf_mode": "fast",
+ "only_main_content": true,
+
+ "include_selectors": ["article", ".content"],
+ "exclude_selectors": ["nav", "footer"],
+
+ "depth": 3,
+ "max_pages": 100,
+ "concurrency": 10,
+ "delay": 200,
+ "path_prefix": "/docs/",
+ "include_paths": ["/docs/*", "/api/*"],
+ "exclude_paths": ["/changelog/*", "/blog/*"],
+ "use_sitemap": true,
+
+ "llm_provider": "gemini",
+ "llm_model": "gemini-2.5-pro"
+}
+```
+
+**Not configurable via config.json** (CLI-only or secrets):
+- `on_change` — shell injection risk
+- `api_key`, `proxy`, `webhook`, `llm_base_url` — secrets/URLs belong in `.env`
+- `raw_html` — per-run mode, not a persistent default