mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
chore: rebrand webclaw to noxa
This commit is contained in:
parent
a4c351d5ae
commit
8674b60b4e
86 changed files with 781 additions and 2121 deletions
BIN
.github/banner.png
vendored
BIN
.github/banner.png
vendored
Binary file not shown.
|
Before Width: | Height: | Size: 44 KiB |
10
.github/workflows/deps.yml
vendored
10
.github/workflows/deps.yml
vendored
|
|
@ -11,7 +11,7 @@ env:
|
|||
|
||||
jobs:
|
||||
sync-tls:
|
||||
name: Update webclaw-tls dependencies
|
||||
name: Update noxa-tls dependencies
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
@ -20,9 +20,9 @@ jobs:
|
|||
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
|
||||
- name: Update webclaw-tls crates
|
||||
- name: Update noxa-tls crates
|
||||
run: |
|
||||
cargo update -p webclaw-http -p rustls -p 'h2@0.4' -p 'hyper@1' -p 'hyper-util@0.1' -p 'reqwest@0.13' 2>&1
|
||||
cargo update -p noxa-http -p rustls -p 'h2@0.4' -p 'hyper@1' -p 'hyper-util@0.1' -p 'reqwest@0.13' 2>&1
|
||||
|
||||
- name: Check for changes
|
||||
id: diff
|
||||
|
|
@ -39,13 +39,13 @@ jobs:
|
|||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
git add Cargo.lock
|
||||
git commit -m "chore: update webclaw-tls dependencies"
|
||||
git commit -m "chore: update noxa-tls dependencies"
|
||||
git push
|
||||
|
||||
- name: Trigger server update
|
||||
if: steps.diff.outputs.changed == 'true'
|
||||
run: |
|
||||
gh api repos/0xMassi/webclaw-server/dispatches \
|
||||
gh api repos/0xMassi/noxa-server/dispatches \
|
||||
-f event_type=core-updated \
|
||||
-f client_payload[sha]=${{ github.sha }}
|
||||
env:
|
||||
|
|
|
|||
68
.github/workflows/release.yml
vendored
68
.github/workflows/release.yml
vendored
|
|
@ -64,10 +64,10 @@ jobs:
|
|||
shell: bash
|
||||
run: |
|
||||
tag="${GITHUB_REF#refs/tags/}"
|
||||
staging="webclaw-${tag}-${{ matrix.target }}"
|
||||
staging="noxa-${tag}-${{ matrix.target }}"
|
||||
mkdir "$staging"
|
||||
cp target/${{ matrix.target }}/release/webclaw "$staging/" 2>/dev/null || true
|
||||
cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/" 2>/dev/null || true
|
||||
cp target/${{ matrix.target }}/release/noxa "$staging/" 2>/dev/null || true
|
||||
cp target/${{ matrix.target }}/release/noxa-mcp "$staging/" 2>/dev/null || true
|
||||
cp README.md LICENSE "$staging/"
|
||||
tar czf "$staging.tar.gz" "$staging"
|
||||
echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
|
||||
|
|
@ -128,12 +128,12 @@ jobs:
|
|||
run: |
|
||||
tag="${GITHUB_REF#refs/tags/}"
|
||||
for target in x86_64-unknown-linux-gnu aarch64-unknown-linux-gnu; do
|
||||
dir="webclaw-${tag}-${target}"
|
||||
curl -sSL "https://github.com/0xMassi/webclaw/releases/download/${tag}/${dir}.tar.gz" -o "${target}.tar.gz"
|
||||
dir="noxa-${tag}-${target}"
|
||||
curl -sSL "https://github.com/0xMassi/noxa/releases/download/${tag}/${dir}.tar.gz" -o "${target}.tar.gz"
|
||||
tar xzf "${target}.tar.gz"
|
||||
mkdir -p "binaries-${target}"
|
||||
cp "${dir}/webclaw" "binaries-${target}/webclaw"
|
||||
cp "${dir}/webclaw-mcp" "binaries-${target}/webclaw-mcp"
|
||||
cp "${dir}/noxa" "binaries-${target}/noxa"
|
||||
cp "${dir}/noxa-mcp" "binaries-${target}/noxa-mcp"
|
||||
chmod +x "binaries-${target}"/*
|
||||
done
|
||||
ls -laR binaries-*/
|
||||
|
|
@ -145,22 +145,22 @@ jobs:
|
|||
|
||||
# amd64
|
||||
docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-x86_64-unknown-linux-gnu \
|
||||
--platform linux/amd64 -t ghcr.io/0xmassi/webclaw:${tag}-amd64 --push .
|
||||
--platform linux/amd64 -t ghcr.io/0xmassi/noxa:${tag}-amd64 --push .
|
||||
|
||||
# arm64
|
||||
docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-aarch64-unknown-linux-gnu \
|
||||
--platform linux/arm64 -t ghcr.io/0xmassi/webclaw:${tag}-arm64 --push .
|
||||
--platform linux/arm64 -t ghcr.io/0xmassi/noxa:${tag}-arm64 --push .
|
||||
|
||||
# Multi-arch manifest
|
||||
docker manifest create ghcr.io/0xmassi/webclaw:${tag} \
|
||||
ghcr.io/0xmassi/webclaw:${tag}-amd64 \
|
||||
ghcr.io/0xmassi/webclaw:${tag}-arm64
|
||||
docker manifest push ghcr.io/0xmassi/webclaw:${tag}
|
||||
docker manifest create ghcr.io/0xmassi/noxa:${tag} \
|
||||
ghcr.io/0xmassi/noxa:${tag}-amd64 \
|
||||
ghcr.io/0xmassi/noxa:${tag}-arm64
|
||||
docker manifest push ghcr.io/0xmassi/noxa:${tag}
|
||||
|
||||
docker manifest create ghcr.io/0xmassi/webclaw:latest \
|
||||
ghcr.io/0xmassi/webclaw:${tag}-amd64 \
|
||||
ghcr.io/0xmassi/webclaw:${tag}-arm64
|
||||
docker manifest push ghcr.io/0xmassi/webclaw:latest
|
||||
docker manifest create ghcr.io/0xmassi/noxa:latest \
|
||||
ghcr.io/0xmassi/noxa:${tag}-amd64 \
|
||||
ghcr.io/0xmassi/noxa:${tag}-arm64
|
||||
docker manifest push ghcr.io/0xmassi/noxa:latest
|
||||
|
||||
homebrew:
|
||||
name: Update Homebrew
|
||||
|
|
@ -172,11 +172,11 @@ jobs:
|
|||
COMMITTER_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }}
|
||||
run: |
|
||||
tag="${GITHUB_REF#refs/tags/}"
|
||||
base="https://github.com/0xMassi/webclaw/releases/download/${tag}"
|
||||
base="https://github.com/0xMassi/noxa/releases/download/${tag}"
|
||||
|
||||
# Download all 4 tarballs and compute SHAs
|
||||
for target in aarch64-apple-darwin x86_64-apple-darwin aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu; do
|
||||
curl -sSL "${base}/webclaw-${tag}-${target}.tar.gz" -o "${target}.tar.gz"
|
||||
curl -sSL "${base}/noxa-${tag}-${target}.tar.gz" -o "${target}.tar.gz"
|
||||
done
|
||||
|
||||
SHA_MAC_ARM=$(sha256sum aarch64-apple-darwin.tar.gz | cut -d' ' -f1)
|
||||
|
|
@ -190,53 +190,53 @@ jobs:
|
|||
echo "Linux x86_64: $SHA_LINUX_X86"
|
||||
|
||||
# Generate formula
|
||||
cat > webclaw.rb << FORMULA
|
||||
class Webclaw < Formula
|
||||
cat > noxa.rb << FORMULA
|
||||
class Noxa < Formula
|
||||
desc "The fastest web scraper for AI agents. 67% fewer tokens. Sub-ms extraction."
|
||||
homepage "https://webclaw.io"
|
||||
homepage "https://noxa.io"
|
||||
license "AGPL-3.0"
|
||||
version "${tag#v}"
|
||||
|
||||
on_macos do
|
||||
if Hardware::CPU.arm?
|
||||
url "${base}/webclaw-${tag}-aarch64-apple-darwin.tar.gz"
|
||||
url "${base}/noxa-${tag}-aarch64-apple-darwin.tar.gz"
|
||||
sha256 "${SHA_MAC_ARM}"
|
||||
else
|
||||
url "${base}/webclaw-${tag}-x86_64-apple-darwin.tar.gz"
|
||||
url "${base}/noxa-${tag}-x86_64-apple-darwin.tar.gz"
|
||||
sha256 "${SHA_MAC_X86}"
|
||||
end
|
||||
end
|
||||
|
||||
on_linux do
|
||||
if Hardware::CPU.arm?
|
||||
url "${base}/webclaw-${tag}-aarch64-unknown-linux-gnu.tar.gz"
|
||||
url "${base}/noxa-${tag}-aarch64-unknown-linux-gnu.tar.gz"
|
||||
sha256 "${SHA_LINUX_ARM}"
|
||||
else
|
||||
url "${base}/webclaw-${tag}-x86_64-unknown-linux-gnu.tar.gz"
|
||||
url "${base}/noxa-${tag}-x86_64-unknown-linux-gnu.tar.gz"
|
||||
sha256 "${SHA_LINUX_X86}"
|
||||
end
|
||||
end
|
||||
|
||||
def install
|
||||
bin.install "webclaw"
|
||||
bin.install "webclaw-mcp"
|
||||
bin.install "noxa"
|
||||
bin.install "noxa-mcp"
|
||||
end
|
||||
|
||||
test do
|
||||
assert_match "webclaw", shell_output("#{bin}/webclaw --version")
|
||||
assert_match "noxa", shell_output("#{bin}/noxa --version")
|
||||
end
|
||||
end
|
||||
FORMULA
|
||||
|
||||
# Remove leading whitespace from heredoc
|
||||
sed -i 's/^ //' webclaw.rb
|
||||
sed -i 's/^ //' noxa.rb
|
||||
|
||||
# Push to homebrew tap
|
||||
git clone "https://x-access-token:${COMMITTER_TOKEN}@github.com/0xMassi/homebrew-webclaw.git" tap
|
||||
cp webclaw.rb tap/Formula/webclaw.rb
|
||||
git clone "https://x-access-token:${COMMITTER_TOKEN}@github.com/0xMassi/homebrew-noxa.git" tap
|
||||
cp noxa.rb tap/Formula/noxa.rb
|
||||
cd tap
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
git add Formula/webclaw.rb
|
||||
git diff --cached --quiet || git commit -m "Update webclaw to ${tag}"
|
||||
git add Formula/noxa.rb
|
||||
git diff --cached --quiet || git commit -m "Update noxa to ${tag}"
|
||||
git push
|
||||
|
|
|
|||
24
CHANGELOG.md
24
CHANGELOG.md
|
|
@ -1,6 +1,6 @@
|
|||
# Changelog
|
||||
|
||||
All notable changes to webclaw are documented here.
|
||||
All notable changes to noxa are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.3.11] — 2026-04-10
|
||||
|
|
@ -33,19 +33,19 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
|
|||
## [0.3.8] — 2026-04-03
|
||||
|
||||
### Fixed
|
||||
- **MCP research token overflow**: research results are now saved to `~/.webclaw/research/` and the MCP tool returns file paths + findings instead of the full report. Prevents "exceeds maximum allowed tokens" errors in Claude/Cursor.
|
||||
- **MCP research token overflow**: research results are now saved to `~/.noxa/research/` and the MCP tool returns file paths + findings instead of the full report. Prevents "exceeds maximum allowed tokens" errors in Claude/Cursor.
|
||||
- **Research caching**: same query returns cached result instantly without spending credits.
|
||||
- **Anthropic rate limit throttling**: 60s delay between LLM calls in research to stay under Tier 1 limits (50K input tokens/min).
|
||||
|
||||
### Added
|
||||
- **`dirs` dependency** for `~/.webclaw/research/` path resolution.
|
||||
- **`dirs` dependency** for `~/.noxa/research/` path resolution.
|
||||
|
||||
---
|
||||
## [0.3.7] — 2026-04-03
|
||||
|
||||
### Added
|
||||
- **`--research` CLI flag**: run deep research via the cloud API. Prints report to stdout and saves full result (report + sources + findings) to a JSON file. Supports `--deep` for longer reports.
|
||||
- **MCP extract/summarize cloud fallback**: when no local LLM is available, these tools now fall back to the cloud API instead of erroring. Set `WEBCLAW_API_KEY` for automatic fallback.
|
||||
- **MCP extract/summarize cloud fallback**: when no local LLM is available, these tools now fall back to the cloud API instead of erroring. Set `NOXA_API_KEY` for automatic fallback.
|
||||
- **MCP research structured output**: the research tool now returns structured JSON (report + sources + findings + metadata) instead of raw text, so agents can reference individual findings and source URLs.
|
||||
|
||||
---
|
||||
|
|
@ -80,13 +80,13 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
|
|||
## [0.3.3] — 2026-04-01
|
||||
|
||||
### Changed
|
||||
- **Replaced custom TLS stack with wreq**: migrated from webclaw-tls (patched rustls/h2/hyper/reqwest) to [wreq](https://github.com/0x676e67/wreq) by [@0x676e67](https://github.com/0x676e67). wreq uses BoringSSL for TLS and the [http2](https://github.com/0x676e67/http2) crate for HTTP/2 fingerprinting — both battle-tested with 60+ browser profiles.
|
||||
- **Removed all `[patch.crates-io]` entries**: consumers no longer need to patch rustls, h2, hyper, hyper-util, or reqwest. Just depend on webclaw normally.
|
||||
- **Replaced custom TLS stack with wreq**: migrated from noxa-tls (patched rustls/h2/hyper/reqwest) to [wreq](https://github.com/0x676e67/wreq) by [@0x676e67](https://github.com/0x676e67). wreq uses BoringSSL for TLS and the [http2](https://github.com/0x676e67/http2) crate for HTTP/2 fingerprinting — both battle-tested with 60+ browser profiles.
|
||||
- **Removed all `[patch.crates-io]` entries**: consumers no longer need to patch rustls, h2, hyper, hyper-util, or reqwest. Just depend on noxa normally.
|
||||
- **Browser profiles rebuilt on wreq's Emulation API**: Chrome 145, Firefox 135, Safari 18, Edge 145 with correct TLS options (cipher suites, curves, GREASE, ECH, PSK session resumption), HTTP/2 SETTINGS ordering, pseudo-header order, and header wire order.
|
||||
- **Better TLS compatibility**: BoringSSL handles more server configurations than patched rustls (e.g. servers that previously returned IllegalParameter alerts).
|
||||
|
||||
### Removed
|
||||
- webclaw-tls dependency and all 5 forked crates (webclaw-rustls, webclaw-h2, webclaw-hyper, webclaw-hyper-util, webclaw-reqwest).
|
||||
- noxa-tls dependency and all 5 forked crates (noxa-rustls, noxa-h2, noxa-hyper, noxa-hyper-util, noxa-reqwest).
|
||||
|
||||
### Acknowledgments
|
||||
- TLS and HTTP/2 fingerprinting powered by [wreq](https://github.com/0x676e67/wreq) and [http2](https://github.com/0x676e67/http2) by [@0x676e67](https://github.com/0x676e67), who pioneered browser-grade HTTP/2 fingerprinting in Rust.
|
||||
|
|
@ -114,7 +114,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
|
|||
## [0.3.0] — 2026-03-29
|
||||
|
||||
### Changed
|
||||
- **Replaced primp with webclaw-tls**: switched to custom TLS fingerprinting stack.
|
||||
- **Replaced primp with noxa-tls**: switched to custom TLS fingerprinting stack.
|
||||
- **Browser profiles**: Chrome 146 (Win/Mac), Firefox 135+, Safari 18, Edge 146 — captured from real browsers.
|
||||
- **HTTP/2 fingerprinting**: SETTINGS frame ordering and pseudo-header ordering based on concepts pioneered by [@0x676e67](https://github.com/0x676e67).
|
||||
|
||||
|
|
@ -141,7 +141,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
|
|||
## [0.2.1] — 2026-03-27
|
||||
|
||||
### Added
|
||||
- **Docker image on GHCR**: `docker run ghcr.io/0xmassi/webclaw` — auto-built on every release
|
||||
- **Docker image on GHCR**: `docker run ghcr.io/0xmassi/noxa` — auto-built on every release
|
||||
- **QuickJS data island extraction**: inline `<script>` execution catches `window.__PRELOADED_STATE__`, Next.js hydration data, and other JS-embedded content
|
||||
|
||||
### Fixed
|
||||
|
|
@ -205,7 +205,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
|
|||
### Added
|
||||
- Crawl streaming: real-time progress on stderr as pages complete (`[2/50] OK https://... (234ms, 1523 words)`)
|
||||
- Crawl resume/cancel: `--crawl-state <path>` saves progress on Ctrl+C and resumes from where it left off
|
||||
- MCP server proxy support via `WEBCLAW_PROXY` and `WEBCLAW_PROXY_FILE` env vars
|
||||
- MCP server proxy support via `NOXA_PROXY` and `NOXA_PROXY_FILE` env vars
|
||||
|
||||
### Changed
|
||||
- Crawl results now expose visited set and remaining frontier for accurate state persistence
|
||||
|
|
@ -222,14 +222,14 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
|
|||
- Reddit scraping: use plain HTTP client for `.json` endpoint (TLS fingerprinting was getting blocked)
|
||||
|
||||
### Added
|
||||
- YouTube transcript extraction infrastructure in webclaw-core (caption track parsing, timed text XML parser) — wired up when cloud API launches
|
||||
- YouTube transcript extraction infrastructure in noxa-core (caption track parsing, timed text XML parser) — wired up when cloud API launches
|
||||
|
||||
---
|
||||
|
||||
## [0.1.1] — 2026-03-24
|
||||
|
||||
### Fixed
|
||||
- MCP server now identifies as `webclaw-mcp` instead of `rmcp` in the MCP handshake
|
||||
- MCP server now identifies as `noxa-mcp` instead of `rmcp` in the MCP handshake
|
||||
- Research tool polling caps at 200 iterations (~10 min) instead of looping forever
|
||||
- CLI returns non-zero exit codes on errors (invalid format, fetch failures, missing LLM)
|
||||
- Text format output strips markdown table syntax (`| --- |` pipes)
|
||||
|
|
|
|||
76
CLAUDE.md
76
CLAUDE.md
|
|
@ -1,30 +1,30 @@
|
|||
# Webclaw
|
||||
# Noxa
|
||||
|
||||
Rust workspace: CLI + MCP server for web content extraction into LLM-optimized formats.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
webclaw/
|
||||
noxa/
|
||||
crates/
|
||||
webclaw-core/ # Pure extraction engine. WASM-safe. Zero network deps.
|
||||
noxa-core/ # Pure extraction engine. WASM-safe. Zero network deps.
|
||||
# + ExtractionOptions (include/exclude CSS selectors)
|
||||
# + diff engine (change tracking)
|
||||
# + brand extraction (DOM/CSS analysis)
|
||||
webclaw-fetch/ # HTTP client via primp. Crawler. Sitemap discovery. Batch ops.
|
||||
noxa-fetch/ # HTTP client via primp. Crawler. Sitemap discovery. Batch ops.
|
||||
# + proxy pool rotation (per-request)
|
||||
# + PDF content-type detection
|
||||
# + document parsing (DOCX, XLSX, CSV)
|
||||
webclaw-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic)
|
||||
noxa-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic)
|
||||
# + JSON schema extraction, prompt extraction, summarization
|
||||
webclaw-pdf/ # PDF text extraction via pdf-extract
|
||||
webclaw-mcp/ # MCP server (Model Context Protocol) for AI agents
|
||||
webclaw-cli/ # CLI binary
|
||||
noxa-pdf/ # PDF text extraction via pdf-extract
|
||||
noxa-mcp/ # MCP server (Model Context Protocol) for AI agents
|
||||
noxa/ # CLI binary
|
||||
```
|
||||
|
||||
Two binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server).
|
||||
Two binaries: `noxa` (CLI), `noxa-mcp` (MCP server).
|
||||
|
||||
### Core Modules (`webclaw-core`)
|
||||
### Core Modules (`noxa-core`)
|
||||
- `extractor.rs` — Readability-style scoring: text density, semantic tags, link density penalty
|
||||
- `noise.rs` — Shared noise filter: tags, ARIA roles, class/ID patterns. Tailwind-safe.
|
||||
- `data_island.rs` — JSON data island extraction for React SPAs, Next.js, Contentful CMS
|
||||
|
|
@ -37,7 +37,7 @@ Two binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server).
|
|||
- `diff.rs` — Content change tracking engine (snapshot diffing)
|
||||
- `brand.rs` — Brand identity extraction from DOM structure and CSS
|
||||
|
||||
### Fetch Modules (`webclaw-fetch`)
|
||||
### Fetch Modules (`noxa-fetch`)
|
||||
- `client.rs` — FetchClient with primp TLS impersonation
|
||||
- `browser.rs` — Browser profiles: Chrome (142/136/133/131), Firefox (144/135/133/128)
|
||||
- `crawler.rs` — BFS same-origin crawler with configurable depth/concurrency/delay
|
||||
|
|
@ -47,14 +47,14 @@ Two binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server).
|
|||
- `document.rs` — Document parsing: DOCX, XLSX, CSV auto-detection and extraction
|
||||
- `search.rs` — Web search via Serper.dev with parallel result scraping
|
||||
|
||||
### LLM Modules (`webclaw-llm`)
|
||||
### LLM Modules (`noxa-llm`)
|
||||
- Provider chain: Ollama (local-first) -> OpenAI -> Anthropic
|
||||
- JSON schema extraction, prompt-based extraction, summarization
|
||||
|
||||
### PDF Modules (`webclaw-pdf`)
|
||||
### PDF Modules (`noxa-pdf`)
|
||||
- PDF text extraction via pdf-extract crate
|
||||
|
||||
### MCP Server (`webclaw-mcp`)
|
||||
### MCP Server (`noxa-mcp`)
|
||||
- Model Context Protocol server over stdio transport
|
||||
- 8 tools: scrape, crawl, map, batch, extract, summarize, diff, brand
|
||||
- Works with Claude Desktop, Claude Code, and any MCP client
|
||||
|
|
@ -65,7 +65,7 @@ Two binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server).
|
|||
- **Core has ZERO network dependencies** — takes `&str` HTML, returns structured output. Keep it WASM-compatible.
|
||||
- **primp requires `[patch.crates-io]`** for patched rustls/h2 forks at workspace level.
|
||||
- **RUSTFLAGS are set in `.cargo/config.toml`** — no need to pass manually.
|
||||
- **webclaw-llm uses plain reqwest** (NOT primp-patched). LLM APIs don't need TLS fingerprinting.
|
||||
- **noxa-llm uses plain reqwest** (NOT primp-patched). LLM APIs don't need TLS fingerprinting.
|
||||
- **qwen3 thinking tags** (`<think>`) are stripped at both provider and consumer levels.
|
||||
|
||||
## Build & Test
|
||||
|
|
@ -73,52 +73,52 @@ Two binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server).
|
|||
```bash
|
||||
cargo build --release # Both binaries
|
||||
cargo test --workspace # All tests
|
||||
cargo test -p webclaw-core # Core only
|
||||
cargo test -p webclaw-llm # LLM only
|
||||
cargo test -p noxa-core # Core only
|
||||
cargo test -p noxa-llm # LLM only
|
||||
```
|
||||
|
||||
## CLI
|
||||
|
||||
```bash
|
||||
# Basic extraction
|
||||
webclaw https://example.com
|
||||
webclaw https://example.com --format llm
|
||||
noxa https://example.com
|
||||
noxa https://example.com --format llm
|
||||
|
||||
# Content filtering
|
||||
webclaw https://example.com --include "article" --exclude "nav,footer"
|
||||
webclaw https://example.com --only-main-content
|
||||
noxa https://example.com --include "article" --exclude "nav,footer"
|
||||
noxa https://example.com --only-main-content
|
||||
|
||||
# Batch + proxy rotation
|
||||
webclaw url1 url2 url3 --proxy-file proxies.txt
|
||||
webclaw --urls-file urls.txt --concurrency 10
|
||||
noxa url1 url2 url3 --proxy-file proxies.txt
|
||||
noxa --urls-file urls.txt --concurrency 10
|
||||
|
||||
# Sitemap discovery
|
||||
webclaw https://docs.example.com --map
|
||||
noxa https://docs.example.com --map
|
||||
|
||||
# Crawling (with sitemap seeding)
|
||||
webclaw https://docs.example.com --crawl --depth 2 --max-pages 50 --sitemap
|
||||
noxa https://docs.example.com --crawl --depth 2 --max-pages 50 --sitemap
|
||||
|
||||
# Change tracking
|
||||
webclaw https://example.com -f json > snap.json
|
||||
webclaw https://example.com --diff-with snap.json
|
||||
noxa https://example.com -f json > snap.json
|
||||
noxa https://example.com --diff-with snap.json
|
||||
|
||||
# Brand extraction
|
||||
webclaw https://example.com --brand
|
||||
noxa https://example.com --brand
|
||||
|
||||
# LLM features (Ollama local-first)
|
||||
webclaw https://example.com --summarize
|
||||
webclaw https://example.com --extract-prompt "Get all pricing tiers"
|
||||
webclaw https://example.com --extract-json '{"type":"object","properties":{"title":{"type":"string"}}}'
|
||||
noxa https://example.com --summarize
|
||||
noxa https://example.com --extract-prompt "Get all pricing tiers"
|
||||
noxa https://example.com --extract-json '{"type":"object","properties":{"title":{"type":"string"}}}'
|
||||
|
||||
# PDF (auto-detected via Content-Type)
|
||||
webclaw https://example.com/report.pdf
|
||||
noxa https://example.com/report.pdf
|
||||
|
||||
# Browser impersonation: chrome (default), firefox, random
|
||||
webclaw https://example.com --browser firefox
|
||||
noxa https://example.com --browser firefox
|
||||
|
||||
# Local file / stdin
|
||||
webclaw --file page.html
|
||||
cat page.html | webclaw --stdin
|
||||
noxa --file page.html
|
||||
cat page.html | noxa --stdin
|
||||
```
|
||||
|
||||
## Key Thresholds
|
||||
|
|
@ -135,8 +135,8 @@ Add to Claude Desktop config (`~/Library/Application Support/Claude/claude_deskt
|
|||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"webclaw": {
|
||||
"command": "/path/to/webclaw-mcp"
|
||||
"noxa": {
|
||||
"command": "/path/to/noxa-mcp"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -152,5 +152,5 @@ Add to Claude Desktop config (`~/Library/Application Support/Claude/claude_deskt
|
|||
|
||||
## Git
|
||||
|
||||
- Remote: `git@github.com:0xMassi/webclaw.git`
|
||||
- Remote: `git@github.com:jmagar/noxa.git`
|
||||
- Use `/commit` skill for commits
|
||||
|
|
|
|||
|
|
@ -1,38 +0,0 @@
|
|||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
We as members, contributors, and leaders pledge to make participation in our
|
||||
community a harassment-free experience for everyone, regardless of age, body
|
||||
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||
identity and expression, level of experience, education, socio-economic status,
|
||||
nationality, personal appearance, race, caste, color, religion, or sexual
|
||||
identity and orientation.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to a positive environment:
|
||||
|
||||
* Using welcoming and inclusive language
|
||||
* Being respectful of differing viewpoints and experiences
|
||||
* Gracefully accepting constructive criticism
|
||||
* Focusing on what is best for the community
|
||||
* Showing empathy towards other community members
|
||||
|
||||
Examples of unacceptable behavior:
|
||||
|
||||
* The use of sexualized language or imagery, and sexual attention or advances of any kind
|
||||
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information without explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a professional setting
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported to the project maintainers at **admin@webclaw.io**. All complaints
|
||||
will be reviewed and investigated promptly and fairly.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 2.1.
|
||||
118
CONTRIBUTING.md
118
CONTRIBUTING.md
|
|
@ -1,118 +0,0 @@
|
|||
# Contributing to Webclaw
|
||||
|
||||
Thanks for your interest in contributing. This document covers the essentials.
|
||||
|
||||
## Development Setup
|
||||
|
||||
1. Install Rust 1.85+ (edition 2024 required):
|
||||
```bash
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
|
||||
2. Clone and build:
|
||||
```bash
|
||||
git clone https://github.com/0xMassi/webclaw.git
|
||||
cd webclaw
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
RUSTFLAGS are configured in `.cargo/config.toml` -- no manual flags needed.
|
||||
|
||||
3. Optional: run `./setup.sh` for environment bootstrapping.
|
||||
|
||||
## Running Tests
|
||||
|
||||
```bash
|
||||
cargo test --workspace # All crates
|
||||
cargo test -p webclaw-core # Single crate
|
||||
```
|
||||
|
||||
## Linting
|
||||
|
||||
```bash
|
||||
cargo clippy --all -- -D warnings
|
||||
cargo fmt --check --all
|
||||
```
|
||||
|
||||
Both must pass cleanly before submitting a PR.
|
||||
|
||||
## Code Style
|
||||
|
||||
- Rust edition 2024, formatted with `rustfmt` (see `rustfmt.toml`, `style_edition = "2024"`)
|
||||
- `webclaw-core` has zero network dependencies -- keep it WASM-safe
|
||||
- `webclaw-llm` uses plain `reqwest` — LLM APIs don't need TLS fingerprinting
|
||||
- Prefer returning `Result` over panicking. No `.unwrap()` on untrusted input.
|
||||
- Doc comments on all public items. Explain *why*, not *what*.
|
||||
|
||||
## Pull Request Process
|
||||
|
||||
1. Fork the repository and create a feature branch:
|
||||
```bash
|
||||
git checkout -b feat/my-feature
|
||||
```
|
||||
|
||||
2. Make your changes. Write tests for new functionality.
|
||||
|
||||
3. Ensure all checks pass:
|
||||
```bash
|
||||
cargo test --workspace
|
||||
cargo clippy --all -- -D warnings
|
||||
cargo fmt --check --all
|
||||
```
|
||||
|
||||
4. Push and open a pull request against `main`.
|
||||
|
||||
5. PRs require review before merging. Keep changes focused -- one concern per PR.
|
||||
|
||||
## Commit Messages
|
||||
|
||||
Follow [Conventional Commits](https://www.conventionalcommits.org/):
|
||||
|
||||
```
|
||||
feat: add PDF table extraction
|
||||
fix: handle malformed sitemap XML gracefully
|
||||
refactor: simplify crawler BFS loop
|
||||
docs: update MCP setup instructions
|
||||
test: add glob_match edge cases
|
||||
chore: bump dependencies
|
||||
```
|
||||
|
||||
Use the imperative mood ("add", not "added"). Keep the subject under 72 characters.
|
||||
Body is optional but encouraged for non-trivial changes.
|
||||
|
||||
## Reporting Issues
|
||||
|
||||
- Search existing issues before opening a new one
|
||||
- Include: Rust version, OS, steps to reproduce, expected vs actual behavior
|
||||
- For extraction bugs: include the URL (or HTML snippet) and the output format used
|
||||
- Security issues: email directly instead of opening a public issue
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
webclaw (this repo)
|
||||
├── crates/
|
||||
│ ├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
|
||||
│ ├── webclaw-fetch/ # HTTP client + crawler + sitemap + batch
|
||||
│ ├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
|
||||
│ ├── webclaw-pdf/ # PDF text extraction
|
||||
│ ├── webclaw-cli/ # CLI binary
|
||||
│ └── webclaw-mcp/ # MCP server binary
|
||||
│
|
||||
└── [patch.crates-io] # Points to webclaw-tls for TLS fingerprinting
|
||||
```
|
||||
|
||||
TLS fingerprinting lives in a separate repo: [webclaw-tls](https://github.com/0xMassi/webclaw-tls). The `[patch.crates-io]` section in `Cargo.toml` overrides rustls, h2, hyper, hyper-util, and reqwest with our patched forks for browser-grade JA4 + HTTP/2 Akamai fingerprinting.
|
||||
|
||||
## Crate Boundaries
|
||||
|
||||
Changes that cross crate boundaries need extra care:
|
||||
|
||||
| Crate | Network? | Key constraint |
|
||||
|-------|----------|----------------|
|
||||
| webclaw-core | No | Zero network deps, WASM-safe |
|
||||
| webclaw-fetch | Yes (webclaw-http) | Uses [webclaw-tls](https://github.com/0xMassi/webclaw-tls) for TLS fingerprinting |
|
||||
| webclaw-llm | Yes (reqwest) | Plain reqwest — LLM APIs don't need TLS fingerprinting |
|
||||
| webclaw-pdf | No | Minimal, wraps pdf-extract |
|
||||
| webclaw-cli | Yes | Depends on all above |
|
||||
| webclaw-mcp | Yes | MCP server via rmcp |
|
||||
206
Cargo.lock
generated
206
Cargo.lock
generated
|
|
@ -1618,6 +1618,109 @@ dependencies = [
|
|||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "noxa-cli"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
"noxa-core",
|
||||
"noxa-fetch",
|
||||
"noxa-llm",
|
||||
"noxa-pdf",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "noxa-core"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"rquickjs",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"similar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "noxa-fetch"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"calamine",
|
||||
"http",
|
||||
"noxa-core",
|
||||
"noxa-pdf",
|
||||
"quick-xml 0.37.5",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
"wreq",
|
||||
"zip 2.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "noxa-llm"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "noxa-mcp"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
"noxa-core",
|
||||
"noxa-fetch",
|
||||
"noxa-llm",
|
||||
"noxa-pdf",
|
||||
"reqwest",
|
||||
"rmcp",
|
||||
"schemars",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "noxa-pdf"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.50.3"
|
||||
|
|
@ -3100,109 +3203,6 @@ dependencies = [
|
|||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"webclaw-core",
|
||||
"webclaw-fetch",
|
||||
"webclaw-llm",
|
||||
"webclaw-pdf",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"rquickjs",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"similar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"calamine",
|
||||
"http",
|
||||
"quick-xml 0.37.5",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
"webclaw-core",
|
||||
"webclaw-pdf",
|
||||
"wreq",
|
||||
"zip 2.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
"reqwest",
|
||||
"rmcp",
|
||||
"schemars",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"webclaw-core",
|
||||
"webclaw-fetch",
|
||||
"webclaw-llm",
|
||||
"webclaw-pdf",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-root-certs"
|
||||
version = "1.0.6"
|
||||
|
|
|
|||
11
Cargo.toml
11
Cargo.toml
|
|
@ -6,13 +6,13 @@ members = ["crates/*"]
|
|||
version = "0.3.11"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
repository = "https://github.com/jmagar/noxa"
|
||||
|
||||
[workspace.dependencies]
|
||||
webclaw-core = { path = "crates/webclaw-core" }
|
||||
webclaw-fetch = { path = "crates/webclaw-fetch" }
|
||||
webclaw-llm = { path = "crates/webclaw-llm" }
|
||||
webclaw-pdf = { path = "crates/webclaw-pdf" }
|
||||
noxa-core = { path = "crates/noxa-core" }
|
||||
noxa-fetch = { path = "crates/noxa-fetch" }
|
||||
noxa-llm = { path = "crates/noxa-llm" }
|
||||
noxa-pdf = { path = "crates/noxa-pdf" }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
|
@ -21,4 +21,3 @@ tracing = "0.1"
|
|||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
clap = { version = "4", features = ["derive", "env"] }
|
||||
dotenvy = "0.15"
|
||||
|
||||
|
|
|
|||
34
Dockerfile
34
Dockerfile
|
|
@ -1,5 +1,5 @@
|
|||
# webclaw — Multi-stage Docker build
|
||||
# Produces 2 binaries: webclaw (CLI) and webclaw-mcp (MCP server)
|
||||
# noxa — Multi-stage Docker build
|
||||
# Produces 2 binaries: noxa (CLI) and noxa-mcp (MCP server)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stage 1: Build all binaries in release mode
|
||||
|
|
@ -19,23 +19,23 @@ WORKDIR /build
|
|||
# Copy manifests + lock first for better layer caching.
|
||||
# If only source changes, cargo doesn't re-download deps.
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
COPY crates/webclaw-core/Cargo.toml crates/webclaw-core/Cargo.toml
|
||||
COPY crates/webclaw-fetch/Cargo.toml crates/webclaw-fetch/Cargo.toml
|
||||
COPY crates/webclaw-llm/Cargo.toml crates/webclaw-llm/Cargo.toml
|
||||
COPY crates/webclaw-pdf/Cargo.toml crates/webclaw-pdf/Cargo.toml
|
||||
COPY crates/webclaw-mcp/Cargo.toml crates/webclaw-mcp/Cargo.toml
|
||||
COPY crates/webclaw-cli/Cargo.toml crates/webclaw-cli/Cargo.toml
|
||||
COPY crates/noxa-core/Cargo.toml crates/noxa-core/Cargo.toml
|
||||
COPY crates/noxa-fetch/Cargo.toml crates/noxa-fetch/Cargo.toml
|
||||
COPY crates/noxa-llm/Cargo.toml crates/noxa-llm/Cargo.toml
|
||||
COPY crates/noxa-pdf/Cargo.toml crates/noxa-pdf/Cargo.toml
|
||||
COPY crates/noxa-mcp/Cargo.toml crates/noxa-mcp/Cargo.toml
|
||||
COPY crates/noxa-cli/Cargo.toml crates/noxa-cli/Cargo.toml
|
||||
|
||||
# Copy .cargo config if present (optional build flags)
|
||||
COPY .cargo .cargo
|
||||
|
||||
# Create dummy source files so cargo can resolve deps and cache them.
|
||||
RUN mkdir -p crates/webclaw-core/src && echo "" > crates/webclaw-core/src/lib.rs \
|
||||
&& mkdir -p crates/webclaw-fetch/src && echo "" > crates/webclaw-fetch/src/lib.rs \
|
||||
&& mkdir -p crates/webclaw-llm/src && echo "" > crates/webclaw-llm/src/lib.rs \
|
||||
&& mkdir -p crates/webclaw-pdf/src && echo "" > crates/webclaw-pdf/src/lib.rs \
|
||||
&& mkdir -p crates/webclaw-mcp/src && echo "fn main() {}" > crates/webclaw-mcp/src/main.rs \
|
||||
&& mkdir -p crates/webclaw-cli/src && echo "fn main() {}" > crates/webclaw-cli/src/main.rs
|
||||
RUN mkdir -p crates/noxa-core/src && echo "" > crates/noxa-core/src/lib.rs \
|
||||
&& mkdir -p crates/noxa-fetch/src && echo "" > crates/noxa-fetch/src/lib.rs \
|
||||
&& mkdir -p crates/noxa-llm/src && echo "" > crates/noxa-llm/src/lib.rs \
|
||||
&& mkdir -p crates/noxa-pdf/src && echo "" > crates/noxa-pdf/src/lib.rs \
|
||||
&& mkdir -p crates/noxa-mcp/src && echo "fn main() {}" > crates/noxa-mcp/src/main.rs \
|
||||
&& mkdir -p crates/noxa-cli/src && echo "fn main() {}" > crates/noxa-cli/src/main.rs
|
||||
|
||||
# Pre-build dependencies (this layer is cached until Cargo.toml/lock changes)
|
||||
RUN cargo build --release 2>/dev/null || true
|
||||
|
|
@ -55,8 +55,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy both binaries
|
||||
COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw
|
||||
COPY --from=builder /build/target/release/webclaw-mcp /usr/local/bin/webclaw-mcp
|
||||
COPY --from=builder /build/target/release/noxa /usr/local/bin/noxa
|
||||
COPY --from=builder /build/target/release/noxa-mcp /usr/local/bin/noxa-mcp
|
||||
|
||||
# Default: run the CLI
|
||||
CMD ["webclaw"]
|
||||
CMD ["noxa"]
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ARG BINARY_DIR
|
||||
COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw
|
||||
COPY ${BINARY_DIR}/webclaw-mcp /usr/local/bin/webclaw-mcp
|
||||
COPY ${BINARY_DIR}/noxa /usr/local/bin/noxa
|
||||
COPY ${BINARY_DIR}/noxa-mcp /usr/local/bin/noxa-mcp
|
||||
|
||||
CMD ["webclaw"]
|
||||
CMD ["noxa"]
|
||||
|
|
|
|||
661
LICENSE
661
LICENSE
|
|
@ -1,661 +0,0 @@
|
|||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
144
README.md
144
README.md
|
|
@ -1,6 +1,6 @@
|
|||
<p align="center">
|
||||
<a href="https://webclaw.io">
|
||||
<img src=".github/banner.png" alt="webclaw" width="700" />
|
||||
<a href="https://noxa.io">
|
||||
<img src=".github/banner.png" alt="noxa" width="700" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
|
|
@ -10,34 +10,33 @@
|
|||
</h3>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://github.com/0xMassi/webclaw/stargazers"><img src="https://img.shields.io/github/stars/0xMassi/webclaw?style=for-the-badge&logo=github&logoColor=white&label=Stars&color=181717" alt="Stars" /></a>
|
||||
<a href="https://github.com/0xMassi/webclaw/releases"><img src="https://img.shields.io/github/v/release/0xMassi/webclaw?style=for-the-badge&logo=rust&logoColor=white&label=Version&color=B7410E" alt="Version" /></a>
|
||||
<a href="https://github.com/0xMassi/webclaw/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-AGPL--3.0-10B981?style=for-the-badge" alt="License" /></a>
|
||||
<a href="https://www.npmjs.com/package/create-webclaw"><img src="https://img.shields.io/npm/dt/create-webclaw?style=for-the-badge&logo=npm&logoColor=white&label=Installs&color=CB3837" alt="npm installs" /></a>
|
||||
<a href="https://github.com/jmagar/noxa/stargazers"><img src="https://img.shields.io/github/stars/jmagar/noxa?style=for-the-badge&logo=github&logoColor=white&label=Stars&color=181717" alt="Stars" /></a>
|
||||
<a href="https://github.com/jmagar/noxa/releases"><img src="https://img.shields.io/github/v/release/jmagar/noxa?style=for-the-badge&logo=rust&logoColor=white&label=Version&color=B7410E" alt="Version" /></a>
|
||||
<a href="https://github.com/jmagar/noxa/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-AGPL--3.0-10B981?style=for-the-badge" alt="License" /></a>
|
||||
<a href="https://www.npmjs.com/package/create-noxa"><img src="https://img.shields.io/npm/dt/create-noxa?style=for-the-badge&logo=npm&logoColor=white&label=Installs&color=CB3837" alt="npm installs" /></a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://discord.gg/KDfd48EpnW"><img src="https://img.shields.io/badge/Discord-Join-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord" /></a>
|
||||
<a href="https://x.com/webclaw_io"><img src="https://img.shields.io/badge/Follow-@webclaw__io-000000?style=for-the-badge&logo=x&logoColor=white" alt="X / Twitter" /></a>
|
||||
<a href="https://webclaw.io"><img src="https://img.shields.io/badge/Website-webclaw.io-0A0A0A?style=for-the-badge&logo=safari&logoColor=white" alt="Website" /></a>
|
||||
<a href="https://webclaw.io/docs"><img src="https://img.shields.io/badge/Docs-Read-3B82F6?style=for-the-badge&logo=readthedocs&logoColor=white" alt="Docs" /></a>
|
||||
<a href="https://x.com/noxa_io"><img src="https://img.shields.io/badge/Follow-@noxa__io-000000?style=for-the-badge&logo=x&logoColor=white" alt="X / Twitter" /></a>
|
||||
<a href="https://noxa.io"><img src="https://img.shields.io/badge/Website-noxa.io-0A0A0A?style=for-the-badge&logo=safari&logoColor=white" alt="Website" /></a>
|
||||
<a href="https://noxa.io/docs"><img src="https://img.shields.io/badge/Docs-Read-3B82F6?style=for-the-badge&logo=readthedocs&logoColor=white" alt="Docs" /></a>
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
<p align="center">
|
||||
<img src="assets/demo.gif" alt="Claude Code: web_fetch gets 403, webclaw extracts successfully" width="700" />
|
||||
<img src="assets/demo.gif" alt="Claude Code: web_fetch gets 403, noxa extracts successfully" width="700" />
|
||||
<br/>
|
||||
<sub>Claude Code's built-in web_fetch → 403 Forbidden. webclaw → clean markdown.</sub>
|
||||
<sub>Claude Code's built-in web_fetch → 403 Forbidden. noxa → clean markdown.</sub>
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
Your AI agent calls `fetch()` and gets a 403. Or 142KB of raw HTML that burns through your token budget. **webclaw fixes both.**
|
||||
Your AI agent calls `fetch()` and gets a 403. Or 142KB of raw HTML that burns through your token budget. **noxa fixes both.**
|
||||
|
||||
It extracts clean, structured content from any URL using Chrome-level TLS fingerprinting — no headless browser, no Selenium, no Puppeteer. Output is optimized for LLMs: **67% fewer tokens** than raw HTML, with metadata, links, and images preserved.
|
||||
|
||||
```
|
||||
Raw HTML webclaw
|
||||
Raw HTML noxa
|
||||
┌──────────────────────────────────┐ ┌──────────────────────────────────┐
|
||||
│ <div class="ad-wrapper"> │ │ # Breaking: AI Breakthrough │
|
||||
│ <nav class="global-nav"> │ │ │
|
||||
|
|
@ -59,7 +58,7 @@ It extracts clean, structured content from any URL using Chrome-level TLS finger
|
|||
### For AI agents (Claude, Cursor, Windsurf, VS Code)
|
||||
|
||||
```bash
|
||||
npx create-webclaw
|
||||
npx create-noxa
|
||||
```
|
||||
|
||||
Auto-detects your AI tools, downloads the MCP server, and configures everything. One command.
|
||||
|
|
@ -67,25 +66,25 @@ Auto-detects your AI tools, downloads the MCP server, and configures everything.
|
|||
### Homebrew (macOS/Linux)
|
||||
|
||||
```bash
|
||||
brew tap 0xMassi/webclaw
|
||||
brew install webclaw
|
||||
brew tap jmagar/noxa
|
||||
brew install noxa
|
||||
```
|
||||
|
||||
### Prebuilt binaries
|
||||
|
||||
Download from [GitHub Releases](https://github.com/0xMassi/webclaw/releases) for macOS (arm64, x86_64) and Linux (x86_64, aarch64).
|
||||
Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for macOS (arm64, x86_64) and Linux (x86_64, aarch64).
|
||||
|
||||
### Cargo (from source)
|
||||
|
||||
```bash
|
||||
cargo install --git https://github.com/0xMassi/webclaw.git webclaw-cli
|
||||
cargo install --git https://github.com/0xMassi/webclaw.git webclaw-mcp
|
||||
cargo install --git https://github.com/jmagar/noxa.git noxa
|
||||
cargo install --git https://github.com/jmagar/noxa.git noxa-mcp
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
docker run --rm ghcr.io/0xmassi/webclaw https://example.com
|
||||
docker run --rm ghcr.io/0xmassi/noxa https://example.com
|
||||
```
|
||||
|
||||
### Docker Compose (with Ollama for LLM features)
|
||||
|
|
@ -97,9 +96,9 @@ docker compose up -d
|
|||
|
||||
---
|
||||
|
||||
## Why webclaw?
|
||||
## Why noxa?
|
||||
|
||||
| | webclaw | Firecrawl | Trafilatura | Readability |
|
||||
| | noxa | Firecrawl | Trafilatura | Readability |
|
||||
|---|:---:|:---:|:---:|:---:|
|
||||
| **Extraction accuracy** | **95.1%** | — | 80.6% | 83.5% |
|
||||
| **Token efficiency** | **-67%** | — | -55% | -51% |
|
||||
|
|
@ -110,14 +109,14 @@ docker compose up -d
|
|||
| **No browser required** | Yes | No | Yes | Yes |
|
||||
| **Cost** | Free | $$$$ | Free | Free |
|
||||
|
||||
**Choose webclaw if** you want fast local extraction, LLM-optimized output, and native AI agent integration.
|
||||
**Choose noxa if** you want fast local extraction, LLM-optimized output, and native AI agent integration.
|
||||
|
||||
---
|
||||
|
||||
## What it looks like
|
||||
|
||||
```bash
|
||||
$ webclaw https://stripe.com -f llm
|
||||
$ noxa https://stripe.com -f llm
|
||||
|
||||
> URL: https://stripe.com
|
||||
> Title: Stripe | Financial Infrastructure for the Internet
|
||||
|
|
@ -137,7 +136,7 @@ and commerce solutions for internet businesses of all sizes.
|
|||
```
|
||||
|
||||
```bash
|
||||
$ webclaw https://github.com --brand
|
||||
$ noxa https://github.com --brand
|
||||
|
||||
{
|
||||
"name": "GitHub",
|
||||
|
|
@ -148,7 +147,7 @@ $ webclaw https://github.com --brand
|
|||
```
|
||||
|
||||
```bash
|
||||
$ webclaw https://docs.rust-lang.org --crawl --depth 2 --max-pages 50
|
||||
$ noxa https://docs.rust-lang.org --crawl --depth 2 --max-pages 50
|
||||
|
||||
Crawling... 50/50 pages extracted
|
||||
---
|
||||
|
|
@ -162,12 +161,12 @@ Crawling... 50/50 pages extracted
|
|||
|
||||
## MCP Server — 10 tools for AI agents
|
||||
|
||||
<a href="https://glama.ai/mcp/servers/0xMassi/webclaw"><img src="https://glama.ai/mcp/servers/0xMassi/webclaw/badge" alt="webclaw MCP server" /></a>
|
||||
<a href="https://glama.ai/mcp/servers/jmagar/noxa"><img src="https://glama.ai/mcp/servers/jmagar/noxa/badge" alt="noxa MCP server" /></a>
|
||||
|
||||
webclaw ships as an MCP server that plugs into Claude Desktop, Claude Code, Cursor, Windsurf, OpenCode, Antigravity, Codex CLI, and any MCP-compatible client.
|
||||
noxa ships as an MCP server that plugs into Claude Desktop, Claude Code, Cursor, Windsurf, OpenCode, Antigravity, Codex CLI, and any MCP-compatible client.
|
||||
|
||||
```bash
|
||||
npx create-webclaw # auto-detects and configures everything
|
||||
npx create-noxa # auto-detects and configures everything
|
||||
```
|
||||
|
||||
Or manual setup — add to your Claude Desktop config:
|
||||
|
|
@ -175,8 +174,8 @@ Or manual setup — add to your Claude Desktop config:
|
|||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"webclaw": {
|
||||
"command": "~/.webclaw/webclaw-mcp"
|
||||
"noxa": {
|
||||
"command": "~/.noxa/noxa-mcp"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -217,45 +216,45 @@ Then in Claude: *"Scrape the top 5 results for 'web scraping tools' and compare
|
|||
### Content control
|
||||
|
||||
```bash
|
||||
webclaw URL --include "article, .content" # CSS selector include
|
||||
webclaw URL --exclude "nav, footer, .sidebar" # CSS selector exclude
|
||||
webclaw URL --only-main-content # Auto-detect main content
|
||||
noxa URL --include "article, .content" # CSS selector include
|
||||
noxa URL --exclude "nav, footer, .sidebar" # CSS selector exclude
|
||||
noxa URL --only-main-content # Auto-detect main content
|
||||
```
|
||||
|
||||
### Crawling
|
||||
|
||||
```bash
|
||||
webclaw URL --crawl --depth 3 --max-pages 100 # BFS same-origin crawl
|
||||
webclaw URL --crawl --sitemap # Seed from sitemap
|
||||
webclaw URL --map # Discover URLs only
|
||||
noxa URL --crawl --depth 3 --max-pages 100 # BFS same-origin crawl
|
||||
noxa URL --crawl --sitemap # Seed from sitemap
|
||||
noxa URL --map # Discover URLs only
|
||||
```
|
||||
|
||||
### LLM features (Ollama / OpenAI / Anthropic)
|
||||
|
||||
```bash
|
||||
webclaw URL --summarize # Page summary
|
||||
webclaw URL --extract-prompt "Get all prices" # Natural language extraction
|
||||
webclaw URL --extract-json '{"type":"object"}' # Schema-enforced extraction
|
||||
noxa URL --summarize # Page summary
|
||||
noxa URL --extract-prompt "Get all prices" # Natural language extraction
|
||||
noxa URL --extract-json '{"type":"object"}' # Schema-enforced extraction
|
||||
```
|
||||
|
||||
### Change tracking
|
||||
|
||||
```bash
|
||||
webclaw URL -f json > snap.json # Take snapshot
|
||||
webclaw URL --diff-with snap.json # Compare later
|
||||
noxa URL -f json > snap.json # Take snapshot
|
||||
noxa URL --diff-with snap.json # Compare later
|
||||
```
|
||||
|
||||
### Brand extraction
|
||||
|
||||
```bash
|
||||
webclaw URL --brand # Colors, fonts, logos, OG image
|
||||
noxa URL --brand # Colors, fonts, logos, OG image
|
||||
```
|
||||
|
||||
### Proxy rotation
|
||||
|
||||
```bash
|
||||
webclaw URL --proxy http://user:pass@host:port # Single proxy
|
||||
webclaw URLs --proxy-file proxies.txt # Pool rotation
|
||||
noxa URL --proxy http://user:pass@host:port # Single proxy
|
||||
noxa URLs --proxy-file proxies.txt # Pool rotation
|
||||
```
|
||||
|
||||
---
|
||||
|
|
@ -267,12 +266,12 @@ All numbers from real tests on 50 diverse pages. See [benchmarks/](benchmarks/)
|
|||
### Extraction quality
|
||||
|
||||
```
|
||||
Accuracy webclaw ███████████████████ 95.1%
|
||||
Accuracy noxa ███████████████████ 95.1%
|
||||
readability ████████████████▋ 83.5%
|
||||
trafilatura ████████████████ 80.6%
|
||||
newspaper3k █████████████▎ 66.4%
|
||||
|
||||
Noise removal webclaw ███████████████████ 96.1%
|
||||
Noise removal noxa ███████████████████ 96.1%
|
||||
readability █████████████████▊ 89.4%
|
||||
trafilatura ██████████████████▏ 91.2%
|
||||
newspaper3k ███████████████▎ 76.8%
|
||||
|
|
@ -281,11 +280,11 @@ Noise removal webclaw ██████████████████
|
|||
### Speed (pure extraction, no network)
|
||||
|
||||
```
|
||||
10KB page webclaw ██ 0.8ms
|
||||
10KB page noxa ██ 0.8ms
|
||||
readability █████ 2.1ms
|
||||
trafilatura ██████████ 4.3ms
|
||||
|
||||
100KB page webclaw ██ 3.2ms
|
||||
100KB page noxa ██ 3.2ms
|
||||
readability █████ 8.7ms
|
||||
trafilatura ██████████ 18.4ms
|
||||
```
|
||||
|
|
@ -297,11 +296,11 @@ Noise removal webclaw ██████████████████
|
|||
| Raw HTML | 4,820 | baseline |
|
||||
| readability | 2,340 | -51% |
|
||||
| trafilatura | 2,180 | -55% |
|
||||
| **webclaw llm** | **1,590** | **-67%** |
|
||||
| **noxa llm** | **1,590** | **-67%** |
|
||||
|
||||
### Crawl speed
|
||||
|
||||
| Concurrency | webclaw | Crawl4AI | Scrapy |
|
||||
| Concurrency | noxa | Crawl4AI | Scrapy |
|
||||
|:-----------:|:-------:|:--------:|:------:|
|
||||
| 5 | **9.8 pg/s** | 5.2 pg/s | 7.1 pg/s |
|
||||
| 10 | **18.4 pg/s** | 8.7 pg/s | 12.3 pg/s |
|
||||
|
|
@ -312,17 +311,17 @@ Noise removal webclaw ██████████████████
|
|||
## Architecture
|
||||
|
||||
```
|
||||
webclaw/
|
||||
noxa/
|
||||
crates/
|
||||
webclaw-core Pure extraction engine. Zero network deps. WASM-safe.
|
||||
webclaw-fetch HTTP client + TLS fingerprinting (wreq/BoringSSL). Crawler. Batch ops.
|
||||
webclaw-llm LLM provider chain (Ollama -> OpenAI -> Anthropic)
|
||||
webclaw-pdf PDF text extraction
|
||||
webclaw-mcp MCP server (10 tools for AI agents)
|
||||
webclaw-cli CLI binary
|
||||
noxa-core Pure extraction engine. Zero network deps. WASM-safe.
|
||||
noxa-fetch HTTP client + TLS fingerprinting (wreq/BoringSSL). Crawler. Batch ops.
|
||||
noxa-llm LLM provider chain (Ollama -> OpenAI -> Anthropic)
|
||||
noxa-pdf PDF text extraction
|
||||
noxa-mcp MCP server (10 tools for AI agents)
|
||||
noxa CLI binary
|
||||
```
|
||||
|
||||
`webclaw-core` takes raw HTML as a `&str` and returns structured output. No I/O, no network, no allocator tricks. Can compile to WASM.
|
||||
`noxa-core` takes raw HTML as a `&str` and returns structured output. No I/O, no network, no allocator tricks. Can compile to WASM.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -330,18 +329,18 @@ webclaw/
|
|||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `WEBCLAW_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) |
|
||||
| `NOXA_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) |
|
||||
| `OLLAMA_HOST` | Ollama URL for local LLM features (default: `http://localhost:11434`) |
|
||||
| `OPENAI_API_KEY` | OpenAI API key for LLM features |
|
||||
| `ANTHROPIC_API_KEY` | Anthropic API key for LLM features |
|
||||
| `WEBCLAW_PROXY` | Single proxy URL |
|
||||
| `WEBCLAW_PROXY_FILE` | Path to proxy pool file |
|
||||
| `NOXA_PROXY` | Single proxy URL |
|
||||
| `NOXA_PROXY_FILE` | Path to proxy pool file |
|
||||
|
||||
---
|
||||
|
||||
## Cloud API (optional)
|
||||
|
||||
For bot-protected sites, JS rendering, and advanced features, webclaw offers a hosted API at [webclaw.io](https://webclaw.io).
|
||||
For bot-protected sites, JS rendering, and advanced features, noxa offers a hosted API at [noxa.io](https://noxa.io).
|
||||
|
||||
The CLI and MCP server work locally first. Cloud is used as a fallback when:
|
||||
- A site has bot protection (Cloudflare, DataDome, WAF)
|
||||
|
|
@ -349,21 +348,21 @@ The CLI and MCP server work locally first. Cloud is used as a fallback when:
|
|||
- You use search or research tools
|
||||
|
||||
```bash
|
||||
export WEBCLAW_API_KEY=wc_your_key
|
||||
export NOXA_API_KEY=wc_your_key
|
||||
|
||||
# Automatic: tries local first, cloud on bot detection
|
||||
webclaw https://protected-site.com
|
||||
noxa https://protected-site.com
|
||||
|
||||
# Force cloud
|
||||
webclaw --cloud https://spa-site.com
|
||||
noxa --cloud https://spa-site.com
|
||||
```
|
||||
|
||||
### SDKs
|
||||
|
||||
```bash
|
||||
npm install @webclaw/sdk # TypeScript/JavaScript
|
||||
pip install webclaw # Python
|
||||
go get github.com/0xMassi/webclaw-go # Go
|
||||
npm install @noxa/sdk # TypeScript/JavaScript
|
||||
pip install noxa # Python
|
||||
go get github.com/jmagar/noxa-go # Go
|
||||
```
|
||||
|
||||
---
|
||||
|
|
@ -381,14 +380,13 @@ go get github.com/0xMassi/webclaw-go # Go
|
|||
|
||||
## Community
|
||||
|
||||
- [Discord](https://discord.gg/KDfd48EpnW) — questions, feedback, show what you built
|
||||
- [GitHub Issues](https://github.com/0xMassi/webclaw/issues) — bug reports and feature requests
|
||||
- [GitHub Issues](https://github.com/jmagar/noxa/issues) — bug reports and feature requests
|
||||
|
||||
## Contributing
|
||||
|
||||
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
||||
|
||||
- [Good first issues](https://github.com/0xMassi/webclaw/issues?q=label%3A%22good+first+issue%22)
|
||||
- [Good first issues](https://github.com/jmagar/noxa/issues?q=label%3A%22good+first+issue%22)
|
||||
- [Architecture docs](CONTRIBUTING.md#architecture)
|
||||
|
||||
## Acknowledgments
|
||||
|
|
|
|||
96
SKILL.md
96
SKILL.md
|
|
@ -1,12 +1,12 @@
|
|||
---
|
||||
name: webclaw
|
||||
name: noxa
|
||||
description: Web extraction engine with antibot bypass. Scrape, crawl, extract, summarize, search, map, diff, monitor, research, and analyze any URL — including Cloudflare-protected sites. Use when you need reliable web content, the built-in web_fetch fails, or you need structured data extraction from web pages.
|
||||
homepage: https://webclaw.io
|
||||
homepage: https://noxa.io
|
||||
user-invocable: true
|
||||
metadata: {"openclaw":{"emoji":"🦀","requires":{"env":["WEBCLAW_API_KEY"]},"primaryEnv":"WEBCLAW_API_KEY","homepage":"https://webclaw.io","install":[{"id":"npx","kind":"node","bins":["webclaw-mcp"],"label":"npx create-webclaw"}]}}
|
||||
metadata: {"openclaw":{"emoji":"🦀","requires":{"env":["NOXA_API_KEY"]},"primaryEnv":"NOXA_API_KEY","homepage":"https://noxa.io","install":[{"id":"npx","kind":"node","bins":["noxa-mcp"],"label":"npx create-noxa"}]}}
|
||||
---
|
||||
|
||||
# webclaw
|
||||
# noxa
|
||||
|
||||
High-quality web extraction with automatic antibot bypass. Beats Firecrawl on extraction quality and handles Cloudflare, DataDome, and JS-rendered pages automatically.
|
||||
|
||||
|
|
@ -27,17 +27,17 @@ High-quality web extraction with automatic antibot bypass. Beats Firecrawl on ex
|
|||
|
||||
## API base
|
||||
|
||||
All requests go to `https://api.webclaw.io/v1/`.
|
||||
All requests go to `https://api.noxa.io/v1/`.
|
||||
|
||||
Authentication: `Authorization: Bearer $WEBCLAW_API_KEY`
|
||||
Authentication: `Authorization: Bearer $NOXA_API_KEY`
|
||||
|
||||
## Endpoints
|
||||
|
||||
### 1. Scrape — extract content from a single URL
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/scrape \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/scrape \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com",
|
||||
|
|
@ -96,8 +96,8 @@ Starts an async job. Poll for results.
|
|||
|
||||
**Start crawl:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/crawl \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/crawl \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://docs.example.com",
|
||||
|
|
@ -111,8 +111,8 @@ Response: `{ "job_id": "abc-123", "status": "running" }`
|
|||
|
||||
**Poll status:**
|
||||
```bash
|
||||
curl https://api.webclaw.io/v1/crawl/abc-123 \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl https://api.noxa.io/v1/crawl/abc-123 \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
Response when complete:
|
||||
|
|
@ -151,8 +151,8 @@ Response when complete:
|
|||
Fast URL discovery without full content extraction.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/map \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/map \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://example.com"}'
|
||||
```
|
||||
|
|
@ -173,8 +173,8 @@ Response:
|
|||
### 4. Batch — scrape multiple URLs in parallel
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/batch \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/batch \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": [
|
||||
|
|
@ -207,8 +207,8 @@ Pull structured data from any page using a JSON schema or plain-text prompt.
|
|||
|
||||
**With JSON schema:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/extract \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/extract \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/pricing",
|
||||
|
|
@ -233,8 +233,8 @@ curl -X POST https://api.webclaw.io/v1/extract \
|
|||
|
||||
**With prompt:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/extract \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/extract \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/pricing",
|
||||
|
|
@ -258,8 +258,8 @@ Response:
|
|||
### 6. Summarize — get a quick summary of any page
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/summarize \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/summarize \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/long-article",
|
||||
|
|
@ -280,8 +280,8 @@ Response:
|
|||
Compare current page content against a previous snapshot.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/diff \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/diff \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com",
|
||||
|
|
@ -309,8 +309,8 @@ Response:
|
|||
Analyze a website's visual identity: colors, fonts, logo.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/brand \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/brand \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://example.com"}'
|
||||
```
|
||||
|
|
@ -336,8 +336,8 @@ Response:
|
|||
Search the web and optionally scrape each result page.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/search \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/search \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"query": "best rust web frameworks 2026",
|
||||
|
|
@ -390,8 +390,8 @@ Starts an async research job that searches, scrapes, and synthesizes information
|
|||
|
||||
**Start research:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/research \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/research \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"query": "How does Cloudflare Turnstile work and what are its known bypass methods?",
|
||||
|
|
@ -416,8 +416,8 @@ Response: `{ "id": "res-abc-123", "status": "running" }`
|
|||
|
||||
**Poll results:**
|
||||
```bash
|
||||
curl https://api.webclaw.io/v1/research/res-abc-123 \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl https://api.noxa.io/v1/research/res-abc-123 \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
Response when complete:
|
||||
|
|
@ -448,8 +448,8 @@ Response when complete:
|
|||
Use an AI agent to navigate and interact with a page to accomplish a specific goal. The agent can click, scroll, fill forms, and extract data across multiple steps.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/agent-scrape \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/agent-scrape \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/products",
|
||||
|
|
@ -488,8 +488,8 @@ Create persistent monitors that check a URL on a schedule and notify via webhook
|
|||
|
||||
**Create a monitor:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/watch \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/watch \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/pricing",
|
||||
|
|
@ -524,8 +524,8 @@ Response:
|
|||
|
||||
**List all monitors:**
|
||||
```bash
|
||||
curl https://api.webclaw.io/v1/watch \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl https://api.noxa.io/v1/watch \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
Response:
|
||||
|
|
@ -546,8 +546,8 @@ Response:
|
|||
|
||||
**Get a monitor with snapshots:**
|
||||
```bash
|
||||
curl https://api.webclaw.io/v1/watch/watch-abc-123 \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl https://api.noxa.io/v1/watch/watch-abc-123 \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
Response:
|
||||
|
|
@ -573,14 +573,14 @@ Response:
|
|||
|
||||
**Trigger an immediate check:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/watch/watch-abc-123/check \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl -X POST https://api.noxa.io/v1/watch/watch-abc-123/check \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
**Delete a monitor:**
|
||||
```bash
|
||||
curl -X DELETE https://api.webclaw.io/v1/watch/watch-abc-123 \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl -X DELETE https://api.noxa.io/v1/watch/watch-abc-123 \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
## Choosing the right format
|
||||
|
|
@ -608,7 +608,7 @@ curl -X DELETE https://api.webclaw.io/v1/watch/watch-abc-123 \
|
|||
|
||||
## Smart Fetch Architecture
|
||||
|
||||
The webclaw MCP server uses a **local-first** approach:
|
||||
The noxa MCP server uses a **local-first** approach:
|
||||
|
||||
1. **Local fetch** — fast, free, no API credits used (~80% of sites)
|
||||
2. **Cloud API fallback** — automatic when bot protection or JS rendering is detected
|
||||
|
|
@ -617,11 +617,11 @@ This means:
|
|||
- Most scrapes cost zero credits (local extraction)
|
||||
- Cloudflare, DataDome, AWS WAF sites automatically fall back to the cloud API
|
||||
- JS-rendered SPAs (React, Next.js, Vue) also fall back automatically
|
||||
- Set `WEBCLAW_API_KEY` to enable cloud fallback
|
||||
- Set `NOXA_API_KEY` to enable cloud fallback
|
||||
|
||||
## vs web_fetch
|
||||
|
||||
| | webclaw | web_fetch |
|
||||
| | noxa | web_fetch |
|
||||
|---|---------|-----------|
|
||||
| Cloudflare bypass | Automatic (cloud fallback) | Fails (403) |
|
||||
| JS-rendered pages | Automatic fallback | Readability only |
|
||||
|
|
@ -631,4 +631,4 @@ This means:
|
|||
| Caching | Built-in, configurable TTL | Per-session |
|
||||
| Rate limiting | Managed server-side | Client responsibility |
|
||||
|
||||
Use `web_fetch` for simple, fast lookups. Use webclaw when you need reliability, quality, or advanced features.
|
||||
Use `web_fetch` for simple, fast lookups. Use noxa when you need reliability, quality, or advanced features.
|
||||
|
|
|
|||
BIN
assets/demo.gif
BIN
assets/demo.gif
Binary file not shown.
|
Before Width: | Height: | Size: 523 KiB |
BIN
assets/demo.mp4
BIN
assets/demo.mp4
Binary file not shown.
|
|
@ -1,16 +1,16 @@
|
|||
# Benchmarks
|
||||
|
||||
Extraction quality and performance benchmarks comparing webclaw against popular alternatives.
|
||||
Extraction quality and performance benchmarks comparing noxa against popular alternatives.
|
||||
|
||||
## Quick Run
|
||||
|
||||
```bash
|
||||
# Run all benchmarks
|
||||
cargo run --release -p webclaw-bench
|
||||
cargo run --release -p noxa-bench
|
||||
|
||||
# Run specific benchmark
|
||||
cargo run --release -p webclaw-bench -- --filter quality
|
||||
cargo run --release -p webclaw-bench -- --filter speed
|
||||
cargo run --release -p noxa-bench -- --filter quality
|
||||
cargo run --release -p noxa-bench -- --filter speed
|
||||
```
|
||||
|
||||
## Extraction Quality
|
||||
|
|
@ -20,7 +20,7 @@ Each page scored on: content completeness, noise removal, link preservation, met
|
|||
|
||||
| Extractor | Accuracy | Noise Removal | Links | Metadata | Avg Score |
|
||||
|-----------|----------|---------------|-------|----------|-----------|
|
||||
| **webclaw** | **94.2%** | **96.1%** | **98.3%** | **91.7%** | **95.1%** |
|
||||
| **noxa** | **94.2%** | **96.1%** | **98.3%** | **91.7%** | **95.1%** |
|
||||
| mozilla/readability | 87.3% | 89.4% | 85.1% | 72.3% | 83.5% |
|
||||
| trafilatura | 82.1% | 91.2% | 68.4% | 80.5% | 80.6% |
|
||||
| newspaper3k | 71.4% | 76.8% | 52.3% | 65.2% | 66.4% |
|
||||
|
|
@ -32,7 +32,7 @@ Each page scored on: content completeness, noise removal, link preservation, met
|
|||
- **Links**: Percentage of meaningful content links preserved with correct text and href
|
||||
- **Metadata**: Correct extraction of title, author, date, description, and language
|
||||
|
||||
### Why webclaw scores higher
|
||||
### Why noxa scores higher
|
||||
|
||||
1. **Multi-signal scoring**: Combines text density, semantic HTML tags, link density penalty, and DOM depth analysis
|
||||
2. **Data island extraction**: Catches React/Next.js JSON payloads that DOM-only extractors miss
|
||||
|
|
@ -43,14 +43,14 @@ Each page scored on: content completeness, noise removal, link preservation, met
|
|||
|
||||
Single-page extraction time (parsing + extraction, no network). Measured on M4 Pro, averaged over 1000 runs.
|
||||
|
||||
| Page Size | webclaw | readability | trafilatura |
|
||||
| Page Size | noxa | readability | trafilatura |
|
||||
|-----------|---------|-------------|-------------|
|
||||
| Small (10KB) | **0.8ms** | 2.1ms | 4.3ms |
|
||||
| Medium (100KB) | **3.2ms** | 8.7ms | 18.4ms |
|
||||
| Large (500KB) | **12.1ms** | 34.2ms | 72.8ms |
|
||||
| Huge (2MB) | **41.3ms** | 112ms | 284ms |
|
||||
|
||||
### Why webclaw is faster
|
||||
### Why noxa is faster
|
||||
|
||||
1. **Rust**: No garbage collection, zero-cost abstractions, SIMD-optimized string operations
|
||||
2. **Single-pass scoring**: Content scoring happens during DOM traversal, not as a separate pass
|
||||
|
|
@ -63,9 +63,9 @@ Tokens used when feeding extraction output to Claude/GPT. Lower is better (same
|
|||
| Format | Tokens (avg) | vs Raw HTML |
|
||||
|--------|-------------|-------------|
|
||||
| Raw HTML | 4,820 | baseline |
|
||||
| webclaw markdown | 1,840 | **-62%** |
|
||||
| webclaw text | 1,620 | **-66%** |
|
||||
| **webclaw llm** | **1,590** | **-67%** |
|
||||
| noxa markdown | 1,840 | **-62%** |
|
||||
| noxa text | 1,620 | **-66%** |
|
||||
| **noxa llm** | **1,590** | **-67%** |
|
||||
| readability markdown | 2,340 | -51% |
|
||||
| trafilatura text | 2,180 | -55% |
|
||||
|
||||
|
|
@ -75,7 +75,7 @@ The `llm` format applies a 9-step optimization pipeline: image strip, emphasis s
|
|||
|
||||
Crawling speed with concurrent extraction. Target: example documentation site (~200 pages).
|
||||
|
||||
| Concurrency | webclaw | Crawl4AI | Scrapy |
|
||||
| Concurrency | noxa | Crawl4AI | Scrapy |
|
||||
|-------------|---------|----------|--------|
|
||||
| 1 | 2.1 pages/s | 1.4 pages/s | 1.8 pages/s |
|
||||
| 5 | **9.8 pages/s** | 5.2 pages/s | 7.1 pages/s |
|
||||
|
|
@ -86,7 +86,7 @@ Crawling speed with concurrent extraction. Target: example documentation site (~
|
|||
|
||||
Success rate against common anti-bot systems (100 attempts each, via Cloud API with antibot sidecar).
|
||||
|
||||
| Protection | webclaw | Firecrawl | Bright Data |
|
||||
| Protection | noxa | Firecrawl | Bright Data |
|
||||
|------------|---------|-----------|-------------|
|
||||
| Cloudflare Turnstile | **97%** | 62% | 94% |
|
||||
| DataDome | **91%** | 41% | 88% |
|
||||
|
|
@ -100,20 +100,20 @@ Note: Bot protection bypass requires the Cloud API with antibot sidecar. The ope
|
|||
|
||||
```bash
|
||||
# Clone the repo
|
||||
git clone https://github.com/0xMassi/webclaw.git
|
||||
cd webclaw
|
||||
git clone https://github.com/jmagar/noxa.git
|
||||
cd noxa
|
||||
|
||||
# Run quality benchmarks (downloads test pages on first run)
|
||||
cargo run --release -p webclaw-bench -- --filter quality
|
||||
cargo run --release -p noxa-bench -- --filter quality
|
||||
|
||||
# Run speed benchmarks
|
||||
cargo run --release -p webclaw-bench -- --filter speed
|
||||
cargo run --release -p noxa-bench -- --filter speed
|
||||
|
||||
# Run token efficiency benchmarks (requires tiktoken)
|
||||
cargo run --release -p webclaw-bench -- --filter tokens
|
||||
cargo run --release -p noxa-bench -- --filter tokens
|
||||
|
||||
# Full benchmark suite with HTML report
|
||||
cargo run --release -p webclaw-bench -- --report html
|
||||
cargo run --release -p noxa-bench -- --report html
|
||||
```
|
||||
|
||||
## Reproducing Results
|
||||
|
|
|
|||
|
|
@ -1,19 +1,19 @@
|
|||
[package]
|
||||
name = "webclaw-cli"
|
||||
name = "noxa-cli"
|
||||
description = "CLI for extracting web content into LLM-optimized formats"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "webclaw"
|
||||
name = "noxa"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
webclaw-core = { workspace = true }
|
||||
webclaw-fetch = { workspace = true }
|
||||
webclaw-llm = { workspace = true }
|
||||
webclaw-pdf = { workspace = true }
|
||||
noxa-core = { workspace = true }
|
||||
noxa-fetch = { workspace = true }
|
||||
noxa-llm = { workspace = true }
|
||||
noxa-pdf = { workspace = true }
|
||||
dotenvy = { workspace = true }
|
||||
rand = "0.8"
|
||||
serde_json = { workspace = true }
|
||||
|
|
@ -1,16 +1,16 @@
|
|||
/// Cloud API client for automatic fallback when local extraction fails.
|
||||
///
|
||||
/// When WEBCLAW_API_KEY is set (or --api-key is passed), the CLI can fall back
|
||||
/// to api.webclaw.io for bot-protected or JS-rendered sites. With --cloud flag,
|
||||
/// When NOXA_API_KEY is set (or --api-key is passed), the CLI can fall back
|
||||
/// to api.noxa.io for bot-protected or JS-rendered sites. With --cloud flag,
|
||||
/// all requests go through the cloud API directly.
|
||||
///
|
||||
/// NOTE: The canonical, full-featured cloud module lives in webclaw-mcp/src/cloud.rs
|
||||
/// NOTE: The canonical, full-featured cloud module lives in noxa-mcp/src/cloud.rs
|
||||
/// (smart_fetch, bot detection, JS rendering checks). This is the minimal subset
|
||||
/// needed by the CLI. Kept separate to avoid pulling in rmcp via webclaw-mcp.
|
||||
/// and adding webclaw-mcp as a dependency would pull in rmcp.
|
||||
/// needed by the CLI. Kept separate to avoid pulling in rmcp via noxa-mcp.
|
||||
/// and adding noxa-mcp as a dependency would pull in rmcp.
|
||||
use serde_json::{Value, json};
|
||||
|
||||
const API_BASE: &str = "https://api.webclaw.io/v1";
|
||||
const API_BASE: &str = "https://api.noxa.io/v1";
|
||||
|
||||
pub struct CloudClient {
|
||||
api_key: String,
|
||||
|
|
@ -18,11 +18,11 @@ pub struct CloudClient {
|
|||
}
|
||||
|
||||
impl CloudClient {
|
||||
/// Create from explicit key or WEBCLAW_API_KEY env var.
|
||||
/// Create from explicit key or NOXA_API_KEY env var.
|
||||
pub fn new(explicit_key: Option<&str>) -> Option<Self> {
|
||||
let key = explicit_key
|
||||
.map(String::from)
|
||||
.or_else(|| std::env::var("WEBCLAW_API_KEY").ok())
|
||||
.or_else(|| std::env::var("NOXA_API_KEY").ok())
|
||||
.filter(|k| !k.is_empty())?;
|
||||
|
||||
Some(Self {
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
#![allow(dead_code)]
|
||||
/// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command.
|
||||
/// CLI entry point -- wires noxa-core and noxa-fetch into a single command.
|
||||
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
|
||||
mod cloud;
|
||||
|
||||
|
|
@ -11,16 +11,16 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
|||
|
||||
use clap::{Parser, ValueEnum};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use webclaw_core::{
|
||||
use noxa_core::{
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
|
||||
to_llm_text,
|
||||
};
|
||||
use webclaw_fetch::{
|
||||
use noxa_fetch::{
|
||||
BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
|
||||
FetchConfig, FetchResult, PageResult, SitemapEntry,
|
||||
};
|
||||
use webclaw_llm::LlmProvider;
|
||||
use webclaw_pdf::PdfMode;
|
||||
use noxa_llm::LlmProvider;
|
||||
use noxa_pdf::PdfMode;
|
||||
|
||||
/// Known anti-bot challenge page titles (case-insensitive prefix match).
|
||||
const ANTIBOT_TITLES: &[&str] = &[
|
||||
|
|
@ -73,19 +73,19 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
|
|||
EmptyReason::Antibot => eprintln!(
|
||||
"\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\
|
||||
This site requires CAPTCHA solving or browser rendering.\n\
|
||||
Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
|
||||
Use the noxa Cloud API for automatic bypass: https://noxa.io/pricing"
|
||||
),
|
||||
EmptyReason::JsRequired => eprintln!(
|
||||
"\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
|
||||
This site requires JavaScript rendering (SPA).\n\
|
||||
Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing"
|
||||
Use the noxa Cloud API for JS rendering: https://noxa.io/pricing"
|
||||
),
|
||||
EmptyReason::None => {}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "webclaw", about = "Extract web content for LLMs", version)]
|
||||
#[command(name = "noxa", about = "Extract web content for LLMs", version)]
|
||||
struct Cli {
|
||||
/// URLs to fetch (multiple allowed)
|
||||
#[arg()]
|
||||
|
|
@ -104,11 +104,11 @@ struct Cli {
|
|||
browser: Browser,
|
||||
|
||||
/// Proxy URL (http://user:pass@host:port or socks5://host:port)
|
||||
#[arg(short, long, env = "WEBCLAW_PROXY")]
|
||||
#[arg(short, long, env = "NOXA_PROXY")]
|
||||
proxy: Option<String>,
|
||||
|
||||
/// File with proxies (host:port:user:pass, one per line). Rotates per request.
|
||||
#[arg(long, env = "WEBCLAW_PROXY_FILE")]
|
||||
#[arg(long, env = "NOXA_PROXY_FILE")]
|
||||
proxy_file: Option<String>,
|
||||
|
||||
/// Request timeout in seconds
|
||||
|
|
@ -177,7 +177,7 @@ struct Cli {
|
|||
|
||||
/// Webhook URL: POST a JSON payload when an operation completes.
|
||||
/// Works with crawl, batch, watch (on change), and single URL modes.
|
||||
#[arg(long, env = "WEBCLAW_WEBHOOK_URL")]
|
||||
#[arg(long, env = "NOXA_WEBHOOK_URL")]
|
||||
webhook: Option<String>,
|
||||
|
||||
/// Extract brand identity (colors, fonts, logo)
|
||||
|
|
@ -248,20 +248,20 @@ struct Cli {
|
|||
summarize: Option<usize>,
|
||||
|
||||
/// Force a specific LLM provider (ollama, openai, anthropic)
|
||||
#[arg(long, env = "WEBCLAW_LLM_PROVIDER")]
|
||||
#[arg(long, env = "NOXA_LLM_PROVIDER")]
|
||||
llm_provider: Option<String>,
|
||||
|
||||
/// Override the LLM model name
|
||||
#[arg(long, env = "WEBCLAW_LLM_MODEL")]
|
||||
#[arg(long, env = "NOXA_LLM_MODEL")]
|
||||
llm_model: Option<String>,
|
||||
|
||||
/// Override the LLM base URL (Ollama or OpenAI-compatible)
|
||||
#[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
|
||||
#[arg(long, env = "NOXA_LLM_BASE_URL")]
|
||||
llm_base_url: Option<String>,
|
||||
|
||||
// -- Cloud API options --
|
||||
/// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites
|
||||
#[arg(long, env = "WEBCLAW_API_KEY")]
|
||||
/// Noxa Cloud API key for automatic fallback on bot-protected or JS-rendered sites
|
||||
#[arg(long, env = "NOXA_API_KEY")]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// Force all requests through the cloud API (skip local extraction)
|
||||
|
|
@ -330,9 +330,9 @@ impl From<Browser> for BrowserProfile {
|
|||
|
||||
fn init_logging(verbose: bool) {
|
||||
let filter = if verbose {
|
||||
EnvFilter::new("webclaw=debug")
|
||||
EnvFilter::new("noxa=debug")
|
||||
} else {
|
||||
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
EnvFilter::try_from_env("NOXA_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
};
|
||||
|
||||
tracing_subscriber::fmt().with_env_filter(filter).init();
|
||||
|
|
@ -347,7 +347,7 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
|||
let (proxy, proxy_pool) = if cli.proxy.is_some() {
|
||||
(cli.proxy.clone(), Vec::new())
|
||||
} else if let Some(ref path) = cli.proxy_file {
|
||||
match webclaw_fetch::parse_proxy_file(path) {
|
||||
match noxa_fetch::parse_proxy_file(path) {
|
||||
Ok(pool) => (None, pool),
|
||||
Err(e) => {
|
||||
eprintln!("warning: {e}");
|
||||
|
|
@ -356,7 +356,7 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
|||
}
|
||||
} else if std::path::Path::new("proxies.txt").exists() {
|
||||
// Auto-load proxies.txt from working directory if present
|
||||
match webclaw_fetch::parse_proxy_file("proxies.txt") {
|
||||
match noxa_fetch::parse_proxy_file("proxies.txt") {
|
||||
Ok(pool) if !pool.is_empty() => {
|
||||
eprintln!("loaded {} proxies from proxies.txt", pool.len());
|
||||
(None, pool)
|
||||
|
|
@ -652,7 +652,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
// --cloud: skip local, go straight to cloud API
|
||||
if cli.cloud {
|
||||
let c =
|
||||
cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?;
|
||||
cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
|
||||
let options = build_extraction_options(cli);
|
||||
let format_str = match cli.format {
|
||||
OutputFormat::Markdown => "markdown",
|
||||
|
|
@ -1349,7 +1349,7 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
|
|||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
|
||||
let entries = webclaw_fetch::sitemap::discover(&client, url)
|
||||
let entries = noxa_fetch::sitemap::discover(&client, url)
|
||||
.await
|
||||
.map_err(|e| format!("sitemap discovery failed: {e}"))?;
|
||||
|
||||
|
|
@ -1469,7 +1469,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
|||
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
|
||||
serde_json::json!({
|
||||
"embeds": [{
|
||||
"title": format!("webclaw: {event}"),
|
||||
"title": format!("noxa: {event}"),
|
||||
"description": format!("```json\n{details}\n```"),
|
||||
"color": 5814783
|
||||
}]
|
||||
|
|
@ -1482,7 +1482,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
|||
.unwrap_or("notification");
|
||||
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
|
||||
serde_json::json!({
|
||||
"text": format!("*webclaw: {event}*\n```{details}```")
|
||||
"text": format!("*noxa: {event}*\n```{details}```")
|
||||
})
|
||||
.to_string()
|
||||
} else {
|
||||
|
|
@ -1575,7 +1575,7 @@ async fn run_watch_single(
|
|||
}
|
||||
};
|
||||
|
||||
let diff = webclaw_core::diff::diff(&previous, ¤t);
|
||||
let diff = noxa_core::diff::diff(&previous, ¤t);
|
||||
|
||||
if diff.status == ChangeStatus::Same {
|
||||
eprintln!("[watch] No changes ({})", timestamp());
|
||||
|
|
@ -1687,7 +1687,7 @@ async fn run_watch_multi(
|
|||
match r.result {
|
||||
Ok(current) => {
|
||||
if let Some(previous) = snapshots.get(&r.url) {
|
||||
let diff = webclaw_core::diff::diff(previous, ¤t);
|
||||
let diff = noxa_core::diff::diff(previous, ¤t);
|
||||
if diff.status == ChangeStatus::Same {
|
||||
same_count += 1;
|
||||
} else {
|
||||
|
|
@ -1790,7 +1790,7 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
|
|||
// Extract current version (handles PDF detection for URLs)
|
||||
let new_result = fetch_and_extract(cli).await?.into_extraction()?;
|
||||
|
||||
let diff = webclaw_core::diff::diff(&old, &new_result);
|
||||
let diff = noxa_core::diff::diff(&old, &new_result);
|
||||
print_diff_output(&diff, &cli.format);
|
||||
|
||||
Ok(())
|
||||
|
|
@ -1799,7 +1799,7 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
|
|||
async fn run_brand(cli: &Cli) -> Result<(), String> {
|
||||
let result = fetch_html(cli).await?;
|
||||
let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
|
||||
let brand = webclaw_core::brand::extract_brand(
|
||||
let brand = noxa_core::brand::extract_brand(
|
||||
&enriched,
|
||||
Some(result.url.as_str()).filter(|s| !s.is_empty()),
|
||||
);
|
||||
|
|
@ -1815,7 +1815,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
if let Some(ref name) = cli.llm_provider {
|
||||
match name.as_str() {
|
||||
"ollama" => {
|
||||
let provider = webclaw_llm::providers::ollama::OllamaProvider::new(
|
||||
let provider = noxa_llm::providers::ollama::OllamaProvider::new(
|
||||
cli.llm_base_url.clone(),
|
||||
cli.llm_model.clone(),
|
||||
);
|
||||
|
|
@ -1825,7 +1825,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
Ok(Box::new(provider))
|
||||
}
|
||||
"openai" => {
|
||||
let provider = webclaw_llm::providers::openai::OpenAiProvider::new(
|
||||
let provider = noxa_llm::providers::openai::OpenAiProvider::new(
|
||||
None,
|
||||
cli.llm_base_url.clone(),
|
||||
cli.llm_model.clone(),
|
||||
|
|
@ -1834,7 +1834,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
Ok(Box::new(provider))
|
||||
}
|
||||
"anthropic" => {
|
||||
let provider = webclaw_llm::providers::anthropic::AnthropicProvider::new(
|
||||
let provider = noxa_llm::providers::anthropic::AnthropicProvider::new(
|
||||
None,
|
||||
cli.llm_model.clone(),
|
||||
)
|
||||
|
|
@ -1846,7 +1846,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
)),
|
||||
}
|
||||
} else {
|
||||
let chain = webclaw_llm::ProviderChain::default().await;
|
||||
let chain = noxa_llm::ProviderChain::default().await;
|
||||
if chain.is_empty() {
|
||||
return Err(
|
||||
"no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
|
||||
|
|
@ -1876,7 +1876,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
let schema: serde_json::Value =
|
||||
serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?;
|
||||
|
||||
let extracted = webclaw_llm::extract::extract_json(
|
||||
let extracted = noxa_llm::extract::extract_json(
|
||||
&result.content.plain_text,
|
||||
&schema,
|
||||
provider.as_ref(),
|
||||
|
|
@ -1890,7 +1890,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
serde_json::to_string_pretty(&extracted).expect("serialization failed")
|
||||
);
|
||||
} else if let Some(ref prompt) = cli.extract_prompt {
|
||||
let extracted = webclaw_llm::extract::extract_with_prompt(
|
||||
let extracted = noxa_llm::extract::extract_with_prompt(
|
||||
&result.content.plain_text,
|
||||
prompt,
|
||||
provider.as_ref(),
|
||||
|
|
@ -1904,7 +1904,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
serde_json::to_string_pretty(&extracted).expect("serialization failed")
|
||||
);
|
||||
} else if let Some(sentences) = cli.summarize {
|
||||
let summary = webclaw_llm::summarize::summarize(
|
||||
let summary = noxa_llm::summarize::summarize(
|
||||
&result.content.plain_text,
|
||||
Some(sentences),
|
||||
provider.as_ref(),
|
||||
|
|
@ -1975,15 +1975,15 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
|||
|
||||
// Run the appropriate LLM operation
|
||||
let llm_result = if let Some(ref schema) = schema {
|
||||
webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model)
|
||||
noxa_llm::extract::extract_json(text, schema, provider.as_ref(), model)
|
||||
.await
|
||||
.map(LlmOutput::Json)
|
||||
} else if let Some(ref prompt) = cli.extract_prompt {
|
||||
webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
|
||||
noxa_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
|
||||
.await
|
||||
.map(LlmOutput::Json)
|
||||
} else if let Some(sentences) = cli.summarize {
|
||||
webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
|
||||
noxa_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
|
||||
.await
|
||||
.map(LlmOutput::Text)
|
||||
} else {
|
||||
|
|
@ -2080,7 +2080,7 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
let api_key = cli
|
||||
.api_key
|
||||
.as_deref()
|
||||
.ok_or("--research requires WEBCLAW_API_KEY (set via env or --api-key)")?;
|
||||
.ok_or("--research requires NOXA_API_KEY (set via env or --api-key)")?;
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(600))
|
||||
|
|
@ -2099,7 +2099,7 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
|
||||
// Start job
|
||||
let resp = client
|
||||
.post("https://api.webclaw.io/v1/research")
|
||||
.post("https://api.noxa.io/v1/research")
|
||||
.header("Authorization", format!("Bearer {api_key}"))
|
||||
.json(&body)
|
||||
.send()
|
||||
|
|
@ -2122,7 +2122,7 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
|
||||
|
||||
let status_resp = client
|
||||
.get(format!("https://api.webclaw.io/v1/research/{job_id}"))
|
||||
.get(format!("https://api.noxa.io/v1/research/{job_id}"))
|
||||
.header("Authorization", format!("Bearer {api_key}"))
|
||||
.send()
|
||||
.await
|
||||
|
|
@ -2448,7 +2448,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn write_to_file_creates_dirs() {
|
||||
let dir = std::env::temp_dir().join("webclaw_test_output_dir");
|
||||
let dir = std::env::temp_dir().join("noxa_test_output_dir");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
write_to_file(&dir, "nested/deep/file.md", "hello").unwrap();
|
||||
let content = std::fs::read_to_string(dir.join("nested/deep/file.md")).unwrap();
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
[package]
|
||||
name = "webclaw-core"
|
||||
name = "noxa-core"
|
||||
description = "Pure HTML content extraction engine for LLMs"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
pub mod brand;
|
||||
pub(crate) mod data_island;
|
||||
/// webclaw-core: Pure HTML content extraction engine for LLMs.
|
||||
/// noxa-core: Pure HTML content extraction engine for LLMs.
|
||||
///
|
||||
/// Takes raw HTML + optional URL, returns structured content
|
||||
/// (metadata, markdown, plain text, links, images, code blocks).
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
[package]
|
||||
name = "webclaw-fetch"
|
||||
name = "noxa-fetch"
|
||||
description = "HTTP client with browser TLS fingerprint impersonation via wreq"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
webclaw-core = { workspace = true }
|
||||
webclaw-pdf = { path = "../webclaw-pdf" }
|
||||
noxa-core = { workspace = true }
|
||||
noxa-pdf = { path = "../noxa-pdf" }
|
||||
serde = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
//! Browser fingerprint selection and rotation.
|
||||
//! Maps our BrowserProfile enum to webclaw-http client builder methods.
|
||||
//! Maps our BrowserProfile enum to noxa-http client builder methods.
|
||||
|
||||
/// Which browser identity to present at the TLS/HTTP layer.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
|
|
@ -11,7 +11,7 @@ pub enum BrowserProfile {
|
|||
Random,
|
||||
}
|
||||
|
||||
/// A browser variant for building webclaw-http clients.
|
||||
/// A browser variant for building noxa-http clients.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum BrowserVariant {
|
||||
Chrome,
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/// HTTP client with browser TLS fingerprint impersonation.
|
||||
/// Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
/// Supports single and batch operations with proxy rotation.
|
||||
/// Automatically detects PDF responses and extracts text via webclaw-pdf.
|
||||
/// Automatically detects PDF responses and extracts text via noxa-pdf.
|
||||
///
|
||||
/// Two proxy modes:
|
||||
/// - **Static**: single proxy (or none) baked into pre-built clients at construction.
|
||||
|
|
@ -15,7 +15,7 @@ use std::time::{Duration, Instant};
|
|||
use rand::seq::SliceRandom;
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{debug, instrument, warn};
|
||||
use webclaw_pdf::PdfMode;
|
||||
use noxa_pdf::PdfMode;
|
||||
|
||||
use crate::browser::{self, BrowserProfile, BrowserVariant};
|
||||
use crate::error::FetchError;
|
||||
|
|
@ -75,11 +75,11 @@ pub struct BatchResult {
|
|||
#[derive(Debug)]
|
||||
pub struct BatchExtractResult {
|
||||
pub url: String,
|
||||
pub result: Result<webclaw_core::ExtractionResult, FetchError>,
|
||||
pub result: Result<noxa_core::ExtractionResult, FetchError>,
|
||||
}
|
||||
|
||||
/// Buffered response that owns its body. Provides the same sync API
|
||||
/// that webclaw-http::Response used to provide.
|
||||
/// that noxa-http::Response used to provide.
|
||||
struct Response {
|
||||
status: u16,
|
||||
url: String,
|
||||
|
|
@ -268,8 +268,8 @@ impl FetchClient {
|
|||
pub async fn fetch_and_extract(
|
||||
&self,
|
||||
url: &str,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
self.fetch_and_extract_with_options(url, &webclaw_core::ExtractionOptions::default())
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
self.fetch_and_extract_with_options(url, &noxa_core::ExtractionOptions::default())
|
||||
.await
|
||||
}
|
||||
|
||||
|
|
@ -278,8 +278,8 @@ impl FetchClient {
|
|||
pub async fn fetch_and_extract_with_options(
|
||||
&self,
|
||||
url: &str,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
options: &noxa_core::ExtractionOptions,
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
|
|
@ -334,7 +334,7 @@ impl FetchClient {
|
|||
"PDF fetch complete"
|
||||
);
|
||||
|
||||
let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
|
||||
let pdf_result = noxa_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
|
||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||
} else if let Some(doc_type) =
|
||||
crate::document::is_document_content_type(&headers, &final_url)
|
||||
|
|
@ -369,7 +369,7 @@ impl FetchClient {
|
|||
debug!("linkedin extraction failed, falling back to standard");
|
||||
}
|
||||
|
||||
let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
let extraction = noxa_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
|
||||
Ok(extraction)
|
||||
}
|
||||
|
|
@ -408,7 +408,7 @@ impl FetchClient {
|
|||
self.fetch_and_extract_batch_with_options(
|
||||
urls,
|
||||
concurrency,
|
||||
&webclaw_core::ExtractionOptions::default(),
|
||||
&noxa_core::ExtractionOptions::default(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
|
@ -418,7 +418,7 @@ impl FetchClient {
|
|||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
options: &noxa_core::ExtractionOptions,
|
||||
) -> Vec<BatchExtractResult> {
|
||||
let semaphore = Arc::new(Semaphore::new(concurrency));
|
||||
let mut handles = Vec::with_capacity(urls.len());
|
||||
|
|
@ -572,16 +572,16 @@ fn extract_homepage(url: &str) -> Option<String> {
|
|||
.map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or("")))
|
||||
}
|
||||
|
||||
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
||||
/// Convert a noxa-pdf PdfResult into a noxa-core ExtractionResult.
|
||||
fn pdf_to_extraction_result(
|
||||
pdf: &webclaw_pdf::PdfResult,
|
||||
pdf: &noxa_pdf::PdfResult,
|
||||
url: &str,
|
||||
) -> webclaw_core::ExtractionResult {
|
||||
let markdown = webclaw_pdf::to_markdown(pdf);
|
||||
) -> noxa_core::ExtractionResult {
|
||||
let markdown = noxa_pdf::to_markdown(pdf);
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
|
||||
webclaw_core::ExtractionResult {
|
||||
metadata: webclaw_core::Metadata {
|
||||
noxa_core::ExtractionResult {
|
||||
metadata: noxa_core::Metadata {
|
||||
title: pdf.metadata.title.clone(),
|
||||
description: pdf.metadata.subject.clone(),
|
||||
author: pdf.metadata.author.clone(),
|
||||
|
|
@ -593,7 +593,7 @@ fn pdf_to_extraction_result(
|
|||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
content: noxa_core::Content {
|
||||
markdown,
|
||||
plain_text: pdf.text.clone(),
|
||||
links: Vec::new(),
|
||||
|
|
@ -713,10 +713,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_pdf_to_extraction_result() {
|
||||
let pdf = webclaw_pdf::PdfResult {
|
||||
let pdf = noxa_pdf::PdfResult {
|
||||
text: "Hello from PDF.".into(),
|
||||
page_count: 2,
|
||||
metadata: webclaw_pdf::PdfMetadata {
|
||||
metadata: noxa_pdf::PdfMetadata {
|
||||
title: Some("My Doc".into()),
|
||||
author: Some("Author".into()),
|
||||
subject: Some("Testing".into()),
|
||||
|
|
@ -91,7 +91,7 @@ pub struct CrawlResult {
|
|||
pub struct PageResult {
|
||||
pub url: String,
|
||||
pub depth: usize,
|
||||
pub extraction: Option<webclaw_core::ExtractionResult>,
|
||||
pub extraction: Option<noxa_core::ExtractionResult>,
|
||||
pub error: Option<String>,
|
||||
#[serde(skip)]
|
||||
pub elapsed: Duration,
|
||||
|
|
@ -81,7 +81,7 @@ pub fn is_document_content_type(headers: &http::HeaderMap, url: &str) -> Option<
|
|||
pub fn extract_document(
|
||||
bytes: &[u8],
|
||||
doc_type: DocType,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
debug!(
|
||||
doc_type = doc_type.label(),
|
||||
bytes = bytes.len(),
|
||||
|
|
@ -98,8 +98,8 @@ pub fn extract_document(
|
|||
let plain_text = strip_markdown_formatting(&markdown);
|
||||
let word_count = plain_text.split_whitespace().count();
|
||||
|
||||
Ok(webclaw_core::ExtractionResult {
|
||||
metadata: webclaw_core::Metadata {
|
||||
Ok(noxa_core::ExtractionResult {
|
||||
metadata: noxa_core::Metadata {
|
||||
title: None,
|
||||
description: None,
|
||||
author: None,
|
||||
|
|
@ -111,7 +111,7 @@ pub fn extract_document(
|
|||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
content: noxa_core::Content {
|
||||
markdown,
|
||||
plain_text,
|
||||
links: Vec::new(),
|
||||
|
|
@ -14,10 +14,10 @@ pub enum FetchError {
|
|||
BodyDecode(String),
|
||||
|
||||
#[error("extraction failed: {0}")]
|
||||
Extraction(#[from] webclaw_core::ExtractError),
|
||||
Extraction(#[from] noxa_core::ExtractError),
|
||||
|
||||
#[error("PDF extraction failed: {0}")]
|
||||
Pdf(#[from] webclaw_pdf::PdfError),
|
||||
Pdf(#[from] noxa_pdf::PdfError),
|
||||
|
||||
#[error("client build failed: {0}")]
|
||||
Build(String),
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
//! webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
//! noxa-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
//! Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
//! Automatically detects PDF responses and delegates to webclaw-pdf.
|
||||
//! Automatically detects PDF responses and delegates to noxa-pdf.
|
||||
pub mod browser;
|
||||
pub mod client;
|
||||
pub mod crawler;
|
||||
|
|
@ -19,4 +19,4 @@ pub use error::FetchError;
|
|||
pub use http::HeaderMap;
|
||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||
pub use sitemap::SitemapEntry;
|
||||
pub use webclaw_pdf::PdfMode;
|
||||
pub use noxa_pdf::PdfMode;
|
||||
|
|
@ -5,7 +5,7 @@
|
|||
/// Profile, etc. We parse these to reconstruct post + comments as markdown.
|
||||
use serde_json::Value;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
use noxa_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL is a LinkedIn post/activity.
|
||||
pub fn is_linkedin_post(url: &str) -> bool {
|
||||
|
|
@ -5,7 +5,7 @@
|
|||
/// comment tree as structured JSON, which we convert to clean markdown.
|
||||
use serde::Deserialize;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
use noxa_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL points to a Reddit post/comment page.
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
//! Browser TLS + HTTP/2 fingerprint profiles built on wreq (BoringSSL).
|
||||
//!
|
||||
//! Replaces the old webclaw-http/webclaw-tls patched rustls stack.
|
||||
//! Replaces the old noxa-http/noxa-tls patched rustls stack.
|
||||
//! Each profile configures TLS options (cipher suites, curves, extensions,
|
||||
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
|
||||
//! stream dependency, priorities) to match real browser fingerprints.
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "webclaw-llm"
|
||||
description = "LLM integration for webclaw — local-first hybrid architecture (Ollama -> OpenAI -> Anthropic)"
|
||||
name = "noxa-llm"
|
||||
description = "LLM integration for noxa — local-first hybrid architecture (Ollama -> OpenAI -> Anthropic)"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
/// webclaw-llm: LLM integration with local-first hybrid architecture.
|
||||
/// noxa-llm: LLM integration with local-first hybrid architecture.
|
||||
///
|
||||
/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
|
||||
/// Provides schema-based extraction, prompt extraction, and summarization
|
||||
/// on top of webclaw-core's content pipeline.
|
||||
/// on top of noxa-core's content pipeline.
|
||||
pub mod chain;
|
||||
pub mod clean;
|
||||
pub mod error;
|
||||
|
|
@ -151,7 +151,7 @@ mod tests {
|
|||
|
||||
// Env var fallback tests mutate process-global state and race with parallel tests.
|
||||
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
|
||||
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
|
||||
// cargo test -p noxa-llm env_var -- --ignored --test-threads=1
|
||||
#[test]
|
||||
#[ignore = "mutates process env; run with --test-threads=1"]
|
||||
fn env_var_key_fallback() {
|
||||
|
|
@ -29,7 +29,7 @@ mod tests {
|
|||
#[test]
|
||||
fn none_override_with_no_env_returns_none() {
|
||||
assert_eq!(
|
||||
load_api_key(None, "WEBCLAW_TEST_NONEXISTENT_KEY_12345"),
|
||||
load_api_key(None, "NOXA_TEST_NONEXISTENT_KEY_12345"),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
|
@ -140,7 +140,7 @@ mod tests {
|
|||
|
||||
// Env var fallback is a trivial `env::var().ok()` -- not worth the flakiness
|
||||
// of manipulating process-global state. Run in isolation if needed:
|
||||
// cargo test -p webclaw-llm env_var_fallback -- --ignored --test-threads=1
|
||||
// cargo test -p noxa-llm env_var_fallback -- --ignored --test-threads=1
|
||||
#[test]
|
||||
#[ignore = "mutates process env; run with --test-threads=1"]
|
||||
fn env_var_fallback() {
|
||||
|
|
@ -162,7 +162,7 @@ mod tests {
|
|||
|
||||
// Env var fallback tests mutate process-global state and race with parallel tests.
|
||||
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
|
||||
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
|
||||
// cargo test -p noxa-llm env_var -- --ignored --test-threads=1
|
||||
#[test]
|
||||
#[ignore = "mutates process env; run with --test-threads=1"]
|
||||
fn env_var_key_fallback() {
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
/// Shared test utilities for webclaw-llm.
|
||||
/// Shared test utilities for noxa-llm.
|
||||
///
|
||||
/// Provides a configurable mock LLM provider for unit tests across
|
||||
/// extract, chain, and other modules that need a fake LLM backend.
|
||||
|
|
@ -1,19 +1,19 @@
|
|||
[package]
|
||||
name = "webclaw-mcp"
|
||||
description = "MCP server for webclaw web extraction toolkit"
|
||||
name = "noxa-mcp"
|
||||
description = "MCP server for noxa web extraction toolkit"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "webclaw-mcp"
|
||||
name = "noxa-mcp"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
webclaw-core = { workspace = true }
|
||||
webclaw-fetch = { workspace = true }
|
||||
webclaw-llm = { workspace = true }
|
||||
webclaw-pdf = { workspace = true }
|
||||
noxa-core = { workspace = true }
|
||||
noxa-fetch = { workspace = true }
|
||||
noxa-llm = { workspace = true }
|
||||
noxa-pdf = { workspace = true }
|
||||
rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
|
||||
schemars = "1.0"
|
||||
dotenvy = { workspace = true }
|
||||
|
|
@ -1,25 +1,26 @@
|
|||
/// Cloud API fallback for protected sites.
|
||||
///
|
||||
/// When local fetch returns a challenge page, this module retries
|
||||
/// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set.
|
||||
/// via api.noxa.io. Requires NOXA_API_KEY to be set.
|
||||
use std::time::Duration;
|
||||
|
||||
use serde_json::{Value, json};
|
||||
use tracing::info;
|
||||
|
||||
const API_BASE: &str = "https://api.webclaw.io/v1";
|
||||
|
||||
/// Lightweight client for the webclaw cloud API.
|
||||
const API_BASE: &str = "https://api.noxa.io/v1";
|
||||
|
||||
/// Lightweight client for the noxa cloud API.
|
||||
pub struct CloudClient {
|
||||
api_key: String,
|
||||
http: reqwest::Client,
|
||||
}
|
||||
|
||||
impl CloudClient {
|
||||
/// Create a new cloud client from WEBCLAW_API_KEY env var.
|
||||
/// Create a new cloud client from NOXA_API_KEY env var.
|
||||
/// Returns None if the key is not set.
|
||||
pub fn from_env() -> Option<Self> {
|
||||
let key = std::env::var("WEBCLAW_API_KEY").ok()?;
|
||||
let key = std::env::var("NOXA_API_KEY").ok()?;
|
||||
if key.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
|
@ -114,7 +115,7 @@ fn truncate_error(text: &str) -> &str {
|
|||
|
||||
/// Check if fetched HTML looks like a bot protection challenge page.
|
||||
/// Detects common bot protection challenge pages.
|
||||
pub fn is_bot_protected(html: &str, headers: &webclaw_fetch::HeaderMap) -> bool {
|
||||
pub fn is_bot_protected(html: &str, headers: &noxa_fetch::HeaderMap) -> bool {
|
||||
let html_lower = html.to_lowercase();
|
||||
|
||||
// Cloudflare challenge page
|
||||
|
|
@ -199,7 +200,7 @@ pub fn needs_js_rendering(word_count: usize, html: &str) -> bool {
|
|||
/// Result of a smart fetch: either local extraction or cloud API response.
|
||||
pub enum SmartFetchResult {
|
||||
/// Successfully extracted locally.
|
||||
Local(Box<webclaw_core::ExtractionResult>),
|
||||
Local(Box<noxa_core::ExtractionResult>),
|
||||
/// Fell back to cloud API. Contains the API response JSON.
|
||||
Cloud(Value),
|
||||
}
|
||||
|
|
@ -210,7 +211,7 @@ pub enum SmartFetchResult {
|
|||
/// If no API key is configured and local fetch is blocked, returns an error
|
||||
/// with a helpful message.
|
||||
pub async fn smart_fetch(
|
||||
client: &webclaw_fetch::FetchClient,
|
||||
client: &noxa_fetch::FetchClient,
|
||||
cloud: Option<&CloudClient>,
|
||||
url: &str,
|
||||
include_selectors: &[String],
|
||||
|
|
@ -239,7 +240,7 @@ pub async fn smart_fetch(
|
|||
}
|
||||
|
||||
// Step 3: Extract locally
|
||||
let options = webclaw_core::ExtractionOptions {
|
||||
let options = noxa_core::ExtractionOptions {
|
||||
include_selectors: include_selectors.to_vec(),
|
||||
exclude_selectors: exclude_selectors.to_vec(),
|
||||
only_main_content,
|
||||
|
|
@ -247,7 +248,7 @@ pub async fn smart_fetch(
|
|||
};
|
||||
|
||||
let extraction =
|
||||
webclaw_core::extract_with_options(&fetch_result.html, Some(&fetch_result.url), &options)
|
||||
noxa_core::extract_with_options(&fetch_result.html, Some(&fetch_result.url), &options)
|
||||
.map_err(|e| format!("Extraction failed: {e}"))?;
|
||||
|
||||
// Step 4: Check for JS-rendered pages (low content from large HTML)
|
||||
|
|
@ -295,8 +296,8 @@ async fn cloud_fallback(
|
|||
Ok(SmartFetchResult::Cloud(resp))
|
||||
}
|
||||
None => Err(format!(
|
||||
"Bot protection detected on {url}. Set WEBCLAW_API_KEY for automatic cloud bypass. \
|
||||
Get a key at https://webclaw.io"
|
||||
"Bot protection detected on {url}. Set NOXA_API_KEY for automatic cloud bypass. \
|
||||
Get a key at https://noxa.io"
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
/// webclaw-mcp: MCP (Model Context Protocol) server for webclaw.
|
||||
/// noxa-mcp: MCP (Model Context Protocol) server for noxa.
|
||||
/// Exposes web extraction tools over stdio transport for AI agents
|
||||
/// like Claude Desktop, Claude Code, and other MCP clients.
|
||||
mod cloud;
|
||||
|
|
@ -8,7 +8,7 @@ mod tools;
|
|||
use rmcp::ServiceExt;
|
||||
use rmcp::transport::stdio;
|
||||
|
||||
use server::WebclawMcp;
|
||||
use server::NoxaMcp;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
|
@ -21,7 +21,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
.with_ansi(false)
|
||||
.init();
|
||||
|
||||
let service = WebclawMcp::new().await.serve(stdio()).await?;
|
||||
let service = NoxaMcp::new().await.serve(stdio()).await?;
|
||||
|
||||
service.waiting().await?;
|
||||
Ok(())
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
/// MCP server implementation for webclaw.
|
||||
/// MCP server implementation for noxa.
|
||||
/// Exposes web extraction capabilities as tools for AI agents.
|
||||
///
|
||||
/// Uses a local-first architecture: fetches pages directly, then falls back
|
||||
/// to the webclaw cloud API (api.webclaw.io) when bot protection or
|
||||
/// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
|
||||
/// to the noxa cloud API (api.noxa.io) when bot protection or
|
||||
/// JS rendering is detected. Set NOXA_API_KEY for automatic fallback.
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
|
|
@ -18,19 +18,19 @@ use url::Url;
|
|||
use crate::cloud::{self, CloudClient, SmartFetchResult};
|
||||
use crate::tools::*;
|
||||
|
||||
pub struct WebclawMcp {
|
||||
pub struct NoxaMcp {
|
||||
tool_router: ToolRouter<Self>,
|
||||
fetch_client: Arc<webclaw_fetch::FetchClient>,
|
||||
llm_chain: Option<webclaw_llm::ProviderChain>,
|
||||
fetch_client: Arc<noxa_fetch::FetchClient>,
|
||||
llm_chain: Option<noxa_llm::ProviderChain>,
|
||||
cloud: Option<CloudClient>,
|
||||
}
|
||||
|
||||
/// Parse a browser string into a BrowserProfile.
|
||||
fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
|
||||
fn parse_browser(browser: Option<&str>) -> noxa_fetch::BrowserProfile {
|
||||
match browser {
|
||||
Some("firefox") => webclaw_fetch::BrowserProfile::Firefox,
|
||||
Some("random") => webclaw_fetch::BrowserProfile::Random,
|
||||
_ => webclaw_fetch::BrowserProfile::Chrome,
|
||||
Some("firefox") => noxa_fetch::BrowserProfile::Firefox,
|
||||
Some("random") => noxa_fetch::BrowserProfile::Random,
|
||||
_ => noxa_fetch::BrowserProfile::Chrome,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -58,28 +58,28 @@ const LOCAL_FETCH_TIMEOUT: Duration = Duration::from_secs(30);
|
|||
const RESEARCH_MAX_POLLS: u32 = 200;
|
||||
|
||||
#[tool_router]
|
||||
impl WebclawMcp {
|
||||
impl NoxaMcp {
|
||||
pub async fn new() -> Self {
|
||||
let mut config = webclaw_fetch::FetchConfig::default();
|
||||
let mut config = noxa_fetch::FetchConfig::default();
|
||||
|
||||
// Load proxy config from env vars or local file
|
||||
if let Ok(proxy) = std::env::var("WEBCLAW_PROXY") {
|
||||
info!("using single proxy from WEBCLAW_PROXY");
|
||||
if let Ok(proxy) = std::env::var("NOXA_PROXY") {
|
||||
info!("using single proxy from NOXA_PROXY");
|
||||
config.proxy = Some(proxy);
|
||||
}
|
||||
|
||||
let proxy_file = std::env::var("WEBCLAW_PROXY_FILE")
|
||||
let proxy_file = std::env::var("NOXA_PROXY_FILE")
|
||||
.ok()
|
||||
.unwrap_or_else(|| "proxies.txt".to_string());
|
||||
if std::path::Path::new(&proxy_file).exists()
|
||||
&& let Ok(pool) = webclaw_fetch::parse_proxy_file(&proxy_file)
|
||||
&& let Ok(pool) = noxa_fetch::parse_proxy_file(&proxy_file)
|
||||
&& !pool.is_empty()
|
||||
{
|
||||
info!(count = pool.len(), file = %proxy_file, "loaded proxy pool");
|
||||
config.proxy_pool = pool;
|
||||
}
|
||||
|
||||
let fetch_client = match webclaw_fetch::FetchClient::new(config) {
|
||||
let fetch_client = match noxa_fetch::FetchClient::new(config) {
|
||||
Ok(client) => client,
|
||||
Err(e) => {
|
||||
error!("failed to build FetchClient: {e}");
|
||||
|
|
@ -87,7 +87,7 @@ impl WebclawMcp {
|
|||
}
|
||||
};
|
||||
|
||||
let chain = webclaw_llm::ProviderChain::default().await;
|
||||
let chain = noxa_llm::ProviderChain::default().await;
|
||||
let llm_chain = if chain.is_empty() {
|
||||
warn!("no LLM providers available -- extract/summarize tools will fail");
|
||||
None
|
||||
|
|
@ -98,11 +98,11 @@ impl WebclawMcp {
|
|||
|
||||
let cloud = CloudClient::from_env();
|
||||
if cloud.is_some() {
|
||||
info!("cloud API fallback enabled (WEBCLAW_API_KEY set)");
|
||||
info!("cloud API fallback enabled (NOXA_API_KEY set)");
|
||||
} else {
|
||||
warn!(
|
||||
"WEBCLAW_API_KEY not set -- bot-protected sites will return challenge pages. \
|
||||
Get a key at https://webclaw.io"
|
||||
"NOXA_API_KEY not set -- bot-protected sites will return challenge pages. \
|
||||
Get a key at https://noxa.io"
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -129,7 +129,7 @@ impl WebclawMcp {
|
|||
}
|
||||
|
||||
/// Scrape a single URL and extract its content as markdown, LLM-optimized text, plain text, or full JSON.
|
||||
/// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
|
||||
/// Automatically falls back to the noxa cloud API when bot protection or JS rendering is detected.
|
||||
#[tool]
|
||||
async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
|
|
@ -147,21 +147,21 @@ impl WebclawMcp {
|
|||
.map(|c| c.join("; "));
|
||||
|
||||
// Use a custom client if non-default browser or cookies are provided
|
||||
let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome);
|
||||
let is_default_browser = matches!(browser, noxa_fetch::BrowserProfile::Chrome);
|
||||
let needs_custom = !is_default_browser || cookie_header.is_some();
|
||||
let custom_client;
|
||||
let client: &webclaw_fetch::FetchClient = if needs_custom {
|
||||
let client: &noxa_fetch::FetchClient = if needs_custom {
|
||||
let mut headers = std::collections::HashMap::new();
|
||||
headers.insert("Accept-Language".to_string(), "en-US,en;q=0.9".to_string());
|
||||
if let Some(ref cookies) = cookie_header {
|
||||
headers.insert("Cookie".to_string(), cookies.clone());
|
||||
}
|
||||
let config = webclaw_fetch::FetchConfig {
|
||||
let config = noxa_fetch::FetchConfig {
|
||||
browser,
|
||||
headers,
|
||||
..Default::default()
|
||||
};
|
||||
custom_client = webclaw_fetch::FetchClient::new(config)
|
||||
custom_client = noxa_fetch::FetchClient::new(config)
|
||||
.map_err(|e| format!("Failed to build client: {e}"))?;
|
||||
&custom_client
|
||||
} else {
|
||||
|
|
@ -183,7 +183,7 @@ impl WebclawMcp {
|
|||
match result {
|
||||
SmartFetchResult::Local(extraction) => {
|
||||
let output = match format {
|
||||
"llm" => webclaw_core::to_llm_text(&extraction, Some(¶ms.url)),
|
||||
"llm" => noxa_core::to_llm_text(&extraction, Some(¶ms.url)),
|
||||
"text" => extraction.content.plain_text,
|
||||
"json" => serde_json::to_string_pretty(&extraction).unwrap_or_default(),
|
||||
_ => extraction.content.markdown,
|
||||
|
|
@ -221,7 +221,7 @@ impl WebclawMcp {
|
|||
|
||||
let format = params.format.as_deref().unwrap_or("markdown");
|
||||
|
||||
let config = webclaw_fetch::CrawlConfig {
|
||||
let config = noxa_fetch::CrawlConfig {
|
||||
max_depth: params.depth.unwrap_or(2) as usize,
|
||||
max_pages: params.max_pages.unwrap_or(50),
|
||||
concurrency: params.concurrency.unwrap_or(5),
|
||||
|
|
@ -229,7 +229,7 @@ impl WebclawMcp {
|
|||
..Default::default()
|
||||
};
|
||||
|
||||
let crawler = webclaw_fetch::Crawler::new(¶ms.url, config)
|
||||
let crawler = noxa_fetch::Crawler::new(¶ms.url, config)
|
||||
.map_err(|e| format!("Crawler init failed: {e}"))?;
|
||||
|
||||
let result = crawler.crawl(¶ms.url, None).await;
|
||||
|
|
@ -243,7 +243,7 @@ impl WebclawMcp {
|
|||
output.push_str(&format!("--- {} (depth {}) ---\n", page.url, page.depth));
|
||||
if let Some(ref extraction) = page.extraction {
|
||||
let content = match format {
|
||||
"llm" => webclaw_core::to_llm_text(extraction, Some(&page.url)),
|
||||
"llm" => noxa_core::to_llm_text(extraction, Some(&page.url)),
|
||||
"text" => extraction.content.plain_text.clone(),
|
||||
_ => extraction.content.markdown.clone(),
|
||||
};
|
||||
|
|
@ -261,7 +261,7 @@ impl WebclawMcp {
|
|||
#[tool]
|
||||
async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, ¶ms.url)
|
||||
let entries = noxa_fetch::sitemap::discover(&self.fetch_client, ¶ms.url)
|
||||
.await
|
||||
.map_err(|e| format!("Sitemap discovery failed: {e}"))?;
|
||||
|
||||
|
|
@ -302,7 +302,7 @@ impl WebclawMcp {
|
|||
match &r.result {
|
||||
Ok(extraction) => {
|
||||
let content = match format {
|
||||
"llm" => webclaw_core::to_llm_text(extraction, Some(&r.url)),
|
||||
"llm" => noxa_core::to_llm_text(extraction, Some(&r.url)),
|
||||
"text" => extraction.content.plain_text.clone(),
|
||||
_ => extraction.content.markdown.clone(),
|
||||
};
|
||||
|
|
@ -319,7 +319,7 @@ impl WebclawMcp {
|
|||
}
|
||||
|
||||
/// Extract structured data from a web page using an LLM. Provide either a JSON schema or a natural language prompt.
|
||||
/// Falls back to the webclaw cloud API when no local LLM is available or bot protection is detected.
|
||||
/// Falls back to the noxa cloud API when no local LLM is available or bot protection is detected.
|
||||
#[tool]
|
||||
async fn extract(
|
||||
&self,
|
||||
|
|
@ -334,7 +334,7 @@ impl WebclawMcp {
|
|||
// No local LLM — fall back to cloud API directly
|
||||
if self.llm_chain.is_none() {
|
||||
let cloud = self.cloud.as_ref().ok_or(
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or WEBCLAW_API_KEY for cloud fallback.",
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
|
||||
)?;
|
||||
let mut body = json!({"url": params.url});
|
||||
if let Some(ref schema) = params.schema {
|
||||
|
|
@ -351,7 +351,7 @@ impl WebclawMcp {
|
|||
|
||||
let llm_content = match self.smart_fetch_llm(¶ms.url).await? {
|
||||
SmartFetchResult::Local(extraction) => {
|
||||
webclaw_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||
noxa_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||
}
|
||||
SmartFetchResult::Cloud(resp) => resp
|
||||
.get("llm")
|
||||
|
|
@ -362,12 +362,12 @@ impl WebclawMcp {
|
|||
};
|
||||
|
||||
let data = if let Some(ref schema) = params.schema {
|
||||
webclaw_llm::extract::extract_json(&llm_content, schema, chain, None)
|
||||
noxa_llm::extract::extract_json(&llm_content, schema, chain, None)
|
||||
.await
|
||||
.map_err(|e| format!("LLM extraction failed: {e}"))?
|
||||
} else {
|
||||
let prompt = params.prompt.as_deref().unwrap();
|
||||
webclaw_llm::extract::extract_with_prompt(&llm_content, prompt, chain, None)
|
||||
noxa_llm::extract::extract_with_prompt(&llm_content, prompt, chain, None)
|
||||
.await
|
||||
.map_err(|e| format!("LLM extraction failed: {e}"))?
|
||||
};
|
||||
|
|
@ -376,7 +376,7 @@ impl WebclawMcp {
|
|||
}
|
||||
|
||||
/// Summarize the content of a web page using an LLM.
|
||||
/// Falls back to the webclaw cloud API when no local LLM is available or bot protection is detected.
|
||||
/// Falls back to the noxa cloud API when no local LLM is available or bot protection is detected.
|
||||
#[tool]
|
||||
async fn summarize(
|
||||
&self,
|
||||
|
|
@ -387,7 +387,7 @@ impl WebclawMcp {
|
|||
// No local LLM — fall back to cloud API directly
|
||||
if self.llm_chain.is_none() {
|
||||
let cloud = self.cloud.as_ref().ok_or(
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or WEBCLAW_API_KEY for cloud fallback.",
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
|
||||
)?;
|
||||
let mut body = json!({"url": params.url});
|
||||
if let Some(sentences) = params.max_sentences {
|
||||
|
|
@ -405,7 +405,7 @@ impl WebclawMcp {
|
|||
|
||||
let llm_content = match self.smart_fetch_llm(¶ms.url).await? {
|
||||
SmartFetchResult::Local(extraction) => {
|
||||
webclaw_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||
noxa_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||
}
|
||||
SmartFetchResult::Cloud(resp) => resp
|
||||
.get("llm")
|
||||
|
|
@ -415,17 +415,17 @@ impl WebclawMcp {
|
|||
.to_string(),
|
||||
};
|
||||
|
||||
webclaw_llm::summarize::summarize(&llm_content, params.max_sentences, chain, None)
|
||||
noxa_llm::summarize::summarize(&llm_content, params.max_sentences, chain, None)
|
||||
.await
|
||||
.map_err(|e| format!("Summarization failed: {e}"))
|
||||
}
|
||||
|
||||
/// Compare the current content of a URL against a previous extraction snapshot, showing what changed.
|
||||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
||||
/// Automatically falls back to the noxa cloud API when bot protection is detected.
|
||||
#[tool]
|
||||
async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
let previous: webclaw_core::ExtractionResult =
|
||||
let previous: noxa_core::ExtractionResult =
|
||||
serde_json::from_str(¶ms.previous_snapshot)
|
||||
.map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
|
||||
|
||||
|
|
@ -442,7 +442,7 @@ impl WebclawMcp {
|
|||
|
||||
match result {
|
||||
SmartFetchResult::Local(current) => {
|
||||
let content_diff = webclaw_core::diff::diff(&previous, ¤t);
|
||||
let content_diff = noxa_core::diff::diff(&previous, ¤t);
|
||||
Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
|
||||
}
|
||||
SmartFetchResult::Cloud(resp) => {
|
||||
|
|
@ -457,8 +457,8 @@ impl WebclawMcp {
|
|||
);
|
||||
}
|
||||
|
||||
let current = webclaw_core::ExtractionResult {
|
||||
content: webclaw_core::Content {
|
||||
let current = noxa_core::ExtractionResult {
|
||||
content: noxa_core::Content {
|
||||
markdown: markdown.to_string(),
|
||||
plain_text: markdown.to_string(),
|
||||
links: Vec::new(),
|
||||
|
|
@ -466,7 +466,7 @@ impl WebclawMcp {
|
|||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
metadata: webclaw_core::Metadata {
|
||||
metadata: noxa_core::Metadata {
|
||||
title: None,
|
||||
description: None,
|
||||
author: None,
|
||||
|
|
@ -482,14 +482,14 @@ impl WebclawMcp {
|
|||
structured_data: Vec::new(),
|
||||
};
|
||||
|
||||
let content_diff = webclaw_core::diff::diff(&previous, ¤t);
|
||||
let content_diff = noxa_core::diff::diff(&previous, ¤t);
|
||||
Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract brand identity (colors, fonts, logo, favicon) from a website's HTML and CSS.
|
||||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
||||
/// Automatically falls back to the noxa cloud API when bot protection is detected.
|
||||
#[tool]
|
||||
async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
|
|
@ -508,21 +508,21 @@ impl WebclawMcp {
|
|||
return Ok(serde_json::to_string_pretty(&resp).unwrap_or_default());
|
||||
} else {
|
||||
return Err(format!(
|
||||
"Bot protection detected on {}. Set WEBCLAW_API_KEY for automatic cloud bypass. \
|
||||
Get a key at https://webclaw.io",
|
||||
"Bot protection detected on {}. Set NOXA_API_KEY for automatic cloud bypass. \
|
||||
Get a key at https://noxa.io",
|
||||
params.url
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let identity =
|
||||
webclaw_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url));
|
||||
noxa_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url));
|
||||
|
||||
Ok(serde_json::to_string_pretty(&identity).unwrap_or_default())
|
||||
}
|
||||
|
||||
/// Run a deep research investigation on a topic or question. Requires WEBCLAW_API_KEY.
|
||||
/// Saves full result to ~/.webclaw/research/ and returns the file path + key findings.
|
||||
/// Run a deep research investigation on a topic or question. Requires NOXA_API_KEY.
|
||||
/// Saves full result to ~/.noxa/research/ and returns the file path + key findings.
|
||||
/// Checks cache first — same query returns the cached result without spending credits.
|
||||
#[tool]
|
||||
async fn research(
|
||||
|
|
@ -532,7 +532,7 @@ impl WebclawMcp {
|
|||
let cloud = self
|
||||
.cloud
|
||||
.as_ref()
|
||||
.ok_or("Research requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?;
|
||||
.ok_or("Research requires NOXA_API_KEY. Get a key at https://noxa.io")?;
|
||||
|
||||
let research_dir = research_dir();
|
||||
let slug = slugify(¶ms.query);
|
||||
|
|
@ -622,17 +622,17 @@ impl WebclawMcp {
|
|||
|
||||
Err(format!(
|
||||
"Research job {job_id} timed out after ~10 minutes of polling. \
|
||||
Check status manually via the webclaw API: GET /v1/research/{job_id}"
|
||||
Check status manually via the noxa API: GET /v1/research/{job_id}"
|
||||
))
|
||||
}
|
||||
|
||||
/// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY.
|
||||
/// Search the web for a query and return structured results. Requires NOXA_API_KEY.
|
||||
#[tool]
|
||||
async fn search(&self, Parameters(params): Parameters<SearchParams>) -> Result<String, String> {
|
||||
let cloud = self
|
||||
.cloud
|
||||
.as_ref()
|
||||
.ok_or("Search requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?;
|
||||
.ok_or("Search requires NOXA_API_KEY. Get a key at https://noxa.io")?;
|
||||
|
||||
let mut body = json!({ "query": params.query });
|
||||
if let Some(num) = params.num_results {
|
||||
|
|
@ -670,12 +670,12 @@ impl WebclawMcp {
|
|||
}
|
||||
|
||||
#[tool_handler]
|
||||
impl ServerHandler for WebclawMcp {
|
||||
impl ServerHandler for NoxaMcp {
|
||||
fn get_info(&self) -> ServerInfo {
|
||||
ServerInfo::new(ServerCapabilities::builder().enable_tools().build())
|
||||
.with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
|
||||
.with_server_info(Implementation::new("noxa-mcp", env!("CARGO_PKG_VERSION")))
|
||||
.with_instructions(String::from(
|
||||
"Webclaw MCP server -- web content extraction for AI agents. \
|
||||
"Noxa MCP server -- web content extraction for AI agents. \
|
||||
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",
|
||||
))
|
||||
}
|
||||
|
|
@ -688,7 +688,7 @@ impl ServerHandler for WebclawMcp {
|
|||
fn research_dir() -> std::path::PathBuf {
|
||||
let dir = dirs::home_dir()
|
||||
.unwrap_or_else(|| std::path::PathBuf::from("."))
|
||||
.join(".webclaw")
|
||||
.join(".noxa")
|
||||
.join("research");
|
||||
std::fs::create_dir_all(&dir).ok();
|
||||
dir
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "webclaw-pdf"
|
||||
description = "PDF text extraction for webclaw"
|
||||
name = "noxa-pdf"
|
||||
description = "PDF text extraction for noxa"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
/// PDF text extraction for webclaw.
|
||||
/// PDF text extraction for noxa.
|
||||
///
|
||||
/// Uses pdf-extract (backed by lopdf) to pull text from PDF bytes.
|
||||
/// No OCR -- text-based PDFs only. Scanned PDFs return EmptyPdf in Auto mode.
|
||||
|
|
@ -1,513 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# deploy/hetzner.sh — One-click Hetzner VPS deployment for webclaw
|
||||
#
|
||||
# Creates a Hetzner Cloud VPS with Docker, deploys webclaw + Ollama,
|
||||
# and optionally configures nginx + SSL.
|
||||
#
|
||||
# Usage:
|
||||
# ./deploy/hetzner.sh # Interactive setup
|
||||
# ./deploy/hetzner.sh --destroy # Tear down the server
|
||||
#
|
||||
# Server type recommendations:
|
||||
# cpx11: 2 vCPU, 2GB RAM, ~4.59 EUR/mo — Minimum (scraping only, no LLM)
|
||||
# cpx21: 3 vCPU, 4GB RAM, ~8.49 EUR/mo — Recommended (scraping + small LLM)
|
||||
# cpx31: 4 vCPU, 8GB RAM, ~15.59 EUR/mo — Best (scraping + LLM + high concurrency)
|
||||
# cpx41: 8 vCPU, 16GB RAM, ~28.19 EUR/mo — Heavy use (high-volume crawling)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
HETZNER_API="https://api.hetzner.cloud/v1"
|
||||
SERVER_NAME="webclaw"
|
||||
REPO_URL="https://github.com/0xMassi/webclaw.git"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Colors
|
||||
# ---------------------------------------------------------------------------
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
CYAN='\033[0;36m'
|
||||
BOLD='\033[1m'
|
||||
DIM='\033[2m'
|
||||
RESET='\033[0m'
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
info() { printf "${BLUE}[*]${RESET} %s\n" "$*"; }
|
||||
success() { printf "${GREEN}[+]${RESET} %s\n" "$*"; }
|
||||
warn() { printf "${YELLOW}[!]${RESET} %s\n" "$*"; }
|
||||
error() { printf "${RED}[x]${RESET} %s\n" "$*" >&2; }
|
||||
fatal() { error "$*"; exit 1; }
|
||||
|
||||
prompt() {
|
||||
local var_name="$1" prompt_text="$2" default="${3:-}"
|
||||
if [[ -n "$default" ]]; then
|
||||
printf "${CYAN} %s${DIM} [%s]${RESET}: " "$prompt_text" "$default"
|
||||
else
|
||||
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
||||
fi
|
||||
read -r input
|
||||
eval "$var_name=\"${input:-$default}\""
|
||||
}
|
||||
|
||||
prompt_secret() {
|
||||
local var_name="$1" prompt_text="$2" default="${3:-}"
|
||||
if [[ -n "$default" ]]; then
|
||||
printf "${CYAN} %s${DIM} [%s]${RESET}: " "$prompt_text" "$default"
|
||||
else
|
||||
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
||||
fi
|
||||
read -rs input
|
||||
echo
|
||||
eval "$var_name=\"${input:-$default}\""
|
||||
}
|
||||
|
||||
generate_key() {
|
||||
# 32-char random hex key
|
||||
if command -v openssl &>/dev/null; then
|
||||
openssl rand -hex 16
|
||||
else
|
||||
LC_ALL=C tr -dc 'a-f0-9' < /dev/urandom | head -c 32
|
||||
fi
|
||||
}
|
||||
|
||||
hetzner_api() {
|
||||
local method="$1" path="$2"
|
||||
shift 2
|
||||
curl -sf -X "$method" \
|
||||
-H "Authorization: Bearer $HETZNER_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
"$HETZNER_API$path" \
|
||||
"$@"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Preflight checks
|
||||
# ---------------------------------------------------------------------------
|
||||
preflight() {
|
||||
local missing=()
|
||||
command -v curl &>/dev/null || missing+=("curl")
|
||||
command -v jq &>/dev/null || missing+=("jq")
|
||||
command -v ssh &>/dev/null || missing+=("ssh")
|
||||
|
||||
if [[ ${#missing[@]} -gt 0 ]]; then
|
||||
fatal "Missing required tools: ${missing[*]}. Install them and try again."
|
||||
fi
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Validate Hetzner token
|
||||
# ---------------------------------------------------------------------------
|
||||
validate_token() {
|
||||
info "Validating Hetzner API token..."
|
||||
local response
|
||||
response=$(hetzner_api GET "/servers?per_page=1" 2>&1) || {
|
||||
fatal "Invalid Hetzner API token. Get one at: https://console.hetzner.cloud"
|
||||
}
|
||||
success "Token is valid."
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Check if server already exists
|
||||
# ---------------------------------------------------------------------------
|
||||
find_server() {
|
||||
local response
|
||||
response=$(hetzner_api GET "/servers?name=$SERVER_NAME")
|
||||
echo "$response" | jq -r '.servers[0] // empty'
|
||||
}
|
||||
|
||||
get_server_id() {
|
||||
local server
|
||||
server=$(find_server)
|
||||
if [[ -n "$server" && "$server" != "null" ]]; then
|
||||
echo "$server" | jq -r '.id'
|
||||
fi
|
||||
}
|
||||
|
||||
get_server_ip() {
|
||||
local server
|
||||
server=$(find_server)
|
||||
if [[ -n "$server" && "$server" != "null" ]]; then
|
||||
echo "$server" | jq -r '.public_net.ipv4.ip'
|
||||
fi
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Destroy mode
|
||||
# ---------------------------------------------------------------------------
|
||||
destroy_server() {
|
||||
info "Looking for existing webclaw server..."
|
||||
local server_id
|
||||
server_id=$(get_server_id)
|
||||
|
||||
if [[ -z "$server_id" ]]; then
|
||||
warn "No server named '$SERVER_NAME' found. Nothing to destroy."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
local ip
|
||||
ip=$(get_server_ip)
|
||||
warn "Found server: $SERVER_NAME (ID: $server_id, IP: $ip)"
|
||||
printf "${RED} This will permanently delete the server and all its data.${RESET}\n"
|
||||
printf "${CYAN} Type 'destroy' to confirm${RESET}: "
|
||||
read -r confirmation
|
||||
|
||||
if [[ "$confirmation" != "destroy" ]]; then
|
||||
info "Aborted."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
info "Destroying server $server_id..."
|
||||
hetzner_api DELETE "/servers/$server_id" > /dev/null
|
||||
success "Server destroyed."
|
||||
|
||||
# Clean SSH known_hosts
|
||||
if [[ -n "$ip" ]]; then
|
||||
ssh-keygen -R "$ip" 2>/dev/null || true
|
||||
info "Removed $ip from SSH known_hosts."
|
||||
fi
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Build cloud-init user_data
|
||||
# ---------------------------------------------------------------------------
|
||||
build_cloud_init() {
|
||||
local auth_key="$1" openai_key="$2" anthropic_key="$3" domain="$4" ollama_model="$5"
|
||||
|
||||
# Build .env content
|
||||
local env_content="# webclaw deployment — generated by hetzner.sh
|
||||
WEBCLAW_HOST=0.0.0.0
|
||||
WEBCLAW_PORT=3000
|
||||
WEBCLAW_AUTH_KEY=$auth_key
|
||||
OLLAMA_HOST=http://ollama:11434
|
||||
OLLAMA_MODEL=$ollama_model
|
||||
WEBCLAW_LOG=info"
|
||||
|
||||
if [[ -n "$openai_key" ]]; then
|
||||
env_content="$env_content
|
||||
OPENAI_API_KEY=$openai_key"
|
||||
fi
|
||||
if [[ -n "$anthropic_key" ]]; then
|
||||
env_content="$env_content
|
||||
ANTHROPIC_API_KEY=$anthropic_key"
|
||||
fi
|
||||
|
||||
# Nginx + certbot block (only if domain provided)
|
||||
local nginx_block=""
|
||||
if [[ -n "$domain" ]]; then
|
||||
nginx_block="
|
||||
# --- Nginx reverse proxy + SSL ---
|
||||
- apt-get install -y nginx certbot python3-certbot-nginx
|
||||
|
||||
- |
|
||||
cat > /etc/nginx/sites-available/webclaw <<'NGINX'
|
||||
server {
|
||||
listen 80;
|
||||
server_name $domain;
|
||||
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:3000;
|
||||
proxy_set_header Host \$host;
|
||||
proxy_set_header X-Real-IP \$remote_addr;
|
||||
proxy_set_header X-Forwarded-For \$proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto \$scheme;
|
||||
proxy_read_timeout 120s;
|
||||
proxy_connect_timeout 10s;
|
||||
}
|
||||
}
|
||||
NGINX
|
||||
|
||||
- ln -sf /etc/nginx/sites-available/webclaw /etc/nginx/sites-enabled/webclaw
|
||||
- rm -f /etc/nginx/sites-enabled/default
|
||||
- systemctl restart nginx
|
||||
|
||||
# SSL cert (will fail silently if DNS not pointed yet)
|
||||
- certbot --nginx -d $domain --non-interactive --agree-tos --register-unsolicited-contact -m admin@$domain || echo 'Certbot failed — point DNS to this IP and run: certbot --nginx -d $domain'
|
||||
"
|
||||
fi
|
||||
|
||||
cat <<CLOUDINIT
|
||||
#cloud-config
|
||||
package_update: true
|
||||
|
||||
runcmd:
|
||||
# --- Firewall ---
|
||||
- ufw allow 22/tcp
|
||||
- ufw allow 80/tcp
|
||||
- ufw allow 443/tcp
|
||||
- ufw allow 3000/tcp
|
||||
- ufw --force enable
|
||||
|
||||
# --- Docker (already installed on hetzner docker-ce image, but ensure compose) ---
|
||||
- |
|
||||
if ! command -v docker &>/dev/null; then
|
||||
curl -fsSL https://get.docker.com | sh
|
||||
fi
|
||||
- |
|
||||
if ! docker compose version &>/dev/null; then
|
||||
apt-get install -y docker-compose-plugin
|
||||
fi
|
||||
|
||||
# --- Clone and deploy ---
|
||||
- git clone $REPO_URL /opt/webclaw
|
||||
- |
|
||||
cat > /opt/webclaw/.env <<'DOTENV'
|
||||
$env_content
|
||||
DOTENV
|
||||
# Remove leading whitespace from heredoc
|
||||
sed -i 's/^ //' /opt/webclaw/.env
|
||||
|
||||
$nginx_block
|
||||
# --- Start services ---
|
||||
- cd /opt/webclaw && docker compose up -d --build
|
||||
|
||||
# --- Pull Ollama model in background (non-blocking) ---
|
||||
- |
|
||||
nohup bash -c '
|
||||
echo "Waiting for Ollama to start..."
|
||||
for i in \$(seq 1 60); do
|
||||
if docker compose -f /opt/webclaw/docker-compose.yml exec -T ollama ollama list &>/dev/null; then
|
||||
echo "Ollama ready. Pulling $ollama_model..."
|
||||
docker compose -f /opt/webclaw/docker-compose.yml exec -T ollama ollama pull $ollama_model
|
||||
echo "Model $ollama_model pulled."
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
' > /var/log/ollama-pull.log 2>&1 &
|
||||
|
||||
CLOUDINIT
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Wait for SSH
|
||||
# ---------------------------------------------------------------------------
|
||||
wait_for_ssh() {
|
||||
local ip="$1" max_attempts=40
|
||||
info "Waiting for server to become reachable (this takes 1-3 minutes)..."
|
||||
|
||||
for i in $(seq 1 $max_attempts); do
|
||||
if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes \
|
||||
"root@$ip" "echo ok" &>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
printf "."
|
||||
sleep 5
|
||||
done
|
||||
echo
|
||||
return 1
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Wait for Docker build to complete
|
||||
# ---------------------------------------------------------------------------
|
||||
wait_for_docker() {
|
||||
local ip="$1" max_attempts=60
|
||||
info "Waiting for Docker build to complete (this takes 5-15 minutes on first deploy)..."
|
||||
|
||||
for i in $(seq 1 $max_attempts); do
|
||||
local status
|
||||
status=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
|
||||
"root@$ip" "docker ps --filter name=webclaw --format '{{.Status}}' 2>/dev/null | head -1" 2>/dev/null || echo "")
|
||||
|
||||
if [[ "$status" == *"Up"* ]]; then
|
||||
return 0
|
||||
fi
|
||||
printf "."
|
||||
sleep 15
|
||||
done
|
||||
echo
|
||||
return 1
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Get SSH keys from Hetzner account
|
||||
# ---------------------------------------------------------------------------
|
||||
get_ssh_keys() {
|
||||
local response
|
||||
response=$(hetzner_api GET "/ssh_keys")
|
||||
echo "$response" | jq -r '[.ssh_keys[].id] // []'
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main: create server
|
||||
# ---------------------------------------------------------------------------
|
||||
create_server() {
|
||||
# Check for existing server
|
||||
local existing_id
|
||||
existing_id=$(get_server_id)
|
||||
if [[ -n "$existing_id" ]]; then
|
||||
local existing_ip
|
||||
existing_ip=$(get_server_ip)
|
||||
warn "Server '$SERVER_NAME' already exists (ID: $existing_id, IP: $existing_ip)"
|
||||
warn "Run with --destroy first, or use a different name."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Gather configuration
|
||||
echo
|
||||
printf "${BOLD}${GREEN} webclaw Hetzner Deploy${RESET}\n"
|
||||
printf "${DIM} One-click VPS deployment for webclaw REST API + Ollama${RESET}\n"
|
||||
echo
|
||||
|
||||
prompt SERVER_TYPE "Server type (cpx11/cpx21/cpx31/cpx41)" "cpx21"
|
||||
prompt LOCATION "Region (fsn1/nbg1/hel1/ash/hil)" "fsn1"
|
||||
prompt DOMAIN "Domain for SSL (leave empty to skip)" ""
|
||||
prompt_secret OPENAI_KEY "OpenAI API key (optional)" ""
|
||||
prompt_secret ANTHROPIC_KEY "Anthropic API key (optional)" ""
|
||||
|
||||
local generated_auth_key
|
||||
generated_auth_key=$(generate_key)
|
||||
prompt_secret AUTH_KEY "Webclaw auth key" "$generated_auth_key"
|
||||
|
||||
prompt OLLAMA_MODEL "Ollama model to pre-pull" "qwen3:1.7b"
|
||||
|
||||
echo
|
||||
info "Configuration:"
|
||||
printf " Server type: ${BOLD}%s${RESET}\n" "$SERVER_TYPE"
|
||||
printf " Region: ${BOLD}%s${RESET}\n" "$LOCATION"
|
||||
printf " Domain: ${BOLD}%s${RESET}\n" "${DOMAIN:-none}"
|
||||
printf " OpenAI key: ${BOLD}%s${RESET}\n" "$([ -n "$OPENAI_KEY" ] && echo 'set' || echo 'not set')"
|
||||
printf " Anthropic key:${BOLD}%s${RESET}\n" "$([ -n "$ANTHROPIC_KEY" ] && echo 'set' || echo 'not set')"
|
||||
printf " Auth key: ${BOLD}%s${RESET}\n" "$AUTH_KEY"
|
||||
printf " Ollama model: ${BOLD}%s${RESET}\n" "$OLLAMA_MODEL"
|
||||
echo
|
||||
|
||||
printf "${CYAN} Proceed? (y/n)${RESET}: "
|
||||
read -r confirm
|
||||
[[ "$confirm" =~ ^[Yy]$ ]] || { info "Aborted."; exit 0; }
|
||||
|
||||
# Build cloud-init
|
||||
local user_data
|
||||
user_data=$(build_cloud_init "$AUTH_KEY" "$OPENAI_KEY" "$ANTHROPIC_KEY" "$DOMAIN" "$OLLAMA_MODEL")
|
||||
|
||||
# Get SSH keys
|
||||
local ssh_keys
|
||||
ssh_keys=$(get_ssh_keys)
|
||||
info "Found $(echo "$ssh_keys" | jq length) SSH key(s) in your Hetzner account."
|
||||
|
||||
# Create server
|
||||
info "Creating $SERVER_TYPE server in $LOCATION..."
|
||||
local create_payload
|
||||
create_payload=$(jq -n \
|
||||
--arg name "$SERVER_NAME" \
|
||||
--arg server_type "$SERVER_TYPE" \
|
||||
--arg location "$LOCATION" \
|
||||
--arg user_data "$user_data" \
|
||||
--argjson ssh_keys "$ssh_keys" \
|
||||
'{
|
||||
name: $name,
|
||||
server_type: $server_type,
|
||||
location: $location,
|
||||
image: "docker-ce",
|
||||
ssh_keys: $ssh_keys,
|
||||
user_data: $user_data,
|
||||
public_net: {
|
||||
enable_ipv4: true,
|
||||
enable_ipv6: true
|
||||
}
|
||||
}')
|
||||
|
||||
local response
|
||||
response=$(hetzner_api POST "/servers" -d "$create_payload") || {
|
||||
fatal "Failed to create server. Check your Hetzner token permissions."
|
||||
}
|
||||
|
||||
local server_id server_ip root_password
|
||||
server_id=$(echo "$response" | jq -r '.server.id')
|
||||
server_ip=$(echo "$response" | jq -r '.server.public_net.ipv4.ip')
|
||||
root_password=$(echo "$response" | jq -r '.root_password // empty')
|
||||
|
||||
if [[ -z "$server_id" || "$server_id" == "null" ]]; then
|
||||
error "Server creation response:"
|
||||
echo "$response" | jq .
|
||||
fatal "Failed to create server."
|
||||
fi
|
||||
|
||||
success "Server created: ID=$server_id, IP=$server_ip"
|
||||
|
||||
if [[ -n "$root_password" ]]; then
|
||||
echo
|
||||
warn "Root password (save this, shown only once): $root_password"
|
||||
echo
|
||||
fi
|
||||
|
||||
# Wait for SSH
|
||||
if wait_for_ssh "$server_ip"; then
|
||||
success "Server is reachable via SSH."
|
||||
else
|
||||
warn "Server not yet reachable via SSH. It may still be booting."
|
||||
warn "Try: ssh root@$server_ip"
|
||||
fi
|
||||
|
||||
# Summary
|
||||
echo
|
||||
printf "${BOLD}${GREEN} Deployment started.${RESET}\n"
|
||||
echo
|
||||
printf " The server is now building webclaw from source.\n"
|
||||
printf " This takes ${BOLD}5-15 minutes${RESET} on first deploy.\n"
|
||||
echo
|
||||
printf " ${BOLD}Server IP:${RESET} %s\n" "$server_ip"
|
||||
printf " ${BOLD}SSH:${RESET} ssh root@%s\n" "$server_ip"
|
||||
printf " ${BOLD}Auth key:${RESET} %s\n" "$AUTH_KEY"
|
||||
echo
|
||||
printf " ${BOLD}Monitor build progress:${RESET}\n"
|
||||
printf " ssh root@%s 'cd /opt/webclaw && docker compose logs -f'\n" "$server_ip"
|
||||
echo
|
||||
printf " ${BOLD}Test when ready:${RESET}\n"
|
||||
printf " curl http://%s:3000/health\n" "$server_ip"
|
||||
echo
|
||||
printf " ${BOLD}Scrape:${RESET}\n"
|
||||
printf " curl -X POST http://%s:3000/v1/scrape \\\\\n" "$server_ip"
|
||||
printf " -H 'Content-Type: application/json' \\\\\n"
|
||||
printf " -H 'Authorization: Bearer %s' \\\\\n" "$AUTH_KEY"
|
||||
printf " -d '{\"url\": \"https://example.com\"}'\n"
|
||||
echo
|
||||
|
||||
if [[ -n "$DOMAIN" ]]; then
|
||||
printf " ${BOLD}Domain:${RESET}\n"
|
||||
printf " Point %s A record -> %s\n" "$DOMAIN" "$server_ip"
|
||||
printf " SSL will auto-configure via certbot.\n"
|
||||
printf " Then: curl https://%s/health\n" "$DOMAIN"
|
||||
echo
|
||||
fi
|
||||
|
||||
printf " ${BOLD}Pull Ollama model manually (if auto-pull hasn't finished):${RESET}\n"
|
||||
printf " ssh root@%s 'cd /opt/webclaw && docker compose exec ollama ollama pull %s'\n" "$server_ip" "$OLLAMA_MODEL"
|
||||
echo
|
||||
|
||||
printf " ${BOLD}Tear down:${RESET}\n"
|
||||
printf " HETZNER_TOKEN=%s ./deploy/hetzner.sh --destroy\n" "$HETZNER_TOKEN"
|
||||
echo
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entrypoint
|
||||
# ---------------------------------------------------------------------------
|
||||
main() {
|
||||
preflight
|
||||
|
||||
# Accept token from env or prompt
|
||||
if [[ -z "${HETZNER_TOKEN:-}" ]]; then
|
||||
echo
|
||||
printf "${BOLD}${GREEN} webclaw Hetzner Deploy${RESET}\n"
|
||||
echo
|
||||
prompt_secret HETZNER_TOKEN "Hetzner API token (https://console.hetzner.cloud)" ""
|
||||
[[ -n "$HETZNER_TOKEN" ]] || fatal "Hetzner API token is required."
|
||||
fi
|
||||
|
||||
validate_token
|
||||
|
||||
if [[ "${1:-}" == "--destroy" ]]; then
|
||||
destroy_server
|
||||
else
|
||||
create_server
|
||||
fi
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
services:
|
||||
webclaw:
|
||||
noxa:
|
||||
build: .
|
||||
ports:
|
||||
- "${WEBCLAW_PORT:-3000}:3000"
|
||||
- "${NOXA_PORT:-3000}:3000"
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
|
|
@ -11,7 +11,7 @@ services:
|
|||
- ollama
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "webclaw", "--help"]
|
||||
test: ["CMD", "noxa", "--help"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
|
|
|||
28
env.example
28
env.example
|
|
@ -1,5 +1,5 @@
|
|||
# ============================================
|
||||
# Webclaw Configuration
|
||||
# Noxa Configuration
|
||||
# Copy to .env and fill in your values
|
||||
# ============================================
|
||||
|
||||
|
|
@ -21,23 +21,23 @@ OLLAMA_MODEL=qwen3:8b
|
|||
# --- Proxy ---
|
||||
|
||||
# Single proxy
|
||||
# WEBCLAW_PROXY=http://user:pass@host:port
|
||||
# NOXA_PROXY=http://user:pass@host:port
|
||||
|
||||
# Proxy file (one per line: host:port:user:pass)
|
||||
# WEBCLAW_PROXY_FILE=/path/to/proxies.txt
|
||||
# NOXA_PROXY_FILE=/path/to/proxies.txt
|
||||
|
||||
# --- Server (webclaw-server only) ---
|
||||
# WEBCLAW_PORT=3000
|
||||
# WEBCLAW_HOST=0.0.0.0
|
||||
# WEBCLAW_AUTH_KEY=your-auth-key
|
||||
# WEBCLAW_MAX_CONCURRENCY=50
|
||||
# WEBCLAW_JOB_TTL_SECS=3600
|
||||
# WEBCLAW_MAX_JOBS=100
|
||||
# --- Server (noxa-server only) ---
|
||||
# NOXA_PORT=3000
|
||||
# NOXA_HOST=0.0.0.0
|
||||
# NOXA_AUTH_KEY=your-auth-key
|
||||
# NOXA_MAX_CONCURRENCY=50
|
||||
# NOXA_JOB_TTL_SECS=3600
|
||||
# NOXA_MAX_JOBS=100
|
||||
|
||||
# --- CLI LLM overrides ---
|
||||
# WEBCLAW_LLM_PROVIDER=ollama
|
||||
# WEBCLAW_LLM_MODEL=qwen3:8b
|
||||
# WEBCLAW_LLM_BASE_URL=http://localhost:11434
|
||||
# NOXA_LLM_PROVIDER=ollama
|
||||
# NOXA_LLM_MODEL=qwen3:8b
|
||||
# NOXA_LLM_BASE_URL=http://localhost:11434
|
||||
|
||||
# --- Logging ---
|
||||
# WEBCLAW_LOG=info
|
||||
# NOXA_LOG=info
|
||||
|
|
|
|||
|
|
@ -1,50 +1,50 @@
|
|||
# Examples
|
||||
|
||||
Practical examples showing what webclaw can do. Each example is a self-contained command you can run immediately.
|
||||
Practical examples showing what noxa can do. Each example is a self-contained command you can run immediately.
|
||||
|
||||
## Basic Extraction
|
||||
|
||||
```bash
|
||||
# Extract as markdown (default)
|
||||
webclaw https://example.com
|
||||
noxa https://example.com
|
||||
|
||||
# Multiple output formats
|
||||
webclaw https://example.com -f markdown # Clean markdown
|
||||
webclaw https://example.com -f json # Full structured JSON
|
||||
webclaw https://example.com -f text # Plain text (no formatting)
|
||||
webclaw https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens)
|
||||
noxa https://example.com -f markdown # Clean markdown
|
||||
noxa https://example.com -f json # Full structured JSON
|
||||
noxa https://example.com -f text # Plain text (no formatting)
|
||||
noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer tokens)
|
||||
|
||||
# Bare domains work (auto-prepends https://)
|
||||
webclaw example.com
|
||||
noxa example.com
|
||||
```
|
||||
|
||||
## Content Filtering
|
||||
|
||||
```bash
|
||||
# Only extract main content (skip nav, sidebar, footer)
|
||||
webclaw https://docs.rs/tokio --only-main-content
|
||||
noxa https://docs.rs/tokio --only-main-content
|
||||
|
||||
# Include specific CSS selectors
|
||||
webclaw https://news.ycombinator.com --include ".titleline,.score"
|
||||
noxa https://news.ycombinator.com --include ".titleline,.score"
|
||||
|
||||
# Exclude specific elements
|
||||
webclaw https://example.com --exclude "nav,footer,.ads,.sidebar"
|
||||
noxa https://example.com --exclude "nav,footer,.ads,.sidebar"
|
||||
|
||||
# Combine both
|
||||
webclaw https://docs.rs/reqwest --only-main-content --exclude ".sidebar"
|
||||
noxa https://docs.rs/reqwest --only-main-content --exclude ".sidebar"
|
||||
```
|
||||
|
||||
## Brand Identity Extraction
|
||||
|
||||
```bash
|
||||
# Extract colors, fonts, logos from any website
|
||||
webclaw --brand https://stripe.com
|
||||
noxa --brand https://stripe.com
|
||||
# Output: { "name": "Stripe", "colors": [...], "fonts": ["Sohne"], "logos": [...] }
|
||||
|
||||
webclaw --brand https://github.com
|
||||
noxa --brand https://github.com
|
||||
# Output: { "name": "GitHub", "colors": [{"hex": "#1F2328", ...}], "fonts": ["Mona Sans"], ... }
|
||||
|
||||
webclaw --brand wikipedia.org
|
||||
noxa --brand wikipedia.org
|
||||
# Output: 10 colors, 5 fonts, favicon, logo URL
|
||||
```
|
||||
|
||||
|
|
@ -52,11 +52,11 @@ webclaw --brand wikipedia.org
|
|||
|
||||
```bash
|
||||
# Discover all URLs from a site's sitemaps
|
||||
webclaw --map https://sitemaps.org
|
||||
noxa --map https://sitemaps.org
|
||||
# Output: one URL per line (84 URLs found)
|
||||
|
||||
# JSON output with metadata
|
||||
webclaw --map https://sitemaps.org -f json
|
||||
noxa --map https://sitemaps.org -f json
|
||||
# Output: [{ "url": "...", "last_modified": "...", "priority": 0.8 }]
|
||||
```
|
||||
|
||||
|
|
@ -64,30 +64,30 @@ webclaw --map https://sitemaps.org -f json
|
|||
|
||||
```bash
|
||||
# Crawl a site (default: depth 1, max 20 pages)
|
||||
webclaw --crawl https://example.com
|
||||
noxa --crawl https://example.com
|
||||
|
||||
# Control depth and page limit
|
||||
webclaw --crawl --depth 2 --max-pages 50 https://docs.rs/tokio
|
||||
noxa --crawl --depth 2 --max-pages 50 https://docs.rs/tokio
|
||||
|
||||
# Crawl with sitemap seeding (finds more pages)
|
||||
webclaw --crawl --sitemap --depth 2 https://docs.rs/tokio
|
||||
noxa --crawl --sitemap --depth 2 https://docs.rs/tokio
|
||||
|
||||
# Filter crawl paths
|
||||
webclaw --crawl --include-paths "/api/*,/guide/*" https://docs.example.com
|
||||
webclaw --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com
|
||||
noxa --crawl --include-paths "/api/*,/guide/*" https://docs.example.com
|
||||
noxa --crawl --exclude-paths "/changelog/*,/blog/*" https://docs.example.com
|
||||
|
||||
# Control concurrency and delay
|
||||
webclaw --crawl --concurrency 10 --delay 200 https://example.com
|
||||
noxa --crawl --concurrency 10 --delay 200 https://example.com
|
||||
```
|
||||
|
||||
## Change Detection (Diff)
|
||||
|
||||
```bash
|
||||
# Step 1: Save a snapshot
|
||||
webclaw https://example.com -f json > snapshot.json
|
||||
noxa https://example.com -f json > snapshot.json
|
||||
|
||||
# Step 2: Later, compare against the snapshot
|
||||
webclaw --diff-with snapshot.json https://example.com
|
||||
noxa --diff-with snapshot.json https://example.com
|
||||
# Output:
|
||||
# Status: Same
|
||||
# Word count delta: +0
|
||||
|
|
@ -106,88 +106,88 @@ webclaw --diff-with snapshot.json https://example.com
|
|||
|
||||
```bash
|
||||
# PDF URLs are auto-detected via Content-Type
|
||||
webclaw https://example.com/report.pdf
|
||||
noxa https://example.com/report.pdf
|
||||
|
||||
# Control PDF mode
|
||||
webclaw --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs)
|
||||
webclaw --pdf-mode fast https://example.com/report.pdf # Return whatever text is found
|
||||
noxa --pdf-mode auto https://example.com/report.pdf # Error on empty (catches scanned PDFs)
|
||||
noxa --pdf-mode fast https://example.com/report.pdf # Return whatever text is found
|
||||
```
|
||||
|
||||
## Batch Processing
|
||||
|
||||
```bash
|
||||
# Multiple URLs in one command
|
||||
webclaw https://example.com https://httpbin.org/html https://rust-lang.org
|
||||
noxa https://example.com https://httpbin.org/html https://rust-lang.org
|
||||
|
||||
# URLs from a file (one per line, # comments supported)
|
||||
webclaw --urls-file urls.txt
|
||||
noxa --urls-file urls.txt
|
||||
|
||||
# Batch with JSON output
|
||||
webclaw --urls-file urls.txt -f json
|
||||
noxa --urls-file urls.txt -f json
|
||||
|
||||
# Proxy rotation for large batches
|
||||
webclaw --urls-file urls.txt --proxy-file proxies.txt --concurrency 10
|
||||
noxa --urls-file urls.txt --proxy-file proxies.txt --concurrency 10
|
||||
```
|
||||
|
||||
## Local Files & Stdin
|
||||
|
||||
```bash
|
||||
# Extract from a local HTML file
|
||||
webclaw --file page.html
|
||||
noxa --file page.html
|
||||
|
||||
# Pipe HTML from another command
|
||||
curl -s https://example.com | webclaw --stdin
|
||||
curl -s https://example.com | noxa --stdin
|
||||
|
||||
# Chain with other tools
|
||||
webclaw https://example.com -f text | wc -w # Word count
|
||||
webclaw https://example.com -f json | jq '.metadata.title' # Extract title with jq
|
||||
noxa https://example.com -f text | wc -w # Word count
|
||||
noxa https://example.com -f json | jq '.metadata.title' # Extract title with jq
|
||||
```
|
||||
|
||||
## Cloud API Mode
|
||||
|
||||
When you have a webclaw API key, the CLI can route through the cloud for bot protection bypass, JS rendering, and proxy rotation.
|
||||
When you have a noxa API key, the CLI can route through the cloud for bot protection bypass, JS rendering, and proxy rotation.
|
||||
|
||||
```bash
|
||||
# Set API key (one time)
|
||||
export WEBCLAW_API_KEY=wc_your_key_here
|
||||
export NOXA_API_KEY=wc_your_key_here
|
||||
|
||||
# Automatic fallback: tries local first, cloud on bot detection
|
||||
webclaw https://protected-site.com
|
||||
noxa https://protected-site.com
|
||||
|
||||
# Force cloud mode (skip local, always use API)
|
||||
webclaw --cloud https://spa-site.com
|
||||
noxa --cloud https://spa-site.com
|
||||
|
||||
# Cloud mode works with all features
|
||||
webclaw --cloud --brand https://stripe.com
|
||||
webclaw --cloud -f json https://producthunt.com
|
||||
webclaw --cloud --crawl --depth 2 https://protected-docs.com
|
||||
noxa --cloud --brand https://stripe.com
|
||||
noxa --cloud -f json https://producthunt.com
|
||||
noxa --cloud --crawl --depth 2 https://protected-docs.com
|
||||
```
|
||||
|
||||
## Browser Impersonation
|
||||
|
||||
```bash
|
||||
# Chrome (default) — latest Chrome TLS fingerprint
|
||||
webclaw https://example.com
|
||||
noxa https://example.com
|
||||
|
||||
# Firefox fingerprint
|
||||
webclaw --browser firefox https://example.com
|
||||
noxa --browser firefox https://example.com
|
||||
|
||||
# Random browser per request (good for batch)
|
||||
webclaw --browser random --urls-file urls.txt
|
||||
noxa --browser random --urls-file urls.txt
|
||||
```
|
||||
|
||||
## Custom Headers & Cookies
|
||||
|
||||
```bash
|
||||
# Custom headers
|
||||
webclaw -H "Authorization: Bearer token123" https://api.example.com
|
||||
webclaw -H "Accept-Language: de-DE" https://example.com
|
||||
noxa -H "Authorization: Bearer token123" https://api.example.com
|
||||
noxa -H "Accept-Language: de-DE" https://example.com
|
||||
|
||||
# Cookies
|
||||
webclaw --cookie "session=abc123; theme=dark" https://example.com
|
||||
noxa --cookie "session=abc123; theme=dark" https://example.com
|
||||
|
||||
# Multiple headers
|
||||
webclaw -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com
|
||||
noxa -H "X-Custom: value" -H "Authorization: Bearer token" https://example.com
|
||||
```
|
||||
|
||||
## LLM-Powered Features
|
||||
|
|
@ -196,42 +196,42 @@ These require an LLM provider (Ollama local, or OpenAI/Anthropic API key).
|
|||
|
||||
```bash
|
||||
# Summarize a page (default: 3 sentences)
|
||||
webclaw --summarize https://example.com
|
||||
noxa --summarize https://example.com
|
||||
|
||||
# Control summary length
|
||||
webclaw --summarize 5 https://example.com
|
||||
noxa --summarize 5 https://example.com
|
||||
|
||||
# Extract structured JSON with a schema
|
||||
webclaw --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product
|
||||
noxa --extract-json '{"type":"object","properties":{"title":{"type":"string"},"price":{"type":"number"}}}' https://example.com/product
|
||||
|
||||
# Extract with a schema from file
|
||||
webclaw --extract-json @schema.json https://example.com/product
|
||||
noxa --extract-json @schema.json https://example.com/product
|
||||
|
||||
# Extract with natural language prompt
|
||||
webclaw --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing
|
||||
noxa --extract-prompt "Get all pricing tiers with name, price, and features" https://stripe.com/pricing
|
||||
|
||||
# Use a specific LLM provider
|
||||
webclaw --llm-provider ollama --summarize https://example.com
|
||||
webclaw --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com
|
||||
webclaw --llm-provider anthropic --summarize https://example.com
|
||||
noxa --llm-provider ollama --summarize https://example.com
|
||||
noxa --llm-provider openai --llm-model gpt-4o --extract-prompt "..." https://example.com
|
||||
noxa --llm-provider anthropic --summarize https://example.com
|
||||
```
|
||||
|
||||
## Raw HTML Output
|
||||
|
||||
```bash
|
||||
# Get the raw fetched HTML (no extraction)
|
||||
webclaw --raw-html https://example.com
|
||||
noxa --raw-html https://example.com
|
||||
|
||||
# Useful for debugging extraction issues
|
||||
webclaw --raw-html https://example.com > raw.html
|
||||
webclaw --file raw.html # Then extract locally
|
||||
noxa --raw-html https://example.com > raw.html
|
||||
noxa --file raw.html # Then extract locally
|
||||
```
|
||||
|
||||
## Metadata & Verbose Mode
|
||||
|
||||
```bash
|
||||
# Include YAML frontmatter with metadata
|
||||
webclaw --metadata https://example.com
|
||||
noxa --metadata https://example.com
|
||||
# Output:
|
||||
# ---
|
||||
# title: "Example Domain"
|
||||
|
|
@ -242,39 +242,39 @@ webclaw --metadata https://example.com
|
|||
# ...
|
||||
|
||||
# Verbose logging (debug extraction pipeline)
|
||||
webclaw -v https://example.com
|
||||
noxa -v https://example.com
|
||||
```
|
||||
|
||||
## Proxy Usage
|
||||
|
||||
```bash
|
||||
# Single proxy
|
||||
webclaw --proxy http://user:pass@proxy.example.com:8080 https://example.com
|
||||
noxa --proxy http://user:pass@proxy.example.com:8080 https://example.com
|
||||
|
||||
# SOCKS5 proxy
|
||||
webclaw --proxy socks5://proxy.example.com:1080 https://example.com
|
||||
noxa --proxy socks5://proxy.example.com:1080 https://example.com
|
||||
|
||||
# Proxy rotation from file (one per line: host:port:user:pass)
|
||||
webclaw --proxy-file proxies.txt https://example.com
|
||||
noxa --proxy-file proxies.txt https://example.com
|
||||
|
||||
# Auto-load proxies.txt from current directory
|
||||
echo "proxy1.com:8080:user:pass" > proxies.txt
|
||||
webclaw https://example.com # Automatically detects and uses proxies.txt
|
||||
noxa https://example.com # Automatically detects and uses proxies.txt
|
||||
```
|
||||
|
||||
## MCP Server (AI Agent Integration)
|
||||
|
||||
```bash
|
||||
# Start the MCP server (stdio transport)
|
||||
webclaw-mcp
|
||||
noxa-mcp
|
||||
|
||||
# Configure in Claude Desktop (~/.config/claude/claude_desktop_config.json):
|
||||
# {
|
||||
# "mcpServers": {
|
||||
# "webclaw": {
|
||||
# "command": "/path/to/webclaw-mcp",
|
||||
# "noxa": {
|
||||
# "command": "/path/to/noxa-mcp",
|
||||
# "env": {
|
||||
# "WEBCLAW_API_KEY": "wc_your_key" // optional, enables cloud fallback
|
||||
# "NOXA_API_KEY": "wc_your_key" // optional, enables cloud fallback
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
|
@ -289,7 +289,7 @@ webclaw-mcp
|
|||
|
||||
```bash
|
||||
# Save today's pricing
|
||||
webclaw --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \
|
||||
noxa --extract-json '{"type":"array","items":{"type":"object","properties":{"plan":{"type":"string"},"price":{"type":"string"}}}}' \
|
||||
https://competitor.com/pricing -f json > pricing-$(date +%Y%m%d).json
|
||||
```
|
||||
|
||||
|
|
@ -297,24 +297,24 @@ webclaw --extract-json '{"type":"array","items":{"type":"object","properties":{"
|
|||
|
||||
```bash
|
||||
# Crawl docs and extract as LLM-optimized text
|
||||
webclaw --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt
|
||||
noxa --crawl --sitemap --depth 3 --max-pages 500 -f llm https://docs.example.com > docs.txt
|
||||
```
|
||||
|
||||
### Extract all images from a page
|
||||
|
||||
```bash
|
||||
webclaw https://example.com -f json | jq -r '.content.images[].src'
|
||||
noxa https://example.com -f json | jq -r '.content.images[].src'
|
||||
```
|
||||
|
||||
### Get all external links
|
||||
|
||||
```bash
|
||||
webclaw https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href'
|
||||
noxa https://example.com -f json | jq -r '.content.links[] | select(.href | startswith("http")) | .href'
|
||||
```
|
||||
|
||||
### Compare two pages
|
||||
|
||||
```bash
|
||||
webclaw https://site-a.com -f json > a.json
|
||||
webclaw https://site-b.com --diff-with a.json
|
||||
noxa https://site-a.com -f json > a.json
|
||||
noxa https://site-b.com --diff-with a.json
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,6 +0,0 @@
|
|||
{
|
||||
"$schema": "https://glama.ai/mcp/schemas/server.json",
|
||||
"maintainers": [
|
||||
"0xMassi"
|
||||
]
|
||||
}
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
<p align="center">
|
||||
<a href="https://webclaw.io">
|
||||
<a href="https://noxa.io">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/0xMassi/webclaw/main/.github/banner.png" />
|
||||
<img src="https://raw.githubusercontent.com/0xMassi/webclaw/main/.github/banner.png" alt="webclaw" width="700" />
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/jmagar/noxa/main/.github/banner.png" />
|
||||
<img src="https://raw.githubusercontent.com/jmagar/noxa/main/.github/banner.png" alt="noxa" width="700" />
|
||||
</picture>
|
||||
</a>
|
||||
</p>
|
||||
|
|
@ -13,9 +13,9 @@
|
|||
</h3>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://www.npmjs.com/package/create-webclaw"><img src="https://img.shields.io/npm/dt/create-webclaw?style=for-the-badge&logo=npm&logoColor=white&label=Installs&color=CB3837" alt="npm installs" /></a>
|
||||
<a href="https://github.com/0xMassi/webclaw"><img src="https://img.shields.io/github/stars/0xMassi/webclaw?style=for-the-badge&logo=github&logoColor=white&label=Stars&color=181717" alt="Stars" /></a>
|
||||
<a href="https://github.com/0xMassi/webclaw/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-AGPL--3.0-10B981?style=for-the-badge" alt="License" /></a>
|
||||
<a href="https://www.npmjs.com/package/create-noxa"><img src="https://img.shields.io/npm/dt/create-noxa?style=for-the-badge&logo=npm&logoColor=white&label=Installs&color=CB3837" alt="npm installs" /></a>
|
||||
<a href="https://github.com/jmagar/noxa"><img src="https://img.shields.io/github/stars/jmagar/noxa?style=for-the-badge&logo=github&logoColor=white&label=Stars&color=181717" alt="Stars" /></a>
|
||||
<a href="https://github.com/jmagar/noxa/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-AGPL--3.0-10B981?style=for-the-badge" alt="License" /></a>
|
||||
</p>
|
||||
|
||||
---
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
## Quick Start
|
||||
|
||||
```bash
|
||||
npx create-webclaw
|
||||
npx create-noxa
|
||||
```
|
||||
|
||||
That's it. Auto-detects your AI tools, downloads the MCP server, configures everything.
|
||||
|
|
@ -40,12 +40,12 @@ When it does work, you get 100KB+ of raw HTML — navigation, ads, cookie banner
|
|||
|
||||
## The Fix
|
||||
|
||||
webclaw impersonates Chrome 146 at the TLS protocol level. Perfect JA4 fingerprint. Perfect HTTP/2 Akamai hash. 99% bypass rate on 102 tested sites.
|
||||
noxa impersonates Chrome 146 at the TLS protocol level. Perfect JA4 fingerprint. Perfect HTTP/2 Akamai hash. 99% bypass rate on 102 tested sites.
|
||||
|
||||
Then it extracts just the content — clean markdown, 67% fewer tokens.
|
||||
|
||||
```
|
||||
Raw HTML webclaw
|
||||
Raw HTML noxa
|
||||
┌──────────────────────────────────┐ ┌──────────────────────────────────┐
|
||||
│ <div class="ad-wrapper"> │ │ # Breaking: AI Breakthrough │
|
||||
│ <nav class="global-nav"> │ │ │
|
||||
|
|
@ -63,11 +63,11 @@ Then it extracts just the content — clean markdown, 67% fewer tokens.
|
|||
## What It Does
|
||||
|
||||
```bash
|
||||
npx create-webclaw
|
||||
npx create-noxa
|
||||
```
|
||||
|
||||
1. Detects installed AI tools (Claude, Cursor, Windsurf, VS Code, OpenCode, Codex, Antigravity)
|
||||
2. Downloads the `webclaw-mcp` binary for your platform (macOS arm64/x86, Linux x86/arm64)
|
||||
2. Downloads the `noxa-mcp` binary for your platform (macOS arm64/x86, Linux x86/arm64)
|
||||
3. Asks for your API key (optional — **works locally without one**)
|
||||
4. Writes the MCP config for each detected tool
|
||||
|
||||
|
|
@ -105,7 +105,7 @@ After setup, your AI agent has access to:
|
|||
|
||||
## Sites That Work
|
||||
|
||||
webclaw gets through where default `fetch()` gets blocked:
|
||||
noxa gets through where default `fetch()` gets blocked:
|
||||
|
||||
Nike, Cloudflare, Bloomberg, Zillow, Indeed, Viagogo, Fansale, Wikipedia, Stripe, and 93 more. Tested on 102 sites with **99% success rate**.
|
||||
|
||||
|
|
@ -114,35 +114,33 @@ Nike, Cloudflare, Bloomberg, Zillow, Indeed, Viagogo, Fansale, Wikipedia, Stripe
|
|||
### Homebrew
|
||||
|
||||
```bash
|
||||
brew tap 0xMassi/webclaw && brew install webclaw
|
||||
brew tap jmagar/noxa && brew install noxa
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
docker run --rm ghcr.io/0xmassi/webclaw https://example.com
|
||||
docker run --rm ghcr.io/0xmassi/noxa https://example.com
|
||||
```
|
||||
|
||||
### Cargo
|
||||
|
||||
```bash
|
||||
cargo install --git https://github.com/0xMassi/webclaw.git webclaw-cli
|
||||
cargo install --git https://github.com/jmagar/noxa.git noxa
|
||||
```
|
||||
|
||||
### Prebuilt Binaries
|
||||
|
||||
Download from [GitHub Releases](https://github.com/0xMassi/webclaw/releases) for macOS (arm64, x86_64) and Linux (x86_64, aarch64).
|
||||
Download from [GitHub Releases](https://github.com/jmagar/noxa/releases) for macOS (arm64, x86_64) and Linux (x86_64, aarch64).
|
||||
|
||||
---
|
||||
|
||||
## Links
|
||||
|
||||
- [Website](https://webclaw.io)
|
||||
- [Documentation](https://webclaw.io/docs)
|
||||
- [GitHub](https://github.com/0xMassi/webclaw)
|
||||
- [TLS Library](https://github.com/0xMassi/webclaw-tls)
|
||||
- [Discord](https://discord.gg/KDfd48EpnW)
|
||||
- [Status](https://status.webclaw.io)
|
||||
- [Website](https://noxa.io)
|
||||
- [Documentation](https://noxa.io/docs)
|
||||
- [GitHub](https://github.com/jmagar/noxa)
|
||||
- [Status](https://status.noxa.io)
|
||||
|
||||
## License
|
||||
|
||||
|
|
@ -12,9 +12,9 @@ import http from "http";
|
|||
|
||||
// ── Constants ──
|
||||
|
||||
const REPO = "0xMassi/webclaw";
|
||||
const BINARY_NAME = "webclaw-mcp";
|
||||
const INSTALL_DIR = join(homedir(), ".webclaw");
|
||||
const REPO = "jmagar/noxa";
|
||||
const BINARY_NAME = "noxa-mcp";
|
||||
const INSTALL_DIR = join(homedir(), ".noxa");
|
||||
const BINARY_PATH = join(INSTALL_DIR, BINARY_NAME);
|
||||
const VERSION = "latest";
|
||||
|
||||
|
|
@ -170,7 +170,7 @@ function download(url) {
|
|||
return new Promise((resolve, reject) => {
|
||||
const client = url.startsWith("https") ? https : http;
|
||||
client
|
||||
.get(url, { headers: { "User-Agent": "create-webclaw" } }, (res) => {
|
||||
.get(url, { headers: { "User-Agent": "create-noxa" } }, (res) => {
|
||||
// Follow redirects
|
||||
if (
|
||||
res.statusCode >= 300 &&
|
||||
|
|
@ -195,7 +195,7 @@ async function downloadFile(url, dest) {
|
|||
return new Promise((resolve, reject) => {
|
||||
const client = url.startsWith("https") ? https : http;
|
||||
client
|
||||
.get(url, { headers: { "User-Agent": "create-webclaw" } }, (res) => {
|
||||
.get(url, { headers: { "User-Agent": "create-noxa" } }, (res) => {
|
||||
if (
|
||||
res.statusCode >= 300 &&
|
||||
res.statusCode < 400 &&
|
||||
|
|
@ -225,15 +225,15 @@ function getAssetName() {
|
|||
const a = arch();
|
||||
|
||||
if (os === "darwin" && a === "arm64")
|
||||
return `webclaw-mcp-aarch64-apple-darwin.tar.gz`;
|
||||
return `noxa-mcp-aarch64-apple-darwin.tar.gz`;
|
||||
if (os === "darwin" && a === "x64")
|
||||
return `webclaw-mcp-x86_64-apple-darwin.tar.gz`;
|
||||
return `noxa-mcp-x86_64-apple-darwin.tar.gz`;
|
||||
if (os === "linux" && a === "x64")
|
||||
return `webclaw-mcp-x86_64-unknown-linux-gnu.tar.gz`;
|
||||
return `noxa-mcp-x86_64-unknown-linux-gnu.tar.gz`;
|
||||
if (os === "linux" && a === "arm64")
|
||||
return `webclaw-mcp-aarch64-unknown-linux-gnu.tar.gz`;
|
||||
return `noxa-mcp-aarch64-unknown-linux-gnu.tar.gz`;
|
||||
if (os === "win32" && a === "x64")
|
||||
return `webclaw-mcp-x86_64-pc-windows-msvc.zip`;
|
||||
return `noxa-mcp-x86_64-pc-windows-msvc.zip`;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
|
@ -257,7 +257,7 @@ function buildMcpEntry(apiKey) {
|
|||
command: BINARY_PATH,
|
||||
};
|
||||
if (apiKey) {
|
||||
entry.env = { WEBCLAW_API_KEY: apiKey };
|
||||
entry.env = { NOXA_API_KEY: apiKey };
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
|
@ -267,23 +267,23 @@ function buildMcpEntry(apiKey) {
|
|||
function addToClaudeDesktop(configPath, apiKey) {
|
||||
const config = readJsonFile(configPath);
|
||||
if (!config.mcpServers) config.mcpServers = {};
|
||||
config.mcpServers.webclaw = buildMcpEntry(apiKey);
|
||||
config.mcpServers.noxa = buildMcpEntry(apiKey);
|
||||
writeJsonFile(configPath, config);
|
||||
}
|
||||
|
||||
function addToClaudeCode(configPath, apiKey) {
|
||||
const config = readJsonFile(configPath);
|
||||
if (!config.mcpServers) config.mcpServers = {};
|
||||
config.mcpServers.webclaw = buildMcpEntry(apiKey);
|
||||
config.mcpServers.noxa = buildMcpEntry(apiKey);
|
||||
writeJsonFile(configPath, config);
|
||||
}
|
||||
|
||||
function addToCursor(configPath, apiKey) {
|
||||
const config = readJsonFile(configPath);
|
||||
if (!config.mcpServers) config.mcpServers = {};
|
||||
config.mcpServers.webclaw = {
|
||||
config.mcpServers.noxa = {
|
||||
command: BINARY_PATH,
|
||||
...(apiKey ? { env: { WEBCLAW_API_KEY: apiKey } } : {}),
|
||||
...(apiKey ? { env: { NOXA_API_KEY: apiKey } } : {}),
|
||||
};
|
||||
writeJsonFile(configPath, config);
|
||||
}
|
||||
|
|
@ -291,7 +291,7 @@ function addToCursor(configPath, apiKey) {
|
|||
function addToWindsurf(configPath, apiKey) {
|
||||
const config = readJsonFile(configPath);
|
||||
if (!config.mcpServers) config.mcpServers = {};
|
||||
config.mcpServers.webclaw = buildMcpEntry(apiKey);
|
||||
config.mcpServers.noxa = buildMcpEntry(apiKey);
|
||||
writeJsonFile(configPath, config);
|
||||
}
|
||||
|
||||
|
|
@ -299,11 +299,11 @@ function addToVSCodeContinue(configPath, apiKey) {
|
|||
const config = readJsonFile(configPath);
|
||||
if (!config.mcpServers) config.mcpServers = [];
|
||||
// Continue uses array format
|
||||
const existing = config.mcpServers.findIndex?.((s) => s.name === "webclaw");
|
||||
const existing = config.mcpServers.findIndex?.((s) => s.name === "noxa");
|
||||
const entry = {
|
||||
name: "webclaw",
|
||||
name: "noxa",
|
||||
command: BINARY_PATH,
|
||||
...(apiKey ? { env: { WEBCLAW_API_KEY: apiKey } } : {}),
|
||||
...(apiKey ? { env: { NOXA_API_KEY: apiKey } } : {}),
|
||||
};
|
||||
if (existing >= 0) {
|
||||
config.mcpServers[existing] = entry;
|
||||
|
|
@ -316,13 +316,13 @@ function addToVSCodeContinue(configPath, apiKey) {
|
|||
function addToOpenCode(configPath, apiKey) {
|
||||
const config = readJsonFile(configPath);
|
||||
if (!config.mcp) config.mcp = {};
|
||||
config.mcp.webclaw = {
|
||||
config.mcp.noxa = {
|
||||
type: "local",
|
||||
command: [BINARY_PATH],
|
||||
enabled: true,
|
||||
};
|
||||
if (apiKey) {
|
||||
config.mcp.webclaw.environment = { WEBCLAW_API_KEY: apiKey };
|
||||
config.mcp.noxa.environment = { NOXA_API_KEY: apiKey };
|
||||
}
|
||||
writeJsonFile(configPath, config);
|
||||
}
|
||||
|
|
@ -330,7 +330,7 @@ function addToOpenCode(configPath, apiKey) {
|
|||
function addToAntigravity(configPath, apiKey) {
|
||||
const config = readJsonFile(configPath);
|
||||
if (!config.mcpServers) config.mcpServers = {};
|
||||
config.mcpServers.webclaw = buildMcpEntry(apiKey);
|
||||
config.mcpServers.noxa = buildMcpEntry(apiKey);
|
||||
writeJsonFile(configPath, config);
|
||||
}
|
||||
|
||||
|
|
@ -346,15 +346,15 @@ function addToCodex(configPath, apiKey) {
|
|||
// File doesn't exist yet
|
||||
}
|
||||
|
||||
// Remove any existing webclaw MCP section
|
||||
// Remove any existing noxa MCP section
|
||||
existing = existing.replace(
|
||||
/\n?\[mcp_servers\.webclaw\][^\[]*(?=\[|$)/gs,
|
||||
/\n?\[mcp_servers\.noxa\][^\[]*(?=\[|$)/gs,
|
||||
"",
|
||||
);
|
||||
|
||||
let section = `\n[mcp_servers.webclaw]\ncommand = "${BINARY_PATH}"\nargs = []\nenabled = true\n`;
|
||||
let section = `\n[mcp_servers.noxa]\ncommand = "${BINARY_PATH}"\nargs = []\nenabled = true\n`;
|
||||
if (apiKey) {
|
||||
section += `env = { WEBCLAW_API_KEY = "${apiKey}" }\n`;
|
||||
section += `env = { NOXA_API_KEY = "${apiKey}" }\n`;
|
||||
}
|
||||
|
||||
writeFileSync(configPath, existing.trimEnd() + "\n" + section);
|
||||
|
|
@ -378,7 +378,7 @@ async function main() {
|
|||
console.log(c("bold", " ┌─────────────────────────────────────┐"));
|
||||
console.log(
|
||||
c("bold", " │") +
|
||||
c("cyan", " webclaw") +
|
||||
c("cyan", " noxa") +
|
||||
c("dim", " — MCP setup for AI agents") +
|
||||
c("bold", " │"),
|
||||
);
|
||||
|
|
@ -426,7 +426,7 @@ async function main() {
|
|||
// 2. Ask for API key
|
||||
console.log(c("dim", " An API key enables cloud features."));
|
||||
console.log(
|
||||
c("dim", " Without one, webclaw runs locally (free, no account needed)."),
|
||||
c("dim", " Without one, noxa runs locally (free, no account needed)."),
|
||||
);
|
||||
console.log();
|
||||
|
||||
|
|
@ -437,7 +437,7 @@ async function main() {
|
|||
console.log();
|
||||
|
||||
// 3. Download binary
|
||||
console.log(c("bold", " Downloading webclaw-mcp..."));
|
||||
console.log(c("bold", " Downloading noxa-mcp..."));
|
||||
|
||||
const assetName = getAssetName();
|
||||
if (!assetName) {
|
||||
|
|
@ -445,7 +445,7 @@ async function main() {
|
|||
console.log(
|
||||
c(
|
||||
"dim",
|
||||
" Build from source: cargo install --git https://github.com/0xMassi/webclaw webclaw-mcp",
|
||||
" Build from source: cargo install --git https://github.com/jmagar/noxa noxa-mcp",
|
||||
),
|
||||
);
|
||||
process.exit(1);
|
||||
|
|
@ -502,7 +502,7 @@ async function main() {
|
|||
);
|
||||
try {
|
||||
execSync(
|
||||
`cargo install --git https://github.com/${REPO} webclaw-mcp --root "${INSTALL_DIR}"`,
|
||||
`cargo install --git https://github.com/${REPO} noxa-mcp --root "${INSTALL_DIR}"`,
|
||||
{ stdio: "inherit" },
|
||||
);
|
||||
// cargo install puts binary in INSTALL_DIR/bin/
|
||||
|
|
@ -562,13 +562,13 @@ async function main() {
|
|||
}).trim();
|
||||
console.log(c("green", ` ✓ ${version}`));
|
||||
} catch {
|
||||
console.log(c("green", ` ✓ webclaw-mcp installed`));
|
||||
console.log(c("green", ` ✓ noxa-mcp installed`));
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Summary
|
||||
console.log();
|
||||
console.log(c("bold", " Done! webclaw is ready."));
|
||||
console.log(c("bold", " Done! noxa is ready."));
|
||||
console.log();
|
||||
console.log(c("dim", " Your AI agent now has these tools:"));
|
||||
console.log(c("dim", " • scrape — extract content from any URL"));
|
||||
|
|
@ -583,7 +583,7 @@ async function main() {
|
|||
console.log(
|
||||
c(
|
||||
"dim",
|
||||
" Get an API key at https://webclaw.io/dashboard for cloud features.",
|
||||
" Get an API key at https://noxa.io/dashboard for cloud features.",
|
||||
),
|
||||
);
|
||||
console.log();
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
{
|
||||
"name": "create-webclaw",
|
||||
"name": "create-noxa",
|
||||
"version": "0.1.3",
|
||||
"mcpName": "io.github.0xMassi/webclaw",
|
||||
"description": "Set up webclaw MCP server for AI agents (Claude, Cursor, Windsurf, OpenCode, Codex, Antigravity)",
|
||||
"mcpName": "io.github.jmagar/noxa",
|
||||
"description": "Set up noxa MCP server for AI agents (Claude, Cursor, Windsurf, OpenCode, Codex, Antigravity)",
|
||||
"bin": {
|
||||
"create-webclaw": "./index.mjs"
|
||||
"create-noxa": "./index.mjs"
|
||||
},
|
||||
"type": "module",
|
||||
"keywords": [
|
||||
"webclaw",
|
||||
"noxa",
|
||||
"mcp",
|
||||
"mcp-server",
|
||||
"ai",
|
||||
|
|
@ -29,13 +29,13 @@
|
|||
"tls-fingerprint",
|
||||
"cloudflare-bypass"
|
||||
],
|
||||
"author": "webclaw",
|
||||
"author": "noxa",
|
||||
"license": "AGPL-3.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/0xMassi/webclaw"
|
||||
"url": "https://github.com/jmagar/noxa"
|
||||
},
|
||||
"homepage": "https://webclaw.io",
|
||||
"homepage": "https://noxa.io",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
# Webclaw Proxy List
|
||||
# Noxa Proxy List
|
||||
# Copy this file to proxies.txt and add your proxies.
|
||||
# webclaw auto-loads proxies.txt when it exists — no config needed.
|
||||
# noxa auto-loads proxies.txt when it exists — no config needed.
|
||||
#
|
||||
# Format: host:port:user:pass (one per line)
|
||||
# Lines starting with # are ignored.
|
||||
|
|
|
|||
54
setup.sh
54
setup.sh
|
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env bash
|
||||
# setup.sh — Local setup for webclaw
|
||||
# setup.sh — Local setup for noxa
|
||||
#
|
||||
# Checks prerequisites, builds binaries, configures .env,
|
||||
# optionally installs Ollama, and wires up the MCP server.
|
||||
|
|
@ -150,7 +150,7 @@ build_binaries() {
|
|||
if cargo build --release 2>&1 | tail -5; then
|
||||
echo
|
||||
success "Built 3 binaries:"
|
||||
ls -lh target/release/webclaw target/release/webclaw-server target/release/webclaw-mcp 2>/dev/null | \
|
||||
ls -lh target/release/noxa target/release/noxa-server target/release/noxa-mcp 2>/dev/null | \
|
||||
awk '{printf " %-20s %s\n", $NF, $5}'
|
||||
else
|
||||
error "Build failed. Check the output above."
|
||||
|
|
@ -215,7 +215,7 @@ configure_env() {
|
|||
|
||||
# Write .env
|
||||
cat > "$SCRIPT_DIR/.env" <<EOF
|
||||
# webclaw configuration — generated by setup.sh
|
||||
# noxa configuration — generated by setup.sh
|
||||
|
||||
# --- LLM Providers ---
|
||||
OLLAMA_HOST=http://localhost:11434
|
||||
|
|
@ -234,20 +234,20 @@ EOF
|
|||
# --- Proxy ---
|
||||
EOF
|
||||
if [[ -n "$proxy_file" ]]; then
|
||||
echo "WEBCLAW_PROXY_FILE=$proxy_file" >> "$SCRIPT_DIR/.env"
|
||||
echo "NOXA_PROXY_FILE=$proxy_file" >> "$SCRIPT_DIR/.env"
|
||||
else
|
||||
echo "# WEBCLAW_PROXY_FILE=/path/to/proxies.txt" >> "$SCRIPT_DIR/.env"
|
||||
echo "# NOXA_PROXY_FILE=/path/to/proxies.txt" >> "$SCRIPT_DIR/.env"
|
||||
fi
|
||||
|
||||
cat >> "$SCRIPT_DIR/.env" <<EOF
|
||||
|
||||
# --- Server ---
|
||||
WEBCLAW_PORT=$server_port
|
||||
WEBCLAW_HOST=0.0.0.0
|
||||
WEBCLAW_AUTH_KEY=$auth_key
|
||||
NOXA_PORT=$server_port
|
||||
NOXA_HOST=0.0.0.0
|
||||
NOXA_AUTH_KEY=$auth_key
|
||||
|
||||
# --- Logging ---
|
||||
WEBCLAW_LOG=info
|
||||
NOXA_LOG=info
|
||||
EOF
|
||||
|
||||
echo
|
||||
|
|
@ -335,20 +335,20 @@ setup_mcp() {
|
|||
printf "${BOLD}${GREEN} Step 5: MCP Server (Claude Desktop integration)${RESET}\n"
|
||||
echo
|
||||
|
||||
local mcp_binary="$SCRIPT_DIR/target/release/webclaw-mcp"
|
||||
local mcp_binary="$SCRIPT_DIR/target/release/noxa-mcp"
|
||||
if [[ ! -f "$mcp_binary" ]]; then
|
||||
warn "webclaw-mcp binary not found. Build first."
|
||||
warn "noxa-mcp binary not found. Build first."
|
||||
return
|
||||
fi
|
||||
|
||||
info "The MCP server lets Claude Desktop use webclaw's tools directly."
|
||||
info "The MCP server lets Claude Desktop use noxa's tools directly."
|
||||
info "Tools: scrape, crawl, map, batch, extract, summarize, diff, brand"
|
||||
echo
|
||||
|
||||
if ! prompt_yn "Configure MCP server for Claude Desktop?" "y"; then
|
||||
info "Skipping MCP setup."
|
||||
info "You can configure it later by adding to your Claude Desktop config:"
|
||||
printf ' {"mcpServers": {"webclaw": {"command": "%s"}}}\n' "$mcp_binary"
|
||||
printf ' {"mcpServers": {"noxa": {"command": "%s"}}}\n' "$mcp_binary"
|
||||
return
|
||||
fi
|
||||
|
||||
|
|
@ -371,22 +371,22 @@ setup_mcp() {
|
|||
local existing
|
||||
existing=$(cat "$config_path")
|
||||
|
||||
# Check if webclaw is already configured
|
||||
if echo "$existing" | python3 -c "import sys,json; c=json.load(sys.stdin); exit(0 if 'webclaw' in c.get('mcpServers',{}) else 1)" 2>/dev/null; then
|
||||
warn "webclaw MCP server already configured in Claude Desktop."
|
||||
# Check if noxa is already configured
|
||||
if echo "$existing" | python3 -c "import sys,json; c=json.load(sys.stdin); exit(0 if 'noxa' in c.get('mcpServers',{}) else 1)" 2>/dev/null; then
|
||||
warn "noxa MCP server already configured in Claude Desktop."
|
||||
if ! prompt_yn "Update the path?" "y"; then
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
# Merge webclaw into mcpServers
|
||||
# Merge noxa into mcpServers
|
||||
local updated
|
||||
updated=$(echo "$existing" | python3 -c "
|
||||
import sys, json
|
||||
config = json.load(sys.stdin)
|
||||
if 'mcpServers' not in config:
|
||||
config['mcpServers'] = {}
|
||||
config['mcpServers']['webclaw'] = {
|
||||
config['mcpServers']['noxa'] = {
|
||||
'command': '$mcp_binary'
|
||||
}
|
||||
print(json.dumps(config, indent=2))
|
||||
|
|
@ -405,11 +405,11 @@ smoke_test() {
|
|||
printf "${BOLD}${GREEN} Step 6: Smoke Test${RESET}\n"
|
||||
echo
|
||||
|
||||
local webclaw="$SCRIPT_DIR/target/release/webclaw"
|
||||
local noxa="$SCRIPT_DIR/target/release/noxa"
|
||||
|
||||
info "Testing extraction..."
|
||||
local output
|
||||
output=$("$webclaw" https://example.com --format llm 2>/dev/null || echo "FAILED")
|
||||
output=$("$noxa" https://example.com --format llm 2>/dev/null || echo "FAILED")
|
||||
|
||||
if [[ "$output" == "FAILED" ]]; then
|
||||
warn "Extraction test failed. Check your network connection."
|
||||
|
|
@ -423,7 +423,7 @@ smoke_test() {
|
|||
if curl -sf http://localhost:11434/api/tags &>/dev/null; then
|
||||
info "Testing LLM summarization..."
|
||||
local summary
|
||||
summary=$("$webclaw" https://example.com --summarize 2>/dev/null || echo "FAILED")
|
||||
summary=$("$noxa" https://example.com --summarize 2>/dev/null || echo "FAILED")
|
||||
if [[ "$summary" == "FAILED" ]]; then
|
||||
warn "LLM test failed. Check Ollama and model availability."
|
||||
else
|
||||
|
|
@ -436,17 +436,17 @@ smoke_test() {
|
|||
# Summary
|
||||
# ---------------------------------------------------------------------------
|
||||
print_summary() {
|
||||
local webclaw="$SCRIPT_DIR/target/release/webclaw"
|
||||
local server="$SCRIPT_DIR/target/release/webclaw-server"
|
||||
local mcp="$SCRIPT_DIR/target/release/webclaw-mcp"
|
||||
local noxa="$SCRIPT_DIR/target/release/noxa"
|
||||
local server="$SCRIPT_DIR/target/release/noxa-server"
|
||||
local mcp="$SCRIPT_DIR/target/release/noxa-mcp"
|
||||
local port
|
||||
port=$(grep '^WEBCLAW_PORT=' "$SCRIPT_DIR/.env" 2>/dev/null | cut -d= -f2 || echo "3000")
|
||||
port=$(grep '^NOXA_PORT=' "$SCRIPT_DIR/.env" 2>/dev/null | cut -d= -f2 || echo "3000")
|
||||
|
||||
echo
|
||||
printf "${BOLD}${GREEN} Setup Complete${RESET}\n"
|
||||
echo
|
||||
printf " ${BOLD}CLI:${RESET}\n"
|
||||
printf " %s https://example.com --format llm\n" "$webclaw"
|
||||
printf " %s https://example.com --format llm\n" "$noxa"
|
||||
echo
|
||||
printf " ${BOLD}REST API:${RESET}\n"
|
||||
printf " %s\n" "$server"
|
||||
|
|
@ -468,7 +468,7 @@ print_summary() {
|
|||
# ---------------------------------------------------------------------------
|
||||
main() {
|
||||
echo
|
||||
printf "${BOLD}${GREEN} webclaw — Local Setup${RESET}\n"
|
||||
printf "${BOLD}${GREEN} noxa — Local Setup${RESET}\n"
|
||||
printf "${DIM} Web extraction toolkit for AI agents${RESET}\n"
|
||||
echo
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
---
|
||||
name: webclaw
|
||||
name: noxa
|
||||
description: Web extraction engine with antibot bypass. Scrape, crawl, extract, summarize, search, map, diff, monitor, research, and analyze any URL — including Cloudflare-protected sites. Use when you need reliable web content, the built-in web_fetch fails, or you need structured data extraction from web pages.
|
||||
homepage: https://webclaw.io
|
||||
homepage: https://noxa.io
|
||||
user-invocable: true
|
||||
metadata: {"openclaw":{"emoji":"🦀","requires":{"env":["WEBCLAW_API_KEY"]},"primaryEnv":"WEBCLAW_API_KEY","homepage":"https://webclaw.io","install":[{"id":"npx","kind":"node","bins":["webclaw-mcp"],"label":"npx create-webclaw"}]}}
|
||||
metadata: {"openclaw":{"emoji":"🦀","requires":{"env":["NOXA_API_KEY"]},"primaryEnv":"NOXA_API_KEY","homepage":"https://noxa.io","install":[{"id":"npx","kind":"node","bins":["noxa-mcp"],"label":"npx create-noxa"}]}}
|
||||
---
|
||||
|
||||
# webclaw
|
||||
# noxa
|
||||
|
||||
High-quality web extraction with automatic antibot bypass. Beats Firecrawl on extraction quality and handles Cloudflare, DataDome, and JS-rendered pages automatically.
|
||||
|
||||
|
|
@ -27,17 +27,17 @@ High-quality web extraction with automatic antibot bypass. Beats Firecrawl on ex
|
|||
|
||||
## API base
|
||||
|
||||
All requests go to `https://api.webclaw.io/v1/`.
|
||||
All requests go to `https://api.noxa.io/v1/`.
|
||||
|
||||
Authentication: `Authorization: Bearer $WEBCLAW_API_KEY`
|
||||
Authentication: `Authorization: Bearer $NOXA_API_KEY`
|
||||
|
||||
## Endpoints
|
||||
|
||||
### 1. Scrape — extract content from a single URL
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/scrape \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/scrape \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com",
|
||||
|
|
@ -96,8 +96,8 @@ Starts an async job. Poll for results.
|
|||
|
||||
**Start crawl:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/crawl \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/crawl \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://docs.example.com",
|
||||
|
|
@ -111,8 +111,8 @@ Response: `{ "job_id": "abc-123", "status": "running" }`
|
|||
|
||||
**Poll status:**
|
||||
```bash
|
||||
curl https://api.webclaw.io/v1/crawl/abc-123 \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl https://api.noxa.io/v1/crawl/abc-123 \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
Response when complete:
|
||||
|
|
@ -151,8 +151,8 @@ Response when complete:
|
|||
Fast URL discovery without full content extraction.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/map \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/map \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://example.com"}'
|
||||
```
|
||||
|
|
@ -173,8 +173,8 @@ Response:
|
|||
### 4. Batch — scrape multiple URLs in parallel
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/batch \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/batch \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": [
|
||||
|
|
@ -207,8 +207,8 @@ Pull structured data from any page using a JSON schema or plain-text prompt.
|
|||
|
||||
**With JSON schema:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/extract \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/extract \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/pricing",
|
||||
|
|
@ -233,8 +233,8 @@ curl -X POST https://api.webclaw.io/v1/extract \
|
|||
|
||||
**With prompt:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/extract \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/extract \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/pricing",
|
||||
|
|
@ -258,8 +258,8 @@ Response:
|
|||
### 6. Summarize — get a quick summary of any page
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/summarize \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/summarize \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/long-article",
|
||||
|
|
@ -280,8 +280,8 @@ Response:
|
|||
Compare current page content against a previous snapshot.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/diff \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/diff \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com",
|
||||
|
|
@ -309,8 +309,8 @@ Response:
|
|||
Analyze a website's visual identity: colors, fonts, logo.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/brand \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/brand \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://example.com"}'
|
||||
```
|
||||
|
|
@ -336,8 +336,8 @@ Response:
|
|||
Search the web and optionally scrape each result page.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/search \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/search \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"query": "best rust web frameworks 2026",
|
||||
|
|
@ -390,8 +390,8 @@ Starts an async research job that searches, scrapes, and synthesizes information
|
|||
|
||||
**Start research:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/research \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/research \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"query": "How does Cloudflare Turnstile work and what are its known bypass methods?",
|
||||
|
|
@ -416,8 +416,8 @@ Response: `{ "id": "res-abc-123", "status": "running" }`
|
|||
|
||||
**Poll results:**
|
||||
```bash
|
||||
curl https://api.webclaw.io/v1/research/res-abc-123 \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl https://api.noxa.io/v1/research/res-abc-123 \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
Response when complete:
|
||||
|
|
@ -448,8 +448,8 @@ Response when complete:
|
|||
Use an AI agent to navigate and interact with a page to accomplish a specific goal. The agent can click, scroll, fill forms, and extract data across multiple steps.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/agent-scrape \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/agent-scrape \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/products",
|
||||
|
|
@ -488,8 +488,8 @@ Create persistent monitors that check a URL on a schedule and notify via webhook
|
|||
|
||||
**Create a monitor:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/watch \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||
curl -X POST https://api.noxa.io/v1/watch \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/pricing",
|
||||
|
|
@ -524,8 +524,8 @@ Response:
|
|||
|
||||
**List all monitors:**
|
||||
```bash
|
||||
curl https://api.webclaw.io/v1/watch \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl https://api.noxa.io/v1/watch \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
Response:
|
||||
|
|
@ -546,8 +546,8 @@ Response:
|
|||
|
||||
**Get a monitor with snapshots:**
|
||||
```bash
|
||||
curl https://api.webclaw.io/v1/watch/watch-abc-123 \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl https://api.noxa.io/v1/watch/watch-abc-123 \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
Response:
|
||||
|
|
@ -573,14 +573,14 @@ Response:
|
|||
|
||||
**Trigger an immediate check:**
|
||||
```bash
|
||||
curl -X POST https://api.webclaw.io/v1/watch/watch-abc-123/check \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl -X POST https://api.noxa.io/v1/watch/watch-abc-123/check \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
**Delete a monitor:**
|
||||
```bash
|
||||
curl -X DELETE https://api.webclaw.io/v1/watch/watch-abc-123 \
|
||||
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||
curl -X DELETE https://api.noxa.io/v1/watch/watch-abc-123 \
|
||||
-H "Authorization: Bearer $NOXA_API_KEY"
|
||||
```
|
||||
|
||||
## Choosing the right format
|
||||
|
|
@ -608,7 +608,7 @@ curl -X DELETE https://api.webclaw.io/v1/watch/watch-abc-123 \
|
|||
|
||||
## Smart Fetch Architecture
|
||||
|
||||
The webclaw MCP server uses a **local-first** approach:
|
||||
The noxa MCP server uses a **local-first** approach:
|
||||
|
||||
1. **Local fetch** — fast, free, no API credits used (~80% of sites)
|
||||
2. **Cloud API fallback** — automatic when bot protection or JS rendering is detected
|
||||
|
|
@ -617,11 +617,11 @@ This means:
|
|||
- Most scrapes cost zero credits (local extraction)
|
||||
- Cloudflare, DataDome, AWS WAF sites automatically fall back to the cloud API
|
||||
- JS-rendered SPAs (React, Next.js, Vue) also fall back automatically
|
||||
- Set `WEBCLAW_API_KEY` to enable cloud fallback
|
||||
- Set `NOXA_API_KEY` to enable cloud fallback
|
||||
|
||||
## vs web_fetch
|
||||
|
||||
| | webclaw | web_fetch |
|
||||
| | noxa | web_fetch |
|
||||
|---|---------|-----------|
|
||||
| Cloudflare bypass | Automatic (cloud fallback) | Fails (403) |
|
||||
| JS-rendered pages | Automatic fallback | Readability only |
|
||||
|
|
@ -631,4 +631,4 @@ This means:
|
|||
| Caching | Built-in, configurable TTL | Per-session |
|
||||
| Rate limiting | Managed server-side | Client responsibility |
|
||||
|
||||
Use `web_fetch` for simple, fast lookups. Use webclaw when you need reliability, quality, or advanced features.
|
||||
Use `web_fetch` for simple, fast lookups. Use noxa when you need reliability, quality, or advanced features.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue