From a8eb6b3cfaf566c289d8e4dd26aa7c2711fb9ca6 Mon Sep 17 00:00:00 2001 From: Valerio Date: Sat, 20 Jun 2026 14:43:55 +0200 Subject: [PATCH 1/2] fix(deploy): write WEBCLAW_API_KEY in generated .env, not WEBCLAW_AUTH_KEY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit setup.sh and deploy/hetzner.sh emitted WEBCLAW_AUTH_KEY into the server's .env, but webclaw-server reads WEBCLAW_API_KEY (env = "WEBCLAW_API_KEY"). The generated key was silently ignored — and since hetzner.sh binds 0.0.0.0, the server refused to start at all (it rejects a public bind without WEBCLAW_API_KEY). Fix both .env writers, plus the hetzner help line that told users to grep the wrong name and the env.example sample. Co-Authored-By: Claude Opus 4.8 (1M context) --- deploy/hetzner.sh | 4 ++-- env.example | 2 +- setup.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deploy/hetzner.sh b/deploy/hetzner.sh index 826c9b5..5deff22 100755 --- a/deploy/hetzner.sh +++ b/deploy/hetzner.sh @@ -196,7 +196,7 @@ build_cloud_init() { local env_content="# webclaw deployment — generated by hetzner.sh WEBCLAW_HOST=0.0.0.0 WEBCLAW_PORT=3000 -WEBCLAW_AUTH_KEY=$auth_key +WEBCLAW_API_KEY=$auth_key OLLAMA_HOST=http://ollama:11434 OLLAMA_MODEL=$ollama_model WEBCLAW_LOG=info" @@ -469,7 +469,7 @@ create_server() { printf " ${BOLD}SSH:${RESET} ssh root@%s\n" "$server_ip" printf " ${BOLD}Auth key:${RESET} %s\n" "$(mask_secret "$AUTH_KEY")" printf " ${DIM}(full key stored in /opt/webclaw/.env on the server:\n" - printf " ssh root@%s 'grep WEBCLAW_AUTH_KEY /opt/webclaw/.env')${RESET}\n" "$server_ip" + printf " ssh root@%s 'grep WEBCLAW_API_KEY /opt/webclaw/.env')${RESET}\n" "$server_ip" echo printf " ${BOLD}Monitor build progress:${RESET}\n" printf " ssh root@%s 'cd /opt/webclaw && docker compose logs -f'\n" "$server_ip" diff --git a/env.example b/env.example index cfe9c61..58e3182 100644 --- a/env.example +++ b/env.example @@ -29,7 +29,7 @@ OLLAMA_MODEL=qwen3:8b # --- Server (webclaw-server only) --- # WEBCLAW_PORT=3000 # WEBCLAW_HOST=0.0.0.0 -# WEBCLAW_AUTH_KEY=your-auth-key +# WEBCLAW_API_KEY=your-auth-key # WEBCLAW_MAX_CONCURRENCY=50 # WEBCLAW_JOB_TTL_SECS=3600 # WEBCLAW_MAX_JOBS=100 diff --git a/setup.sh b/setup.sh index 1b3c0fb..f059ba3 100755 --- a/setup.sh +++ b/setup.sh @@ -244,7 +244,7 @@ EOF # --- Server --- WEBCLAW_PORT=$server_port WEBCLAW_HOST=0.0.0.0 -WEBCLAW_AUTH_KEY=$auth_key +WEBCLAW_API_KEY=$auth_key # --- Logging --- WEBCLAW_LOG=info From e9abc8f459d91f9ea234bc7bf786074ccc1e065f Mon Sep 17 00:00:00 2001 From: Valerio Date: Sat, 20 Jun 2026 14:44:03 +0200 Subject: [PATCH 2/2] docs(claude-md): correct LLM chain, fetch modules, extractor count - LLM provider chain is Ollama -> OpenAI -> Gemini -> Anthropic; Gemini was added ahead of Anthropic (Google Cloud credits preferred) but the docs still listed Ollama -> OpenAI -> Anthropic. - Document the top-level webclaw-fetch verticals reddit.rs / linkedin.rs (distinct from extractors/ and webclaw-core parsers) and progress.rs. - Bump extractor count ~28 -> ~30 and call out the shared helpers (og.rs, github_common.rs, jsonld_product.rs, ecommerce_product.rs). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 387c2dd..959e49c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,7 +16,7 @@ webclaw/ # + PDF content-type detection # + document parsing (DOCX, XLSX, CSV) # + layered URL discovery (map) + Serper web search (BYO key) - webclaw-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic) + webclaw-llm/ # LLM provider chain (Ollama -> OpenAI -> Gemini -> Anthropic) # + JSON schema extraction, prompt extraction, summarization webclaw-pdf/ # PDF text extraction via pdf-extract webclaw-mcp/ # MCP server (Model Context Protocol) for AI agents @@ -49,7 +49,9 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R - `fetcher.rs` — the public `Fetcher` trait (`Send + Sync`). Vertical extractors take `&dyn Fetcher`, not `&FetchClient`. - `browser.rs` — `BrowserProfile`/`BrowserVariant` enums only (Chrome, ChromeMacos, Firefox, Safari, SafariIos26, Edge). No version numbers live here. - `tls.rs` — the real fingerprint builder: per-variant wreq `Emulation` (cipher/sigalg/curve lists, TLS extension order, HTTP/2 SETTINGS, header wire-order). Browser versions are set HERE: Chrome 145, Firefox 135, Edge 145, Safari 18.3.1, Safari iOS 26. SafariIos26 composes on top of `wreq_util::Profile::SafariIos26`. SSRF-safe redirect policy lives here too. -- `extractors/` — ~28 vertical site extractors (Amazon, eBay, GitHub, Instagram, LinkedIn, Reddit, YouTube, npm, PyPI, HuggingFace, ...); `extractors/mod.rs` is the dispatch table. All reach the network through `&dyn Fetcher`. `extractors/og.rs` is the shared single-pass Open Graph (`og:*`) meta parser the verticals use (`raw()` vs `unescaped()`). +- `extractors/` — ~30 vertical site extractors (Amazon, eBay, GitHub, Instagram, LinkedIn, Reddit, YouTube, npm, PyPI, HuggingFace, Etsy, Shopify, WooCommerce, Trustpilot, arXiv, Hacker News, StackOverflow, ...); `extractors/mod.rs` is the dispatch table. All reach the network through `&dyn Fetcher`. Shared helpers (not verticals themselves): `extractors/og.rs` (single-pass Open Graph `og:*` parser, `raw()` vs `unescaped()`), `extractors/github_common.rs` (shared GitHub API fetch + status handling), `extractors/jsonld_product.rs` / `ecommerce_product.rs` (shared JSON-LD product walker reused by the e-commerce verticals). +- `reddit.rs` / `linkedin.rs` — top-level fetch-side verticals (distinct from `extractors/` and from `webclaw-core`'s parsers): `reddit.rs` rewrites Reddit hosts to `old.reddit.com` (the `*.json` API is blocked) so `webclaw-core::reddit` can parse server-rendered HTML; `linkedin.rs` reconstructs post + comments from the SPA's HTML-escaped JSON in `` tags (the `included` typed-entity array). +- `progress.rs` — wraps a slow fetch future in `tokio::select!` against an interval, emitting a periodic `# webclaw: still fetching (Ns)` line to STDERR. - `crawler.rs` — BFS same-origin crawler with configurable depth/concurrency/delay - `sitemap.rs` — Sitemap discovery and parsing (sitemap.xml, robots.txt; gzip `.xml.gz` supported via `decode_sitemap_body`, sitemap-index recursion) - `map.rs` — layered URL discovery (`discover_urls` / `MapOptions`): sitemaps first, then a bounded same-origin crawl fallback when the sitemap is thin, harvesting links from fetched pages + the unfetched frontier (deduped against the sitemap set) @@ -61,7 +63,7 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R - `url_security.rs` — SSRF guards + SSRF-safe redirect policy ### LLM Modules (`webclaw-llm`) -- Provider chain: Ollama (local-first) -> OpenAI -> Anthropic +- Provider chain (`chain.rs`): Ollama (local-first, always added; availability checked at call time) -> OpenAI -> Gemini -> Anthropic. Gemini sits ahead of Anthropic so Google Cloud credits are preferred; Anthropic is the last-resort fallback. Each provider lives in `providers/` (`ollama.rs`, `openai.rs`, `gemini.rs`, `anthropic.rs`). - JSON schema extraction, prompt-based extraction, summarization ### PDF Modules (`webclaw-pdf`)