From a8eb6b3cfaf566c289d8e4dd26aa7c2711fb9ca6 Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Sat, 20 Jun 2026 14:43:55 +0200
Subject: [PATCH 1/2] fix(deploy): write WEBCLAW_API_KEY in generated .env, not
 WEBCLAW_AUTH_KEY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

setup.sh and deploy/hetzner.sh emitted WEBCLAW_AUTH_KEY into the server's
.env, but webclaw-server reads WEBCLAW_API_KEY (env = "WEBCLAW_API_KEY").
The generated key was silently ignored — and since hetzner.sh binds
0.0.0.0, the server refused to start at all (it rejects a public bind
without WEBCLAW_API_KEY). Fix both .env writers, plus the hetzner help
line that told users to grep the wrong name and the env.example sample.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 deploy/hetzner.sh | 4 ++--
 env.example       | 2 +-
 setup.sh          | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/deploy/hetzner.sh b/deploy/hetzner.sh
index 826c9b5..5deff22 100755
--- a/deploy/hetzner.sh
+++ b/deploy/hetzner.sh
@@ -196,7 +196,7 @@ build_cloud_init() {
     local env_content="# webclaw deployment — generated by hetzner.sh
 WEBCLAW_HOST=0.0.0.0
 WEBCLAW_PORT=3000
-WEBCLAW_AUTH_KEY=$auth_key
+WEBCLAW_API_KEY=$auth_key
 OLLAMA_HOST=http://ollama:11434
 OLLAMA_MODEL=$ollama_model
 WEBCLAW_LOG=info"
@@ -469,7 +469,7 @@ create_server() {
     printf "  ${BOLD}SSH:${RESET}          ssh root@%s\n" "$server_ip"
     printf "  ${BOLD}Auth key:${RESET}     %s\n" "$(mask_secret "$AUTH_KEY")"
     printf "  ${DIM}(full key stored in /opt/webclaw/.env on the server:\n"
-    printf "   ssh root@%s 'grep WEBCLAW_AUTH_KEY /opt/webclaw/.env')${RESET}\n" "$server_ip"
+    printf "   ssh root@%s 'grep WEBCLAW_API_KEY /opt/webclaw/.env')${RESET}\n" "$server_ip"
     echo
     printf "  ${BOLD}Monitor build progress:${RESET}\n"
     printf "    ssh root@%s 'cd /opt/webclaw && docker compose logs -f'\n" "$server_ip"
diff --git a/env.example b/env.example
index cfe9c61..58e3182 100644
--- a/env.example
+++ b/env.example
@@ -29,7 +29,7 @@ OLLAMA_MODEL=qwen3:8b
 # --- Server (webclaw-server only) ---
 # WEBCLAW_PORT=3000
 # WEBCLAW_HOST=0.0.0.0
-# WEBCLAW_AUTH_KEY=your-auth-key
+# WEBCLAW_API_KEY=your-auth-key
 # WEBCLAW_MAX_CONCURRENCY=50
 # WEBCLAW_JOB_TTL_SECS=3600
 # WEBCLAW_MAX_JOBS=100
diff --git a/setup.sh b/setup.sh
index 1b3c0fb..f059ba3 100755
--- a/setup.sh
+++ b/setup.sh
@@ -244,7 +244,7 @@ EOF
 # --- Server ---
 WEBCLAW_PORT=$server_port
 WEBCLAW_HOST=0.0.0.0
-WEBCLAW_AUTH_KEY=$auth_key
+WEBCLAW_API_KEY=$auth_key
 
 # --- Logging ---
 WEBCLAW_LOG=info

From e9abc8f459d91f9ea234bc7bf786074ccc1e065f Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Sat, 20 Jun 2026 14:44:03 +0200
Subject: [PATCH 2/2] docs(claude-md): correct LLM chain, fetch modules,
 extractor count

- LLM provider chain is Ollama -> OpenAI -> Gemini -> Anthropic; Gemini
  was added ahead of Anthropic (Google Cloud credits preferred) but the
  docs still listed Ollama -> OpenAI -> Anthropic.
- Document the top-level webclaw-fetch verticals reddit.rs / linkedin.rs
  (distinct from extractors/ and webclaw-core parsers) and progress.rs.
- Bump extractor count ~28 -> ~30 and call out the shared helpers
  (og.rs, github_common.rs, jsonld_product.rs, ecommerce_product.rs).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 387c2dd..959e49c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -16,7 +16,7 @@ webclaw/
                       # + PDF content-type detection
                       # + document parsing (DOCX, XLSX, CSV)
                       # + layered URL discovery (map) + Serper web search (BYO key)
-    webclaw-llm/      # LLM provider chain (Ollama -> OpenAI -> Anthropic)
+    webclaw-llm/      # LLM provider chain (Ollama -> OpenAI -> Gemini -> Anthropic)
                       # + JSON schema extraction, prompt extraction, summarization
     webclaw-pdf/      # PDF text extraction via pdf-extract
     webclaw-mcp/      # MCP server (Model Context Protocol) for AI agents
@@ -49,7 +49,9 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
 - `fetcher.rs` — the public `Fetcher` trait (`Send + Sync`). Vertical extractors take `&dyn Fetcher`, not `&FetchClient`.
 - `browser.rs` — `BrowserProfile`/`BrowserVariant` enums only (Chrome, ChromeMacos, Firefox, Safari, SafariIos26, Edge). No version numbers live here.
 - `tls.rs` — the real fingerprint builder: per-variant wreq `Emulation` (cipher/sigalg/curve lists, TLS extension order, HTTP/2 SETTINGS, header wire-order). Browser versions are set HERE: Chrome 145, Firefox 135, Edge 145, Safari 18.3.1, Safari iOS 26. SafariIos26 composes on top of `wreq_util::Profile::SafariIos26`. SSRF-safe redirect policy lives here too.
-- `extractors/` — ~28 vertical site extractors (Amazon, eBay, GitHub, Instagram, LinkedIn, Reddit, YouTube, npm, PyPI, HuggingFace, ...); `extractors/mod.rs` is the dispatch table. All reach the network through `&dyn Fetcher`. `extractors/og.rs` is the shared single-pass Open Graph (`og:*`) meta parser the verticals use (`raw()` vs `unescaped()`).
+- `extractors/` — ~30 vertical site extractors (Amazon, eBay, GitHub, Instagram, LinkedIn, Reddit, YouTube, npm, PyPI, HuggingFace, Etsy, Shopify, WooCommerce, Trustpilot, arXiv, Hacker News, StackOverflow, ...); `extractors/mod.rs` is the dispatch table. All reach the network through `&dyn Fetcher`. Shared helpers (not verticals themselves): `extractors/og.rs` (single-pass Open Graph `og:*` parser, `raw()` vs `unescaped()`), `extractors/github_common.rs` (shared GitHub API fetch + status handling), `extractors/jsonld_product.rs` / `ecommerce_product.rs` (shared JSON-LD product walker reused by the e-commerce verticals).
+- `reddit.rs` / `linkedin.rs` — top-level fetch-side verticals (distinct from `extractors/` and from `webclaw-core`'s parsers): `reddit.rs` rewrites Reddit hosts to `old.reddit.com` (the `*.json` API is blocked) so `webclaw-core::reddit` can parse server-rendered HTML; `linkedin.rs` reconstructs post + comments from the SPA's HTML-escaped JSON in `<code>` tags (the `included` typed-entity array).
+- `progress.rs` — wraps a slow fetch future in `tokio::select!` against an interval, emitting a periodic `# webclaw: still fetching <URL> (Ns)` line to STDERR.
 - `crawler.rs` — BFS same-origin crawler with configurable depth/concurrency/delay
 - `sitemap.rs` — Sitemap discovery and parsing (sitemap.xml, robots.txt; gzip `.xml.gz` supported via `decode_sitemap_body`, sitemap-index recursion)
 - `map.rs` — layered URL discovery (`discover_urls` / `MapOptions`): sitemaps first, then a bounded same-origin crawl fallback when the sitemap is thin, harvesting links from fetched pages + the unfetched frontier (deduped against the sitemap set)
@@ -61,7 +63,7 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
 - `url_security.rs` — SSRF guards + SSRF-safe redirect policy
 
 ### LLM Modules (`webclaw-llm`)
-- Provider chain: Ollama (local-first) -> OpenAI -> Anthropic
+- Provider chain (`chain.rs`): Ollama (local-first, always added; availability checked at call time) -> OpenAI -> Gemini -> Anthropic. Gemini sits ahead of Anthropic so Google Cloud credits are preferred; Anthropic is the last-resort fallback. Each provider lives in `providers/` (`ollama.rs`, `openai.rs`, `gemini.rs`, `anthropic.rs`).
 - JSON schema extraction, prompt-based extraction, summarization
 
 ### PDF Modules (`webclaw-pdf`)