Compare commits

..

No commits in common. "main" and "v0.6.3" have entirely different histories.
main ... v0.6.3

67 changed files with 848 additions and 6153 deletions

BIN
.github/banner.png vendored

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

After

Width:  |  Height:  |  Size: 44 KiB

Before After
Before After

View file

@ -14,7 +14,7 @@ jobs:
name: Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
- run: cargo test --workspace
@ -23,7 +23,7 @@ jobs:
name: Lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
with:
components: clippy, rustfmt
@ -35,7 +35,7 @@ jobs:
name: WASM
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
with:
targets: wasm32-unknown-unknown
@ -50,7 +50,7 @@ jobs:
name: Docs
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
- run: cargo doc --no-deps --workspace

View file

@ -14,7 +14,7 @@ jobs:
name: Update webclaw-tls dependencies
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
with:
token: ${{ secrets.SYNC_PAT }}

View file

@ -3,15 +3,6 @@ name: Release
on:
push:
tags: ["v*"]
# Manual re-publish of the Docker image for an existing release, without
# rebuilding binaries or cutting a new version. Runs only the docker (+
# homebrew) jobs against the given tag's already-published release assets.
workflow_dispatch:
inputs:
tag:
description: "Existing release tag to (re)build + push the Docker image for, e.g. v0.6.9"
required: true
type: string
permissions:
contents: read
@ -21,9 +12,6 @@ env:
jobs:
build:
# Binaries are only built when a tag is pushed. A manual dispatch reuses
# the existing release's binaries, so it skips this job entirely.
if: github.event_name == 'push'
permissions:
contents: read
name: Build ${{ matrix.target }}
@ -44,7 +32,7 @@ jobs:
os: windows-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
with:
@ -110,20 +98,19 @@ jobs:
fi
- name: Upload artifact
uses: actions/upload-artifact@v5
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.target }}
path: ${{ env.ASSET }}
release:
name: Release
if: github.event_name == 'push'
needs: build
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- uses: actions/download-artifact@v5
- uses: actions/download-artifact@v4
with:
path: artifacts
@ -150,16 +137,12 @@ jobs:
docker:
name: Docker
needs: release
# Runs after a successful release on tag push, or standalone via
# workflow_dispatch to (re)publish an existing tag's image. `always()` lets
# it run even though `release` is skipped on a manual dispatch.
if: ${{ always() && (github.event_name == 'workflow_dispatch' || needs.release.result == 'success') }}
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: docker/setup-qemu-action@v3
with:
@ -173,48 +156,49 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
# The pushed tag, or the workflow_dispatch input for a manual re-publish.
- name: Resolve tag
id: tag
run: echo "tag=${{ github.event.inputs.tag || github.ref_name }}" >> "$GITHUB_OUTPUT"
# Download pre-built binaries into TARGETARCH-named dirs (amd64/arm64) so
# a single multi-platform build picks the matching binary per platform.
# Download pre-built binaries for both architectures
- name: Download release binaries
run: |
tag="${{ steps.tag.outputs.tag }}"
declare -A arch=( [x86_64-unknown-linux-gnu]=amd64 [aarch64-unknown-linux-gnu]=arm64 )
tag="${GITHUB_REF#refs/tags/}"
for target in x86_64-unknown-linux-gnu aarch64-unknown-linux-gnu; do
dir="webclaw-${tag}-${target}"
curl -sSL "https://github.com/0xMassi/webclaw/releases/download/${tag}/${dir}.tar.gz" -o "${target}.tar.gz"
tar xzf "${target}.tar.gz"
a="${arch[$target]}"
mkdir -p "binaries-${a}"
cp "${dir}/webclaw" "${dir}/webclaw-mcp" "${dir}/webclaw-server" "binaries-${a}/"
chmod +x "binaries-${a}"/*
mkdir -p "binaries-${target}"
cp "${dir}/webclaw" "binaries-${target}/webclaw"
cp "${dir}/webclaw-mcp" "binaries-${target}/webclaw-mcp"
cp "${dir}/webclaw-server" "binaries-${target}/webclaw-server"
chmod +x "binaries-${target}"/*
done
ls -laR binaries-*/
# One atomic multi-platform build + push. buildx assembles a single
# manifest list and pushes it in one shot, so there is no separate
# `imagetools create` step to race GHCR's read-after-write (that is what
# failed before: "v0.6.9-arm64: not found"). Provenance/SBOM attestations
# are disabled so each platform entry stays a plain image manifest.
# Build per-arch images with plain docker build (no buildx manifest nesting)
- name: Build and push
run: |
tag="${{ steps.tag.outputs.tag }}"
docker buildx build -f Dockerfile.ci \
--platform linux/amd64,linux/arm64 \
--provenance=false --sbom=false \
-t "ghcr.io/0xmassi/webclaw:${tag}" \
-t ghcr.io/0xmassi/webclaw:latest \
--push .
tag="${GITHUB_REF#refs/tags/}"
# amd64
docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-x86_64-unknown-linux-gnu \
--platform linux/amd64 -t ghcr.io/0xmassi/webclaw:${tag}-amd64 --push .
# arm64
docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-aarch64-unknown-linux-gnu \
--platform linux/arm64 -t ghcr.io/0xmassi/webclaw:${tag}-arm64 --push .
# Multi-arch manifest
docker manifest create ghcr.io/0xmassi/webclaw:${tag} \
ghcr.io/0xmassi/webclaw:${tag}-amd64 \
ghcr.io/0xmassi/webclaw:${tag}-arm64
docker manifest push ghcr.io/0xmassi/webclaw:${tag}
docker manifest create ghcr.io/0xmassi/webclaw:latest \
ghcr.io/0xmassi/webclaw:${tag}-amd64 \
ghcr.io/0xmassi/webclaw:${tag}-arm64
docker manifest push ghcr.io/0xmassi/webclaw:latest
homebrew:
name: Update Homebrew
needs: [release, docker]
# Runs once Docker succeeds, on both tag push and manual re-publish.
if: ${{ always() && needs.docker.result == 'success' }}
runs-on: ubuntu-latest
permissions:
contents: read
@ -223,7 +207,7 @@ jobs:
env:
COMMITTER_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }}
run: |
tag="${{ github.event.inputs.tag || github.ref_name }}"
tag="${GITHUB_REF#refs/tags/}"
base="https://github.com/0xMassi/webclaw/releases/download/${tag}"
# Download all tarballs (Linux + macOS) and compute SHAs

View file

@ -1,7 +0,0 @@
{
"mcpServers": {
"webclaw": {
"command": "~/.webclaw/webclaw-mcp"
}
}
}

View file

@ -3,81 +3,6 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
## [Unreleased]
## [0.6.13] - 2026-06-17
### Performance
- Faster content extraction with byte-identical output. The markdown noise filter no longer recompiles its CSS selectors on every element; the vertical extractors share a single Open Graph meta parse instead of re-scanning the page per field; the JavaScript sandbox is skipped entirely when a page has no JS-assigned data (and reuses the already-parsed document instead of re-parsing); and the HTTP client now tunes its connection pool (connect timeout, idle-pool reuse, keep-alive) for better connection reuse across requests.
## [0.6.12] - 2026-06-17
### Added
- **Standalone web search** using your own [Serper.dev](https://serper.dev) key — no hosted webclaw account needed. Available across the CLI (`webclaw search "query" --num 5 --scrape`, key via `--serper-key` or `SERPER_API_KEY`), the MCP `search` tool (local-first when `SERPER_API_KEY` is set, hosted API otherwise), and the self-hosted REST server (`POST /v1/search`, enabled when started with `SERPER_API_KEY`). With `--scrape`, the top result pages are fetched and extracted to markdown.
- **Layered URL discovery for `--map`**: when a site has no sitemap or only a thin one, map now falls back to a bounded same-origin crawl and harvests links from every fetched page plus the unfetched frontier, returning far more URLs. Adds gzipped-sitemap (`.xml.gz`) support, deeper sitemap-index recursion, more fallback paths, and `--map-pages` / `--no-map-crawl` / `--map-limit` controls. Crawler logs now go to stderr so `--map --format json` stays machine-parseable.
### Fixed
- MCP tools now accept boolean arguments whether the client sends them as JSON booleans or as the strings `"true"`/`"false"` (case-insensitive). Some MCP clients (e.g. Claude Desktop) send booleans as strings, which previously failed the call with a deserialization error. Affects `scrape` (only_main_content), `crawl` (use_sitemap), `research` (deep), and `search` (scrape). This completes the earlier numeric-parameter fix.
## [0.6.11] - 2026-06-16
### Added
- New **Google Gemini** provider in the LLM provider chain. Set `GEMINI_API_KEY` (and optionally `GEMINI_MODEL`, default `gemini-2.5-flash`) to enable it; the chain tries Ollama → OpenAI → Gemini → Anthropic and uses the first available provider.
### Fixed
- The Anthropic provider's default model pointed at a retired model id that now returns `404`, which could fail extraction/summarization when falling back to Anthropic. It now defaults to a current model and is overridable via `ANTHROPIC_MODEL`.
## [0.6.10] - 2026-06-15
### Fixed
- MCP tools that take numeric arguments now accept those values whether the client sends them as numbers or as numeric strings. Some MCP clients (e.g. Claude Desktop) send `"5"` instead of `5`, which previously failed the call with a deserialization error. Affects `crawl` (depth, max_pages, concurrency), `batch` (concurrency), `search` (num_results), and `summarize` (max_sentences).
## [0.6.9] - 2026-06-10
### Fixed
- The multi-arch Docker image (linux/amd64 + linux/arm64) now publishes reliably on each release. The build moved to Buildx so registry pushes no longer fail intermittently, and the Homebrew formula update that depends on it is no longer skipped.
## [0.6.8] - 2026-06-10
### Fixed
- Pages with multibyte text (accented or CJK characters) no longer panic or get mangled during extraction. API-endpoint discovery now cuts oversized scripts on a character boundary instead of crashing mid-character, and structured-data parsing preserves non-ASCII string values instead of turning them into mojibake.
- LLM error messages from a provider no longer panic when the error body contains multibyte characters near the truncation point.
- LLM provider requests now have explicit connect and overall timeouts, so a stalled or unreachable provider fails fast instead of hanging.
- Batch extraction in the MCP server no longer aborts the whole batch when a single URL fails to resolve; bad URLs are reported as individual per-URL errors and the rest still run.
- CLI crawl and batch runs now wait for the completion webhook to actually send before exiting, replacing a fixed delay that could cut the request off or waste time.
- Homepage warm-up requests now include the port for hosts on a non-default port, so those sites are warmed correctly.
---
## [0.6.7] — 2026-06-09
### Changed
- Updated the HTTP/TLS engine (wreq 6.0.0-rc.29, wreq-util 3.0.0-rc.12). This pulls in upstream robustness fixes: no more panic on responses with non-UTF8 header values, a fix for short reads when decoding large compressed bodies, and the TCP nodelay setting is restored. Browser TLS fingerprints are unchanged.
---
## [0.6.6] — 2026-06-09
### Added
- Slow fetches now print a progress line to stderr every 10 seconds (`# webclaw: still fetching <url> (Ns)`) so a long request no longer looks like the CLI hung. Fast fetches stay silent and stdout is untouched.
- New `--url-encoded` flag plus a warning when a URL looks like the shell split it on `&` or `?`. The warning suggests quoting the URL; pass `--url-encoded` to silence it when the URL is intentional.
---
## [0.6.5] — 2026-06-04
### Changed
- Reddit threads extract reliably again. The old anonymous JSON endpoint is no longer available, so webclaw now reads old.reddit.com directly without an API key or JavaScript. You get the post plus the full nested comment tree, with authors, scores, timestamps, and reply nesting preserved. Comment text keeps its links and code blocks, hidden scores are reported as unknown rather than zero, and deleted comments stay in place so their replies aren't lost.
---
## [0.6.4] — 2026-05-19
### Added
- API surface discovery: a new module extracts the API endpoints embedded in a page's inline scripts and linked JavaScript bundles. It surfaces relative REST paths, absolute URLs, GraphQL operations, and WebSocket endpoints that a sitemap alone cannot reveal. A built-in noise filter drops schema.org and json-schema.org references, bare framework paths, and other non-API matches so the result stays focused on the real surface.
---
## [0.6.3] — 2026-05-19
### Fixed

View file

@ -15,7 +15,6 @@ webclaw/
# + proxy pool rotation (per-request)
# + PDF content-type detection
# + document parsing (DOCX, XLSX, CSV)
# + layered URL discovery (map) + Serper web search (BYO key)
webclaw-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic)
# + JSON schema extraction, prompt extraction, summarization
webclaw-pdf/ # PDF text extraction via pdf-extract
@ -31,34 +30,25 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
- `extractor.rs` — Readability-style scoring: text density, semantic tags, link density penalty
- `noise.rs` — Shared noise filter: tags, ARIA roles, class/ID patterns. Tailwind-safe.
- `data_island.rs` — JSON data island extraction for React SPAs, Next.js, Contentful CMS
- `structured_data.rs` — JSON-LD, Next.js `__NEXT_DATA__`, and SvelteKit data-island extraction
- `js_eval.rs` — QuickJS sandbox (rquickjs) that runs inline `<script>` tags to recover JS-assigned blobs (`window.__PRELOADED_STATE__`, Next.js `self.__next_f`) the static path can't see. Behind the default `quickjs` feature, gated `cfg(not(target_arch = "wasm32"))` — rquickjs links a C lib and won't build for wasm. Never ungate it (see Hard Rules). Runtime-gated for speed: the VM is skipped entirely when the page has no JS-candidate markers (`has_js_candidate_data`), and it reuses the already-parsed document instead of re-parsing.
- `endpoints.rs` — API surface discovery: REST paths, GraphQL, and WebSocket endpoints mined from inline scripts + JS bundle text (regex over string literals, DoS-bounded). Pure: caller passes raw text.
- `markdown.rs` — HTML to markdown with URL resolution, asset collection
- `llm/` — directory module (`mod` + `body`/`cleanup`/`images`/`links`/`metadata`): 9-step LLM optimization pipeline (image strip, emphasis strip, link dedup, stat merge, whitespace collapse)
- `llm.rs` — 9-step LLM optimization pipeline (image strip, emphasis strip, link dedup, stat merge, whitespace collapse)
- `domain.rs` — Domain detection from URL patterns + DOM heuristics
- `metadata.rs` — OG, Twitter Card, standard meta tag extraction
- `types.rs` — Core data structures (ExtractionResult, Metadata, Content, plus ExtractionOptions for include/exclude CSS selectors — applied in `extractor.rs`; there is no `filter.rs`)
- `types.rs` — Core data structures (ExtractionResult, Metadata, Content)
- `filter.rs` — CSS selector include/exclude filtering (ExtractionOptions)
- `diff.rs` — Content change tracking engine (snapshot diffing)
- `brand.rs` — Brand identity extraction from DOM structure and CSS
- `reddit.rs` — old.reddit.com thread vertical extractor (parses server-rendered HTML directly; no JS/API key). Test fixtures under `testdata/reddit/*.html` are `exclude`d from the published crate (Cargo.toml).
- `youtube.rs``ytInitialPlayerResponse` parser, structured markdown for `youtube.com/watch` URLs (title, channel, views, published, duration, description). Produces the legacy markdown shape — for transcripts and a structured `YoutubeData` block see the production server's `youtube_transcript.rs` short-circuit (yt-dlp via proxy pool).
### Fetch Modules (`webclaw-fetch`)
- `client.rs``FetchClient` with wreq BoringSSL TLS impersonation; also implements batch (`BatchResult`/`BatchExtractResult` — there is no `batch.rs`). Implements the public `Fetcher` trait so callers (incl. server adapters) can swap implementations.
- `fetcher.rs` — the public `Fetcher` trait (`Send + Sync`). Vertical extractors take `&dyn Fetcher`, not `&FetchClient`.
- `browser.rs``BrowserProfile`/`BrowserVariant` enums only (Chrome, ChromeMacos, Firefox, Safari, SafariIos26, Edge). No version numbers live here.
- `tls.rs` — the real fingerprint builder: per-variant wreq `Emulation` (cipher/sigalg/curve lists, TLS extension order, HTTP/2 SETTINGS, header wire-order). Browser versions are set HERE: Chrome 145, Firefox 135, Edge 145, Safari 18.3.1, Safari iOS 26. SafariIos26 composes on top of `wreq_util::Profile::SafariIos26`. SSRF-safe redirect policy lives here too.
- `extractors/` — ~28 vertical site extractors (Amazon, eBay, GitHub, Instagram, LinkedIn, Reddit, YouTube, npm, PyPI, HuggingFace, ...); `extractors/mod.rs` is the dispatch table. All reach the network through `&dyn Fetcher`. `extractors/og.rs` is the shared single-pass Open Graph (`og:*`) meta parser the verticals use (`raw()` vs `unescaped()`).
- `client.rs` — FetchClient with wreq BoringSSL TLS impersonation; implements the public `Fetcher` trait so callers (including server adapters) can swap in alternative implementations
- `browser.rs` — Browser profiles: Chrome (142/136/133/131), Firefox (144/135/133/128)
- `crawler.rs` — BFS same-origin crawler with configurable depth/concurrency/delay
- `sitemap.rs` — Sitemap discovery and parsing (sitemap.xml, robots.txt; gzip `.xml.gz` supported via `decode_sitemap_body`, sitemap-index recursion)
- `map.rs` — layered URL discovery (`discover_urls` / `MapOptions`): sitemaps first, then a bounded same-origin crawl fallback when the sitemap is thin, harvesting links from fetched pages + the unfetched frontier (deduped against the sitemap set)
- `search.rs` — web search via Serper.dev with the caller's own key (`search` / `SearchOptions` / `SearchResult`; pure `parse_serper_organic`). Plain wreq client (JSON API, no fingerprinting); optional bounded concurrent fetch+extract of result pages. Powers the CLI `search` subcommand, the MCP `search` tool, and the OSS server `POST /v1/search`.
- `sitemap.rs` — Sitemap discovery and parsing (sitemap.xml, robots.txt)
- `batch.rs` — Multi-URL concurrent extraction
- `proxy.rs` — Proxy pool with per-request rotation
- `document.rs` — Document parsing: DOCX, XLSX, CSV auto-detection and extraction
- `cloud.rs``CloudClient` for hosted antibot escalation, exposed via `Fetcher::cloud()`
- `locale.rs` — Accept-Language by TLD (`accept_language_for_tld` / `_for_url`)
- `url_security.rs` — SSRF guards + SSRF-safe redirect policy
- `search.rs` — Web search via Serper.dev with parallel result scraping
### LLM Modules (`webclaw-llm`)
- Provider chain: Ollama (local-first) -> OpenAI -> Anthropic
@ -69,31 +59,26 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
### MCP Server (`webclaw-mcp`)
- Model Context Protocol server over stdio transport
- 12 tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, list_extractors, vertical_scrape. `search` is local-first via the caller's `SERPER_API_KEY` (falls back to the hosted API when unset); `research` uses the hosted deep-research API. The rest run locally.
- 8 tools: scrape, crawl, map, batch, extract, summarize, diff, brand
- Works with Claude Desktop, Claude Code, and any MCP client
- Uses `rmcp` crate (official Rust MCP SDK)
### REST API Server (`webclaw-server`)
- Axum 0.8, stateless, no database, no job queue
- 10 POST routes (incl. `POST /v1/scrape/{vertical}` and `POST /v1/search`) +
`GET /v1/extractors` + `GET /health`. JSON shapes mirror api.webclaw.io
where the capability exists in OSS. The vertical surface
(`routes/structured.rs`) mirrors the MCP `list_extractors` /
`vertical_scrape` tools. `POST /v1/search` is gated on `SERPER_API_KEY`
(returns 501 when unset).
- 8 POST routes + /health, JSON shapes mirror api.webclaw.io where the
capability exists in OSS
- Constant-time bearer-token auth via `subtle::ConstantTimeEq` when
`--api-key` / `WEBCLAW_API_KEY` is set; otherwise open mode
- Hard caps: crawl ≤ 500 pages, batch ≤ 100 URLs, 20 concurrent
- Does NOT include: anti-bot bypass, JS rendering, async jobs,
multi-tenant auth, billing, proxy rotation, research/watch/
multi-tenant auth, billing, proxy rotation, search/research/watch/
agent-scrape. Those live behind api.webclaw.io and are closed-source.
(Web search IS available here as a bring-your-own-Serper-key path.)
## Hard Rules
- **Core has ZERO network dependencies** — takes `&str` HTML, returns structured output. Keep it WASM-compatible. The `quickjs` feature (default ON) pulls in rquickjs, which links a C lib and can't target wasm32; it's gated `cfg(not(target_arch = "wasm32"))` in `lib.rs`. CI compiles webclaw-core for wasm32 both with AND without default features — never ungate that.
- **webclaw-fetch pins wreq exactly**: `wreq = "=6.0.0-rc.29"` + `wreq-util = "=3.0.0-rc.12"` (BoringSSL). The `=` pin is deliberate — these are release candidates with no semver stability between rc.N builds. No `[patch.crates-io]` forks needed; wreq handles TLS internally.
- **No build flags in `.cargo/config.toml`** (it is comments-only) — don't add any locally. BUT CI (`.github/workflows/ci.yml`, `deps.yml`) DOES export `RUSTFLAGS: "--cfg reqwest_unstable"` for the wreq path; don't remove it from CI.
- **Core has ZERO network dependencies** — takes `&str` HTML, returns structured output. Keep it WASM-compatible.
- **webclaw-fetch uses wreq 6.x** (BoringSSL). No `[patch.crates-io]` forks needed; wreq handles TLS internally.
- **No special RUSTFLAGS** — `.cargo/config.toml` is currently empty of build flags. Don't add any.
- **webclaw-llm uses plain reqwest**. LLM APIs don't need TLS fingerprinting, so no wreq dep.
- **Vertical extractors take `&dyn Fetcher`**, not `&FetchClient`. This lets the production server plug in a `ProductionFetcher` that adds domain_hints routing and antibot escalation on top of the same wreq client.
- **qwen3 thinking tags** (`<think>`) are stripped at both provider and consumer levels.
@ -101,28 +86,12 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
## Build & Test
```bash
cargo build --release # All three binaries (webclaw, webclaw-mcp, webclaw-server)
cargo build --release # Both binaries
cargo test --workspace # All tests
cargo test -p webclaw-core # Core only
cargo test -p webclaw-llm # LLM only
```
CI (`.github/workflows/ci.yml`, with `RUSTFLAGS=--cfg reqwest_unstable`) runs four jobs — match them locally before pushing:
- `cargo test --workspace`
- `cargo fmt --check --all` + `cargo clippy --all -- -D warnings` (warnings fail CI)
- `cargo check --target wasm32-unknown-unknown -p webclaw-core` **with and without** `--no-default-features` (guards the WASM-safe rule)
- `cargo doc --no-deps --workspace`
## Repo Layout & Packaging
Workspace is version **0.6.13**, edition **2024**, license **AGPL-3.0** (matters for the public-OSS scrubbing rules). No crate declares `rust-version`, so MSRV is implicit — edition 2024 floors it at Rust 1.85+; CI pins `dtolnay/rust-toolchain@stable`.
Artifacts outside `crates/` that need separate attention:
- `packages/create-webclaw/``npx create-webclaw` Node scaffolder that installs/configures the MCP server for AI agents (Claude, Cursor, Windsurf, ...). Versioned independently (own `package.json`) — bump it separately when MCP setup changes.
- `smithery.yaml` + `glama.json` — MCP-registry manifests (Smithery stdio config spawning `webclaw-mcp` with optional `WEBCLAW_API_KEY`; Glama). Update when the MCP launch command or env changes.
- `examples/` — runnable demos (cloudflare-diagnostics, firecrawl-compatible-api, html-to-markdown-rag, mcp-web-scraping, proxy-backed-crawling).
- `Dockerfile` / `Dockerfile.ci` / `docker-compose.yml`, `benchmarks/` (`/benchmark` skill), `SKILL.md` + `skill/` (Claude Code skill).
## CLI
```bash
@ -138,18 +107,12 @@ webclaw https://example.com --only-main-content
webclaw url1 url2 url3 --proxy-file proxies.txt
webclaw --urls-file urls.txt --concurrency 10
# URL discovery (--map): sitemaps first, bounded crawl fallback when the sitemap is thin
# Sitemap discovery
webclaw https://docs.example.com --map
webclaw https://news.ycombinator.com --map --map-pages 150 --map-limit 500
webclaw https://docs.example.com --map --no-map-crawl # sitemap-only (no crawl fallback)
# Crawling (with sitemap seeding)
webclaw https://docs.example.com --crawl --depth 2 --max-pages 50 --sitemap
# Web search via Serper.dev (bring your own key: --serper-key or SERPER_API_KEY)
webclaw search "rust async runtime" --num 5
webclaw search "best web scraper" --scrape -f json # also fetch + extract result pages
# Change tracking
webclaw https://example.com -f json > snap.json
webclaw https://example.com --diff-with snap.json
@ -177,8 +140,8 @@ cat page.html | webclaw --stdin
- Scoring minimum: 50 chars text length
- Semantic bonus: +50 for `<article>`/`<main>`, +25 for content class/ID
- Link density (generic divs): >50% = 0.1x score, >30% = 0.5x. Semantic nodes (article/main/role=main) get a milder curve: >70% = 0.3x, >50% = 0.5x (`extractor.rs`)
- Data island fallback triggers when DOM word count < 500 (`SPARSE_THRESHOLD` in `data_island.rs`)
- Link density: >50% = 0.1x score, >30% = 0.5x
- Data island fallback triggers when DOM word count < 30
- Eyebrow text max: 80 chars
## MCP Setup

View file

@ -91,16 +91,18 @@ Body is optional but encouraged for non-trivial changes.
```
webclaw (this repo)
└── crates/
├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
├── webclaw-fetch/ # HTTP client (wreq/BoringSSL) + crawler + sitemap + batch
├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
├── webclaw-pdf/ # PDF text extraction
├── webclaw-cli/ # CLI binary
└── webclaw-mcp/ # MCP server binary
├── crates/
│ ├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
│ ├── webclaw-fetch/ # HTTP client + crawler + sitemap + batch
│ ├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
│ ├── webclaw-pdf/ # PDF text extraction
│ ├── webclaw-cli/ # CLI binary
│ └── webclaw-mcp/ # MCP server binary
└── [patch.crates-io] # Points to webclaw-tls for TLS fingerprinting
```
TLS fingerprinting is handled in-process by [wreq](https://crates.io/crates/wreq) (BoringSSL), so `webclaw-fetch` impersonates real browser TLS directly. There are no `[patch.crates-io]` forks or external TLS dependencies.
TLS fingerprinting lives in a separate repo: [webclaw-tls](https://github.com/0xMassi/webclaw-tls). The `[patch.crates-io]` section in `Cargo.toml` overrides rustls, h2, hyper, hyper-util, and reqwest with our patched forks for browser-grade JA4 + HTTP/2 Akamai fingerprinting.
## Crate Boundaries
@ -109,7 +111,7 @@ Changes that cross crate boundaries need extra care:
| Crate | Network? | Key constraint |
|-------|----------|----------------|
| webclaw-core | No | Zero network deps, WASM-safe |
| webclaw-fetch | Yes (wreq) | Browser TLS impersonation via wreq (BoringSSL); no patched deps |
| webclaw-fetch | Yes (webclaw-http) | Uses [webclaw-tls](https://github.com/0xMassi/webclaw-tls) for TLS fingerprinting |
| webclaw-llm | Yes (reqwest) | Plain reqwest — LLM APIs don't need TLS fingerprinting |
| webclaw-pdf | No | Minimal, wraps pdf-extract |
| webclaw-cli | Yes | Depends on all above |

222
Cargo.lock generated
View file

@ -28,6 +28,18 @@ dependencies = [
"cpufeatures",
]
[[package]]
name = "ahash"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.4"
@ -52,12 +64,6 @@ dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "allocator-api2"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
[[package]]
name = "android_system_properties"
version = "0.1.5"
@ -266,9 +272,9 @@ dependencies = [
[[package]]
name = "bitflags"
version = "2.13.0"
version = "2.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8"
checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
[[package]]
name = "block-buffer"
@ -279,6 +285,31 @@ dependencies = [
"generic-array",
]
[[package]]
name = "boring-sys2"
version = "5.0.0-alpha.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "455d79965f5155dcc88a7abce112c3590883889131b799beda10bf9a813ed669"
dependencies = [
"bindgen",
"cmake",
"fs_extra",
"fslock",
]
[[package]]
name = "boring2"
version = "5.0.0-alpha.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "183ccc3854411c035410dcdbffafca62084f3a6c33f013c77e83c025d2a08a28"
dependencies = [
"bitflags",
"boring-sys2",
"foreign-types",
"libc",
"openssl-macros",
]
[[package]]
name = "brotli"
version = "8.0.2"
@ -300,31 +331,6 @@ dependencies = [
"alloc-stdlib",
]
[[package]]
name = "btls"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c5e60b8c8d282c86360cab651ded04ab0335a7b5390c8d34145cbeab8cacf5f"
dependencies = [
"bitflags",
"btls-sys",
"foreign-types",
"libc",
"openssl-macros",
]
[[package]]
name = "btls-sys"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b1b8638a2e1c38a5ae4efa90ae57e643baec35a30d03fc5b399b893adc4954b"
dependencies = [
"bindgen",
"cmake",
"fs_extra",
"fslock",
]
[[package]]
name = "bumpalo"
version = "3.20.2"
@ -859,12 +865,6 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foldhash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "foreign-types"
version = "0.5.0"
@ -1089,13 +1089,19 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
[[package]]
name = "hashbrown"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
[[package]]
name = "hashbrown"
version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"foldhash 0.1.5",
"foldhash",
]
[[package]]
@ -1104,17 +1110,6 @@ version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
[[package]]
name = "hashbrown"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash 0.2.0",
]
[[package]]
name = "heck"
version = "0.5.0"
@ -1177,9 +1172,9 @@ dependencies = [
[[package]]
name = "http2"
version = "0.5.17"
version = "0.5.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "569ef7a780e853c4e1768f58a3c8168193b82cdcbab66638a0b1c6583ec5995e"
checksum = "c45c6490693ee8a8d0d95fdbdf76fead9fb87548f7894137259a7c6d22821948"
dependencies = [
"atomic-waker",
"bytes",
@ -1188,6 +1183,7 @@ dependencies = [
"futures-sink",
"http",
"indexmap",
"parking_lot",
"slab",
"smallvec",
"tokio",
@ -1499,9 +1495,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
[[package]]
name = "libc"
version = "0.2.186"
version = "0.2.183"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
[[package]]
name = "libloading"
@ -1567,15 +1563,6 @@ dependencies = [
"weezl",
]
[[package]]
name = "lru"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a860605968fce16869fd239cf4237a82f3ac470723415db603b0e8b6c8d4fb9"
dependencies = [
"hashbrown 0.17.1",
]
[[package]]
name = "lru-slab"
version = "0.1.2"
@ -2388,6 +2375,17 @@ dependencies = [
"syn",
]
[[package]]
name = "schnellru"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "356285bbf17bea63d9e52e96bd18f039672ac92b55b8cb997d6162a2a37d1649"
dependencies = [
"ahash",
"cfg-if",
"hashbrown 0.13.2",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
@ -2781,9 +2779,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.52.3"
version = "1.50.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe"
checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d"
dependencies = [
"bytes",
"libc",
@ -2797,20 +2795,20 @@ dependencies = [
]
[[package]]
name = "tokio-btls"
version = "0.5.6"
name = "tokio-boring2"
version = "5.0.0-alpha.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e1fd638ec35427faf3b8f412e0fdd6fae76591d79dba40f38fa667d22bc44dd"
checksum = "0f81df1210d791f31d72d840de8fbd80b9c3cb324956523048b1413e2bd55756"
dependencies = [
"btls",
"boring2",
"tokio",
]
[[package]]
name = "tokio-macros"
version = "2.7.0"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
dependencies = [
"proc-macro2",
"quote",
@ -3221,7 +3219,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
version = "0.6.13"
version = "0.6.3"
dependencies = [
"clap",
"dotenvy",
@ -3242,7 +3240,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
version = "0.6.13"
version = "0.6.3"
dependencies = [
"ego-tree",
"once_cell",
@ -3260,13 +3258,11 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
version = "0.6.13"
version = "0.6.3"
dependencies = [
"async-trait",
"bytes",
"calamine",
"flate2",
"futures-util",
"http",
"quick-xml 0.37.5",
"rand 0.8.5",
@ -3288,7 +3284,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
version = "0.6.13"
version = "0.6.3"
dependencies = [
"async-trait",
"reqwest",
@ -3301,7 +3297,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
version = "0.6.13"
version = "0.6.3"
dependencies = [
"dirs",
"dotenvy",
@ -3321,7 +3317,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
version = "0.6.13"
version = "0.6.3"
dependencies = [
"pdf-extract",
"thiserror",
@ -3330,7 +3326,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
version = "0.6.13"
version = "0.6.3"
dependencies = [
"anyhow",
"axum",
@ -3351,9 +3347,9 @@ dependencies = [
[[package]]
name = "webpki-root-certs"
version = "1.0.7"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
dependencies = [
"rustls-pki-types",
]
@ -3700,14 +3696,17 @@ dependencies = [
[[package]]
name = "wreq"
version = "6.0.0-rc.29"
version = "6.0.0-rc.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f0eba5f5814a94e5f1a99156f187133464e525b66bdbc69a9627d46530af2e1"
checksum = "f79937f6c4df65b3f6f78715b9de2977afe9ee3b3436483c7949a24511e25935"
dependencies = [
"btls",
"btls-sys",
"ahash",
"boring2",
"brotli",
"bytes",
"cookie",
"flate2",
"futures-channel",
"futures-util",
"http",
"http-body",
@ -3716,64 +3715,29 @@ dependencies = [
"httparse",
"ipnet",
"libc",
"lru",
"percent-encoding",
"pin-project-lite",
"schnellru",
"smallvec",
"socket2",
"sync_wrapper",
"tokio",
"tokio-btls",
"tokio-util",
"tokio-boring2",
"tower",
"tower-http",
"url",
"webpki-root-certs",
"wreq-proto",
"wreq-rt",
]
[[package]]
name = "wreq-proto"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a43942f024bb303f1042c9aa3c87fa1d9149f507c65db6e5220a11ccdb207387"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
"http",
"http-body",
"http2",
"httparse",
"pin-project-lite",
"smallvec",
"tokio",
"tokio-util",
"want",
]
[[package]]
name = "wreq-rt"
version = "0.2.2-rc.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e9bce67a3fa3dd3f1503f066d86661c9caf399a763d3bd184da7afaf886c8b"
dependencies = [
"pin-project-lite",
"tokio",
"wreq-proto",
"webpki-root-certs",
"zstd",
]
[[package]]
name = "wreq-util"
version = "3.0.0-rc.12"
version = "3.0.0-rc.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baa5d2ab72139256916ca352a3d05c53d74e1dd360052eb5ba7691033c417c65"
checksum = "6c6bbe24d28beb9ceb58b514bd6a613c759d3b706f768b9d2950d5d35b543c04"
dependencies = [
"brotli",
"flate2",
"typed-builder",
"wreq",
"zstd",
]
[[package]]

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
version = "0.6.13"
version = "0.6.3"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"

View file

@ -59,9 +59,9 @@ RUN touch crates/*/src/*.rs \
# ---------------------------------------------------------------------------
FROM ubuntu:24.04
# CA bundle from distroless (ships it, multi-arch, gcr.io) instead of
# apt-installing from ports.ubuntu.com (unreachable for arm64 on CI runners).
COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Copy all three binaries
COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw

View file

@ -1,21 +1,18 @@
# Slim runtime image — uses pre-built binaries from the release.
# The full Dockerfile (multi-stage Rust build) is for local development.
# CI uses this to avoid 60+ min QEMU cross-compilation.
ARG BINARY_DIR=binaries
FROM ubuntu:24.04
# CA bundle copied from a reliable multi-arch image instead of apt-installing
# from ports.ubuntu.com — Canonical's arm64 ports mirror is unreachable from
# CI runners and breaks the multi-arch release build. No build-time network.
COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# TARGETARCH (amd64 / arm64) is provided automatically by buildx for each
# target platform, so one multi-platform build copies the matching binaries.
# The release workflow stages them in binaries-amd64 / binaries-arm64.
ARG TARGETARCH
COPY binaries-${TARGETARCH}/webclaw /usr/local/bin/webclaw
COPY binaries-${TARGETARCH}/webclaw-mcp /usr/local/bin/webclaw-mcp
COPY binaries-${TARGETARCH}/webclaw-server /usr/local/bin/webclaw-server
ARG BINARY_DIR
COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw
COPY ${BINARY_DIR}/webclaw-mcp /usr/local/bin/webclaw-mcp
COPY ${BINARY_DIR}/webclaw-server /usr/local/bin/webclaw-server
# Default REST API port when running `webclaw-server` inside the container.
EXPOSE 3000
@ -27,9 +24,8 @@ ENV WEBCLAW_HOST=0.0.0.0
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
# commands directly so this image can be used as a FROM base with custom CMD.
# `--chmod` sets the bit at copy time so the build needs no in-container `RUN`
# (and thus no QEMU emulation for the arm64 platform).
COPY --chmod=755 docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["docker-entrypoint.sh"]
CMD ["webclaw", "--help"]

View file

@ -77,7 +77,7 @@ brew install webclaw
### Prebuilt binaries
Download macOS, Linux, and Windows binaries from [GitHub Releases](https://github.com/0xMassi/webclaw/releases).
Download macOS and Linux binaries from [GitHub Releases](https://github.com/0xMassi/webclaw/releases).
### Docker
@ -142,7 +142,7 @@ webclaw https://docs.rust-lang.org --crawl --depth 2 --max-pages 50
- [HTML to Markdown for RAG](examples/html-to-markdown-rag/)
- [Firecrawl-compatible API](examples/firecrawl-compatible-api/)
- [MCP web scraping](examples/mcp-web-scraping/)
- [Proxy-backed crawling with ColdProxy](examples/proxy-backed-crawling/)
- [Proxy-backed crawling](examples/proxy-backed-crawling/)
- [Cloudflare diagnostics](examples/cloudflare-diagnostics/)
### Extract brand assets
@ -385,48 +385,9 @@ Please remove secrets, cookies, private tokens, and customer data from logs befo
---
## Infrastructure Partner
## Studio Partner
<table>
<tr>
<td align="center">
<a href="https://coldproxy.com/">
<img src="./assets/sponsors/coldproxy-banner.png" alt="ColdProxy" width="720" />
</a>
</td>
</tr>
<tr>
<td>
<strong>ColdProxy</strong> supports webclaw as an Infrastructure Partner, providing residential IPv4,
residential IPv6, and datacenter IPv6 proxy infrastructure across 195+ countries for public data
collection, regional testing, monitoring, and web scraping workflows. Explore
<a href="https://coldproxy.com/">ColdProxy</a>'s latest plans and available offers directly on the website.
See the <a href="examples/proxy-backed-crawling/#using-coldproxy">proxy-backed crawling guide</a>
for a hands-on walkthrough of wiring ColdProxy into webclaw.
</td>
</tr>
</table>
---
## Studio Partners
<table>
<tr>
<td width="340" align="center">
<a href="https://go.nodemaven.com/webclaw">
<img src="./assets/sponsors/nodemaven-banner.png" alt="NodeMaven" width="300" />
</a>
</td>
<td>
<strong>NodeMaven</strong> is the most reliable proxy provider with the highest-quality IPs on the market.
Best solution for automation, web scraping, SEO research, and social media management: 99.9% uptime,
sticky sessions up to 7 days, IP filtering (all proxies under a 97% fraud score), no KYC, and cashback up
to 10% on traffic. Use <code>WEBCLAW35</code> for 35% off Mobile and Residential proxies, or
<code>WEBCLAW40</code> for 40% off ISP (Static) proxies at
<a href="https://go.nodemaven.com/webclaw">NodeMaven</a>.
</td>
</tr>
<tr>
<td width="340" align="center">
<a href="https://quantumproxies.net/?utm_source=webclaw&utm_medium=github&utm_campaign=sponsor">
@ -452,31 +413,6 @@ Please remove secrets, cookies, private tokens, and customer data from logs befo
<a href="https://proxy-seller.com/?partner=KXMQNNLIGHXR4B">proxy-seller.com</a>.
</td>
</tr>
<tr>
<td width="340" align="center">
<a href="https://www.rapidproxy.io/?ref=webclaw">
<img src="./assets/sponsors/rapidproxy-banner.png" alt="RapidProxy" width="300" />
</a>
</td>
<td>
<strong>RapidProxy</strong> delivers fast, reliable proxy infrastructure for large-scale data collection.
With 90M+ residential IPs, smart rotation, high concurrency, AI-powered CAPTCHA bypass, and non-expiring traffic, it helps keep scraping workflows stable at scale.
Use code <code>webclaw</code> for 10% off, or
<a href="https://www.rapidproxy.io/?ref=webclaw">Try it free</a>.
</td>
</tr>
<tr>
<td width="340" align="center">
<a href="https://mangoproxy.com/?utm_source=github&utm_medium=partner&utm_campaign=0xmassi">
<img src="./assets/sponsors/mangoproxy-banner.png" alt="MangoProxy" width="300" />
</a>
</td>
<td>
<strong>MangoProxy</strong> provides residential, ISP, datacenter, and mobile proxies across 200+ locations, backed by a 90M+ IP pool with HTTP and SOCKS5 support and high stability for web scraping and data collection at scale.
Use code <code>0XMASSI</code> for 8% off ISP (Static) proxies at
<a href="https://mangoproxy.com/?utm_source=github&utm_medium=partner&utm_campaign=0xmassi">mangoproxy.com</a>.
</td>
</tr>
</table>
---

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 757 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 371 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 413 KiB

View file

@ -166,14 +166,6 @@ struct Cli {
#[arg(long)]
urls_file: Option<String>,
/// Assert that the URL has been handled for shell escaping. Suppresses
/// the URL-truncation stderr warning. Use when the URL is intentionally
/// passed with an empty/keyless query (e.g. legacy CGI) or when a
/// trailing `&` is genuinely part of the URL. The URL is fetched as-is
/// (no extra normalization beyond the standard scheme prepend).
#[arg(long)]
url_encoded: bool,
/// Output format (markdown, json, text, llm, html)
#[arg(short, long, default_value = "markdown")]
format: OutputFormat,
@ -313,18 +305,6 @@ struct Cli {
#[arg(long)]
map: bool,
/// Max pages for --map's crawl fallback when the sitemap is thin [default: 150]
#[arg(long)]
map_pages: Option<usize>,
/// Disable --map's crawl fallback (sitemap-only discovery)
#[arg(long)]
no_map_crawl: bool,
/// Cap the number of URLs --map returns (default: uncapped)
#[arg(long)]
map_limit: Option<usize>,
// -- LLM options --
/// Extract structured JSON using LLM (pass a JSON schema string or @file)
#[arg(long)]
@ -422,43 +402,6 @@ enum Commands {
#[arg(long)]
raw: bool,
},
/// Web search via Serper.dev using YOUR OWN API key.
///
/// Returns Google organic results (title, link, snippet). With
/// `--scrape`, each result page is fetched and extracted to markdown.
/// Get a free key at serper.dev, then pass `--serper-key` or set
/// `SERPER_API_KEY`.
///
/// Example: `webclaw search "rust async runtime" --num 5 --scrape`.
Search {
/// Search query.
query: String,
/// Serper.dev API key. Falls back to the `SERPER_API_KEY` env var.
#[arg(long, env = "SERPER_API_KEY")]
serper_key: Option<String>,
/// Number of results to return (1-10).
#[arg(long, default_value = "5")]
num: usize,
/// Country code for localization (e.g. "us", "gb", "it").
#[arg(long)]
country: Option<String>,
/// Language code for localization (e.g. "en", "it").
#[arg(long)]
lang: Option<String>,
/// Fetch + extract each result page and include its markdown.
#[arg(long)]
scrape: bool,
/// Output format: `markdown` (human-readable, default) or `json`.
#[arg(short, long, default_value = "markdown")]
format: OutputFormat,
},
}
#[derive(Clone, ValueEnum)]
@ -520,13 +463,7 @@ fn init_logging(verbose: bool) {
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
};
// Logs go to stderr, never stdout: stdout carries the actual result
// (markdown / JSON / URL list). A stray WARN on stdout corrupts
// machine-readable output — e.g. `--map --format json` piped to a parser.
tracing_subscriber::fmt()
.with_env_filter(filter)
.with_writer(std::io::stderr)
.init();
tracing_subscriber::fmt().with_env_filter(filter).init();
}
/// Build FetchConfig from CLI flags.
@ -654,31 +591,6 @@ fn normalize_url(url: &str) -> String {
}
}
/// M14: detect URLs that look truncated by the shell (e.g. an unquoted URL
/// that the shell split on `&` or `?`). Returns `true` when:
/// - the URL ends with `&` (a trailing param separator suggests the next
/// param was lopped off), OR
/// - the URL contains `?` but no `=` after it (a query with bare keys is
/// rare; usually a real query has at least one `=`).
///
/// Informational only — caller decides whether to warn / abort. This is a
/// heuristic; legitimate URLs with bare-key queries will trigger a false
/// positive (suppressible via `--url-encoded`).
fn looks_truncated(url: &str) -> bool {
let trimmed = url.trim();
if trimmed.ends_with('&') {
return true;
}
if let Some((_before, after_q)) = trimmed.split_once('?') {
// Trim a trailing fragment so `?#section` etc. doesn't mask the check.
let query_part = after_q.split('#').next().unwrap_or(after_q);
if !query_part.contains('=') {
return true;
}
}
false
}
/// Derive a filename from a URL for `--output-dir`.
///
/// Strips the scheme/host, maps the path to a filesystem path, and appends
@ -914,14 +826,6 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
.urls
.first()
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
// M14: warn when the URL looks like the shell split it on `&` or `?`.
// Informational only — fetch still proceeds. Suppressed by --url-encoded,
// which asserts the caller has handled escaping intentionally.
if !cli.url_encoded && looks_truncated(raw_url) {
eprintln!(
"# webclaw: warning: URL looks truncated (ends with '&' or '?'); did the shell split it? Quote the URL or use --url-encoded."
);
}
let url = normalize_url(raw_url);
let url = url.as_str();
@ -955,11 +859,8 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
// M13: wrap with periodic stderr progress emitter. Fast fetches see
// zero emissions (timer never fires in <10s); slow fetches get a
// line every 10s of elapsed time so the CLI doesn't appear hung.
let fetch_fut = client.fetch_and_extract_with_options(url, &options);
let result = webclaw_fetch::with_progress(url, fetch_fut)
let result = client
.fetch_and_extract_with_options(url, &options)
.await
.map_err(|e| format!("fetch error: {e}"))?;
@ -1603,7 +1504,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
// Fire webhook on crawl complete
if let Some(ref webhook_url) = cli.webhook {
let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
let handle = fire_webhook(
fire_webhook(
webhook_url,
&serde_json::json!({
"event": "crawl_complete",
@ -1614,8 +1515,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
"urls": urls,
}),
);
// Wait for the webhook to finish so the process doesn't exit mid-send.
let _ = handle.await;
// Brief pause so the async webhook has time to fire
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
if result.errors > 0 {
@ -1628,73 +1529,6 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
}
}
/// Web search via Serper.dev with the caller's own API key.
///
/// The Serper key is resolved by the caller (flag or `SERPER_API_KEY`
/// env, via clap's `env`) and passed in already-unwrapped. When `scrape`
/// is set, each result page is fetched + extracted through a FetchClient
/// (which carries the browser TLS profile) and its markdown is included.
#[allow(clippy::too_many_arguments)]
async fn run_search(
serper_key: &str,
query: &str,
num: usize,
country: Option<&str>,
lang: Option<&str>,
scrape: bool,
format: &OutputFormat,
) -> Result<(), String> {
// Default fetch config is enough: search localization is handled by
// Serper's gl/hl, and the result-page scrape just needs a standard
// browser profile. Attach cloud fallback when WEBCLAW_API_KEY is set
// so scraped pages behind bot protection can still escalate.
let mut client = webclaw_fetch::FetchClient::new(webclaw_fetch::FetchConfig::default())
.map_err(|e| format!("client error: {e}"))?;
if let Some(cloud) = webclaw_fetch::cloud::CloudClient::from_env() {
client = client.with_cloud(cloud);
}
let opts = webclaw_fetch::SearchOptions {
num_results: num,
country: country.map(str::to_string),
lang: lang.map(str::to_string),
scrape,
};
let results = webclaw_fetch::search(&client, serper_key, query, &opts)
.await
.map_err(|e| format!("search error: {e}"))?;
if matches!(format, OutputFormat::Json) {
let json = serde_json::json!({ "query": query, "results": results });
match serde_json::to_string_pretty(&json) {
Ok(s) => println!("{s}"),
Err(e) => return Err(format!("JSON encode failed: {e}")),
}
return Ok(());
}
if results.is_empty() {
eprintln!("no results for \"{query}\"");
return Ok(());
}
for r in &results {
println!("{}. {}", r.position, r.title);
println!(" {}", r.link);
if !r.snippet.is_empty() {
println!(" {}", r.snippet);
}
if let Some(ref content) = r.content {
println!();
println!("{content}");
}
println!();
}
Ok(())
}
async fn run_map(cli: &Cli) -> Result<(), String> {
let url = cli
.urls
@ -1706,22 +1540,12 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
// Layered discovery: sitemaps first, bounded crawl fallback when thin.
let mut opts = webclaw_fetch::MapOptions::default();
if let Some(pages) = cli.map_pages {
opts.max_crawl_pages = pages;
}
if cli.no_map_crawl {
opts.crawl_fallback = false;
}
if let Some(limit) = cli.map_limit {
opts.max_urls = Some(limit);
}
let entries = webclaw_fetch::discover_urls(&client, url, &opts).await;
let entries = webclaw_fetch::sitemap::discover(&client, url)
.await
.map_err(|e| format!("sitemap discovery failed: {e}"))?;
if entries.is_empty() {
eprintln!("no URLs found for {url}");
eprintln!("no sitemap URLs found for {url}");
} else {
eprintln!("discovered {} URLs", entries.len());
}
@ -1790,7 +1614,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
// Fire webhook on batch complete
if let Some(ref webhook_url) = cli.webhook {
let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
let handle = fire_webhook(
fire_webhook(
webhook_url,
&serde_json::json!({
"event": "batch_complete",
@ -1800,7 +1624,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
"urls": urls,
}),
);
let _ = handle.await;
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
if errors > 0 {
@ -1874,12 +1698,9 @@ async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
}
}
/// Fire a webhook POST with a JSON payload. Spawns the send on a background task
/// and returns its `JoinHandle` so callers that need delivery (e.g. one-shot
/// crawl/batch runs that exit immediately after) can `.await` it; long-running
/// loops can drop the handle and let it run fire-and-forget. Errors are logged
/// to stderr. Auto-detects Discord and Slack webhook URLs and wraps the payload.
fn fire_webhook(url: &str, payload: &serde_json::Value) -> tokio::task::JoinHandle<()> {
/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
fn fire_webhook(url: &str, payload: &serde_json::Value) {
let url = url.to_string();
let is_discord = url.contains("discord.com/api/webhooks");
let is_slack = url.contains("hooks.slack.com");
@ -1941,7 +1762,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) -> tokio::task::JoinHand
},
Err(e) => eprintln!("[webhook] client error: {e}"),
}
})
});
}
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
@ -2453,7 +2274,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
if let Some(ref webhook_url) = cli.webhook {
let handle = fire_webhook(
fire_webhook(
webhook_url,
&serde_json::json!({
"event": "batch_llm_complete",
@ -2462,7 +2283,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
"errors": errors,
}),
);
let _ = handle.await;
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
if errors > 0 {
@ -2721,40 +2542,6 @@ async fn main() {
}
return;
}
Commands::Search {
query,
serper_key,
num,
country,
lang,
scrape,
format,
} => {
let key = match serper_key {
Some(k) if !k.trim().is_empty() => k.clone(),
_ => {
eprintln!(
"error: search requires a Serper.dev API key: pass --serper-key or set SERPER_API_KEY (get one free at serper.dev)"
);
process::exit(1);
}
};
if let Err(e) = run_search(
&key,
query,
*num,
country.as_deref(),
lang.as_deref(),
*scrape,
format,
)
.await
{
eprintln!("error: {e}");
process::exit(1);
}
return;
}
}
}
@ -3092,61 +2879,6 @@ mod tests {
let _ = std::fs::remove_dir_all(&dir);
}
// M14: URL truncation heuristic tests.
#[test]
fn looks_truncated_fires_on_trailing_ampersand() {
// The most common shell-split shape: `?a=1&` lost the `b=2`.
assert!(looks_truncated("https://example.com/?a=1&"));
assert!(looks_truncated("https://example.com/path?key=val&"));
}
#[test]
fn looks_truncated_fires_on_query_with_no_equals() {
// `?foo` with no `=` is a strong signal the shell ate the `=value`.
assert!(looks_truncated("https://example.com/?foo"));
// Bare `?` (empty query) also looks like the shell ate the whole pair.
assert!(looks_truncated("https://example.com/?"));
// Same with a fragment after — strip fragment before checking.
assert!(looks_truncated("https://example.com/?foo#section"));
}
#[test]
fn looks_truncated_silent_on_clean_url() {
// Normal URLs (no query, or query with at least one `=`) are clean.
assert!(!looks_truncated("https://example.com/"));
assert!(!looks_truncated("https://example.com/path/to/page"));
assert!(!looks_truncated("https://example.com/?a=1"));
assert!(!looks_truncated("https://example.com/?a=1&b=2"));
assert!(!looks_truncated(
"https://example.com/?a=1&b=2&c=hello%20world"
));
// Hash anchors without a query are clean.
assert!(!looks_truncated("https://example.com/page#section"));
}
#[test]
fn looks_truncated_silent_with_url_encoded_assertion_modeled_via_skip() {
// The --url-encoded flag suppresses the warning at the call site
// (main.rs gates the eprintln! behind `if !cli.url_encoded`).
// This test models the gate logic directly: when --url-encoded is set,
// the warning branch is never entered, even on a truncated-looking URL.
let url = "https://example.com/?a=1&";
let url_encoded_flag = true;
let should_warn = !url_encoded_flag && looks_truncated(url);
assert!(
!should_warn,
"--url-encoded must suppress the warning even on URL ending with &"
);
// Sanity: same URL without --url-encoded does warn.
let url_encoded_flag = false;
let should_warn = !url_encoded_flag && looks_truncated(url);
assert!(
should_warn,
"without --url-encoded, the warning should fire on URL ending with &"
);
}
#[test]
fn research_slug_truncation_is_char_safe() {
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.

View file

@ -4,10 +4,6 @@ description = "Pure HTML content extraction engine for LLMs"
version.workspace = true
edition.workspace = true
license.workspace = true
# Reddit regression fixtures are real old.reddit.com pages read at test time;
# they're large and only needed to run the test suite from the repo, so keep
# them out of the published crate.
exclude = ["testdata/reddit/*.html"]
[features]
default = ["quickjs"]

View file

@ -1,533 +0,0 @@
//! API/endpoint surface discovery from HTML + JS bundle text.
//!
//! Pure and zero-network: callers fetch the page and its `<script src>`
//! bundles, then hand the raw text here. We surface API paths, absolute
//! API URLs, GraphQL and WebSocket endpoints that live in inline scripts
//! and bundles — the surface a sitemap/`map` can never see.
//!
//! Heuristic by design: regex over string literals, not JS dataflow.
//! High-signal patterns only; bounded for DoS safety.
use once_cell::sync::Lazy;
use regex::Regex;
use scraper::{Html, Selector};
use std::collections::BTreeSet;
use url::Url;
/// Hard caps so a hostile/huge bundle set can't blow up CPU or memory.
const MAX_SCAN_BYTES: usize = 8 * 1024 * 1024;
const MAX_ENDPOINTS: usize = 2000;
/// Cap on `<script src>` URLs returned for the caller to fetch.
const MAX_SCRIPT_SRCS: usize = 40;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum EndpointKind {
RelativePath,
AbsoluteUrl,
GraphQl,
WebSocket,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct DiscoveredEndpoint {
pub value: String,
pub kind: EndpointKind,
pub first_party: bool,
/// `"inline"` or the bundle URL the match came from.
pub source: String,
}
#[derive(Debug, Default, serde::Serialize)]
pub struct EndpointReport {
pub endpoints: Vec<DiscoveredEndpoint>,
/// Distinct hosts seen across absolute URLs (first- and third-party).
pub hosts: Vec<String>,
pub bundles_scanned: usize,
/// True if a cap was hit and results may be incomplete.
pub truncated: bool,
}
// Quoted relative path that looks API-ish. Bounded quantifiers; the `regex`
// crate is linear-time (RE2) so this cannot catastrophically backtrack.
static RE_REL_PATH: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#"["'`](/[A-Za-z0-9_\-./]{0,200}?(?:api|graphql|gql|/v[0-9]|/rest|/gateway|/internal|/discovery)[A-Za-z0-9_\-./]{0,200})["'`]"#,
)
.expect("RE_REL_PATH")
});
static RE_ABS_URL: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"https?://[A-Za-z0-9.\-]{1,253}(?:/[A-Za-z0-9_\-./%]{0,400})?"#)
.expect("RE_ABS_URL")
});
static RE_WS: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"wss?://[A-Za-z0-9.\-]{1,253}(?:/[A-Za-z0-9_\-./%]{0,256})?"#).expect("RE_WS")
});
static SCRIPT_SEL: Lazy<Selector> = Lazy::new(|| Selector::parse("script").expect("script sel"));
/// Common multi-label public suffixes so `ticketmaster.co.uk` resolves to
/// `ticketmaster.co.uk` (not `co.uk`). Not a full PSL — pragmatic v1.
const SUFFIX2: &[&str] = &[
"co.uk", "org.uk", "gov.uk", "ac.uk", "me.uk", "com.au", "net.au", "org.au", "co.jp", "co.nz",
"co.za", "com.br", "com.mx", "com.sg", "co.in", "co.kr", "com.tr", "com.cn",
];
fn registrable_domain(host: &str) -> String {
let host = host.trim_end_matches('.').to_ascii_lowercase();
let labels: Vec<&str> = host.split('.').collect();
if labels.len() < 2 {
return host;
}
let last2 = labels[labels.len() - 2..].join(".");
if SUFFIX2.contains(&last2.as_str()) && labels.len() >= 3 {
labels[labels.len() - 3..].join(".")
} else {
last2
}
}
fn is_first_party(candidate_host: &str, base_reg: &str) -> bool {
let ch = candidate_host.to_ascii_lowercase();
ch == base_reg || ch.ends_with(&format!(".{base_reg}"))
}
/// Registrable domains that are spec/schema/example noise, never real API
/// surface (minified JSON-Schema/`schema.org` refs show up constantly).
const NOISE_HOSTS: &[&str] = &[
"schema.org",
"json-schema.org",
"w3.org",
"example.com",
"example.org",
"example.net",
"localhost",
];
/// A host worth reporting: multi-label with an alphabetic TLD (>=2 chars).
/// Rejects minifier garbage like `http://f` / `http://n` and UUID-ish
/// single labels that the URL regex otherwise picks up.
fn is_valid_host(host: &str) -> bool {
let h = host.trim_end_matches('.');
let labels: Vec<&str> = h.split('.').collect();
if labels.len() < 2 || labels.iter().any(|l| l.is_empty()) {
return false;
}
let tld = labels[labels.len() - 1];
tld.len() >= 2 && tld.chars().all(|c| c.is_ascii_alphabetic())
}
/// Bare/low-signal relative paths that are just the prefix, not an endpoint
/// (e.g. `/api`, `/api/`, `/`). `/graphql`, `/gql`, `/api/x` are kept.
fn is_noise_path(p: &str) -> bool {
let t = p.trim_end_matches('/');
t.len() < 4 || matches!(t, "/api" | "/rest")
}
/// Resolved absolute `<script src>` URLs (http/https only), deduped, capped.
/// Inline scripts have no `src` and are scanned via [`extract_endpoints`].
pub fn script_srcs(html: &str, base_url: &str) -> Vec<String> {
let base = Url::parse(base_url).ok();
let doc = Html::parse_document(html);
let mut seen = BTreeSet::new();
let mut out = Vec::new();
for el in doc.select(&SCRIPT_SEL) {
if out.len() >= MAX_SCRIPT_SRCS {
break;
}
let Some(src) = el.value().attr("src") else {
continue;
};
let resolved = match Url::parse(src) {
Ok(u) => Some(u),
Err(_) => base.as_ref().and_then(|b| b.join(src).ok()),
};
let Some(u) = resolved else {
continue;
};
if (u.scheme() == "http" || u.scheme() == "https") && seen.insert(u.to_string()) {
out.push(u.to_string());
}
}
out
}
/// Extract endpoints from inline HTML scripts plus pre-fetched JS bundles.
/// `bundles` is `(bundle_url, bundle_text)`.
pub fn extract_endpoints(
html: &str,
base_url: &str,
bundles: &[(String, String)],
) -> EndpointReport {
let base_reg = Url::parse(base_url)
.ok()
.and_then(|u| u.host_str().map(registrable_domain))
.unwrap_or_default();
let mut endpoints: Vec<DiscoveredEndpoint> = Vec::new();
let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
let mut hosts: BTreeSet<String> = BTreeSet::new();
let mut budget = MAX_SCAN_BYTES;
let mut truncated = false;
let push = |value: String,
kind: EndpointKind,
source: &str,
endpoints: &mut Vec<DiscoveredEndpoint>,
seen: &mut BTreeSet<(String, String)>,
hosts: &mut BTreeSet<String>|
-> bool {
if endpoints.len() >= MAX_ENDPOINTS {
return false;
}
let first_party = match Url::parse(&value) {
Ok(u) => {
let Some(h) = u.host_str() else {
return true;
};
if !is_valid_host(h) {
return true; // minifier garbage host
}
if NOISE_HOSTS.contains(&registrable_domain(h).as_str()) {
return true; // schema.org / json-schema.org / example.*
}
// Absolute URL with no real path is an origin/site link,
// not an API endpoint (drops the page's own URL too).
let path = u.path();
if path.is_empty() || path == "/" {
return true;
}
hosts.insert(h.to_ascii_lowercase());
is_first_party(h, &base_reg)
}
// Relative path: same origin as the page by definition.
Err(_) => {
if is_noise_path(&value) {
return true; // bare /api, /, ultra-short
}
true
}
};
if seen.insert((value.clone(), source.to_string())) {
endpoints.push(DiscoveredEndpoint {
value,
kind,
first_party,
source: source.to_string(),
});
}
true
};
let scan = |text: &str,
source: &str,
endpoints: &mut Vec<DiscoveredEndpoint>,
seen: &mut BTreeSet<(String, String)>,
hosts: &mut BTreeSet<String>,
budget: &mut usize,
truncated: &mut bool| {
if *budget == 0 {
return;
}
let slice = if text.len() > *budget {
*truncated = true;
// Snap the cut to a UTF-8 char boundary so non-ASCII content
// (multibyte codepoints straddling the budget) can't panic.
let mut cut = (*budget).min(text.len());
while cut > 0 && !text.is_char_boundary(cut) {
cut -= 1;
}
&text[..cut]
} else {
text
};
*budget -= slice.len();
for c in RE_REL_PATH.captures_iter(slice) {
if let Some(m) = c.get(1) {
let v = m.as_str().to_string();
let kind = if v.contains("graphql") || v.contains("/gql") {
EndpointKind::GraphQl
} else {
EndpointKind::RelativePath
};
if !push(v, kind, source, endpoints, seen, hosts) {
*truncated = true;
return;
}
}
}
for m in RE_WS.find_iter(slice) {
if !push(
m.as_str().to_string(),
EndpointKind::WebSocket,
source,
endpoints,
seen,
hosts,
) {
*truncated = true;
return;
}
}
for m in RE_ABS_URL.find_iter(slice) {
let v = m.as_str().to_string();
// Skip obvious static assets — we want API surface, not CDN files.
let lower = v.to_ascii_lowercase();
if lower.ends_with(".js")
|| lower.ends_with(".css")
|| lower.ends_with(".png")
|| lower.ends_with(".jpg")
|| lower.ends_with(".svg")
|| lower.ends_with(".woff2")
{
// still record the host for visibility
if let Some(h) = Url::parse(&v)
.ok()
.and_then(|u| u.host_str().map(str::to_string))
{
hosts.insert(h.to_ascii_lowercase());
}
continue;
}
let kind = if lower.contains("graphql") || lower.contains("/gql") {
EndpointKind::GraphQl
} else {
EndpointKind::AbsoluteUrl
};
if !push(v, kind, source, endpoints, seen, hosts) {
*truncated = true;
return;
}
}
};
// Inline scripts.
let doc = Html::parse_document(html);
let mut inline = String::new();
for el in doc.select(&SCRIPT_SEL) {
if el.value().attr("src").is_none() {
inline.push_str(&el.text().collect::<String>());
inline.push('\n');
}
}
scan(
&inline,
"inline",
&mut endpoints,
&mut seen,
&mut hosts,
&mut budget,
&mut truncated,
);
// Bundles.
let mut bundles_scanned = 0usize;
for (src, text) in bundles {
if budget == 0 {
truncated = true;
break;
}
bundles_scanned += 1;
scan(
text,
src,
&mut endpoints,
&mut seen,
&mut hosts,
&mut budget,
&mut truncated,
);
}
endpoints.sort_by(|a, b| (a.kind, &a.value, &a.source).cmp(&(b.kind, &b.value, &b.source)));
EndpointReport {
endpoints,
hosts: hosts.into_iter().collect(),
bundles_scanned,
truncated,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn registrable_domain_handles_cc_tlds() {
assert_eq!(
registrable_domain("www.ticketmaster.co.uk"),
"ticketmaster.co.uk"
);
assert_eq!(
registrable_domain("api.ticketmaster.com"),
"ticketmaster.com"
);
assert_eq!(
registrable_domain("pubapi.ticketmaster.co.uk"),
"ticketmaster.co.uk"
);
assert_eq!(registrable_domain("localhost"), "localhost");
}
#[test]
fn script_srcs_resolves_and_filters() {
let html = r#"<html><head>
<script src="/_next/static/chunks/main-abc.js"></script>
<script src="https://cdn.example.net/lib.js"></script>
<script>var inline = 1;</script>
<script src="data:text/javascript,1"></script>
</head></html>"#;
let srcs = script_srcs(html, "https://www.ticketmaster.co.uk/");
assert!(srcs.contains(
&"https://www.ticketmaster.co.uk/_next/static/chunks/main-abc.js".to_string()
));
assert!(srcs.contains(&"https://cdn.example.net/lib.js".to_string()));
assert_eq!(srcs.len(), 2, "inline + data: ignored");
}
#[test]
fn extracts_inline_and_bundle_endpoints_with_classification() {
let html = r#"<html><body>
<script>
var cfg = { search: "/api/search/events", suggest: "/api/search/search-suggest" };
fetch("/api/venue/info");
</script>
<script src="/app.js"></script>
</body></html>"#;
let bundles = vec![(
"https://www.ticketmaster.co.uk/app.js".to_string(),
r#"
const GQL = "https://pubapi.ticketmaster.co.uk/graphql";
axios.post("https://services.ticketmaster.co.uk/discovery/v2/events");
new WebSocket("wss://live.ticketmaster.co.uk/socket");
const ga = "https://www.googletagservices.com/tag/js/gpt.js";
const img = "https://cdn.tmol.co/hero.png";
"#
.to_string(),
)];
let r = extract_endpoints(html, "https://www.ticketmaster.co.uk/", &bundles);
let vals: Vec<&str> = r.endpoints.iter().map(|e| e.value.as_str()).collect();
assert!(vals.contains(&"/api/search/events"));
assert!(vals.contains(&"/api/search/search-suggest"));
assert!(vals.contains(&"/api/venue/info"));
assert!(vals.contains(&"https://pubapi.ticketmaster.co.uk/graphql"));
assert!(vals.contains(&"https://services.ticketmaster.co.uk/discovery/v2/events"));
assert!(vals.contains(&"wss://live.ticketmaster.co.uk/socket"));
// static .js asset is not an endpoint, but its host is recorded
assert!(!vals.contains(&"https://www.googletagservices.com/tag/js/gpt.js"));
assert!(r.hosts.iter().any(|h| h == "www.googletagservices.com"));
let gql = r
.endpoints
.iter()
.find(|e| e.value.contains("graphql"))
.unwrap();
assert_eq!(gql.kind, EndpointKind::GraphQl);
assert!(
gql.first_party,
"pubapi.ticketmaster.co.uk is first-party to .co.uk"
);
let third = r
.endpoints
.iter()
.find(|e| e.value.starts_with("/api/venue"));
assert!(third.unwrap().first_party, "relative path is same-origin");
assert_eq!(r.bundles_scanned, 1);
}
#[test]
fn third_party_absolute_is_flagged_not_first_party() {
let bundles = vec![(
"b".to_string(),
r#"x="https://api.stripe.com/v1/charges""#.to_string(),
)];
let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
let e = r
.endpoints
.iter()
.find(|e| e.value.contains("stripe"))
.unwrap();
assert!(!e.first_party);
}
#[test]
fn caps_bound_pathological_input() {
// A huge blob of fake endpoints must not exceed MAX_ENDPOINTS and
// must return promptly (regex crate is linear-time).
let mut big = String::new();
for i in 0..50_000 {
big.push_str(&format!("\"/api/v1/item/{i}\" "));
}
let bundles = vec![("big".to_string(), big)];
let r = extract_endpoints("<html></html>", "https://x.com/", &bundles);
assert!(r.endpoints.len() <= MAX_ENDPOINTS);
assert!(r.truncated);
}
#[test]
fn empty_inputs_are_safe() {
let r = extract_endpoints("", "not a url", &[]);
assert!(r.endpoints.is_empty());
assert_eq!(r.bundles_scanned, 0);
assert!(!r.truncated);
}
#[test]
fn v1_1_noise_is_filtered() {
let bundles = vec![(
"b.js".to_string(),
r#"
"/api/search/events";
"/api"; "/api/";
"http://f"; "http://n/x";
"https://schema.org/Thing";
"http://json-schema.org/draft-07/schema";
"https://www.ticketmaster.co.uk/";
"https://pubapi.ticketmaster.co.uk/discovery/v2/events";
"wss://live.ticketmaster.co.uk/socket";
"#
.to_string(),
)];
let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
let vals: std::collections::BTreeSet<&str> =
r.endpoints.iter().map(|e| e.value.as_str()).collect();
assert!(vals.contains("/api/search/events"));
assert!(vals.contains("https://pubapi.ticketmaster.co.uk/discovery/v2/events"));
assert!(vals.contains("wss://live.ticketmaster.co.uk/socket"));
for junk in [
"/api",
"/api/",
"http://f",
"http://n/x",
"https://schema.org/Thing",
"http://json-schema.org/draft-07/schema",
"https://www.ticketmaster.co.uk/",
] {
assert!(!vals.contains(junk), "noise leaked: {junk}");
}
assert!(
!r.hosts
.iter()
.any(|h| h == "f" || h == "n" || h == "schema.org")
);
assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
}
#[test]
fn scan_truncation_at_non_ascii_boundary_does_not_panic() {
// A bundle just over the scan budget, padded with a multibyte char
// ('é' is 2 bytes) so the cut lands mid-codepoint. The old
// `&text[..budget]` slice panicked here; the boundary snap must not.
let pad = "é".repeat(MAX_SCAN_BYTES); // ~2× budget in bytes
let bundle = format!("{pad} fetch(\"/api/x\")");
let bundles = vec![("big.js".to_string(), bundle)];
let r = extract_endpoints("<html></html>", "https://example.com/", &bundles);
assert!(r.truncated, "oversized bundle should mark truncated");
}
}

View file

@ -16,29 +16,6 @@ static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find
/// any data blob. The scan only ever surfaces `globalThis.__*` object/array
/// properties, and the seeded `__next_f` only emits when non-empty. Every
/// realistic way an inline script populates such a global goes through one of
/// these substrings (`window.`/`self.__next` assignments, or the
/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none
/// are present, running the VM is guaranteed to return zero blobs, so skipping
/// it is output-neutral. Conservative by design: any of these may appear in
/// non-script HTML too, which only makes us skip *less* often, never more.
const JS_CANDIDATE_MARKERS: [&str; 5] = [
"window.",
"__NEXT_DATA__",
"__NUXT__",
"application/json",
"self.__next",
];
/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS
/// scan could surface. When false, the VM is provably a no-op and is skipped.
pub fn has_js_candidate_data(html: &str) -> bool {
JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m))
}
/// A blob of data extracted from JS execution.
pub struct JsDataBlob {
pub name: String,
@ -47,17 +24,9 @@ pub struct JsDataBlob {
}
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
///
/// Convenience wrapper that parses `html` first. Hot callers that already hold a
/// parsed [`Html`] should use [`extract_js_data_from_doc`] to avoid a second parse.
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
let doc = Html::parse_document(html);
extract_js_data_from_doc(&doc)
}
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data,
/// reusing an already-parsed [`Html`] document instead of re-parsing the HTML.
pub fn extract_js_data_from_doc(doc: &Html) -> Vec<JsDataBlob> {
let scripts: Vec<String> = doc
.select(&SCRIPT_SELECTOR)
.filter(|el| {

View file

@ -7,7 +7,6 @@ pub(crate) mod data_island;
/// Zero network dependencies — WASM-compatible by design.
pub mod diff;
pub mod domain;
pub mod endpoints;
pub mod error;
pub mod extractor;
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
@ -17,7 +16,6 @@ pub mod markdown;
pub mod metadata;
#[allow(dead_code)]
pub(crate) mod noise;
pub mod reddit;
pub mod structured_data;
pub mod types;
pub mod youtube;
@ -95,24 +93,6 @@ fn extract_with_options_inner(
return Err(ExtractError::NoContent);
}
// Reddit fast path: parse old.reddit.com HTML directly.
// The fetch layer rewrites all Reddit hosts to old.reddit.com before
// calling extract, so we always get stable server-rendered HTML here.
if let Some(u) = url
&& reddit::is_reddit_url(u)
{
if let Some(result) = reddit::try_extract(html, u) {
return Ok(result);
}
// A recognised comment thread that we couldn't parse (Reddit markup
// change, or a block/challenge page) — don't fall through to generic
// extraction, which would emit Reddit nav/sidebar chrome. Listings
// and profiles (no `/comments/`) intentionally fall through below.
if u.contains("/comments/") {
return Err(ExtractError::NoContent);
}
}
// YouTube fast path: if the URL is a YouTube video page, try extracting
// structured metadata from ytInitialPlayerResponse before DOM scoring.
// This gives LLMs a clean, structured view of video metadata.
@ -222,8 +202,8 @@ fn extract_with_options_inner(
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
// static JSON data island extraction above with runtime-evaluated data.
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
if js_eval::has_js_candidate_data(html) {
let blobs = js_eval::extract_js_data_from_doc(&doc);
{
let blobs = js_eval::extract_js_data(html);
if !blobs.is_empty() {
let js_text = js_eval::extract_readable_text(&blobs);
if !js_text.is_empty() {

View file

@ -13,8 +13,6 @@ use crate::noise;
use crate::types::{CodeBlock, Image, Link};
static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
static IMG_ALT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("img[alt]").unwrap());
static A_HREF_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());
/// Maximum recursion depth for DOM traversal.
/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
@ -855,7 +853,7 @@ fn collect_assets_from_noise(
assets: &mut ConvertedAssets,
) {
// Collect images with alt text
for img in element.select(&IMG_ALT_SELECTOR) {
for img in element.select(&Selector::parse("img[alt]").unwrap()) {
let alt = img.value().attr("alt").unwrap_or("").to_string();
let src = img
.value()
@ -868,7 +866,7 @@ fn collect_assets_from_noise(
}
// Collect links
for link in element.select(&A_HREF_SELECTOR) {
for link in element.select(&Selector::parse("a[href]").unwrap()) {
let href = link
.value()
.attr("href")

View file

@ -1,968 +0,0 @@
//! Reddit thread extractor — parses old.reddit.com HTML directly.
//!
//! old.reddit.com serves fully server-rendered HTML with stable class names
//! and data attributes. No JS, no API key, no `.json` trick needed.
use scraper::{ElementRef, Html, Selector};
use serde::Serialize;
use crate::{Content, DomainData, DomainType, ExtractionResult, Metadata};
// ─── Public types ──────────────────────────────────────────────────────────────
#[derive(Serialize)]
pub struct RedditPost {
pub id: Option<String>,
pub title: String,
pub author: String,
pub subreddit: Option<String>,
pub score: i64,
pub body: Option<String>,
pub num_comments: usize,
pub permalink: String,
pub url: Option<String>,
pub is_self: bool,
pub flair: Option<String>,
pub created_utc: Option<String>,
}
#[derive(Serialize)]
pub struct RedditComment {
pub id: Option<String>,
pub author: String,
pub body: String,
/// `None` when Reddit hides the score (fresh comments). Distinct from
/// `Some(0)`, which is a real net-zero score.
pub score: Option<i64>,
pub depth: usize,
pub is_op: bool,
pub created_utc: Option<String>,
pub replies: Vec<RedditComment>,
}
#[derive(Serialize)]
pub struct RedditThread {
#[serde(rename = "url")]
pub source_url: String,
pub post: Option<RedditPost>,
pub comments: Vec<RedditComment>,
}
// ─── Public API ────────────────────────────────────────────────────────────────
pub fn is_reddit_url(url: &str) -> bool {
matches!(
host_of(url),
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
)
}
/// Try to parse a Reddit thread from old.reddit.com HTML.
/// Returns `None` if the page doesn't have recognisable Reddit structure.
pub fn try_extract_thread(html: &str, url: &str) -> Option<RedditThread> {
if !url.contains("/comments/") {
return None;
}
let doc = Html::parse_document(html);
let post = parse_post(&doc);
let op = post.as_ref().map(|p| p.author.as_str()).unwrap_or("");
let comments = parse_comments(&doc, op);
if post.is_none() && comments.is_empty() {
return None;
}
Some(RedditThread {
source_url: url.to_string(),
post,
comments,
})
}
/// Entry point for `webclaw-core`'s extraction fast path.
pub fn try_extract(html: &str, url: &str) -> Option<ExtractionResult> {
let thread = try_extract_thread(html, url)?;
Some(to_extraction_result(&thread))
}
// ─── ExtractionResult builder ──────────────────────────────────────────────────
fn to_extraction_result(thread: &RedditThread) -> ExtractionResult {
let md = to_markdown(thread);
let plain = plain_text(&md);
let wc = md.split_whitespace().count();
let (title, author, site_name) = thread
.post
.as_ref()
.map(|p| {
(
Some(p.title.clone()),
Some(p.author.clone()),
p.subreddit.clone(),
)
})
.unwrap_or_default();
ExtractionResult {
metadata: Metadata {
title,
description: None,
author,
published_date: None,
language: Some("en".to_string()),
url: Some(thread.source_url.clone()),
site_name,
image: None,
favicon: None,
word_count: wc,
},
content: Content {
markdown: md,
plain_text: plain,
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: Some(DomainData {
domain_type: DomainType::Social,
}),
structured_data: vec![],
}
}
// ─── Markdown rendering ────────────────────────────────────────────────────────
pub fn to_markdown(thread: &RedditThread) -> String {
let mut out = String::new();
if let Some(p) = &thread.post {
out.push_str(&format!("# {}\n\n", p.title));
let pts = pt_label(Some(p.score));
let cmt = match p.num_comments {
0 => String::new(),
1 => " · 1 comment".to_string(),
n => format!(" · {n} comments"),
};
let sub = p.subreddit.as_deref().unwrap_or("?");
out.push_str(&format!("**u/{}** · r/{sub} · {pts}{cmt}\n\n", p.author));
if let Some(ref body) = p.body
&& !body.is_empty()
{
out.push_str(body);
out.push_str("\n\n");
}
if let Some(ref link) = p.url
&& !p.is_self
{
out.push_str(&format!("[Link]({link})\n\n"));
}
out.push_str("---\n\n");
}
if !thread.comments.is_empty() {
out.push_str("## Comments\n\n");
for c in &thread.comments {
render_comment(c, &mut out);
}
}
collapse_blank_lines(out.trim_end())
}
/// Render one comment + its replies. Nesting is expressed with blockquote
/// depth (`> ` per level) rather than leading spaces: space-indentation of
/// 4+ would turn ordinary text and ``` fences into CommonMark indented code
/// blocks, corrupting any comment at depth ≥ 2.
fn render_comment(c: &RedditComment, out: &mut String) {
let q = "> ".repeat(c.depth);
let blank = ">".repeat(c.depth);
let author = if c.is_op {
format!("**u/{} [OP]**", c.author)
} else {
format!("**u/{}**", c.author)
};
out.push_str(&format!("{q}{author} · {}\n", pt_label(c.score)));
for line in c.body.lines() {
if line.is_empty() {
out.push_str(&blank);
out.push('\n');
} else {
out.push_str(&q);
out.push_str(line);
out.push('\n');
}
}
out.push('\n');
for reply in &c.replies {
render_comment(reply, out);
}
}
fn pt_label(n: Option<i64>) -> String {
match n {
None => "score hidden".to_string(),
Some(1) => "1 pt".to_string(),
Some(-1) => "-1 pt".to_string(),
Some(n) => format!("{n} pts"),
}
}
/// Collapse runs of 3+ newlines down to a blank-line separator so the
/// blockquote prefixes and `<pre>` spacing don't leave large gaps.
fn collapse_blank_lines(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut newlines = 0;
for ch in s.chars() {
if ch == '\n' {
newlines += 1;
if newlines <= 2 {
out.push(ch);
}
} else {
newlines = 0;
out.push(ch);
}
}
out
}
fn plain_text(md: &str) -> String {
md.lines()
.map(|l| {
// Strip a single leading blockquote / heading marker, then drop
// emphasis markers. Greedy char-class stripping (the old approach)
// ate legitimate content like ">"-prefixed quotes.
let l = l.trim_start();
let l = l
.strip_prefix("> ")
.or_else(|| l.strip_prefix('>'))
.unwrap_or(l);
let l = l.trim_start_matches('#').trim_start();
l.replace("**", "")
.replace("~~", "")
.replace(['*', '`'], "")
})
.collect::<Vec<_>>()
.join("\n")
}
// ─── HTML parsing ──────────────────────────────────────────────────────────────
fn parse_post(doc: &Html) -> Option<RedditPost> {
let sel = Selector::parse("#siteTable .thing.link").ok()?;
let thing = doc.select(&sel).next()?;
let v = thing.value();
let id = v
.attr("data-fullname")
.map(|s| s.trim_start_matches("t3_").to_string());
let author = v.attr("data-author").unwrap_or("[deleted]").to_string();
let subreddit = v.attr("data-subreddit").map(str::to_string);
let score: i64 = v
.attr("data-score")
.and_then(|s| s.parse().ok())
.unwrap_or(0);
let num_comments: usize = v
.attr("data-comments-count")
.and_then(|s| s.parse().ok())
.unwrap_or(0);
let permalink_path = v.attr("data-permalink").unwrap_or("");
let permalink = format!("https://old.reddit.com{permalink_path}");
// Self-posts carry the `self` class and a `self.<sub>` domain; their
// data-url points back at the permalink rather than an external site.
let is_self = v.has_class("self", scraper::CaseSensitivity::AsciiCaseInsensitive)
|| v.attr("data-domain")
.is_some_and(|d| d.starts_with("self."));
let link_url = v.attr("data-url").map(str::to_string);
let url = if is_self { None } else { link_url };
// Title
let sel_title = Selector::parse(".title a.title").ok()?;
let title = thing
.select(&sel_title)
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty())?;
// Flair
let flair = Selector::parse(".linkflairlabel")
.ok()
.and_then(|s| thing.select(&s).next())
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty());
// Self-text body: thing > .entry > .expando > .usertext-body [> .md]
let body = direct_child(thing, "entry")
.and_then(|entry| find_class(entry, "expando"))
.and_then(|expando| find_class(expando, "usertext-body"))
.and_then(|ut| find_class(ut, "md"))
.map(md_to_markdown)
.filter(|s| !s.is_empty());
// Datetime
let created_utc = Selector::parse("time[datetime]")
.ok()
.and_then(|s| thing.select(&s).next())
.and_then(|t| t.value().attr("datetime"))
.map(str::to_string);
Some(RedditPost {
id,
title,
author,
subreddit,
score,
body,
num_comments,
permalink,
url,
is_self,
flair,
created_utc,
})
}
// ─── Comment parsing ───────────────────────────────────────────────────────────
//
// old.reddit.com nests comments structurally, not via a depth attribute:
//
// .commentarea
// .sitetable.nestedlisting
// .comment.thing ← root comment
// .entry → form → .usertext-body → .md ← its own body
// .child
// .sitetable.listing
// .comment.thing ← reply (recurse)
//
// `data-depth`/`data-replies` are absent or always "0" in the logged-out
// HTML, so we walk the tree by recursing into each comment's `.child`.
fn parse_comments(doc: &Html, op: &str) -> Vec<RedditComment> {
// Root listing is `.sitetable.nestedlisting` inside `.commentarea`
// (note: `commentarea` is a class on old.reddit, not an id). Fall back
// to the first `.nestedlisting` anywhere for comment-permalink pages.
let listing = Selector::parse(".commentarea .sitetable.nestedlisting")
.ok()
.and_then(|s| doc.select(&s).next())
.or_else(|| {
Selector::parse(".sitetable.nestedlisting")
.ok()
.and_then(|s| doc.select(&s).next())
});
match listing {
Some(l) => walk_comment_level(l, op, 0),
None => vec![],
}
}
/// Parse the direct-child `.comment.thing` elements of a comment listing.
fn walk_comment_level(listing: ElementRef, op: &str, depth: usize) -> Vec<RedditComment> {
listing
.children()
.filter_map(ElementRef::wrap)
.filter(|c| {
let val = c.value();
val.has_class("comment", scraper::CaseSensitivity::AsciiCaseInsensitive)
&& val.has_class("thing", scraper::CaseSensitivity::AsciiCaseInsensitive)
})
.filter_map(|c| parse_one_comment(c, op, depth))
.collect()
}
fn parse_one_comment(c: ElementRef, op: &str, depth: usize) -> Option<RedditComment> {
let v = c.value();
// "load more comments" placeholders are `.thing` with type=morechildren.
// They carry a t1_ fullname but no real content — skip them.
if v.attr("data-type") == Some("morechildren")
|| v.has_class(
"morechildren",
scraper::CaseSensitivity::AsciiCaseInsensitive,
)
{
return None;
}
let is_deleted = v.has_class("deleted", scraper::CaseSensitivity::AsciiCaseInsensitive);
let id = v
.attr("data-fullname")
.map(|s| s.trim_start_matches("t1_").to_string());
let author = v
.attr("data-author")
.filter(|a| !a.is_empty())
.unwrap_or("[deleted]")
.to_string();
// Own body lives in `.entry > form > .usertext-body > .md`. `.child`
// (nested replies) is a sibling of `.entry`, so descending within
// `.entry` never crosses into a reply's body.
let entry = direct_child(c, "entry");
let body = entry
.and_then(|e| find_class(e, "usertext-body"))
.and_then(|ut| find_class(ut, "md"))
.map(md_to_markdown)
.filter(|s| !s.is_empty())
.unwrap_or_else(|| {
if is_deleted {
"[removed]".into()
} else {
String::new()
}
});
// Displayed score is `.score.unvoted`, whose `title` holds the exact
// integer (the sibling likes/dislikes spans are ±1). Hidden-score
// comments have no `.score.unvoted` span, so `comment_score` returns
// None — kept distinct from a genuine 0.
let score = entry.and_then(comment_score);
let created_utc = entry
.zip(Selector::parse("time[datetime]").ok())
.and_then(|(e, s)| e.select(&s).next())
.and_then(|t| t.value().attr("datetime"))
.map(str::to_string);
let is_op = !is_deleted && author != "[deleted]" && author == op;
// Replies: `.comment > .child > .sitetable > .comment`.
let replies = direct_child(c, "child")
.and_then(|child| direct_child(child, "sitetable"))
.map(|st| walk_comment_level(st, op, depth + 1))
.unwrap_or_default();
Some(RedditComment {
id,
author,
body,
score,
depth,
is_op,
created_utc,
replies,
})
}
/// Read a comment's score from the `.score.unvoted` span inside `.entry`.
/// Prefers the `title` attribute (exact integer); falls back to the text.
/// Returns `None` when Reddit hides the score (no `.score.unvoted` span).
fn comment_score(entry: ElementRef) -> Option<i64> {
let sel = Selector::parse("span.score.unvoted").ok()?;
let span = entry.select(&sel).next()?;
span.value()
.attr("title")
.and_then(|t| t.trim().parse().ok())
.or_else(|| parse_score(&span.text().collect::<String>()))
}
// ─── DOM helpers ───────────────────────────────────────────────────────────────
/// First direct child element whose class list includes `class`.
fn direct_child<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
el.children().filter_map(ElementRef::wrap).find(|c| {
c.value()
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
})
}
/// First descendant (any depth) whose class list includes `class`.
fn find_class<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
el.children().filter_map(ElementRef::wrap).find_map(|c| {
if c.value()
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
{
Some(c)
} else {
find_class(c, class)
}
})
}
fn parse_score(text: &str) -> Option<i64> {
text.split_whitespace()
.next()
.map(|w| w.replace('', "-"))
.and_then(|w| w.parse().ok())
}
// ─── .md div → markdown ────────────────────────────────────────────────────────
fn md_to_markdown(el: ElementRef) -> String {
let mut out = String::new();
render_children(el, &mut out);
out.trim().to_string()
}
fn render_children(el: ElementRef, out: &mut String) {
use scraper::node::Node;
for child in el.children() {
match child.value() {
Node::Text(t) => out.push_str(t.as_ref()),
Node::Element(_) => {
if let Some(c) = ElementRef::wrap(child) {
render_node(c, out);
}
}
_ => {}
}
}
}
fn render_node(el: ElementRef, out: &mut String) {
match el.value().name() {
"p" | "div" => {
let mut inner = String::new();
render_children(el, &mut inner);
let t = inner.trim();
if !t.is_empty() {
out.push_str(t);
out.push_str("\n\n");
}
}
"br" => out.push('\n'),
"strong" | "b" => {
let t: String = el.text().collect();
let t = t.trim();
if !t.is_empty() {
out.push_str(&format!("**{t}**"));
}
}
"em" | "i" => {
let t: String = el.text().collect();
let t = t.trim();
if !t.is_empty() {
out.push_str(&format!("*{t}*"));
}
}
"del" | "s" | "strike" => {
let t: String = el.text().collect();
let t = t.trim();
if !t.is_empty() {
out.push_str(&format!("~~{t}~~"));
}
}
"code" => {
let t: String = el.text().collect();
out.push('`');
out.push_str(t.trim());
out.push('`');
}
"pre" => {
let t: String = el.text().collect();
out.push_str("```\n");
out.push_str(t.trim_end_matches('\n'));
out.push_str("\n```\n\n");
}
"a" => {
let text: String = el.text().collect();
let text = text.trim();
if !text.is_empty() {
// Preserve the destination as a markdown link. Resolve
// root-relative reddit hrefs (/r/, /user/, /wiki/, ...) and
// drop non-navigational ones (javascript:, #fragment, mailto:).
let href = el.value().attr("href").unwrap_or("");
if href.starts_with("http://") || href.starts_with("https://") {
out.push_str(&format!("[{text}]({href})"));
} else if href.starts_with('/') {
out.push_str(&format!("[{text}](https://old.reddit.com{href})"));
} else {
out.push_str(text);
}
}
}
"blockquote" => {
let mut inner = String::new();
render_children(el, &mut inner);
let trimmed = inner.trim();
for line in trimmed.lines() {
out.push('>');
if !line.is_empty() {
out.push(' ');
out.push_str(line);
}
out.push('\n');
}
out.push('\n');
}
"ul" => render_list(el, false, 0, out),
"ol" => render_list(el, true, 0, out),
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
let level = el
.value()
.name()
.chars()
.nth(1)
.and_then(|c| c.to_digit(10))
.unwrap_or(2) as usize;
let t: String = el.text().collect();
let t = t.trim();
if !t.is_empty() {
out.push_str(&"#".repeat(level));
out.push(' ');
out.push_str(t);
out.push_str("\n\n");
}
}
"hr" => out.push_str("---\n\n"),
"sup" => {
let t: String = el.text().collect();
out.push_str(t.trim());
}
// Unknown / generic containers: recurse
_ => render_children(el, out),
}
}
/// Render a `<ul>`/`<ol>`, indenting nested lists by two spaces per level so
/// child items keep their own line instead of being glued to the parent.
fn render_list(list: ElementRef, ordered: bool, indent: usize, out: &mut String) {
use scraper::node::Node;
let pad = " ".repeat(indent);
let mut n = 0;
for li in list
.children()
.filter_map(ElementRef::wrap)
.filter(|c| c.value().name() == "li")
{
n += 1;
// Inline content of this <li>, excluding nested lists (rendered after).
let mut inline = String::new();
for child in li.children() {
match child.value() {
Node::Text(t) => inline.push_str(t.as_ref()),
Node::Element(e) if e.name() == "ul" || e.name() == "ol" => {}
Node::Element(_) => {
if let Some(c) = ElementRef::wrap(child) {
render_node(c, &mut inline);
}
}
_ => {}
}
}
let marker = if ordered {
format!("{n}. ")
} else {
"- ".to_string()
};
out.push_str(&format!("{pad}{marker}{}\n", inline.trim()));
for child in li.children().filter_map(ElementRef::wrap) {
match child.value().name() {
"ul" => render_list(child, false, indent + 1, out),
"ol" => render_list(child, true, indent + 1, out),
_ => {}
}
}
}
if indent == 0 {
out.push('\n');
}
}
// ─── URL helpers ───────────────────────────────────────────────────────────────
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split(['/', '?', '#'])
.next()
.unwrap_or("")
}
// ─── Tests ─────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn is_reddit_url_recognises_variants() {
assert!(is_reddit_url(
"https://www.reddit.com/r/rust/comments/abc/x/"
));
assert!(is_reddit_url(
"https://old.reddit.com/r/rust/comments/abc/x/"
));
assert!(is_reddit_url("https://reddit.com/r/rust/comments/abc/x/"));
assert!(!is_reddit_url("https://example.com"));
}
#[test]
fn try_extract_thread_returns_none_for_listing_url() {
let html = "<html><body></body></html>";
assert!(try_extract_thread(html, "https://old.reddit.com/r/rust/").is_none());
}
#[test]
fn md_to_markdown_basic() {
let html =
Html::parse_fragment(r#"<div class="md"><p>Hello <strong>world</strong>!</p></div>"#);
let sel = Selector::parse(".md").unwrap();
let el = html.select(&sel).next().unwrap();
let md = md_to_markdown(el);
assert!(md.contains("**world**"));
assert!(md.contains("Hello"));
}
#[test]
fn md_to_markdown_blockquote_and_code() {
let html = Html::parse_fragment(
r#"<div class="md"><blockquote><p>Quoted</p></blockquote><pre><code>fn main() {}</code></pre></div>"#,
);
let sel = Selector::parse(".md").unwrap();
let el = html.select(&sel).next().unwrap();
let md = md_to_markdown(el);
assert!(md.contains("> Quoted"));
assert!(md.contains("```"));
assert!(md.contains("fn main()"));
}
#[test]
fn md_to_markdown_link_preserves_href() {
let abs = Html::parse_fragment(
r#"<div class="md"><p>see <a href="https://example.com/x">this</a></p></div>"#,
);
let sel = Selector::parse(".md").unwrap();
let el = abs.select(&sel).next().unwrap();
assert!(md_to_markdown(el).contains("[this](https://example.com/x)"));
// Root-relative reddit links resolve against old.reddit.com.
let rel = Html::parse_fragment(
r#"<div class="md"><p><a href="/r/rust/wiki/faq">faq</a></p></div>"#,
);
let el = rel.select(&sel).next().unwrap();
assert!(md_to_markdown(el).contains("[faq](https://old.reddit.com/r/rust/wiki/faq)"));
// javascript: / fragment hrefs degrade to bare text.
let js = Html::parse_fragment(
r#"<div class="md"><p><a href="javascript:void(0)">x</a></p></div>"#,
);
let el = js.select(&sel).next().unwrap();
let out = md_to_markdown(el);
assert!(out.contains('x') && !out.contains("javascript"));
}
// ── Regression tests against REAL old.reddit.com HTML ──────────────────
//
// These fixtures are genuine pages fetched from old.reddit.com (see
// testdata/reddit/). They are the ground truth — synthetic HTML is too
// easy to write to match wrong assumptions, which is exactly how the
// first version of this parser shipped silently broken.
fn fixture(name: &str) -> String {
std::fs::read_to_string(format!("testdata/reddit/{name}")).unwrap()
}
fn total_comments(cs: &[RedditComment]) -> usize {
cs.len() + cs.iter().map(|c| total_comments(&c.replies)).sum::<usize>()
}
fn collect<'a>(cs: &'a [RedditComment], out: &mut Vec<&'a RedditComment>) {
for c in cs {
out.push(c);
collect(&c.replies, out);
}
}
#[test]
fn real_link_post_metadata() {
// pandas: external-link post (blog.geekuni.com), 34 comments.
let html = fixture("pandas_34comments.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should parse");
let p = t.post.expect("post");
assert_eq!(p.author, "Horror-Willingness74");
assert_eq!(p.subreddit.as_deref(), Some("programming"));
assert_eq!(p.score, 43);
assert_eq!(p.num_comments, 34, "data-comments-count");
assert!(!p.is_self, "external blog link, not a self post");
assert_eq!(
p.url.as_deref(),
Some("https://blog.geekuni.com/2026/06/why-learn-pandas.html")
);
assert!(p.title.contains("Pandas"));
}
#[test]
fn real_self_post_metadata() {
// A self-post (text) on r/rust: `self.rust` domain, self-text body,
// no external url.
let html = fixture("rust_selfpost_36comments.html");
let t = try_extract_thread(&html, "https://old.reddit.com/r/rust/comments/abc123/t/")
.expect("should parse");
let p = t.post.expect("post");
assert!(p.is_self, "self.rust domain → self post");
assert_eq!(p.url, None, "self posts carry no external url");
assert_eq!(p.subreddit.as_deref(), Some("rust"));
assert!(
p.body
.as_deref()
.unwrap_or("")
.contains("IT project manager"),
"self-text body should be extracted: {:?}",
p.body
);
}
#[test]
fn real_comment_bodies_and_scores() {
// The original bug: every comment body came back empty because
// .usertext-body sits inside a <form>, not directly under .entry.
let html = fixture("ebpf_6comments.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should parse");
// 6 comments total: 5 top-level + 1 nested reply (admalledd under ejrh).
assert_eq!(t.comments.len(), 5, "5 top-level comments");
assert_eq!(total_comments(&t.comments), 6, "6 comments incl. nested");
let teerre = t
.comments
.iter()
.find(|c| c.author == "teerre")
.expect("teerre");
assert!(
teerre.body.contains("Very cool blog"),
"body must be populated, got {:?}",
teerre.body
);
// Score comes from .score.unvoted title (the real value), not the
// ±1 likes/dislikes siblings.
assert_eq!(
teerre.score,
Some(10),
"unvoted score, not dislikes(9)/likes(11)"
);
assert!(
t.comments.iter().all(|c| !c.body.is_empty()),
"no comment body should be empty"
);
}
#[test]
fn real_nested_comment_tree() {
// pandas has structurally-nested replies (.child > .sitetable >
// .comment). data-depth/data-replies are absent in logged-out HTML.
let html = fixture("pandas_34comments.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should parse");
// 34 rendered comments with content + 1 [deleted] node that old.reddit
// still shows because it has live replies = 35 nodes in the tree.
assert_eq!(
total_comments(&t.comments),
35,
"all comments incl. nested + deleted"
);
let nested = t.comments.iter().any(|c| !c.replies.is_empty());
assert!(nested, "at least one comment must have replies");
let max_depth = {
fn d(cs: &[RedditComment]) -> usize {
cs.iter().map(|c| 1 + d(&c.replies)).max().unwrap_or(0)
}
d(&t.comments)
};
assert!(max_depth >= 2, "tree should be more than one level deep");
let a_reply = t.comments.iter().find_map(|c| c.replies.first());
assert_eq!(a_reply.map(|r| r.depth), Some(1));
}
#[test]
fn real_morechildren_stubs_skipped() {
// AskReddit deep thread: 259 .thing[data-fullname=t1_] markers, but
// some are "load more comments" stubs (data-type=morechildren) with
// no author/body. They must not appear as ghost comments.
let html = fixture("askreddit_deep_morechildren.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
)
.expect("should parse");
fn check(cs: &[RedditComment]) {
for c in cs {
let ghost = c.body.is_empty() && c.author == "[deleted]" && c.id.is_some();
assert!(!ghost, "morechildren stub leaked as comment: {:?}", c.id);
check(&c.replies);
}
}
check(&t.comments);
}
#[test]
fn real_hidden_score_is_none_not_zero() {
// AskReddit has fresh comments with `.score-hidden` (no .score.unvoted
// span). These must be None, distinct from a genuine 0-score comment.
let html = fixture("askreddit_deep_morechildren.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
)
.expect("should parse");
let mut all = Vec::new();
collect(&t.comments, &mut all);
assert!(
all.iter().any(|c| c.score.is_none()),
"some fresh comments have hidden scores → None"
);
}
#[test]
fn real_deleted_comment_preserves_subtree() {
// pandas has a [deleted] comment that still has visible replies. The
// structural walk must keep it so its children aren't orphaned.
let html = fixture("pandas_34comments.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should parse");
let mut all = Vec::new();
collect(&t.comments, &mut all);
let deleted: Vec<_> = all.iter().filter(|c| c.author == "[deleted]").collect();
assert!(!deleted.is_empty(), "should keep deleted comments");
assert!(
deleted.iter().any(|c| !c.replies.is_empty()),
"a deleted comment with replies must retain its subtree"
);
assert!(deleted.iter().all(|c| !c.is_op));
}
#[test]
fn real_markdown_is_commonmark_clean() {
// Guards the markdown bugs the verification workflow found: no
// whitespace-only "blank" lines, and ``` fences never indented 4+
// spaces (which would turn them into literal indented code blocks).
let html = fixture("elixir_60comments.html");
let result = try_extract(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should extract");
let md = &result.content.markdown;
assert!(md.starts_with("# "));
assert!(md.contains("## Comments"));
for line in md.lines() {
assert!(
!(line.starts_with(' ') && line.trim().is_empty()),
"whitespace-only line: {line:?}"
);
let trimmed = line.trim_start_matches(['>', ' ']);
if trimmed.starts_with("```") {
let indent = line.len() - line.trim_start_matches(' ').len();
assert!(indent < 4, "code fence indented {indent} spaces: {line:?}");
}
}
assert!(result.metadata.word_count > 20);
}
}

View file

@ -178,12 +178,7 @@ pub fn extract_sveltekit(html: &str) -> Vec<Value> {
/// Preserves already-quoted keys and string values.
fn js_literal_to_json(input: &str) -> String {
let bytes = input.as_bytes();
// Accumulate raw bytes, not `byte as char`. The input is valid UTF-8 and we
// only ever copy its bytes verbatim or insert ASCII quotes, so the result is
// guaranteed valid UTF-8 — copying byte-by-byte preserves multibyte
// codepoints (e.g. accented/CJK string values) instead of mangling them
// into Latin-1 mojibake.
let mut out: Vec<u8> = Vec::with_capacity(input.len() + input.len() / 10);
let mut out = String::with_capacity(input.len() + input.len() / 10);
let mut i = 0;
let len = bytes.len();
@ -192,14 +187,14 @@ fn js_literal_to_json(input: &str) -> String {
// Skip through strings
if b == b'"' {
out.push(b'"');
out.push('"');
i += 1;
while i < len {
let c = bytes[i];
out.push(c);
out.push(c as char);
i += 1;
if c == b'\\' && i < len {
out.push(bytes[i]);
out.push(bytes[i] as char);
i += 1;
} else if c == b'"' {
break;
@ -210,11 +205,11 @@ fn js_literal_to_json(input: &str) -> String {
// After { or , — look for unquoted key followed by :
if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
out.push(b);
out.push(b as char);
i += 1;
// Skip whitespace
while i < len && bytes[i].is_ascii_whitespace() {
out.push(bytes[i]);
out.push(bytes[i] as char);
i += 1;
}
// Check if next is an unquoted identifier (key)
@ -223,30 +218,29 @@ fn js_literal_to_json(input: &str) -> String {
while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
i += 1;
}
let key = &bytes[key_start..i];
let key = &input[key_start..i];
// Skip whitespace after key
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
// If followed by :, it's an unquoted key — quote it
if i < len && bytes[i] == b':' {
out.push(b'"');
out.extend_from_slice(key);
out.push(b'"');
out.push('"');
out.push_str(key);
out.push('"');
} else {
// Not a key — might be a bare value like true/false/null
out.extend_from_slice(key);
out.push_str(key);
}
}
continue;
}
out.push(b);
out.push(b as char);
i += 1;
}
// Safe: we only copied bytes from valid-UTF-8 `input` plus ASCII quotes.
String::from_utf8(out).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
out
}
/// Replace raw newlines/tabs inside JSON string values with escape sequences.
@ -446,17 +440,4 @@ newline"}"#;
assert_eq!(parsed["text"], "line1\nline2");
assert_eq!(parsed["raw"], "has\nnewline");
}
#[test]
fn js_literal_to_json_preserves_multibyte_utf8() {
// Unquoted ASCII keys with accented and CJK string values (the shape
// SvelteKit emits). The old `byte as char` path turned the multibyte
// values into Latin-1 mojibake; they must now survive intact.
let input = r#"{name:"déjà vu", city:"東京", emoji:"🌱"}"#;
let json = js_literal_to_json(input);
let parsed: Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["name"], "déjà vu");
assert_eq!(parsed["city"], "東京");
assert_eq!(parsed["emoji"], "🌱");
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -13,17 +13,10 @@ thiserror = { workspace = true }
tracing = { workspace = true }
tokio = { workspace = true }
async-trait = "0.1"
# Pinned to exact pre-release versions: wreq/wreq-util are release candidates
# with no semver stability between rc.N builds. An exact pin keeps `cargo build`,
# `cargo install` (which ignores Cargo.lock), and the release workflow all on the
# version that compiles.
wreq = { version = "=6.0.0-rc.29", features = ["cookies", "gzip", "brotli", "zstd", "deflate", "stream"] }
wreq-util = "=3.0.0-rc.12"
wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
wreq-util = "3.0.0-rc.10"
http = "1"
bytes = "1"
# Stream adapter for `wreq::Response::bytes_stream()` (wreq 6.0.0-rc.29 dropped
# `Response::chunk()`); used to buffer bodies under the running size ceiling.
futures-util = "0.3"
url = "2"
rand = "0.8"
quick-xml = { version = "0.37", features = ["serde"] }
@ -32,7 +25,6 @@ reqwest = { version = "0.12", default-features = false, features = ["json", "rus
serde_json.workspace = true
calamine = "0.34"
zip = "2"
flate2 = "1"
[dev-dependencies]
tempfile = "3"

View file

@ -12,7 +12,6 @@ use std::hash::{Hash, Hasher};
use std::sync::Arc;
use std::time::{Duration, Instant};
use futures_util::StreamExt;
use rand::seq::SliceRandom;
use tokio::sync::Semaphore;
use tracing::{debug, instrument, warn};
@ -119,7 +118,7 @@ impl Response {
/// negotiated), so a tiny compressed payload that inflates to
/// gigabytes is aborted as soon as the accumulated size crosses the
/// cap — it never gets fully buffered in memory.
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
if let Some(len) = resp.content_length()
&& len > MAX_BODY_BYTES
{
@ -131,13 +130,12 @@ impl Response {
let url = resp.uri().to_string();
let headers = resp.headers().clone();
// wreq 6.0.0-rc.29 dropped `Response::chunk()`. Stream post-decompression
// bytes via `bytes_stream()` and keep enforcing the running ceiling so a
// compression bomb is aborted before it is fully buffered in memory.
let mut buf = bytes::BytesMut::new();
let mut stream = resp.bytes_stream();
while let Some(chunk) = stream.next().await {
let chunk = chunk.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
while let Some(chunk) = resp
.chunk()
.await
.map_err(|e| FetchError::BodyDecode(e.to_string()))?
{
check_body_ceiling(buf.len(), chunk.len())?;
buf.extend_from_slice(&chunk);
}
@ -162,6 +160,9 @@ impl Response {
fn body(&self) -> &[u8] {
&self.body
}
fn is_success(&self) -> bool {
(200..300).contains(&self.status)
}
fn text(&self) -> std::borrow::Cow<'_, str> {
String::from_utf8_lossy(&self.body)
@ -170,13 +171,6 @@ impl Response {
fn into_text(self) -> String {
String::from_utf8_lossy(&self.body).into_owned()
}
/// Consume the response and return the raw, undecoded body bytes.
/// Used by [`FetchClient::fetch_raw`] for binary payloads (e.g. gzipped
/// sitemaps) that must not be run through lossy UTF-8 decoding.
fn into_body(self) -> bytes::Bytes {
self.body
}
}
/// Internal representation of the client pool strategy.
@ -305,15 +299,32 @@ impl FetchClient {
/// when you need literal no-rescue behavior (e.g. inside the rescue
/// logic itself to avoid recursion).
pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
// Reddit: fetch old.reddit.com for stable server-rendered HTML.
// The JSON API is blocked; old.reddit.com works without JS or auth.
let owned;
let url = if crate::reddit::is_reddit_url(url) {
owned = crate::reddit::to_old_reddit_url(url);
owned.as_str()
} else {
url
};
// Reddit: the HTML page shows a verification interstitial for most
// client IPs, but appending `.json` returns the post + comment tree
// publicly. `parse_reddit_json` in downstream code knows how to read
// the result; here we just do the URL swap at the fetch layer.
if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
let json_url = crate::reddit::json_url(url);
// Reddit's public .json API serves JSON to identifiable bot
// User-Agents and blocks browser UAs with a verification wall.
// Override our Chrome-profile UA for this specific call.
let ua = concat!(
"Webclaw/",
env!("CARGO_PKG_VERSION"),
" (+https://webclaw.io)"
);
if let Ok(resp) = self
.fetch_with_headers(&json_url, &[("user-agent", ua)])
.await
&& resp.status == 200
{
let first = resp.html.trim_start().as_bytes().first().copied();
if matches!(first, Some(b'{') | Some(b'[')) {
return Ok(resp);
}
}
// If the .json fetch failed or returned HTML, fall through.
}
let resp = self.fetch(url).await?;
@ -465,27 +476,6 @@ impl FetchClient {
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
}
/// Fetch a URL and return the raw, undecoded response body as bytes.
///
/// Unlike [`fetch`](Self::fetch), this does **not** run the body through
/// `String::from_utf8_lossy`, so binary payloads survive intact. This is
/// required for gzipped sitemaps (`.xml.gz`): such files are served with
/// `Content-Type: application/gzip` and *no* `Content-Encoding`, so wreq
/// never auto-inflates them — the bytes arrive as raw gzip and the lossy
/// String path would mangle them. Callers detect the gzip magic
/// (`0x1f 0x8b`) and gunzip before parsing.
///
/// No retry wrapper: callers (sitemap discovery) already tolerate
/// per-URL failures by skipping. Returns `(status, body)`.
pub async fn fetch_raw(&self, url: &str) -> Result<(u16, bytes::Bytes), FetchError> {
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
let url = parsed_url.as_str();
let client = self.pick_client(url);
let resp = client.get(url).send().await?;
let response = Response::from_wreq(resp).await?;
Ok((response.status(), response.into_body()))
}
/// Fetch a URL then extract structured content.
#[instrument(skip(self), fields(url = %url))]
pub async fn fetch_and_extract(
@ -506,16 +496,23 @@ impl FetchClient {
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
let url = parsed_url.as_str();
// Reddit: rewrite to old.reddit.com for stable server-rendered HTML.
// webclaw-core's Reddit fast path then parses the thread structure.
let reddit_owned;
let url = if crate::reddit::is_reddit_url(url) {
reddit_owned = crate::reddit::to_old_reddit_url(url);
debug!("reddit: rewriting to {reddit_owned}");
reddit_owned.as_str()
} else {
url
};
// Reddit fallback: use their JSON API to get post + full comment tree.
if crate::reddit::is_reddit_url(url) {
let json_url = crate::reddit::json_url(url);
let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
debug!("reddit detected, fetching {json_url}");
let client = self.pick_client(url);
let resp = client.get(json_url.as_str()).send().await?;
let response = Response::from_wreq(resp).await?;
if response.is_success() {
let bytes = response.body();
match crate::reddit::parse_reddit_json(bytes, url) {
Ok(result) => return Ok(result),
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
}
}
}
let start = Instant::now();
let client = self.pick_client(url);
@ -829,17 +826,11 @@ fn is_challenge_html(html: &str) -> bool {
false
}
/// Extract the homepage URL (scheme + host[:port]) from a full URL.
/// Extract the homepage URL (scheme + host) from a full URL.
fn extract_homepage(url: &str) -> Option<String> {
url::Url::parse(url).ok().map(|u| {
let host = u.host_str().unwrap_or("");
// `port()` is `Some` only for a non-default port; include it so a
// host like example.com:8443 is warmed on the right port.
match u.port() {
Some(port) => format!("{}://{}:{}/", u.scheme(), host, port),
None => format!("{}://{}/", u.scheme(), host),
}
})
url::Url::parse(url)
.ok()
.map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or("")))
}
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.

View file

@ -810,18 +810,13 @@ mod tests {
// --- CloudClient construction ------------------------------------------
// `WEBCLAW_API_KEY` is process-global; cargo runs tests in parallel
// threads. Without serialization, a test that sets the var can race a
// test asserting it is absent. This lock makes the env-mutating
// CloudClient tests mutually exclusive (poison-tolerant: a panicking
// test must not wedge the others).
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
#[test]
fn cloud_client_explicit_key_wins_over_env() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
// are unsafe on the 2024 toolchain. Explicit key must beat the env.
// SAFETY: this test mutates process env. Serial tests only.
// Set env to something, pass an explicit key, explicit should win.
// (We don't actually *call* the API, just check the struct stored
// the right key.)
// rustc std::env::set_var is unsafe in newer toolchains.
unsafe {
std::env::set_var("WEBCLAW_API_KEY", "from-env");
}
@ -834,9 +829,6 @@ mod tests {
#[test]
fn cloud_client_none_when_empty() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation serialized by ENV_LOCK. Clearing the var
// (incl. any ambient runner value) is what makes this deterministic.
unsafe {
std::env::remove_var("WEBCLAW_API_KEY");
}

View file

@ -528,7 +528,7 @@ impl Crawler {
}
/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
pub(crate) fn origin_key(url: &Url) -> String {
fn origin_key(url: &Url) -> String {
let port_suffix = match url.port() {
Some(p) => format!(":{p}"),
None => String::new(),
@ -563,7 +563,7 @@ fn root_domain(url: &Url) -> String {
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
/// lowercase scheme + host. Preserves query params and path case.
pub(crate) fn normalize(url: &Url) -> String {
fn normalize(url: &Url) -> String {
let scheme = url.scheme();
let host = url.host_str().unwrap_or("").to_ascii_lowercase();
let port_suffix = match url.port() {

View file

@ -33,7 +33,6 @@ use serde_json::{Value, json};
use url::Url;
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -116,25 +115,23 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
/// without carrying webclaw_fetch types.
pub fn parse(html: &str, url: &str, asin: &str) -> Value {
let jsonld = find_product_jsonld(html);
// Single scan for the og:* fallbacks read below.
let og_meta = parse_og(html);
// Three-tier title: JSON-LD `name` > Amazon's `#productTitle` span
// (only present on real static HTML) > cloud-synthesized og:title.
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| dom_title(html))
.or_else(|| og_meta.unescaped("title"));
.or_else(|| og(html, "title"));
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| dom_image(html))
.or_else(|| og_meta.unescaped("image"));
.or_else(|| og(html, "image"));
let brand = jsonld.as_ref().and_then(get_brand);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og_meta.unescaped("description"));
.or_else(|| og(html, "description"));
let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating);
let offer = jsonld.as_ref().and_then(first_offer);
@ -339,6 +336,31 @@ fn dom_image(html: &str) -> Option<String> {
.map(|m| m.as_str().to_string())
}
/// OG meta tag lookup. Cloud-synthesized HTML ships these even when
/// JSON-LD and Amazon-DOM-IDs are both absent, so they're the last
/// line of defence for `title`, `image`, `description`.
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| html_unescape(m.as_str()));
}
}
None
}
/// Undo the synthesize_html attribute escaping for the few entities it
/// emits. Keeps us off a heavier HTML-entity dep.
fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
FetchError::Build(e.to_string())
}
@ -455,7 +477,7 @@ mod tests {
fn og_unescape_handles_quot_entity() {
let html = r#"<meta property="og:title" content="Apple &quot;M2 Pro&quot; Laptop">"#;
assert_eq!(
parse_og(html).unescaped("title").as_deref(),
og(html, "title").as_deref(),
Some(r#"Apple "M2 Pro" Laptop"#)
);
}

View file

@ -15,7 +15,6 @@ use serde_json::{Value, json};
use url::Url;
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -66,21 +65,19 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
let jsonld = find_product_jsonld(html);
// Single scan for the three og:* fields read as fallbacks below.
let og_meta = parse_og(html);
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| og_meta.raw("title"));
.or_else(|| og(html, "title"));
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| og_meta.raw("image"));
.or_else(|| og(html, "image"));
let brand = jsonld.as_ref().and_then(get_brand);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og_meta.raw("description"));
.or_else(|| og(html, "description"));
let offer = jsonld.as_ref().and_then(first_offer);
// eBay's AggregateOffer uses lowPrice/highPrice. Offer uses price.
@ -271,6 +268,19 @@ fn get_aggregate_rating(v: &Value) -> Option<Value> {
}))
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
FetchError::Build(e.to_string())
}

View file

@ -42,7 +42,6 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::{og, parse_og};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -143,17 +142,15 @@ fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value {
/// Build a minimal payload from OG / product meta tags. Used when a
/// page has no Product JSON-LD at all.
fn build_og_payload(html: &str, url: &str) -> Value {
// Single scan for the three og:* fields this fallback reads.
let og_meta = parse_og(html);
let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default();
let image = og_meta.raw("image");
let image = og(html, "image");
let images: Vec<Value> = image.map(|i| vec![Value::String(i)]).unwrap_or_default();
json!({
"url": url,
"data_source": "og_fallback",
"name": og_meta.raw("title"),
"description": og_meta.raw("description"),
"name": og(html, "title"),
"description": og(html, "description"),
"brand": meta_property(html, "product:brand"),
"sku": None::<String>,
"mpn": None::<String>,
@ -371,6 +368,20 @@ fn build_og_offer(html: &str) -> Option<Value> {
}))
}
/// Pull the value of `<meta property="og:{prop}" content="...">`.
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Pull the value of any `<meta property="..." content="...">` tag.
/// Needed for namespaced OG variants like `product:price:amount` that
/// the simple `og:*` matcher above doesn't cover.

View file

@ -26,7 +26,6 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -75,26 +74,19 @@ pub fn parse(html: &str, url: &str, listing_id: &str) -> Value {
let jsonld = find_product_jsonld(html);
let slug_title = humanise_slug(parse_slug(url).as_deref());
// Single scan for the three og:* fields used as fallbacks below.
let og_meta = parse_og(html);
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| og_meta.raw("title").filter(|t| !is_generic_title(t)))
.or_else(|| og(html, "title").filter(|t| !is_generic_title(t)))
.or(slug_title);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| {
og_meta
.raw("description")
.filter(|d| !is_generic_description(d))
});
.or_else(|| og(html, "description").filter(|d| !is_generic_description(d)));
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| og_meta.raw("image"));
.or_else(|| og(html, "image"));
let brand = jsonld.as_ref().and_then(get_brand);
// Etsy listings often ship either a single Offer or an
@ -367,6 +359,19 @@ fn strip_schema_prefix(s: String) -> String {
.replace("https://schema.org/", "")
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Etsy links the owning shop with a canonical anchor like
/// `<a href="/shop/ShopName" ...>`. Grab the first one after the
/// breadcrumb boundary.

View file

@ -33,7 +33,6 @@ pub mod instagram_post;
pub mod instagram_profile;
pub mod linkedin_post;
pub mod npm;
pub(crate) mod og;
pub mod pypi;
pub mod reddit;
pub mod shopify_collection;

View file

@ -1,79 +0,0 @@
//! Shared Open Graph (`og:*`) meta-tag parsing for the HTML vertical
//! extractors.
//!
//! Several site extractors read a handful of `og:*` properties (title,
//! description, image, ...) from the page `<head>`. Each used to carry a
//! verbatim copy of the same regex + scan helper. This module centralises
//! that logic and adds [`parse_og`], which collects every `og:*` pair in a
//! single `captures_iter` pass so an extractor that needs multiple fields
//! scans the document once instead of once per field.
//!
//! Values are stored raw. Callers that need HTML entity decoding apply
//! [`html_unescape`] themselves — some extractors intentionally keep the
//! raw value, so decoding is opt-in per call site to preserve output.
use std::collections::HashMap;
use std::sync::OnceLock;
use regex::Regex;
/// Matches `<meta property="og:<name>" content="<value>">`, case-insensitive.
/// Capture 1 is the property suffix (after `og:`), capture 2 is the content.
fn og_regex() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
})
}
/// Return the raw content of the first `og:<prop>` meta tag, if present.
///
/// Single-pass per call. For extractors reading several properties, prefer
/// [`parse_og`] to scan the document only once.
pub(crate) fn og(html: &str, prop: &str) -> Option<String> {
for c in og_regex().captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Parse every `og:*` meta tag in one pass into a `suffix -> content` map.
///
/// First occurrence wins, matching the short-circuit-on-first-match
/// behaviour of [`og`] when called per property. Values are raw (not
/// entity-decoded); use [`OgMeta::unescaped`] / [`OgMeta::raw`] to read.
pub(crate) fn parse_og(html: &str) -> OgMeta {
let mut map: HashMap<String, String> = HashMap::new();
for c in og_regex().captures_iter(html) {
if let (Some(name), Some(content)) = (c.get(1), c.get(2)) {
map.entry(name.as_str().to_string())
.or_insert_with(|| content.as_str().to_string());
}
}
OgMeta(map)
}
/// Parsed `og:*` properties from a single document scan.
pub(crate) struct OgMeta(HashMap<String, String>);
impl OgMeta {
/// Raw content of `og:<prop>`, exactly as it appeared in the HTML.
pub(crate) fn raw(&self, prop: &str) -> Option<String> {
self.0.get(prop).cloned()
}
/// Content of `og:<prop>` with the common HTML entities decoded.
pub(crate) fn unescaped(&self, prop: &str) -> Option<String> {
self.0.get(prop).map(|v| html_unescape(v))
}
}
/// Decode the small set of HTML entities that show up in `og:*` content.
pub(crate) fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}

View file

@ -1,10 +1,12 @@
//! Reddit structured extractor — parses old.reddit.com HTML.
//! Reddit structured extractor — returns the full post + comment tree
//! as typed JSON via Reddit's `.json` API.
//!
//! Fetches old.reddit.com (stable server-rendered HTML, no JS required)
//! and delegates parsing to `webclaw_core::reddit`. Returns a typed JSON
//! value with `{ url, post, comments }` structure.
//! The same trick the markdown extractor in `crate::reddit` uses:
//! appending `.json` to any post URL returns the data the new SPA
//! frontend would load client-side. Zero antibot, zero JS rendering.
use serde_json::Value;
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::error::FetchError;
@ -22,27 +24,182 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
};
pub fn matches(url: &str) -> bool {
webclaw_core::reddit::is_reddit_url(url) && url.contains("/comments/")
let host = host_of(url);
let is_reddit_host = matches!(
host,
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
);
is_reddit_host && url.contains("/comments/")
}
pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
let fetch_url = crate::reddit::to_old_reddit_url(url);
let resp = client.fetch(&fetch_url).await?;
let json_url = build_json_url(url);
let resp = client.fetch(&json_url).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"reddit: unexpected status {}",
"reddit api returned status {}",
resp.status
)));
}
let thread = webclaw_core::reddit::try_extract_thread(&resp.html, url).ok_or_else(|| {
FetchError::BodyDecode(
"reddit: page structure not recognised — is this a thread URL?".into(),
)
})?;
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
serde_json::to_value(&thread)
.map_err(|e| FetchError::BodyDecode(format!("reddit: serialisation error: {e}")))
if listings.is_empty() {
return Err(FetchError::BodyDecode("reddit response empty".into()));
}
// First listing = the post (single t3 child).
let post = listings
.first()
.and_then(|l| l.data.children.first())
.filter(|t| t.kind == "t3")
.map(|t| post_json(&t.data))
.unwrap_or(Value::Null);
// Second listing = the comment tree.
let comments: Vec<Value> = listings
.get(1)
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
.unwrap_or_default();
Ok(json!({
"url": url,
"post": post,
"comments": comments,
}))
}
// ---------------------------------------------------------------------------
// JSON shapers
// ---------------------------------------------------------------------------
fn post_json(d: &ThingData) -> Value {
json!({
"id": d.id,
"title": d.title,
"author": d.author,
"subreddit": d.subreddit_name_prefixed,
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
"url": d.url_overridden_by_dest,
"is_self": d.is_self,
"selftext": d.selftext,
"score": d.score,
"upvote_ratio": d.upvote_ratio,
"num_comments": d.num_comments,
"created_utc": d.created_utc,
"link_flair_text": d.link_flair_text,
"over_18": d.over_18,
"spoiler": d.spoiler,
"stickied": d.stickied,
"locked": d.locked,
})
}
/// Render a single comment + its reply tree. Returns `None` for non-t1
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
fn comment_json(thing: &Thing) -> Option<Value> {
if thing.kind != "t1" {
return None;
}
let d = &thing.data;
let replies: Vec<Value> = match &d.replies {
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
_ => Vec::new(),
};
Some(json!({
"id": d.id,
"author": d.author,
"body": d.body,
"score": d.score,
"created_utc": d.created_utc,
"is_submitter": d.is_submitter,
"stickied": d.stickied,
"depth": d.depth,
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
"replies": replies,
}))
}
// ---------------------------------------------------------------------------
// URL helpers
// ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
/// or `old.reddit.com` as the caller gave us). Routing through
/// `old.reddit.com` unconditionally looks appealing but that host has
/// stricter UA-based blocking than `www.reddit.com`, while the main
/// host accepts our Chrome-fingerprinted client fine.
fn build_json_url(url: &str) -> String {
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
format!("{clean}.json?raw_json=1")
}
// ---------------------------------------------------------------------------
// Reddit JSON types — only fields we render. Everything else is dropped.
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Listing {
data: ListingData,
}
#[derive(Deserialize)]
struct ListingData {
children: Vec<Thing>,
}
#[derive(Deserialize)]
struct Thing {
kind: String,
data: ThingData,
}
#[derive(Deserialize, Default)]
struct ThingData {
// post (t3)
id: Option<String>,
title: Option<String>,
selftext: Option<String>,
subreddit_name_prefixed: Option<String>,
url_overridden_by_dest: Option<String>,
is_self: Option<bool>,
upvote_ratio: Option<f64>,
num_comments: Option<i64>,
over_18: Option<bool>,
spoiler: Option<bool>,
stickied: Option<bool>,
locked: Option<bool>,
link_flair_text: Option<String>,
// comment (t1)
author: Option<String>,
body: Option<String>,
score: Option<i64>,
created_utc: Option<f64>,
is_submitter: Option<bool>,
depth: Option<i64>,
permalink: Option<String>,
// recursive
replies: Option<Replies>,
}
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
Listing(Listing),
#[allow(dead_code)]
Empty(String),
}
#[cfg(test)]
@ -50,17 +207,28 @@ mod tests {
use super::*;
#[test]
fn matches_thread_urls() {
fn matches_reddit_post_urls() {
assert!(matches(
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
));
assert!(matches(
"https://reddit.com/r/rust/comments/abc123/some_title"
));
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
assert!(matches("https://reddit.com/r/rust/comments/abc/x"));
}
#[test]
fn rejects_listing_and_non_reddit() {
fn rejects_non_post_reddit_urls() {
assert!(!matches("https://www.reddit.com/r/rust"));
assert!(!matches("https://example.com/r/rust/comments/abc/x"));
assert!(!matches("https://www.reddit.com/user/foo"));
assert!(!matches("https://example.com/r/rust/comments/x"));
}
#[test]
fn json_url_appends_suffix_and_drops_query() {
assert_eq!(
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
);
}
}

View file

@ -28,7 +28,6 @@ use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -182,27 +181,24 @@ async fn html_fallback(
pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
let article = find_article_jsonld(html);
// Single scan for the four og:* fields read as fallbacks below.
let og_meta = parse_og(html);
let title = article
.as_ref()
.and_then(|v| get_text(v, "headline"))
.or_else(|| og_meta.raw("title"));
.or_else(|| og(html, "title"));
let description = article
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og_meta.raw("description"));
.or_else(|| og(html, "description"));
let cover_image = article
.as_ref()
.and_then(get_first_image)
.or_else(|| og_meta.raw("image"));
.or_else(|| og(html, "image"));
let post_date = article
.as_ref()
.and_then(|v| get_text(v, "datePublished"))
.or_else(|| meta_property(html, "article:published_time"));
let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
let publication_name = og_meta.raw("site_name");
let publication_name = og(html, "site_name");
let authors = article.as_ref().map(extract_authors).unwrap_or_default();
json!({
@ -306,6 +302,19 @@ fn handle_from_author_url(u: &str) -> Option<String> {
// HTML tag helpers
// ---------------------------------------------------------------------------
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Pull `<meta property="article:published_time" content="...">` and
/// similar structured meta tags.
fn meta_property(html: &str, prop: &str) -> Option<String> {

View file

@ -32,7 +32,6 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -88,17 +87,11 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
// The aiSummary block: not typed (no `@type`), detect by key.
let ai_block = find_ai_summary_block(&blocks);
// Single scan of the page's og:* meta tags; title + description feed
// the regex fallbacks below.
let og_meta = parse_og(html);
let og_title = og_meta.unescaped("title");
let og_description = og_meta.unescaped("description");
// Business name: Dataset > metadata.title regex > URL domain.
let business_name = dataset
.as_ref()
.and_then(|d| get_string(d, "name"))
.or_else(|| parse_name_from_og_title(og_title.as_deref()))
.or_else(|| parse_name_from_og_title(html))
.or_else(|| Some(domain.clone()));
// Rating distribution from the csvw:Table columns. Each column has
@ -112,8 +105,8 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
// Page-title / page-description fallbacks. OG title format:
// "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
let (rating_label, rating_from_og) = parse_rating_from_og_title(og_title.as_deref());
let total_from_desc = parse_review_count_from_og_description(og_description.as_deref());
let (rating_label, rating_from_og) = parse_rating_from_og_title(html);
let total_from_desc = parse_review_count_from_og_description(html);
// Recent reviews carried by the aiSummary block.
let recent_reviews: Vec<Value> = ai_block
@ -343,21 +336,20 @@ fn compute_rating_stats(distribution: &Value) -> (Option<String>, Option<i64>) {
/// Regex out the business name from the standard Trustpilot OG title
/// shape: `"{name} is rated \"{label}\" with {rating} / 5 on Trustpilot"`.
/// `title` is the (entity-decoded) `og:title` content.
fn parse_name_from_og_title(title: Option<&str>) -> Option<String> {
let title = title?;
fn parse_name_from_og_title(html: &str) -> Option<String> {
let title = og(html, "title")?;
// "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"^(.+?)\s+is rated\b").unwrap());
re.captures(title)
re.captures(&title)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
/// Pull the rating label (e.g. "Bad", "Excellent") and numeric value
/// from the (entity-decoded) `og:title` content.
fn parse_rating_from_og_title(title: Option<&str>) -> (Option<String>, Option<String>) {
let Some(title) = title else {
/// from the OG title.
fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
let Some(title) = og(html, "title") else {
return (None, None);
};
static RE: OnceLock<Regex> = OnceLock::new();
@ -365,7 +357,7 @@ fn parse_rating_from_og_title(title: Option<&str>) -> (Option<String>, Option<St
let re = RE.get_or_init(|| {
Regex::new(r#"is rated\s*[\\"]+([^"\\]+)[\\"]+\s*with\s*([\d.]+)\s*/\s*5"#).unwrap()
});
let Some(caps) = re.captures(title) else {
let Some(caps) = re.captures(&title) else {
return (None, None);
};
(
@ -374,13 +366,13 @@ fn parse_rating_from_og_title(title: Option<&str>) -> (Option<String>, Option<St
)
}
/// Parse "hear what 226 customers have already said" from the
/// (entity-decoded) `og:description` content.
fn parse_review_count_from_og_description(desc: Option<&str>) -> Option<i64> {
let desc = desc?;
/// Parse "hear what 226 customers have already said" from the OG
/// description tag.
fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
let desc = og(html, "description")?;
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"(\d[\d,]*)\s+customers").unwrap());
re.captures(desc)?
re.captures(&desc)?
.get(1)?
.as_str()
.replace(',', "")
@ -388,6 +380,29 @@ fn parse_review_count_from_og_description(desc: Option<&str>) -> Option<i64> {
.ok()
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
let raw = c.get(2).map(|m| m.as_str())?;
return Some(html_unescape(raw));
}
}
None
}
/// Minimal HTML entity unescaping for the three entities the
/// synthesize_html escaper might produce. Keeps us off a heavier dep.
fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}
fn get_string(v: &Value, key: &str) -> Option<String> {
v.get(key).and_then(|x| x.as_str().map(String::from))
}
@ -473,12 +488,8 @@ mod tests {
#[test]
fn parse_og_title_extracts_name_and_rating() {
let html = r#"<meta property="og:title" content="Anthropic is rated &quot;Bad&quot; with 1.5 / 5 on Trustpilot">"#;
let title = parse_og(html).unescaped("title");
assert_eq!(
parse_name_from_og_title(title.as_deref()),
Some("Anthropic".into())
);
let (label, rating) = parse_rating_from_og_title(title.as_deref());
assert_eq!(parse_name_from_og_title(html), Some("Anthropic".into()));
let (label, rating) = parse_rating_from_og_title(html);
assert_eq!(label.as_deref(), Some("Bad"));
assert_eq!(rating.as_deref(), Some("1.5"));
}
@ -486,11 +497,7 @@ mod tests {
#[test]
fn parse_review_count_from_og_description_picks_number() {
let html = r#"<meta property="og:description" content="Do you agree? Voice your opinion today and hear what 226 customers have already said.">"#;
let desc = parse_og(html).unescaped("description");
assert_eq!(
parse_review_count_from_og_description(desc.as_deref()),
Some(226)
);
assert_eq!(parse_review_count_from_og_description(html), Some(226));
}
#[test]

View file

@ -25,7 +25,6 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -144,11 +143,9 @@ fn build_player_payload(
// ---------------------------------------------------------------------------
fn build_og_fallback(html: &str, url: &str, canonical: &str, video_id: &str) -> Value {
// Single scan for the three og:* fields read below.
let og_meta = parse_og(html);
let title = og_meta.raw("title");
let description = og_meta.raw("description");
let thumbnail = og_meta.raw("image");
let title = og(html, "title");
let description = og(html, "description");
let thumbnail = og(html, "image");
// YouTube sets `<meta name="channel_name" ...>` on some pages but
// OG-only pages reliably carry `og:video:tag` and the channel in
// `<link itemprop="name">`. We keep this lean: just what's stable.
@ -251,6 +248,19 @@ fn extract_player_response(html: &str) -> Option<Value> {
// Meta-tag helpers (for OG fallback)
// ---------------------------------------------------------------------------
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
fn meta_name(html: &str, name: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {

View file

@ -11,11 +11,8 @@ pub mod extractors;
pub mod fetcher;
pub mod linkedin;
pub mod locale;
pub mod map;
pub mod progress;
pub mod proxy;
pub mod reddit;
pub mod search;
pub mod sitemap;
pub mod tls;
pub mod url_security;
@ -27,9 +24,6 @@ pub use error::FetchError;
pub use fetcher::Fetcher;
pub use http::HeaderMap;
pub use locale::{accept_language_for_tld, accept_language_for_url};
pub use map::{MapOptions, discover_urls};
pub use progress::{PROGRESS_INTERVAL, with_progress};
pub use proxy::{parse_proxy_file, parse_proxy_line};
pub use search::{SearchOptions, SearchResult, parse_serper_organic, search};
pub use sitemap::SitemapEntry;
pub use webclaw_pdf::PdfMode;

View file

@ -1,326 +0,0 @@
//! Layered URL discovery for the `map` command.
//!
//! `sitemap::discover` only finds URLs a site explicitly advertises in its
//! `sitemap.xml`. Plenty of sites have no sitemap (news.ycombinator.com), a
//! stale one, or a thin one that lists a handful of section roots. For those,
//! a sitemap-only map returns almost nothing.
//!
//! This module adds a second layer: when the sitemap yields fewer than a
//! threshold of URLs, run a *bounded* same-origin crawl and harvest every URL
//! it touches — fetched pages, the visited set, **and** the remaining frontier
//! (links queued but never fetched because the page cap was hit). That last
//! bucket is the gold: a 150-page crawl of a link-dense site surfaces several
//! thousand frontier URLs, turning a useless map into a real one.
//!
//! Strategy (layered, sitemap-first):
//! 1. Sitemaps via [`sitemap::discover`] — authoritative, carries metadata
//! (lastmod / priority / changefreq).
//! 2. If sitemaps are thin (`< min_sitemap_urls`) and the fallback is enabled,
//! a bounded crawl fills in the rest. Crawl-discovered URLs carry no
//! metadata (`None` everywhere) since they come from link harvesting, not a
//! sitemap.
//!
//! Sitemap entries always come first in the returned vec; crawl-discovered
//! URLs are appended, deduplicated against the sitemap set using the *same*
//! normalization the crawler uses ([`crawler::normalize`]) so map output stays
//! internally consistent.
use std::collections::HashSet;
use std::time::Duration;
use url::Url;
use crate::client::{FetchClient, FetchConfig};
use crate::crawler::{self, CrawlConfig, Crawler};
use crate::sitemap::{self, SitemapEntry};
/// Tuning knobs for [`discover_urls`].
#[derive(Debug, Clone)]
pub struct MapOptions {
/// Hard cap on pages the fallback crawl will fetch. The crawl surfaces far
/// more URLs than this via the unfetched frontier, so a small number still
/// yields a large map while keeping the crawl fast and polite.
pub max_crawl_pages: usize,
/// How deep the fallback crawl follows links (1 = links off the seed only).
pub crawl_depth: usize,
/// Sitemap-URL count below which the crawl fallback kicks in. A site with a
/// rich sitemap (≥ this many URLs) skips the crawl entirely.
pub min_sitemap_urls: usize,
/// Master switch for the crawl fallback. When `false`, behaves exactly like
/// the old sitemap-only `discover`.
pub crawl_fallback: bool,
/// Optional cap on URLs returned. `None` (default) = uncapped: return every
/// URL discovered (the crawl is already bounded by `max_crawl_pages`, so the
/// uncapped set is the links harvested from the fetched pages). Set `Some(n)`
/// to truncate.
pub max_urls: Option<usize>,
}
impl Default for MapOptions {
fn default() -> Self {
Self {
max_crawl_pages: 150,
crawl_depth: 2,
min_sitemap_urls: 200,
crawl_fallback: true,
max_urls: None,
}
}
}
/// Discover URLs for a site using the layered strategy described in the module
/// docs: sitemaps first, then a bounded crawl fallback when the sitemap is
/// thin.
///
/// Never errors — sitemap and crawl failures are swallowed and simply yield
/// fewer URLs (an empty vec in the worst case), matching `sitemap::discover`'s
/// "absence is not an error" contract.
pub async fn discover_urls(
client: &FetchClient,
base_url: &str,
opts: &MapOptions,
) -> Vec<SitemapEntry> {
// Layer 1: sitemaps.
let mut entries = sitemap::discover(client, base_url)
.await
.unwrap_or_default();
// Track normalized URLs we've already emitted, for cross-layer dedup.
let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
// Layer 2: bounded crawl fallback, only when the sitemap is thin.
if !opts.crawl_fallback || entries.len() >= opts.min_sitemap_urls {
return entries;
}
let Some(base_origin) = Url::parse(base_url).ok().map(|u| crawler::origin_key(&u)) else {
// Unparseable base URL — nothing sensible to crawl against.
return entries;
};
let config = CrawlConfig {
fetch: FetchConfig::default(),
max_depth: opts.crawl_depth,
max_pages: opts.max_crawl_pages,
// Politeness + scope: same-origin only (crawler default), modest delay.
delay: Duration::from_millis(50),
..CrawlConfig::default()
};
let crawler = match Crawler::new(base_url, config) {
Ok(c) => c,
Err(_) => return entries,
};
let result = crawler.crawl(base_url, None).await;
// Richest source first: every link harvested from each fetched page. A
// directory/index page holds hundreds of same-origin links, and this set is
// NOT bound by the crawler's internal frontier cap. Then the URLs the crawl
// itself touched (fetched, visited, queued-but-unfetched frontier).
let mut discovered: Vec<String> = Vec::new();
for p in &result.pages {
discovered.push(p.url.clone());
if let Some(ex) = p.extraction.as_ref() {
let page_base = Url::parse(&p.url).ok();
for link in &ex.content.links {
// Resolve relative/protocol-relative hrefs against the page URL
// so the same-origin filter and dedup see absolute URLs.
let abs = match &page_base {
Some(b) => b.join(&link.href).ok(),
None => Url::parse(&link.href).ok(),
};
if let Some(u) = abs {
discovered.push(u.to_string());
}
}
}
}
discovered.extend(result.visited);
discovered.extend(result.remaining_frontier.into_iter().map(|(url, _)| url));
append_crawled(&mut entries, &mut seen, discovered, &base_origin);
// Uncapped by default; only truncate if the caller set an explicit limit
// (sitemap entries added first keep priority).
if let Some(cap) = opts.max_urls {
entries.truncate(cap);
}
entries
}
/// Normalize a raw URL string to the crawler's canonical form, returning `None`
/// if it doesn't parse.
fn normalize_url(raw: &str) -> Option<String> {
Url::parse(raw).ok().map(|u| crawler::normalize(&u))
}
/// Normalize a [`SitemapEntry`]'s URL for the dedup set.
fn normalize_str(entry: &SitemapEntry) -> Option<String> {
normalize_url(&entry.url)
}
/// Append crawl-discovered URLs to `entries`, skipping any that are off-origin,
/// unparseable, or already present (by normalized form).
///
/// Split out from [`discover_urls`] so the union/dedup/same-origin logic is
/// unit-testable without touching the network. Mutates `entries` and `seen` in
/// place; crawl URLs get empty metadata.
fn append_crawled(
entries: &mut Vec<SitemapEntry>,
seen: &mut HashSet<String>,
discovered: impl IntoIterator<Item = String>,
base_origin: &str,
) {
for raw in discovered {
let Ok(parsed) = Url::parse(&raw) else {
continue;
};
// Same-origin filter: drop anything whose origin differs from the seed.
if crawler::origin_key(&parsed) != base_origin {
continue;
}
let norm = crawler::normalize(&parsed);
if seen.insert(norm.clone()) {
entries.push(SitemapEntry {
url: norm,
last_modified: None,
priority: None,
change_freq: None,
});
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn entry(url: &str) -> SitemapEntry {
SitemapEntry {
url: url.to_string(),
last_modified: None,
priority: None,
change_freq: None,
}
}
fn origin_of(url: &str) -> String {
crawler::origin_key(&Url::parse(url).unwrap())
}
#[test]
fn append_adds_new_same_origin_urls() {
let mut entries = vec![entry("https://example.com/")];
let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
append_crawled(
&mut entries,
&mut seen,
vec![
"https://example.com/about".to_string(),
"https://example.com/contact".to_string(),
],
&origin_of("https://example.com"),
);
let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
assert_eq!(
urls,
vec![
"https://example.com/",
"https://example.com/about",
"https://example.com/contact",
]
);
}
#[test]
fn append_dedups_against_sitemap_and_self() {
let mut entries = vec![entry("https://example.com/about")];
let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
append_crawled(
&mut entries,
&mut seen,
vec![
// Same as sitemap entry (trailing slash normalizes away).
"https://example.com/about/".to_string(),
// Fragment + duplicate -> only one new entry survives.
"https://example.com/new#frag".to_string(),
"https://example.com/new".to_string(),
],
&origin_of("https://example.com"),
);
let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
assert_eq!(
urls,
vec!["https://example.com/about", "https://example.com/new"]
);
}
#[test]
fn append_filters_off_origin() {
let mut entries = Vec::new();
let mut seen = HashSet::new();
append_crawled(
&mut entries,
&mut seen,
vec![
"https://example.com/keep".to_string(),
"https://evil.com/drop".to_string(),
"https://sub.example.com/drop".to_string(), // different origin
"ftp://example.com/drop".to_string(), // unparseable as http origin match
],
&origin_of("https://example.com"),
);
let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
assert_eq!(urls, vec!["https://example.com/keep"]);
}
#[test]
fn append_treats_www_as_same_origin() {
// origin_key strips a leading `www.`, so www and apex collapse.
let mut entries = Vec::new();
let mut seen = HashSet::new();
append_crawled(
&mut entries,
&mut seen,
vec!["https://www.example.com/page".to_string()],
&origin_of("https://example.com"),
);
assert_eq!(entries.len(), 1);
}
#[test]
fn crawl_urls_carry_no_metadata() {
let mut entries = Vec::new();
let mut seen = HashSet::new();
append_crawled(
&mut entries,
&mut seen,
vec!["https://example.com/x".to_string()],
&origin_of("https://example.com"),
);
assert_eq!(entries.len(), 1);
assert!(entries[0].last_modified.is_none());
assert!(entries[0].priority.is_none());
assert!(entries[0].change_freq.is_none());
}
#[test]
fn map_options_defaults() {
let o = MapOptions::default();
assert_eq!(o.max_crawl_pages, 150);
assert_eq!(o.crawl_depth, 2);
assert_eq!(o.min_sitemap_urls, 200);
assert!(o.crawl_fallback);
}
}

View file

@ -1,293 +0,0 @@
//! Periodic stderr progress line emitter for slow fetches (M13).
//!
//! Wraps any async fetch future with a `tokio::select!` against a
//! `tokio::time::interval`. Every `PROGRESS_INTERVAL` (default 10s) of
//! elapsed time, emits one line to STDERR of the form:
//!
//! ```text
//! # webclaw: still fetching <URL> (Ns)
//! ```
//!
//! Fetches completing in under `PROGRESS_INTERVAL` emit zero lines (the
//! timer never fires). Stdout is untouched.
//!
//! The URL is truncated to at most 80 chars (head + `...` + tail) so
//! pathological query strings don't blow up the stderr line. Truncation
//! is char-boundary safe (operates on `chars`, not bytes).
use std::future::Future;
use std::time::Duration;
use tokio::time::{Instant, MissedTickBehavior, interval};
/// Default progress emission interval. The first tick fires at +10s
/// elapsed; subsequent ticks at +20s, +30s, etc.
pub const PROGRESS_INTERVAL: Duration = Duration::from_secs(10);
/// Maximum URL length in the progress line. Longer URLs are truncated
/// `head...tail` style.
const MAX_URL_LEN: usize = 80;
/// Wrap a fetch future with the default 10s progress emitter. Writes
/// progress lines to STDERR via `eprintln!`. Returns the inner future's
/// result unchanged.
pub async fn with_progress<F, T>(url: &str, future: F) -> T
where
F: Future<Output = T>,
{
with_progress_writer(url, future, PROGRESS_INTERVAL, |s| eprintln!("{s}")).await
}
/// Test-friendly variant of [`with_progress`]: caller supplies the tick
/// interval (so tests can use a 50ms period instead of 10s) and a
/// writer closure (so tests can capture emitted lines without touching
/// real stderr).
///
/// Production code uses [`with_progress`] which delegates here with
/// [`PROGRESS_INTERVAL`] and an `eprintln!` writer.
pub async fn with_progress_writer<F, T, W>(
url: &str,
future: F,
period: Duration,
mut writer: W,
) -> T
where
F: Future<Output = T>,
W: FnMut(String),
{
let start = Instant::now();
let mut ticker = interval(period);
// First tick of `tokio::time::interval(period)` fires *immediately*
// (at construction time). We don't want a t=0 emit — consume that
// first tick before entering the select loop. Subsequent ticks fire
// at `start + period`, `start + 2*period`, ...
ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
ticker.tick().await;
tokio::pin!(future);
loop {
tokio::select! {
// Bias toward the future — if both are ready (rare), prefer
// returning the result over emitting a final tick.
biased;
result = &mut future => {
return result;
}
_ = ticker.tick() => {
let elapsed = start.elapsed();
writer(format_progress_line(url, elapsed));
}
}
}
}
/// Build the progress line: `# webclaw: still fetching <URL> (Ns)`.
/// URL is truncated via [`truncate_url`] to [`MAX_URL_LEN`] chars.
/// Elapsed is rounded to whole seconds (10, 20, 30, ...).
pub(crate) fn format_progress_line(url: &str, elapsed: Duration) -> String {
let truncated = truncate_url(url, MAX_URL_LEN);
let secs = elapsed.as_secs();
format!("# webclaw: still fetching {truncated} ({secs}s)")
}
/// Truncate `url` to at most `max` chars, using `head...tail` shape
/// when truncation is needed. Char-boundary safe (operates on `chars`).
pub(crate) fn truncate_url(url: &str, max: usize) -> String {
let total_chars = url.chars().count();
if total_chars <= max {
return url.to_string();
}
// Reserve 3 chars for "..." and split the remainder ~70/30 between
// head (path-side) and tail (query-side).
let avail = max.saturating_sub(3);
let head_chars = avail.saturating_sub(17);
let tail_chars = 17;
let head: String = url.chars().take(head_chars).collect();
let tail: String = url
.chars()
.rev()
.take(tail_chars)
.collect::<Vec<_>>()
.into_iter()
.rev()
.collect();
format!("{head}...{tail}")
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::{Arc, Mutex};
/// Collect emitted lines into a `Vec<String>` via a captured writer.
fn capture() -> (Arc<Mutex<Vec<String>>>, impl FnMut(String)) {
let sink: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
let sink_clone = Arc::clone(&sink);
let writer = move |s: String| {
sink_clone.lock().unwrap().push(s);
};
(sink, writer)
}
#[tokio::test]
async fn test_progress_emits_after_interval_elapsed() {
let (sink, writer) = capture();
// 250ms future, 50ms interval — expect ~4-5 ticks before resolution.
let fut = tokio::time::sleep(Duration::from_millis(250));
with_progress_writer(
"https://example.com/slow",
async {
fut.await;
42_i32
},
Duration::from_millis(50),
writer,
)
.await;
let lines = sink.lock().unwrap();
assert!(
!lines.is_empty(),
"expected >=1 progress line; got {} ({:?})",
lines.len(),
*lines
);
for line in lines.iter() {
assert!(
line.starts_with("# webclaw: still fetching"),
"line shape wrong: {line:?}"
);
assert!(
line.contains("https://example.com/slow"),
"url missing from line: {line:?}"
);
}
}
#[tokio::test]
async fn test_progress_silent_on_fast_future() {
let (sink, writer) = capture();
// 10ms future, 1s interval — zero ticks expected.
let result = with_progress_writer(
"https://example.com/fast",
async {
tokio::time::sleep(Duration::from_millis(10)).await;
"done"
},
Duration::from_secs(1),
writer,
)
.await;
assert_eq!(result, "done");
let lines = sink.lock().unwrap();
assert_eq!(
lines.len(),
0,
"expected 0 progress lines on fast future; got {:?}",
*lines
);
}
#[tokio::test]
async fn test_progress_line_includes_url() {
let (sink, writer) = capture();
let target_url = "https://news.ycombinator.com/item?id=12345";
with_progress_writer(
target_url,
async {
tokio::time::sleep(Duration::from_millis(150)).await;
},
Duration::from_millis(50),
writer,
)
.await;
let lines = sink.lock().unwrap();
assert!(!lines.is_empty(), "expected progress lines");
assert!(
lines.iter().all(|l| l.contains(target_url)),
"every line should contain the URL: {:?}",
*lines
);
}
#[tokio::test]
async fn test_progress_returns_inner_result_ok() {
let (_sink, writer) = capture();
let r: Result<i32, String> = with_progress_writer(
"https://example.com/",
async { Ok::<i32, String>(7) },
Duration::from_secs(1),
writer,
)
.await;
assert_eq!(r, Ok(7));
}
#[tokio::test]
async fn test_progress_propagates_error() {
let (_sink, writer) = capture();
let r: Result<i32, String> = with_progress_writer(
"https://example.com/",
async { Err::<i32, String>("boom".to_string()) },
Duration::from_secs(1),
writer,
)
.await;
assert_eq!(r, Err("boom".to_string()));
}
#[test]
fn test_truncate_url_short_passthrough() {
let url = "https://example.com/";
assert_eq!(truncate_url(url, 80), url);
}
#[test]
fn test_truncate_url_long_head_dots_tail() {
let url = "https://www.example.com/very/long/path/segments/with/lots/of/text/and/then?q=some_long_query_string_value_here&other=more&another=thing";
let truncated = truncate_url(url, 80);
assert!(
truncated.chars().count() <= 80,
"truncated length {} > 80: {truncated:?}",
truncated.chars().count()
);
assert!(
truncated.contains("..."),
"expected '...' marker in truncated url: {truncated:?}"
);
assert!(
truncated.starts_with("https://www.example.com/"),
"truncated should start with the URL head: {truncated:?}"
);
}
#[test]
fn test_truncate_url_unicode_safe() {
// Cyrillic URL longer than 80 chars — must not panic on a
// mid-codepoint split.
let url =
"https://example.com/путь/к/очень/длинной/странице/с/большим/количеством/кириллицы/тут";
let truncated = truncate_url(url, 80);
assert!(truncated.is_char_boundary(truncated.len()));
// Roundtrip through chars to confirm valid UTF-8 throughout.
let _: String = truncated.chars().collect();
}
#[test]
fn test_format_progress_line_shape() {
let line = format_progress_line("https://example.com/", Duration::from_secs(10));
assert_eq!(line, "# webclaw: still fetching https://example.com/ (10s)");
}
#[test]
fn test_format_progress_line_seconds_only() {
// Sub-second elapsed rounds to 0s, not fractions. (In practice
// the first tick fires at +PROGRESS_INTERVAL so this is mostly
// a defensive shape assertion.)
let line = format_progress_line("https://x/", Duration::from_millis(9_500));
assert!(
line.ends_with("(9s)"),
"line should end with `(9s)`: {line:?}"
);
}
}

View file

@ -1,56 +1,172 @@
//! Reddit URL helpers for the fetch layer.
//!
//! The JSON API (`*.json`) is blocked. We rewrite all Reddit hosts to
//! `old.reddit.com`, which serves stable server-rendered HTML that
//! `webclaw-core::reddit` parses directly.
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
///
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
/// comment tree as structured JSON, which we convert to clean markdown.
use serde::Deserialize;
use tracing::debug;
use webclaw_core::{Content, ExtractionResult, Metadata};
/// Check if a URL points to a Reddit post/comment page.
pub fn is_reddit_url(url: &str) -> bool {
webclaw_core::reddit::is_reddit_url(url)
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
matches!(
host,
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
)
}
/// Rewrite any Reddit host to old.reddit.com, preserving path and query.
pub fn to_old_reddit_url(url: &str) -> String {
let Some(scheme_end) = url.find("://") else {
return url.to_string();
};
let after = &url[scheme_end + 3..];
let host_end = after.find(['/', '?', '#']).unwrap_or(after.len());
let scheme = &url[..scheme_end + 3];
let rest = &after[host_end..];
format!("{scheme}old.reddit.com{rest}")
/// Build the `.json` URL from a Reddit page URL.
pub fn json_url(url: &str) -> String {
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
format!("{clean}.json")
}
#[cfg(test)]
mod tests {
use super::*;
/// Convert Reddit JSON API response into an ExtractionResult.
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
let listings: Vec<Listing> =
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
#[test]
fn rewrites_www_to_old() {
assert_eq!(
to_old_reddit_url("https://www.reddit.com/r/rust/comments/abc/x/"),
"https://old.reddit.com/r/rust/comments/abc/x/"
);
let mut markdown = String::new();
let mut title = None;
let mut author = None;
let mut subreddit = None;
// First listing = the post itself
if let Some(post_listing) = listings.first() {
for child in &post_listing.data.children {
if child.kind == "t3" {
let d = &child.data;
title = d.title.clone();
author = d.author.clone();
subreddit = d.subreddit_name_prefixed.clone();
if let Some(ref t) = title {
markdown.push_str(&format!("# {t}\n\n"));
}
if let (Some(a), Some(sr)) = (&author, &subreddit) {
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
}
if let Some(ref body) = d.selftext
&& !body.is_empty()
{
markdown.push_str(body);
markdown.push_str("\n\n");
}
if let Some(ref url_field) = d.url_overridden_by_dest
&& !url_field.is_empty()
{
markdown.push_str(&format!("[Link]({url_field})\n\n"));
}
markdown.push_str("---\n\n");
}
}
}
#[test]
fn rewrites_bare_to_old() {
assert_eq!(
to_old_reddit_url("https://reddit.com/r/rust/"),
"https://old.reddit.com/r/rust/"
);
// Second listing = comment tree
if let Some(comment_listing) = listings.get(1) {
markdown.push_str("## Comments\n\n");
for child in &comment_listing.data.children {
render_comment(child, 0, &mut markdown);
}
}
#[test]
fn preserves_old_reddit_unchanged() {
let url = "https://old.reddit.com/r/rust/comments/abc/x/?context=3";
assert_eq!(to_old_reddit_url(url), url);
}
let word_count = markdown.split_whitespace().count();
debug!(word_count, "reddit json extracted");
#[test]
fn preserves_query_and_hash() {
assert_eq!(
to_old_reddit_url("https://www.reddit.com/r/rust/?sort=top#anchor"),
"https://old.reddit.com/r/rust/?sort=top#anchor"
);
Ok(ExtractionResult {
metadata: Metadata {
title,
description: None,
author,
published_date: None,
language: Some("en".into()),
url: Some(url.to_string()),
site_name: subreddit,
image: None,
favicon: None,
word_count,
},
content: Content {
markdown,
plain_text: String::new(),
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
}
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
if thing.kind != "t1" {
return;
}
let d = &thing.data;
let indent = " ".repeat(depth);
let author = d.author.as_deref().unwrap_or("[deleted]");
let body = d.body.as_deref().unwrap_or("[removed]");
let score = d.score.unwrap_or(0);
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
for line in body.lines() {
out.push_str(&format!("{indent} {line}\n"));
}
out.push('\n');
// Recurse into replies
if let Some(Replies::Listing(listing)) = &d.replies {
for child in &listing.data.children {
render_comment(child, depth + 1, out);
}
}
}
// --- Reddit JSON types (minimal) ---
#[derive(Deserialize)]
struct Listing {
data: ListingData,
}
#[derive(Deserialize)]
struct ListingData {
children: Vec<Thing>,
}
#[derive(Deserialize)]
struct Thing {
kind: String,
data: ThingData,
}
#[derive(Deserialize)]
struct ThingData {
// Post fields (t3)
title: Option<String>,
selftext: Option<String>,
subreddit_name_prefixed: Option<String>,
url_overridden_by_dest: Option<String>,
// Comment fields (t1)
author: Option<String>,
body: Option<String>,
score: Option<i64>,
replies: Option<Replies>,
}
/// Reddit replies can be either a nested Listing or an empty string.
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
Listing(Listing),
#[allow(dead_code)]
Empty(String),
}

View file

@ -1,322 +0,0 @@
//! Web search via Serper.dev (Google results) with optional content scraping.
//!
//! This is the self-hosted search path: the caller supplies their own
//! Serper.dev API key (free tier at serper.dev). The CLI, MCP server, and
//! OSS REST server all route through [`search`] so search works without the
//! hosted webclaw API.
//!
//! Serper returns a plain JSON API, so we hit it with a vanilla wreq client
//! (10s timeout) — no browser TLS fingerprinting needed. When `scrape` is
//! set, the top results are fetched through the caller's [`FetchClient`]
//! (which *does* carry the fingerprinting) and extracted to markdown.
use std::sync::Arc;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};
use tokio::sync::Semaphore;
use tracing::warn;
use crate::client::FetchClient;
use crate::error::FetchError;
/// Serper.dev search endpoint.
const SERPER_URL: &str = "https://google.serper.dev/search";
/// Bound on the number of result pages scraped concurrently when
/// `scrape` is enabled. Keeps the fan-out from overwhelming the proxy
/// pool / remote hosts on a large result set.
const SCRAPE_CONCURRENCY: usize = 5;
/// Options controlling a search request.
#[derive(Debug, Clone)]
pub struct SearchOptions {
/// Number of organic results to request (clamped to `1..=10`).
pub num_results: usize,
/// Country code for localization (Serper `gl`, e.g. `"us"`, `"gb"`).
pub country: Option<String>,
/// Language code for localization (Serper `hl`, e.g. `"en"`, `"it"`).
pub lang: Option<String>,
/// When true, fetch + extract the result pages and fill in `content`.
pub scrape: bool,
}
impl Default for SearchOptions {
fn default() -> Self {
Self {
num_results: 5,
country: None,
lang: None,
scrape: false,
}
}
}
/// A single organic search result. When `scrape` was requested and the
/// fetch succeeded, `content` holds the extracted markdown; otherwise it
/// is `None` (a per-result fetch failure never fails the whole search).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
pub title: String,
pub link: String,
pub snippet: String,
pub position: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub content: Option<String>,
}
/// Run a web search through Serper.dev.
///
/// `client` — the caller's [`FetchClient`], used only when `opts.scrape`
/// is set (to fetch + extract the result pages).
/// `serper_key` — the caller's Serper.dev API key.
/// `query` — the search query.
/// `opts` — result count, localization, and whether to scrape.
///
/// Returns the organic results in Serper's order. With `scrape` enabled,
/// the top results are fetched concurrently (bounded) and their extracted
/// markdown is attached to `content`.
pub async fn search(
client: &FetchClient,
serper_key: &str,
query: &str,
opts: &SearchOptions,
) -> Result<Vec<SearchResult>, FetchError> {
let num = opts.num_results.clamp(1, 10);
let response = call_serper(
serper_key,
query,
num,
opts.country.as_deref(),
opts.lang.as_deref(),
)
.await?;
let mut results = parse_serper_organic(&response);
if opts.scrape && !results.is_empty() {
scrape_results(client, &mut results).await;
}
Ok(results)
}
/// POST the query to Serper.dev and return the raw JSON response.
///
/// Builds a plain wreq client (no browser emulation — Serper is a JSON
/// API, not a bot-protected page). Non-2xx responses are surfaced as a
/// [`FetchError::Build`] carrying the status and body so the caller can
/// show Serper's own error (bad key, quota exceeded, etc.).
async fn call_serper(
api_key: &str,
query: &str,
num: usize,
country: Option<&str>,
lang: Option<&str>,
) -> Result<Value, FetchError> {
let http = wreq::Client::builder()
.timeout(Duration::from_secs(10))
.build()
.map_err(|e| FetchError::Build(format!("failed to build serper client: {e}")))?;
let mut body = json!({ "q": query, "num": num });
if let Some(gl) = country {
body["gl"] = json!(gl);
}
if let Some(hl) = lang {
body["hl"] = json!(hl);
}
// Serialize ourselves rather than `.json()` — the wreq `json` feature
// is not enabled in this crate and isn't worth pulling in for one call.
let payload = serde_json::to_vec(&body)
.map_err(|e| FetchError::Build(format!("serper request encode error: {e}")))?;
let resp = http
.post(SERPER_URL)
.header("X-API-KEY", api_key)
.header("Content-Type", "application/json")
.body(payload)
.send()
.await?;
let status = resp.status();
if !status.is_success() {
let code = status.as_u16();
let text = resp.text().await.unwrap_or_default();
return Err(FetchError::Build(format!("serper returned {code}: {text}")));
}
let text = resp
.text()
.await
.map_err(|e| FetchError::BodyDecode(format!("serper response read error: {e}")))?;
serde_json::from_str::<Value>(&text)
.map_err(|e| FetchError::BodyDecode(format!("serper response parse error: {e}")))
}
/// Parse the `organic` array of a Serper response into [`SearchResult`]s.
///
/// Pure (no network), so it is unit-tested against a fixture. Entries
/// missing `title` or `link` are skipped; `snippet` defaults to empty.
/// `position` is 1-based over the kept entries.
pub fn parse_serper_organic(response: &Value) -> Vec<SearchResult> {
let Some(organic) = response.get("organic").and_then(|v| v.as_array()) else {
return Vec::new();
};
organic
.iter()
.filter_map(|item| {
let title = item.get("title")?.as_str()?.to_string();
let link = item.get("link")?.as_str()?.to_string();
let snippet = item
.get("snippet")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
Some(SearchResult {
title,
link,
snippet,
// Filled in after collection so it tracks kept entries,
// not the raw array index (which may include skips).
position: 0,
content: None,
})
})
.enumerate()
.map(|(i, mut r)| {
r.position = i + 1;
r
})
.collect()
}
/// Fetch + extract the result pages and attach markdown to `content`.
///
/// Bounded by [`SCRAPE_CONCURRENCY`]. A per-result fetch or extraction
/// failure leaves that result's `content` as `None` rather than failing
/// the whole search.
async fn scrape_results(client: &FetchClient, results: &mut [SearchResult]) {
let sem = Arc::new(Semaphore::new(SCRAPE_CONCURRENCY));
// Collect owned links first so the per-result futures don't borrow
// `results`. That keeps the future captures free of the slice's
// lifetime, which is what lets this compile inside the MCP `#[tool]`
// macro's stricter `Send`/lifetime bounds.
let links: Vec<String> = results.iter().map(|r| r.link.clone()).collect();
let scrapes = links.into_iter().map(|link| {
let sem = sem.clone();
async move {
// If the semaphore is closed (shutdown race), skip rather than panic.
let _permit = match sem.acquire().await {
Ok(p) => p,
Err(_) => return None,
};
match client.fetch(&link).await {
Ok(fetched) => match webclaw_core::extract(&fetched.html, Some(&fetched.url)) {
Ok(extraction) => Some(extraction.content.markdown),
Err(e) => {
warn!(url = %link, error = %e, "search: extraction failed");
None
}
},
Err(e) => {
warn!(url = %link, error = %e, "search: fetch failed");
None
}
}
}
});
// `join_all` drives every scrape future concurrently and returns
// results in input order; the semaphore caps how many fetches run at
// once. Result set is tiny (≤10), so the all-at-once poll is fine.
let contents = futures_util::future::join_all(scrapes).await;
for (r, content) in results.iter_mut().zip(contents) {
r.content = content;
}
}
#[cfg(test)]
mod tests {
use super::*;
fn fixture() -> Value {
json!({
"searchParameters": { "q": "rust async", "type": "search" },
"organic": [
{
"title": "Async Rust",
"link": "https://example.com/async",
"snippet": "Learn async in Rust.",
"position": 1
},
{
// snippet missing on purpose -> defaults to ""
"title": "Tokio",
"link": "https://tokio.rs"
},
{
// no link -> skipped, must not shift positions of the rest
"title": "No Link Here"
}
]
})
}
#[test]
fn parses_organic_results() {
let results = parse_serper_organic(&fixture());
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "Async Rust");
assert_eq!(results[0].link, "https://example.com/async");
assert_eq!(results[0].snippet, "Learn async in Rust.");
assert_eq!(results[0].position, 1);
assert!(results[0].content.is_none());
// Missing snippet -> empty string, and position is 1-based over
// kept entries (the link-less entry is dropped, not counted).
assert_eq!(results[1].title, "Tokio");
assert_eq!(results[1].snippet, "");
assert_eq!(results[1].position, 2);
}
#[test]
fn missing_organic_key_yields_empty() {
assert!(parse_serper_organic(&json!({})).is_empty());
assert!(parse_serper_organic(&json!({ "organic": "not-an-array" })).is_empty());
}
#[test]
fn search_result_serializes_without_null_content() {
let r = SearchResult {
title: "T".into(),
link: "https://e.com".into(),
snippet: "s".into(),
position: 1,
content: None,
};
let v = serde_json::to_value(&r).unwrap();
assert!(v.get("content").is_none(), "None content should be skipped");
let r2 = SearchResult {
content: Some("# md".into()),
..r
};
let v2 = serde_json::to_value(&r2).unwrap();
assert_eq!(v2.get("content").and_then(|c| c.as_str()), Some("# md"));
}
#[test]
fn default_options() {
let o = SearchOptions::default();
assert_eq!(o.num_results, 5);
assert!(!o.scrape);
assert!(o.country.is_none());
assert!(o.lang.is_none());
}
}

View file

@ -18,20 +18,12 @@ use crate::error::FetchError;
/// Maximum depth when recursively fetching sitemap index files.
/// Prevents infinite loops from circular sitemap references.
///
/// Raised 3→5: large sites (gov.uk, news publishers) nest sitemap indexes
/// more than three levels deep — a top index → per-section index →
/// per-month index → urlset is already four hops. Three cut those off.
const MAX_RECURSION_DEPTH: usize = 5;
const MAX_RECURSION_DEPTH: usize = 3;
/// Common sitemap paths to try when robots.txt doesn't list any.
const FALLBACK_SITEMAP_PATHS: &[&str] = &[
"/sitemap.xml",
"/sitemap_index.xml",
"/sitemap-index.xml",
"/sitemap1.xml",
"/sitemaps.xml",
"/sitemap/index.xml",
"/wp-sitemap.xml",
"/sitemap/sitemap-index.xml",
];
@ -113,12 +105,10 @@ async fn fetch_sitemaps(
for sitemap_url in urls {
debug!(url = %sitemap_url, depth, "fetching sitemap");
// Fetch raw bytes so gzipped sitemaps survive intact. `fetch` runs
// the body through `from_utf8_lossy`, which corrupts binary gzip.
let body = match client.fetch_raw(sitemap_url).await {
Ok((200, body)) => body,
Ok((status, _)) => {
debug!(url = %sitemap_url, status, "sitemap not found");
let xml = match client.fetch(sitemap_url).await {
Ok(result) if result.status == 200 => result.html,
Ok(result) => {
debug!(url = %sitemap_url, status = result.status, "sitemap not found");
continue;
}
Err(e) => {
@ -127,14 +117,6 @@ async fn fetch_sitemaps(
}
};
let xml = match decode_sitemap_body(&body) {
Some(xml) => xml,
None => {
debug!(url = %sitemap_url, "failed to decode sitemap body, skipping");
continue;
}
};
match detect_sitemap_type(&xml) {
SitemapType::UrlSet => {
let parsed = parse_urlset(&xml);
@ -165,33 +147,6 @@ async fn fetch_sitemaps(
}
}
/// Decode a raw sitemap body into a UTF-8 XML string.
///
/// Sitemaps are commonly served gzipped (`.xml.gz`) with
/// `Content-Type: application/gzip` and *no* `Content-Encoding`, so the HTTP
/// layer never inflates them. We detect the gzip magic bytes (`0x1f 0x8b`)
/// and gunzip in-process; otherwise the body is treated as plain XML.
///
/// Returns `None` if a gzip stream fails to inflate. Plain (non-gzip) bodies
/// always succeed via lossy UTF-8 decode, mirroring the previous behaviour.
pub(crate) fn decode_sitemap_body(body: &[u8]) -> Option<String> {
if body.starts_with(&[0x1f, 0x8b]) {
use std::io::Read;
let mut decoder = flate2::read::GzDecoder::new(body);
let mut out = String::new();
match decoder.read_to_string(&mut out) {
Ok(_) => Some(out),
Err(e) => {
warn!(error = %e, "failed to gunzip sitemap body");
None
}
}
} else {
Some(String::from_utf8_lossy(body).into_owned())
}
}
// ---------------------------------------------------------------------------
// Pure parsing functions (no I/O, fully testable)
// ---------------------------------------------------------------------------
@ -714,47 +669,5 @@ mod tests {
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
// Paths added for robustness (item 3).
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap-index.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap1.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemaps.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/index.xml"));
}
#[test]
fn decode_plain_xml_body() {
let xml = r#"<?xml version="1.0"?><urlset></urlset>"#;
let got = decode_sitemap_body(xml.as_bytes()).expect("plain body decodes");
assert_eq!(got, xml);
}
#[test]
fn decode_gzipped_body() {
use std::io::Write;
let xml = r#"<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/gz-page</loc></url>
</urlset>"#;
// Gzip-compress the XML, then confirm decode_sitemap_body inflates it
// and the parser finds the URL.
let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
encoder.write_all(xml.as_bytes()).unwrap();
let gz = encoder.finish().unwrap();
assert_eq!(&gz[..2], &[0x1f, 0x8b], "gzip magic present");
let decoded = decode_sitemap_body(&gz).expect("gzip body inflates");
let entries = parse_urlset(&decoded);
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].url, "https://example.com/gz-page");
}
#[test]
fn decode_corrupt_gzip_returns_none() {
// Starts with gzip magic but the rest is garbage -> inflate fails.
let bad = [0x1f, 0x8b, 0x08, 0x00, 0xde, 0xad, 0xbe, 0xef];
assert!(decode_sitemap_body(&bad).is_none());
}
}

View file

@ -10,24 +10,15 @@ use std::{borrow::Cow, io, time::Duration};
use wreq::http2::{
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
};
use wreq::tls::compress::CertificateCompressor;
use wreq::tls::{AlpnProtocol, AlpsProtocol, ExtensionType, TlsOptions, TlsVersion};
use wreq::{Client, Emulation, Group, IntoEmulation};
use wreq_util::emulate::compress::{BrotliCompressor, ZlibCompressor};
use wreq::tls::{
AlpnProtocol, AlpsProtocol, CertificateCompressionAlgorithm, ExtensionType, TlsOptions,
TlsVersion,
};
use wreq::{Client, Emulation};
use crate::browser::BrowserVariant;
use crate::error::FetchError;
// Certificate-compression advertisement per profile. wreq 6.0.0-rc.29 replaced
// the `CertificateCompressionAlgorithm` enum argument with `&dyn
// CertificateCompressor` trait objects; wreq-util ships the concrete zlib/brotli
// implementations. The advertised set (and order) is a TLS fingerprint signal,
// so these mirror the previous enum lists exactly.
static CHROME_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&BrotliCompressor];
static FIREFOX_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] =
&[&ZlibCompressor, &BrotliCompressor];
static SAFARI_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&ZlibCompressor];
#[derive(Clone, Default)]
struct PublicDnsResolver;
@ -128,14 +119,14 @@ fn chrome_extensions() -> Vec<ExtensionType> {
ExtensionType::PSK_KEY_EXCHANGE_MODES, // 45
ExtensionType::EC_POINT_FORMATS, // 11
ExtensionType::CERT_COMPRESSION, // 27
ExtensionType::APPLICATION_SETTINGS, // 17613 (new codepoint, matches alps_use_new_codepoint)
ExtensionType::SUPPORTED_VERSIONS, // 43
ExtensionType::SIGNATURE_ALGORITHMS, // 13
ExtensionType::SERVER_NAME, // 0
ExtensionType::APPLICATION_SETTINGS_NEW, // 17613 (new codepoint, matches alps_use_new_codepoint)
ExtensionType::SUPPORTED_VERSIONS, // 43
ExtensionType::SIGNATURE_ALGORITHMS, // 13
ExtensionType::SERVER_NAME, // 0
ExtensionType::APPLICATION_LAYER_PROTOCOL_NEGOTIATION, // 16
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
ExtensionType::RENEGOTIATE, // 65281
ExtensionType::EXTENDED_MASTER_SECRET, // 23
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
ExtensionType::RENEGOTIATE, // 65281
ExtensionType::EXTENDED_MASTER_SECRET, // 23
]
}
@ -296,7 +287,7 @@ fn chrome_tls() -> TlsOptions {
.alps_protocols([AlpsProtocol::HTTP3, AlpsProtocol::HTTP2])
.alps_use_new_codepoint(true)
.aes_hw_override(true)
.certificate_compressors(CHROME_CERT_COMPRESSORS)
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
.build()
}
@ -313,7 +304,10 @@ fn firefox_tls() -> TlsOptions {
.pre_shared_key(true)
.enable_ocsp_stapling(true)
.enable_signed_cert_timestamps(true)
.certificate_compressors(FIREFOX_CERT_COMPRESSORS)
.certificate_compression_algorithms(&[
CertificateCompressionAlgorithm::ZLIB,
CertificateCompressionAlgorithm::BROTLI,
])
.build()
}
@ -330,7 +324,7 @@ fn safari_tls() -> TlsOptions {
.pre_shared_key(false)
.enable_ocsp_stapling(true)
.enable_signed_cert_timestamps(true)
.certificate_compressors(SAFARI_CERT_COMPRESSORS)
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB])
.build()
}
@ -351,23 +345,21 @@ fn safari_tls() -> TlsOptions {
/// `priority: u=0, i`, zstd), replace with the real iOS 26 set.
/// 4. `accept-language` preserved from config.extra_headers for locale.
fn safari_ios_emulation() -> wreq::Emulation {
// wreq 6.0.0-rc.29 exposes the `Emulation` fields directly (no `*_mut()`
// accessors) and wreq-util 3.0.0-rc.12 renamed the enum to `Profile` with
// `IntoEmulation::into_emulation` replacing `EmulationFactory::emulation`.
let mut em = wreq_util::Profile::SafariIos26.into_emulation();
use wreq::EmulationFactory;
let mut em = wreq_util::Emulation::SafariIos26.emulation();
if let Some(tls) = em.tls_options.as_mut() {
if let Some(tls) = em.tls_options_mut().as_mut() {
tls.extension_permutation = Some(Cow::Owned(safari_ios_extensions()));
}
// Only override the priority flag — keep wreq-util's SETTINGS, WINDOW_UPDATE,
// and pseudo-order intact. Replacing the whole Http2Options resets SETTINGS
// to defaults, which sends only INITIAL_WINDOW_SIZE and fails DataDome.
if let Some(h2) = em.http2_options.as_mut() {
if let Some(h2) = em.http2_options_mut().as_mut() {
h2.headers_stream_dependency = Some(StreamDependency::new(StreamId::zero(), 255, true));
}
let hm = &mut em.headers;
let hm = em.headers_mut();
hm.clear();
for (k, v) in SAFARI_IOS_HEADERS {
if let (Ok(n), Ok(val)) = (
@ -516,12 +508,12 @@ pub fn build_client(
.tls_options(tls)
.http2_options(h2)
.headers(build_headers(headers))
.build(Group::default())
.build()
}
};
// Append extra headers after profile defaults.
let hm = &mut emulation.headers;
let hm = emulation.headers_mut();
for (k, v) in extra_headers {
if let (Ok(n), Ok(val)) = (
http::header::HeaderName::from_bytes(k.as_bytes()),
@ -538,11 +530,7 @@ pub fn build_client(
max_redirects as usize,
))
.cookie_store(true)
.timeout(timeout)
.connect_timeout(Duration::from_secs(5))
.pool_idle_timeout(Duration::from_secs(90))
.pool_max_idle_per_host(8)
.tcp_keepalive(Duration::from_secs(60));
.timeout(timeout);
if let Some(proxy_url) = proxy {
let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {

View file

@ -1,5 +1,5 @@
/// Provider chain — tries providers in order until one succeeds.
/// Default order: Ollama (local, free) -> OpenAI -> Gemini -> Anthropic.
/// Default order: Ollama (local, free) -> OpenAI -> Anthropic.
/// Only includes providers that are actually configured/available.
use async_trait::async_trait;
use tracing::{debug, warn};
@ -7,8 +7,7 @@ use tracing::{debug, warn};
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider};
use crate::providers::{
anthropic::AnthropicProvider, gemini::GeminiProvider, ollama::OllamaProvider,
openai::OpenAiProvider,
anthropic::AnthropicProvider, ollama::OllamaProvider, openai::OpenAiProvider,
};
pub struct ProviderChain {
@ -16,11 +15,9 @@ pub struct ProviderChain {
}
impl ProviderChain {
/// Build the default chain: Ollama -> OpenAI -> Gemini -> Anthropic.
/// Build the default chain: Ollama -> OpenAI -> Anthropic.
/// Ollama is always added (availability checked at call time).
/// Cloud providers are only added if their API keys are configured.
/// Gemini sits ahead of Anthropic so Google Cloud credits are preferred,
/// with Anthropic as the last-resort fallback.
pub async fn default() -> Self {
let mut providers: Vec<Box<dyn LlmProvider>> = Vec::new();
@ -37,11 +34,6 @@ impl ProviderChain {
providers.push(Box::new(openai));
}
if let Some(gemini) = GeminiProvider::new(None, None, None) {
debug!("gemini configured, adding to chain");
providers.push(Box::new(gemini));
}
if let Some(anthropic) = AnthropicProvider::with_base_url(None, None, None) {
debug!("anthropic configured, adding to chain");
providers.push(Box::new(anthropic));

View file

@ -1,6 +1,6 @@
/// webclaw-llm: LLM integration with local-first hybrid architecture.
///
/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Gemini, then Anthropic.
/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
/// Provides schema-based extraction, prompt extraction, and summarization
/// on top of webclaw-core's content pipeline.
pub mod chain;

View file

@ -1,8 +1,6 @@
/// Anthropic provider — Claude models via api.anthropic.com.
/// Anthropic's API differs from OpenAI: system message is a top-level param,
/// not part of the messages array.
use std::time::Duration;
use async_trait::async_trait;
use serde_json::json;
@ -37,20 +35,14 @@ impl AnthropicProvider {
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
Some(Self {
client: reqwest::Client::builder()
.timeout(Duration::from_secs(120))
.connect_timeout(Duration::from_secs(10))
.build()
.unwrap_or_else(|_| reqwest::Client::new()),
client: reqwest::Client::new(),
key,
base_url: base_url
.or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
.unwrap_or_else(|| DEFAULT_ANTHROPIC_BASE_URL.into())
.trim_end_matches('/')
.to_string(),
default_model: model
.or_else(|| std::env::var("ANTHROPIC_MODEL").ok())
.unwrap_or_else(|| "claude-sonnet-4-6".into()),
default_model: model.unwrap_or_else(|| "claude-sonnet-4-20250514".into()),
})
}
@ -116,7 +108,11 @@ impl LlmProvider for AnthropicProvider {
if !resp.status().is_success() {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
let safe_text = text.chars().take(500).collect::<String>();
let safe_text = if text.len() > 500 {
&text[..500]
} else {
&text
};
return Err(LlmError::ProviderError(format!(
"anthropic returned {status}: {safe_text}"
)));
@ -160,7 +156,7 @@ mod tests {
let provider =
AnthropicProvider::new(Some("sk-ant-test".into()), None).expect("should construct");
assert_eq!(provider.name(), "anthropic");
assert_eq!(provider.default_model, "claude-sonnet-4-6");
assert_eq!(provider.default_model, "claude-sonnet-4-20250514");
assert_eq!(provider.key, "sk-ant-test");
assert_eq!(provider.base_url, "https://api.anthropic.com/v1");
assert_eq!(
@ -180,7 +176,7 @@ mod tests {
#[test]
fn default_model_accessor() {
let provider = AnthropicProvider::new(Some("sk-ant-test".into()), None).unwrap();
assert_eq!(provider.default_model(), "claude-sonnet-4-6");
assert_eq!(provider.default_model(), "claude-sonnet-4-20250514");
}
#[test]

View file

@ -1,363 +0,0 @@
/// Google Gemini provider — Gemini models via the Generative Language API.
/// Gemini's request shape differs from OpenAI/Anthropic: the system message is a
/// top-level `systemInstruction`, conversation turns live in `contents` (with the
/// assistant role renamed to `model`), and generation knobs sit under
/// `generationConfig`. API-key auth is sent as an `x-goog-api-key` header.
use std::time::Duration;
use async_trait::async_trait;
use serde_json::json;
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider};
use super::load_api_key;
const DEFAULT_GEMINI_BASE_URL: &str = "https://generativelanguage.googleapis.com/v1beta";
/// Default model. Gemini 2.5 Flash/Pro are "thinking" models: internal reasoning
/// tokens count against `maxOutputTokens`, so the output budget must comfortably
/// exceed the visible response (see `request_body`) or the model returns
/// `finishReason=MAX_TOKENS` with no text. Set `GEMINI_MODEL` to a non-thinking
/// model (e.g. `gemini-2.0-flash`) to avoid the reasoning overhead entirely.
const DEFAULT_GEMINI_MODEL: &str = "gemini-2.5-flash";
/// Gemini puts the model in the URL path, so only plain model identifiers are
/// safe to interpolate. Real model names are ASCII alphanumerics plus `-`/`.`/`_`
/// (e.g. `gemini-2.5-flash`, `gemini-2.0-flash-001`); anything else (`/`, `:`,
/// `?`, `#`, whitespace) could redirect the request to a different path/method.
fn is_safe_model_name(model: &str) -> bool {
!model.is_empty()
&& model
.bytes()
.all(|b| b.is_ascii_alphanumeric() || matches!(b, b'-' | b'.' | b'_'))
}
pub struct GeminiProvider {
client: reqwest::Client,
key: String,
base_url: String,
default_model: String,
}
impl GeminiProvider {
/// Returns `None` if no API key is available (param or `GEMINI_API_KEY` env).
pub fn new(
key_override: Option<String>,
base_url: Option<String>,
model: Option<String>,
) -> Option<Self> {
let key = load_api_key(key_override, "GEMINI_API_KEY")?;
Some(Self {
client: reqwest::Client::builder()
.timeout(Duration::from_secs(120))
.connect_timeout(Duration::from_secs(10))
.build()
.unwrap_or_else(|_| reqwest::Client::new()),
key,
base_url: base_url
.or_else(|| std::env::var("GEMINI_BASE_URL").ok())
.unwrap_or_else(|| DEFAULT_GEMINI_BASE_URL.into())
.trim_end_matches('/')
.to_string(),
default_model: model
.or_else(|| std::env::var("GEMINI_MODEL").ok())
.unwrap_or_else(|| DEFAULT_GEMINI_MODEL.into()),
})
}
pub fn default_model(&self) -> &str {
&self.default_model
}
/// Build the `generateContent` body from a generic completion request.
/// System messages become `systemInstruction`; user/assistant turns become
/// `contents` (assistant → `model`); `json_mode` constrains the model to
/// valid JSON via `responseMimeType`.
fn request_body(&self, request: &CompletionRequest) -> serde_json::Value {
let contents: Vec<serde_json::Value> = request
.messages
.iter()
.filter(|m| m.role != "system")
.map(|m| {
let role = if m.role == "assistant" {
"model"
} else {
"user"
};
json!({ "role": role, "parts": [{ "text": m.content }] })
})
.collect();
let system_parts: Vec<serde_json::Value> = request
.messages
.iter()
.filter(|m| m.role == "system")
.map(|m| json!({ "text": m.content }))
.collect();
// `maxOutputTokens` is a ceiling, not a reservation — you're billed per
// token actually produced — so default generously. Gemini 2.5 "thinking"
// models spend part of this budget on internal reasoning; too low a cap
// makes them return `finishReason=MAX_TOKENS` with no visible text.
let mut generation_config = json!({
"maxOutputTokens": request.max_tokens.unwrap_or(8192),
});
if let Some(temp) = request.temperature {
generation_config["temperature"] = json!(temp);
}
if request.json_mode {
generation_config["responseMimeType"] = json!("application/json");
}
let mut body = json!({
"contents": contents,
"generationConfig": generation_config,
});
// Gemini rejects an empty `systemInstruction`, so only attach it when a
// system message is actually present.
if !system_parts.is_empty() {
body["systemInstruction"] = json!({ "parts": system_parts });
}
body
}
}
#[async_trait]
impl LlmProvider for GeminiProvider {
async fn complete(&self, request: &CompletionRequest) -> Result<String, LlmError> {
let model = if request.model.is_empty() {
&self.default_model
} else {
&request.model
};
// The model goes in the URL path (Gemini's API requires it there, unlike
// OpenAI/Anthropic which take it in the body), so reject anything that
// isn't a plain model identifier to prevent path/query injection from a
// caller-supplied `request.model`.
if !is_safe_model_name(model) {
return Err(LlmError::ProviderError(format!(
"invalid gemini model name: {model:?}"
)));
}
let body = self.request_body(request);
// API-key auth goes in the header, never the URL, so the key can't leak
// into request logs, proxies, or referrer headers.
let url = format!("{}/models/{model}:generateContent", self.base_url);
let resp = self
.client
.post(&url)
.header("x-goog-api-key", &self.key)
.header("content-type", "application/json")
.json(&body)
.send()
.await?;
if !resp.status().is_success() {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
let safe_text = text.chars().take(500).collect::<String>();
return Err(LlmError::ProviderError(format!(
"gemini returned {status}: {safe_text}"
)));
}
// Cap response body size to defend against adversarial payloads.
let json = super::response_json_capped(resp).await?;
// Gemini response: {"candidates":[{"content":{"parts":[{"text":"..."}]}}]}.
// A candidate may carry multiple text parts; concatenate them in order.
let text = json["candidates"][0]["content"]["parts"]
.as_array()
.map(|parts| {
parts
.iter()
.filter_map(|p| p["text"].as_str())
.collect::<String>()
})
.unwrap_or_default();
if text.is_empty() {
// No usable text. Surface Gemini's finishReason (or a prompt-level
// block reason) so MAX_TOKENS — e.g. a "thinking" model that spent
// its whole maxOutputTokens budget on reasoning — and SAFETY blocks
// are visible in logs/telemetry instead of masquerading as a parse
// failure. The chain falls through to the next provider on any Err.
let reason = json["candidates"][0]["finishReason"]
.as_str()
.or_else(|| json["promptFeedback"]["blockReason"].as_str())
.unwrap_or("unknown");
return Err(LlmError::ProviderError(format!(
"gemini returned no text (finishReason={reason})"
)));
}
Ok(strip_thinking_tags(&text))
}
async fn is_available(&self) -> bool {
!self.key.is_empty()
}
fn name(&self) -> &str {
"gemini"
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::provider::Message;
fn provider() -> GeminiProvider {
GeminiProvider::new(Some("test-key".into()), None, None).expect("should construct")
}
fn msg(role: &str, content: &str) -> Message {
Message {
role: role.into(),
content: content.into(),
}
}
fn request(messages: Vec<Message>, json_mode: bool) -> CompletionRequest {
CompletionRequest {
model: String::new(),
messages,
temperature: None,
max_tokens: None,
json_mode,
}
}
#[test]
fn empty_key_returns_none() {
assert!(GeminiProvider::new(Some(String::new()), None, None).is_none());
}
#[test]
fn model_name_validation_blocks_path_injection() {
// Real model identifiers pass.
assert!(is_safe_model_name("gemini-2.5-flash"));
assert!(is_safe_model_name("gemini-2.0-flash-001"));
assert!(is_safe_model_name("gemini-1.5-pro-002"));
// Anything that could alter the request path/method is rejected.
assert!(!is_safe_model_name(""));
assert!(!is_safe_model_name(
"gemini-2.5-flash:streamGenerateContent"
));
assert!(!is_safe_model_name("../../models/x"));
assert!(!is_safe_model_name("model?alt=sse"));
assert!(!is_safe_model_name("a b"));
}
#[test]
fn explicit_key_constructs_with_defaults() {
let p = provider();
assert_eq!(p.name(), "gemini");
assert_eq!(p.key, "test-key");
assert_eq!(p.default_model, DEFAULT_GEMINI_MODEL);
assert_eq!(p.default_model(), DEFAULT_GEMINI_MODEL);
assert_eq!(p.base_url, DEFAULT_GEMINI_BASE_URL);
}
#[test]
fn custom_base_url_trims_trailing_slash_and_model() {
let p = GeminiProvider::new(
Some("test-key".into()),
Some("https://example.test/v1beta/".into()),
Some("gemini-2.5-pro".into()),
)
.unwrap();
assert_eq!(p.base_url, "https://example.test/v1beta");
assert_eq!(p.default_model, "gemini-2.5-pro");
}
#[test]
fn maps_user_and_assistant_roles_into_contents() {
let p = provider();
let body = p.request_body(&request(
vec![msg("user", "hello"), msg("assistant", "hi there")],
false,
));
let contents = body["contents"].as_array().unwrap();
assert_eq!(contents.len(), 2);
assert_eq!(contents[0]["role"], "user");
assert_eq!(contents[0]["parts"][0]["text"], "hello");
// assistant must be renamed to Gemini's "model" role.
assert_eq!(contents[1]["role"], "model");
assert_eq!(contents[1]["parts"][0]["text"], "hi there");
// No system message -> no systemInstruction key at all.
assert!(body.get("systemInstruction").is_none());
}
#[test]
fn system_message_becomes_system_instruction_not_contents() {
let p = provider();
let body = p.request_body(&request(
vec![msg("system", "be terse"), msg("user", "hello")],
false,
));
let contents = body["contents"].as_array().unwrap();
assert_eq!(contents.len(), 1, "system message lifted out of contents");
assert_eq!(contents[0]["role"], "user");
assert_eq!(body["systemInstruction"]["parts"][0]["text"], "be terse");
}
#[test]
fn json_mode_toggles_response_mime_type() {
let p = provider();
let on = p.request_body(&request(vec![msg("user", "x")], true));
assert_eq!(
on["generationConfig"]["responseMimeType"],
"application/json"
);
let off = p.request_body(&request(vec![msg("user", "x")], false));
assert!(off["generationConfig"].get("responseMimeType").is_none());
}
#[test]
fn max_output_tokens_default_and_temperature_override() {
let p = provider();
let default_body = p.request_body(&request(vec![msg("user", "x")], false));
assert_eq!(default_body["generationConfig"]["maxOutputTokens"], 8192);
// No temperature set -> key omitted.
assert!(
default_body["generationConfig"]
.get("temperature")
.is_none()
);
let mut req = request(vec![msg("user", "x")], false);
req.max_tokens = Some(256);
req.temperature = Some(0.5); // 0.5 is exact in both f32 and f64
let body = p.request_body(&req);
assert_eq!(body["generationConfig"]["maxOutputTokens"], 256);
assert_eq!(body["generationConfig"]["temperature"], 0.5);
}
// Env var fallback tests mutate process-global state and race with parallel
// tests. Run in isolation if needed:
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
#[test]
#[ignore = "mutates process env; run with --test-threads=1"]
fn env_var_key_fallback() {
unsafe { std::env::set_var("GEMINI_API_KEY", "gemini-env-key") };
let p = GeminiProvider::new(None, None, None).expect("should construct from env");
assert_eq!(p.key, "gemini-env-key");
unsafe { std::env::remove_var("GEMINI_API_KEY") };
}
#[test]
#[ignore = "mutates process env; run with --test-threads=1"]
fn no_key_returns_none() {
unsafe { std::env::remove_var("GEMINI_API_KEY") };
assert!(GeminiProvider::new(None, None, None).is_none());
}
}

View file

@ -1,5 +1,4 @@
pub mod anthropic;
pub mod gemini;
pub mod ollama;
pub mod openai;

View file

@ -1,7 +1,5 @@
/// Ollama provider — talks to a local Ollama instance (default localhost:11434).
/// First choice in the provider chain: free, private, fast on Apple Silicon.
use std::time::Duration;
use async_trait::async_trait;
use serde_json::json;
@ -26,11 +24,7 @@ impl OllamaProvider {
.unwrap_or_else(|| "qwen3:8b".into());
Self {
client: reqwest::Client::builder()
.timeout(Duration::from_secs(120))
.connect_timeout(Duration::from_secs(10))
.build()
.unwrap_or_else(|_| reqwest::Client::new()),
client: reqwest::Client::new(),
base_url,
default_model,
}
@ -76,7 +70,11 @@ impl LlmProvider for OllamaProvider {
if !resp.status().is_success() {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
let safe_text = text.chars().take(500).collect::<String>();
let safe_text = if text.len() > 500 {
&text[..500]
} else {
&text
};
return Err(LlmError::ProviderError(format!(
"ollama returned {status}: {safe_text}"
)));
@ -100,8 +98,7 @@ impl LlmProvider for OllamaProvider {
async fn is_available(&self) -> bool {
let url = format!("{}/api/tags", self.base_url);
let req = self.client.get(&url).timeout(Duration::from_secs(10));
matches!(req.send().await, Ok(r) if r.status().is_success())
matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success())
}
fn name(&self) -> &str {

View file

@ -1,6 +1,4 @@
/// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint.
use std::time::Duration;
use async_trait::async_trait;
use serde_json::json;
@ -71,11 +69,7 @@ impl OpenAiProvider {
let key = load_api_key(key_override, "OPENAI_API_KEY")?;
Some(Self {
client: reqwest::Client::builder()
.timeout(Duration::from_secs(120))
.connect_timeout(Duration::from_secs(10))
.build()
.unwrap_or_else(|_| reqwest::Client::new()),
client: reqwest::Client::new(),
key,
base_url: base_url
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
@ -138,7 +132,11 @@ impl LlmProvider for OpenAiProvider {
if !resp.status().is_success() {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
let safe_text = text.chars().take(500).collect::<String>();
let safe_text = if text.len() > 500 {
&text[..500]
} else {
&text
};
return Err(LlmError::ProviderError(format!(
"openai returned {status}: {safe_text}"
)));

View file

@ -323,10 +323,9 @@ impl WebclawMcp {
if params.urls.len() > 100 {
return Err("batch is limited to 100 URLs per request".into());
}
// No up-front DNS pre-validation: it aborted the whole batch on a
// single unresolvable URL. The fetch layer applies the same SSRF
// guard (validate_public_http_url) per URL, so bad entries surface
// as individual per-URL errors below instead of failing the batch.
for u in &params.urls {
validate_url(u).await?;
}
let format = params.format.as_deref().unwrap_or("markdown");
let concurrency = params.concurrency.unwrap_or(5);
@ -668,55 +667,13 @@ impl WebclawMcp {
))
}
/// Search the web for a query and return structured results.
///
/// Resolves the backend in priority order:
/// 1. `SERPER_API_KEY` set → local Serper.dev search with the user's
/// own key (no hosted API needed). Supports `country`, `lang`, and
/// `scrape` (fetch + extract each result page).
/// 2. else `WEBCLAW_API_KEY` set → the hosted webclaw search API.
/// 3. else → an error explaining both options.
/// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY.
#[tool]
async fn search(&self, Parameters(params): Parameters<SearchParams>) -> Result<String, String> {
// Local path: user's own Serper key. Preferred when present so the
// tool works without the hosted API and without spending credits.
if let Ok(serper_key) = std::env::var("SERPER_API_KEY")
&& !serper_key.trim().is_empty()
{
let opts = webclaw_fetch::SearchOptions {
num_results: params.num_results.unwrap_or(5) as usize,
country: params.country.clone(),
lang: params.lang.clone(),
scrape: params.scrape.unwrap_or(false),
};
let results = webclaw_fetch::search(
self.fetch_client.as_ref(),
&serper_key,
&params.query,
&opts,
)
.await
.map_err(|e| format!("search error: {e}"))?;
let mut output = format!("Found {} results:\n\n", results.len());
for r in &results {
output.push_str(&format!("{}. {}\n {}\n", r.position, r.title, r.link));
if !r.snippet.is_empty() {
output.push_str(&format!(" {}\n", r.snippet));
}
if let Some(ref content) = r.content {
output.push_str(&format!("\n{content}\n"));
}
output.push('\n');
}
return Ok(output);
}
// Hosted path: the webclaw cloud API.
let cloud = self.cloud.as_ref().ok_or(
"Search requires a search backend: set SERPER_API_KEY for local search \
(get one free at serper.dev), or WEBCLAW_API_KEY for the hosted API.",
)?;
let cloud = self
.cloud
.as_ref()
.ok_or("Search requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?;
let mut body = json!({ "query": params.query });
if let Some(num) = params.num_results {

View file

@ -4,89 +4,6 @@
use schemars::JsonSchema;
use serde::Deserialize;
// ── Coercion helpers ────────────────────────────────────────────────────────
//
// MCP clients (Claude Desktop, VS Code extension, etc.) sometimes pass numeric
// parameters as JSON strings (e.g. `"depth": "3"` instead of `"depth": 3`).
// serde's default u32/usize deserialisers reject strings with:
//
// "invalid type: string \"3\", expected u32"
//
// These helpers accept both forms transparently so callers never see that
// error regardless of which representation their client sends. The same
// problem hits booleans: clients send `"true"`/`"false"` as JSON strings,
// which serde's default bool deserialiser rejects — `deser_opt_bool_or_str`
// covers that case.
fn deser_opt_u32_or_str<'de, D>(d: D) -> Result<Option<u32>, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(serde::Deserialize)]
#[serde(untagged)]
enum NumOrStr {
Num(u32),
Str(String),
}
match Option::<NumOrStr>::deserialize(d)? {
None => Ok(None),
Some(NumOrStr::Num(n)) => Ok(Some(n)),
Some(NumOrStr::Str(s)) => {
s.trim().parse::<u32>().map(Some).map_err(|_| {
serde::de::Error::custom(format!("expected a u32, got string \"{s}\""))
})
}
}
}
fn deser_opt_usize_or_str<'de, D>(d: D) -> Result<Option<usize>, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(serde::Deserialize)]
#[serde(untagged)]
enum NumOrStr {
Num(usize),
Str(String),
}
match Option::<NumOrStr>::deserialize(d)? {
None => Ok(None),
Some(NumOrStr::Num(n)) => Ok(Some(n)),
Some(NumOrStr::Str(s)) => {
s.trim().parse::<usize>().map(Some).map_err(|_| {
serde::de::Error::custom(format!("expected a usize, got string \"{s}\""))
})
}
}
}
fn deser_opt_bool_or_str<'de, D>(d: D) -> Result<Option<bool>, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(serde::Deserialize)]
#[serde(untagged)]
enum BoolOrStr {
Bool(bool),
Str(String),
}
match Option::<BoolOrStr>::deserialize(d)? {
None => Ok(None),
Some(BoolOrStr::Bool(b)) => Ok(Some(b)),
// Accept "true"/"false" case-insensitively (trimmed). Reject anything
// else with a clear message rather than silently coercing it.
Some(BoolOrStr::Str(s)) => match s.trim().to_ascii_lowercase().as_str() {
"true" => Ok(Some(true)),
"false" => Ok(Some(false)),
_ => Err(serde::de::Error::custom(format!(
"expected a bool, got string \"{s}\""
))),
},
}
}
// ── Parameter structs ───────────────────────────────────────────────────────
#[derive(Debug, Deserialize, JsonSchema)]
pub struct ScrapeParams {
/// URL to scrape
@ -98,7 +15,6 @@ pub struct ScrapeParams {
/// CSS selectors to exclude from output
pub exclude_selectors: Option<Vec<String>>,
/// If true, extract only the main content (article/main element)
#[serde(default, deserialize_with = "deser_opt_bool_or_str")]
pub only_main_content: Option<bool>,
/// Browser profile: "chrome" (default), "firefox", or "random"
pub browser: Option<String>,
@ -111,16 +27,12 @@ pub struct CrawlParams {
/// Seed URL to start crawling from
pub url: String,
/// Maximum link depth to follow (default: 2)
#[serde(default, deserialize_with = "deser_opt_u32_or_str")]
pub depth: Option<u32>,
/// Maximum number of pages to crawl (default: 50)
#[serde(default, deserialize_with = "deser_opt_usize_or_str")]
pub max_pages: Option<usize>,
/// Number of concurrent requests (default: 5)
#[serde(default, deserialize_with = "deser_opt_usize_or_str")]
pub concurrency: Option<usize>,
/// Seed the frontier from sitemap discovery before crawling
#[serde(default, deserialize_with = "deser_opt_bool_or_str")]
pub use_sitemap: Option<bool>,
/// Output format for each page: "markdown" (default), "llm", "text"
pub format: Option<String>,
@ -139,7 +51,6 @@ pub struct BatchParams {
/// Output format: "markdown" (default), "llm", "text"
pub format: Option<String>,
/// Number of concurrent requests (default: 5)
#[serde(default, deserialize_with = "deser_opt_usize_or_str")]
pub concurrency: Option<usize>,
}
@ -158,7 +69,6 @@ pub struct SummarizeParams {
/// URL to fetch and summarize
pub url: String,
/// Number of sentences in the summary (default: 3)
#[serde(default, deserialize_with = "deser_opt_usize_or_str")]
pub max_sentences: Option<usize>,
}
@ -181,7 +91,6 @@ pub struct ResearchParams {
/// Research query or question to investigate
pub query: String,
/// Enable deep research mode for more thorough investigation (default: false)
#[serde(default, deserialize_with = "deser_opt_bool_or_str")]
pub deep: Option<bool>,
/// Topic hint to guide research focus (e.g. "technology", "finance", "science")
pub topic: Option<String>,
@ -191,19 +100,8 @@ pub struct ResearchParams {
pub struct SearchParams {
/// Search query
pub query: String,
/// Number of results to return (default: 5, max: 10)
#[serde(default, deserialize_with = "deser_opt_u32_or_str")]
/// Number of results to return (default: 10)
pub num_results: Option<u32>,
/// Country code for localization (e.g. "us", "gb", "it").
/// Only used by the local Serper path (SERPER_API_KEY).
pub country: Option<String>,
/// Language code for localization (e.g. "en", "it").
/// Only used by the local Serper path (SERPER_API_KEY).
pub lang: Option<String>,
/// When true, fetch + extract each result page and include its
/// markdown. Only used by the local Serper path (SERPER_API_KEY).
#[serde(default, deserialize_with = "deser_opt_bool_or_str")]
pub scrape: Option<bool>,
}
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
@ -222,292 +120,3 @@ pub struct VerticalParams {
/// so rmcp can generate a schema and parse the (empty) JSON-RPC params.
#[derive(Debug, Deserialize, JsonSchema)]
pub struct ListExtractorsParams {}
#[cfg(test)]
mod tests {
use super::*;
// ── CrawlParams.depth (u32) ──────────────────────────────────────────────
#[test]
fn crawl_depth_from_numeric_string() {
let v: CrawlParams =
serde_json::from_str(r#"{"url":"https://x.com","depth":"3"}"#).unwrap();
assert_eq!(v.depth, Some(3));
}
#[test]
fn crawl_depth_from_number() {
let v: CrawlParams = serde_json::from_str(r#"{"url":"https://x.com","depth":3}"#).unwrap();
assert_eq!(v.depth, Some(3));
}
#[test]
fn crawl_depth_absent_is_none() {
let v: CrawlParams = serde_json::from_str(r#"{"url":"https://x.com"}"#).unwrap();
assert_eq!(v.depth, None);
}
#[test]
fn crawl_depth_non_numeric_string_errors() {
let e = serde_json::from_str::<CrawlParams>(r#"{"url":"https://x.com","depth":"abc"}"#);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// ── CrawlParams.max_pages (usize) ────────────────────────────────────────
#[test]
fn crawl_max_pages_from_numeric_string() {
let v: CrawlParams =
serde_json::from_str(r#"{"url":"https://x.com","max_pages":"50"}"#).unwrap();
assert_eq!(v.max_pages, Some(50));
}
#[test]
fn crawl_max_pages_from_number() {
let v: CrawlParams =
serde_json::from_str(r#"{"url":"https://x.com","max_pages":50}"#).unwrap();
assert_eq!(v.max_pages, Some(50));
}
#[test]
fn crawl_max_pages_absent_is_none() {
let v: CrawlParams = serde_json::from_str(r#"{"url":"https://x.com"}"#).unwrap();
assert_eq!(v.max_pages, None);
}
#[test]
fn crawl_max_pages_non_numeric_string_errors() {
let e = serde_json::from_str::<CrawlParams>(r#"{"url":"https://x.com","max_pages":"abc"}"#);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// ── CrawlParams.concurrency (usize) ──────────────────────────────────────
#[test]
fn crawl_concurrency_from_numeric_string() {
let v: CrawlParams =
serde_json::from_str(r#"{"url":"https://x.com","concurrency":"5"}"#).unwrap();
assert_eq!(v.concurrency, Some(5));
}
#[test]
fn crawl_concurrency_from_number() {
let v: CrawlParams =
serde_json::from_str(r#"{"url":"https://x.com","concurrency":5}"#).unwrap();
assert_eq!(v.concurrency, Some(5));
}
#[test]
fn crawl_concurrency_absent_is_none() {
let v: CrawlParams = serde_json::from_str(r#"{"url":"https://x.com"}"#).unwrap();
assert_eq!(v.concurrency, None);
}
#[test]
fn crawl_concurrency_non_numeric_string_errors() {
let e =
serde_json::from_str::<CrawlParams>(r#"{"url":"https://x.com","concurrency":"abc"}"#);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// ── BatchParams.concurrency (usize) ──────────────────────────────────────
#[test]
fn batch_concurrency_from_numeric_string() {
let v: BatchParams =
serde_json::from_str(r#"{"urls":["https://x.com"],"concurrency":"5"}"#).unwrap();
assert_eq!(v.concurrency, Some(5));
}
#[test]
fn batch_concurrency_from_number() {
let v: BatchParams =
serde_json::from_str(r#"{"urls":["https://x.com"],"concurrency":5}"#).unwrap();
assert_eq!(v.concurrency, Some(5));
}
#[test]
fn batch_concurrency_absent_is_none() {
let v: BatchParams = serde_json::from_str(r#"{"urls":["https://x.com"]}"#).unwrap();
assert_eq!(v.concurrency, None);
}
#[test]
fn batch_concurrency_non_numeric_string_errors() {
let e = serde_json::from_str::<BatchParams>(
r#"{"urls":["https://x.com"],"concurrency":"abc"}"#,
);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// ── SearchParams.num_results (u32) ───────────────────────────────────────
#[test]
fn search_num_results_from_numeric_string() {
let v: SearchParams =
serde_json::from_str(r#"{"query":"rust","num_results":"10"}"#).unwrap();
assert_eq!(v.num_results, Some(10));
}
#[test]
fn search_num_results_from_number() {
let v: SearchParams = serde_json::from_str(r#"{"query":"rust","num_results":10}"#).unwrap();
assert_eq!(v.num_results, Some(10));
}
#[test]
fn search_num_results_absent_is_none() {
let v: SearchParams = serde_json::from_str(r#"{"query":"rust"}"#).unwrap();
assert_eq!(v.num_results, None);
}
#[test]
fn search_num_results_non_numeric_string_errors() {
let e = serde_json::from_str::<SearchParams>(r#"{"query":"rust","num_results":"abc"}"#);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// ── SummarizeParams.max_sentences (usize) ────────────────────────────────
#[test]
fn summarize_max_sentences_from_numeric_string() {
let v: SummarizeParams =
serde_json::from_str(r#"{"url":"https://x.com","max_sentences":"3"}"#).unwrap();
assert_eq!(v.max_sentences, Some(3));
}
#[test]
fn summarize_max_sentences_from_number() {
let v: SummarizeParams =
serde_json::from_str(r#"{"url":"https://x.com","max_sentences":3}"#).unwrap();
assert_eq!(v.max_sentences, Some(3));
}
#[test]
fn summarize_max_sentences_absent_is_none() {
let v: SummarizeParams = serde_json::from_str(r#"{"url":"https://x.com"}"#).unwrap();
assert_eq!(v.max_sentences, None);
}
#[test]
fn summarize_max_sentences_non_numeric_string_errors() {
let e = serde_json::from_str::<SummarizeParams>(
r#"{"url":"https://x.com","max_sentences":"abc"}"#,
);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// ── Boolean param string-coercion (issue #62) ───────────────────────────
// ScrapeParams.only_main_content
#[test]
fn scrape_only_main_content_from_bool() {
let v: ScrapeParams =
serde_json::from_str(r#"{"url":"https://x.com","only_main_content":true}"#).unwrap();
assert_eq!(v.only_main_content, Some(true));
}
#[test]
fn scrape_only_main_content_from_string() {
let t: ScrapeParams =
serde_json::from_str(r#"{"url":"https://x.com","only_main_content":"true"}"#).unwrap();
assert_eq!(t.only_main_content, Some(true));
let f: ScrapeParams =
serde_json::from_str(r#"{"url":"https://x.com","only_main_content":"false"}"#).unwrap();
assert_eq!(f.only_main_content, Some(false));
}
#[test]
fn scrape_only_main_content_absent_is_none() {
let v: ScrapeParams = serde_json::from_str(r#"{"url":"https://x.com"}"#).unwrap();
assert_eq!(v.only_main_content, None);
}
#[test]
fn scrape_only_main_content_non_bool_string_errors() {
let e = serde_json::from_str::<ScrapeParams>(
r#"{"url":"https://x.com","only_main_content":"yes"}"#,
);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// CrawlParams.use_sitemap
#[test]
fn crawl_use_sitemap_from_bool() {
let v: CrawlParams =
serde_json::from_str(r#"{"url":"https://x.com","use_sitemap":false}"#).unwrap();
assert_eq!(v.use_sitemap, Some(false));
}
#[test]
fn crawl_use_sitemap_from_string() {
let v: CrawlParams =
serde_json::from_str(r#"{"url":"https://x.com","use_sitemap":"true"}"#).unwrap();
assert_eq!(v.use_sitemap, Some(true));
}
#[test]
fn crawl_use_sitemap_absent_is_none() {
let v: CrawlParams = serde_json::from_str(r#"{"url":"https://x.com"}"#).unwrap();
assert_eq!(v.use_sitemap, None);
}
#[test]
fn crawl_use_sitemap_non_bool_string_errors() {
let e =
serde_json::from_str::<CrawlParams>(r#"{"url":"https://x.com","use_sitemap":"nope"}"#);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// ResearchParams.deep
#[test]
fn research_deep_from_bool() {
let v: ResearchParams = serde_json::from_str(r#"{"query":"rust","deep":true}"#).unwrap();
assert_eq!(v.deep, Some(true));
}
#[test]
fn research_deep_from_string() {
let v: ResearchParams = serde_json::from_str(r#"{"query":"rust","deep":"true"}"#).unwrap();
assert_eq!(v.deep, Some(true));
}
#[test]
fn research_deep_absent_is_none() {
let v: ResearchParams = serde_json::from_str(r#"{"query":"rust"}"#).unwrap();
assert_eq!(v.deep, None);
}
#[test]
fn research_deep_non_bool_string_errors() {
// Numeric-looking strings are NOT accepted for bools (avoids ambiguity).
let e = serde_json::from_str::<ResearchParams>(r#"{"query":"rust","deep":"1"}"#);
assert!(e.is_err(), "expected Err, got {e:?}");
}
// SearchParams.scrape
#[test]
fn search_scrape_from_bool() {
let v: SearchParams = serde_json::from_str(r#"{"query":"rust","scrape":true}"#).unwrap();
assert_eq!(v.scrape, Some(true));
}
#[test]
fn search_scrape_from_string_case_insensitive() {
let v: SearchParams = serde_json::from_str(r#"{"query":"rust","scrape":"True"}"#).unwrap();
assert_eq!(v.scrape, Some(true));
}
#[test]
fn search_scrape_absent_is_none() {
let v: SearchParams = serde_json::from_str(r#"{"query":"rust"}"#).unwrap();
assert_eq!(v.scrape, None);
}
#[test]
fn search_scrape_non_bool_string_errors() {
let e = serde_json::from_str::<SearchParams>(r#"{"query":"rust","scrape":"maybe"}"#);
assert!(e.is_err(), "expected Err, got {e:?}");
}
}

View file

@ -38,24 +38,16 @@ pub enum ApiError {
#[error("internal: {0}")]
Internal(String),
#[error("{0}")]
NotImplemented(String),
}
impl ApiError {
pub fn bad_request(msg: impl Into<String>) -> Self {
Self::BadRequest(msg.into())
}
#[allow(dead_code)]
pub fn internal(msg: impl Into<String>) -> Self {
Self::Internal(msg.into())
}
/// 501 — a capability the operator hasn't configured (e.g. search
/// without `SERPER_API_KEY`). Distinct from `BadRequest` (client's
/// fault) and `Internal` (our fault): it's a deployment-config gap.
pub fn not_implemented(msg: impl Into<String>) -> Self {
Self::NotImplemented(msg.into())
}
fn status(&self) -> StatusCode {
match self {
@ -65,7 +57,6 @@ impl ApiError {
Self::Fetch(_) => StatusCode::BAD_GATEWAY,
Self::Extract(_) | Self::Llm(_) => StatusCode::UNPROCESSABLE_ENTITY,
Self::Internal(_) => StatusCode::INTERNAL_SERVER_ERROR,
Self::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED,
}
}
}

View file

@ -94,7 +94,6 @@ async fn main() -> anyhow::Result<()> {
)
.route("/crawl", post(routes::crawl::crawl))
.route("/map", post(routes::map::map))
.route("/search", post(routes::search::search))
.route("/batch", post(routes::batch::batch))
.route("/extract", post(routes::extract::extract))
.route("/extractors", get(routes::structured::list_extractors))

View file

@ -6,11 +6,6 @@
//! (anti-bot bypass with stealth Chrome, JS rendering at scale,
//! per-user auth, billing, async job queues, agent loops) are
//! intentionally not implemented here. Use api.webclaw.io for those.
//!
//! `POST /v1/search` is supported when the operator supplies their own
//! Serper.dev API key via the `SERPER_API_KEY` env var (free key at
//! serper.dev). Without it, the route returns 501. This is the
//! bring-your-own-key path — no hosted webclaw account required.
pub mod batch;
pub mod brand;
@ -20,6 +15,5 @@ pub mod extract;
pub mod health;
pub mod map;
pub mod scrape;
pub mod search;
pub mod structured;
pub mod summarize;

View file

@ -1,68 +0,0 @@
//! POST /v1/search — web search via Serper.dev using the operator's own key.
//!
//! Enabled only when the server is started with `SERPER_API_KEY` set
//! (get a free key at serper.dev). Without it, this route returns 501 so
//! self-hosters know the capability exists but isn't configured.
//!
//! With `scrape: true`, each result page is fetched + extracted to
//! markdown via the shared [`webclaw_fetch::FetchClient`]. A per-result
//! fetch failure leaves that result's `content` null; it never fails the
//! whole search.
use axum::{Json, extract::State};
use serde::Deserialize;
use serde_json::{Value, json};
use crate::{error::ApiError, state::AppState};
#[derive(Debug, Deserialize)]
pub struct SearchRequest {
pub query: String,
/// Max results to return (default 5, clamped to 1..=10).
#[serde(default = "default_num_results")]
pub num_results: usize,
/// Country code for localization (e.g. "us", "gb", "it").
pub country: Option<String>,
/// Language code for localization (e.g. "en", "it").
pub lang: Option<String>,
/// When true, fetch + extract each result page and include its markdown.
#[serde(default)]
pub scrape: bool,
}
fn default_num_results() -> usize {
5
}
pub async fn search(
State(state): State<AppState>,
Json(req): Json<SearchRequest>,
) -> Result<Json<Value>, ApiError> {
if req.query.trim().is_empty() {
return Err(ApiError::bad_request("`query` is required"));
}
let serper_key = state.serper_api_key().ok_or_else(|| {
ApiError::not_implemented(
"search is not configured: start the server with SERPER_API_KEY set \
(get a free key at serper.dev)",
)
})?;
let opts = webclaw_fetch::SearchOptions {
num_results: req.num_results,
country: req.country.clone(),
lang: req.lang.clone(),
scrape: req.scrape,
};
let results = webclaw_fetch::search(state.fetch(), serper_key, &req.query, &opts)
.await
.map_err(|e| ApiError::internal(format!("search failed: {e}")))?;
Ok(Json(json!({
"query": req.query,
"count": results.len(),
"results": results,
})))
}

View file

@ -36,9 +36,6 @@ struct Inner {
pub fetch: Arc<FetchClient>,
/// Inbound bearer-auth token for this server's own `/v1/*` surface.
pub api_key: Option<String>,
/// Operator's own Serper.dev API key, read from `SERPER_API_KEY`.
/// Enables `/v1/search`. Unset = `/v1/search` returns 501.
pub serper_api_key: Option<String>,
}
impl AppState {
@ -69,20 +66,10 @@ impl AppState {
fetch = fetch.with_cloud(cloud);
}
// Operator's own Serper.dev key enables /v1/search. Empty/unset
// leaves search returning 501 with a setup hint.
let serper_api_key = std::env::var("SERPER_API_KEY")
.ok()
.filter(|k| !k.trim().is_empty());
if serper_api_key.is_some() {
info!("search enabled — using SERPER_API_KEY for /v1/search");
}
Ok(Self {
inner: Arc::new(Inner {
fetch: Arc::new(fetch),
api_key: inbound_api_key,
serper_api_key,
}),
})
}
@ -94,11 +81,6 @@ impl AppState {
pub fn api_key(&self) -> Option<&str> {
self.inner.api_key.as_deref()
}
/// Operator's Serper.dev key for `/v1/search`, if configured.
pub fn serper_api_key(&self) -> Option<&str> {
self.inner.serper_api_key.as_deref()
}
}
/// Resolve the outbound cloud key. Prefers `WEBCLAW_CLOUD_API_KEY`;

View file

@ -1,68 +1,6 @@
# Proxy-Backed Crawling
Use proxy rotation when you need to distribute a crawl across a proxy pool. webclaw supports a single proxy or a proxy file, and accepts any standard HTTP/HTTPS or SOCKS5 proxy URL.
## Using ColdProxy
[ColdProxy](https://coldproxy.com/) is webclaw's infrastructure partner, providing residential IPv4, residential IPv6, and datacenter IPv6 proxies across 195+ countries. Use a ColdProxy endpoint as a full URL with `--proxy` / `WEBCLAW_PROXY`, or list several in a `--proxy-file` pool.
### 1. Get your endpoint
Sign in to your [ColdProxy dashboard](https://coldproxy.com/) and copy your proxy host, port, and credentials. Assemble them into a standard proxy URL:
```text
http://USERNAME:PASSWORD@HOST:PORT
```
### 2. One ColdProxy endpoint
```bash
export WEBCLAW_PROXY="http://USERNAME:PASSWORD@HOST:PORT"
webclaw https://example.com --format markdown
```
Or pass it inline:
```bash
webclaw https://example.com \
--proxy "http://USERNAME:PASSWORD@HOST:PORT" \
--format markdown
```
### 3. Rotate a ColdProxy pool
List one ColdProxy endpoint per line in `coldproxy.txt`. Pool files use `host:port:user:pass` (one entry per line; lines starting with `#` are ignored). Mix product types and regions to match your workload:
```text
# residential IPv4
HOST:PORT:USERNAME:PASSWORD
# residential IPv6
HOST:PORT:USERNAME:PASSWORD
# datacenter IPv6
HOST:PORT:USERNAME:PASSWORD
```
webclaw rotates across the pool per request:
```bash
webclaw https://docs.example.com \
--crawl \
--depth 2 \
--max-pages 200 \
--concurrency 10 \
--delay 200 \
--proxy-file coldproxy.txt \
--format markdown
```
### 4. Target a country
ColdProxy offers access across 195+ countries. Use the country-specific endpoint from your ColdProxy dashboard for each region you want to collect from (for example, a France residential endpoint for fr-localized pages). Add one endpoint per country to your pool file to spread a single crawl across regions.
### Choosing a product
- **Residential IPv4 / IPv6** — suitable for region-specific testing, localized content validation, public data collection, market monitoring, and regional QA.
- **Datacenter IPv6** — fastest and most cost-effective; best for high-volume crawling of tolerant endpoints.
Use proxy rotation when you need to distribute a crawl across a proxy pool. webclaw supports a single proxy or a proxy file.
## Single Proxy
@ -82,12 +20,12 @@ webclaw https://example.com \
## Proxy Pool
Create `proxies.txt` with one proxy per line in `host:port:user:pass` format (lines starting with `#` are ignored):
Create `proxies.txt` with one proxy per line:
```text
proxy-1.example.com:8080:user:pass
proxy-2.example.com:8080:user:pass
proxy-3.example.com:8080:user:pass
http://user:pass@proxy-1.example.com:8080
http://user:pass@proxy-2.example.com:8080
http://user:pass@proxy-3.example.com:8080
```
Run a crawl with controlled concurrency: