mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-09 22:35:12 +02:00
Compare commits
18 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d0d7b835f2 | ||
|
|
6519ac2a8b | ||
|
|
14ded4b99e | ||
|
|
72a451cfb6 | ||
|
|
17fce81a95 | ||
|
|
84a0f9774d | ||
|
|
519dfb7864 | ||
|
|
985a90b083 | ||
|
|
a1abf625a0 | ||
|
|
9a63c1a3ca | ||
|
|
58d274ffe9 | ||
|
|
f6000cba52 | ||
|
|
217bfe088b | ||
|
|
3b7d11328e | ||
|
|
363e17d362 | ||
|
|
8fe8bcb479 | ||
|
|
51260ae4e3 | ||
|
|
fe567a6af1 |
32 changed files with 3716 additions and 552 deletions
BIN
.github/banner.png
vendored
BIN
.github/banner.png
vendored
Binary file not shown.
|
Before Width: | Height: | Size: 44 KiB After Width: | Height: | Size: 48 KiB |
8
.github/workflows/ci.yml
vendored
8
.github/workflows/ci.yml
vendored
|
|
@ -14,7 +14,7 @@ jobs:
|
|||
name: Test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- run: cargo test --workspace
|
||||
|
|
@ -23,7 +23,7 @@ jobs:
|
|||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
with:
|
||||
components: clippy, rustfmt
|
||||
|
|
@ -35,7 +35,7 @@ jobs:
|
|||
name: WASM
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
with:
|
||||
targets: wasm32-unknown-unknown
|
||||
|
|
@ -50,7 +50,7 @@ jobs:
|
|||
name: Docs
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- run: cargo doc --no-deps --workspace
|
||||
|
|
|
|||
2
.github/workflows/deps.yml
vendored
2
.github/workflows/deps.yml
vendored
|
|
@ -14,7 +14,7 @@ jobs:
|
|||
name: Update webclaw-tls dependencies
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
with:
|
||||
token: ${{ secrets.SYNC_PAT }}
|
||||
|
||||
|
|
|
|||
8
.github/workflows/release.yml
vendored
8
.github/workflows/release.yml
vendored
|
|
@ -32,7 +32,7 @@ jobs:
|
|||
os: windows-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
with:
|
||||
|
|
@ -98,7 +98,7 @@ jobs:
|
|||
fi
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
name: ${{ matrix.target }}
|
||||
path: ${{ env.ASSET }}
|
||||
|
|
@ -110,7 +110,7 @@ jobs:
|
|||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- uses: actions/download-artifact@v4
|
||||
- uses: actions/download-artifact@v5
|
||||
with:
|
||||
path: artifacts
|
||||
|
||||
|
|
@ -142,7 +142,7 @@ jobs:
|
|||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
|
||||
- uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
|
|
|
|||
29
CHANGELOG.md
29
CHANGELOG.md
|
|
@ -3,6 +3,35 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.6.7] — 2026-06-09
|
||||
|
||||
### Changed
|
||||
- Updated the HTTP/TLS engine (wreq 6.0.0-rc.29, wreq-util 3.0.0-rc.12). This pulls in upstream robustness fixes: no more panic on responses with non-UTF8 header values, a fix for short reads when decoding large compressed bodies, and the TCP nodelay setting is restored. Browser TLS fingerprints are unchanged.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.6] — 2026-06-09
|
||||
|
||||
### Added
|
||||
- Slow fetches now print a progress line to stderr every 10 seconds (`# webclaw: still fetching <url> (Ns)`) so a long request no longer looks like the CLI hung. Fast fetches stay silent and stdout is untouched.
|
||||
- New `--url-encoded` flag plus a warning when a URL looks like the shell split it on `&` or `?`. The warning suggests quoting the URL; pass `--url-encoded` to silence it when the URL is intentional.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.5] — 2026-06-04
|
||||
|
||||
### Changed
|
||||
- Reddit threads extract reliably again. The old anonymous JSON endpoint is no longer available, so webclaw now reads old.reddit.com directly without an API key or JavaScript. You get the post plus the full nested comment tree, with authors, scores, timestamps, and reply nesting preserved. Comment text keeps its links and code blocks, hidden scores are reported as unknown rather than zero, and deleted comments stay in place so their replies aren't lost.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.4] — 2026-05-19
|
||||
|
||||
### Added
|
||||
- API surface discovery: a new module extracts the API endpoints embedded in a page's inline scripts and linked JavaScript bundles. It surfaces relative REST paths, absolute URLs, GraphQL operations, and WebSocket endpoints that a sitemap alone cannot reveal. A built-in noise filter drops schema.org and json-schema.org references, bare framework paths, and other non-API matches so the result stays focused on the real surface.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.3] — 2026-05-19
|
||||
|
||||
### Fixed
|
||||
|
|
|
|||
|
|
@ -91,18 +91,16 @@ Body is optional but encouraged for non-trivial changes.
|
|||
|
||||
```
|
||||
webclaw (this repo)
|
||||
├── crates/
|
||||
│ ├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
|
||||
│ ├── webclaw-fetch/ # HTTP client + crawler + sitemap + batch
|
||||
│ ├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
|
||||
│ ├── webclaw-pdf/ # PDF text extraction
|
||||
│ ├── webclaw-cli/ # CLI binary
|
||||
│ └── webclaw-mcp/ # MCP server binary
|
||||
│
|
||||
└── [patch.crates-io] # Points to webclaw-tls for TLS fingerprinting
|
||||
└── crates/
|
||||
├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
|
||||
├── webclaw-fetch/ # HTTP client (wreq/BoringSSL) + crawler + sitemap + batch
|
||||
├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
|
||||
├── webclaw-pdf/ # PDF text extraction
|
||||
├── webclaw-cli/ # CLI binary
|
||||
└── webclaw-mcp/ # MCP server binary
|
||||
```
|
||||
|
||||
TLS fingerprinting lives in a separate repo: [webclaw-tls](https://github.com/0xMassi/webclaw-tls). The `[patch.crates-io]` section in `Cargo.toml` overrides rustls, h2, hyper, hyper-util, and reqwest with our patched forks for browser-grade JA4 + HTTP/2 Akamai fingerprinting.
|
||||
TLS fingerprinting is handled in-process by [wreq](https://crates.io/crates/wreq) (BoringSSL), so `webclaw-fetch` impersonates real browser TLS directly. There are no `[patch.crates-io]` forks or external TLS dependencies.
|
||||
|
||||
## Crate Boundaries
|
||||
|
||||
|
|
@ -111,7 +109,7 @@ Changes that cross crate boundaries need extra care:
|
|||
| Crate | Network? | Key constraint |
|
||||
|-------|----------|----------------|
|
||||
| webclaw-core | No | Zero network deps, WASM-safe |
|
||||
| webclaw-fetch | Yes (webclaw-http) | Uses [webclaw-tls](https://github.com/0xMassi/webclaw-tls) for TLS fingerprinting |
|
||||
| webclaw-fetch | Yes (wreq) | Browser TLS impersonation via wreq (BoringSSL); no patched deps |
|
||||
| webclaw-llm | Yes (reqwest) | Plain reqwest — LLM APIs don't need TLS fingerprinting |
|
||||
| webclaw-pdf | No | Minimal, wraps pdf-extract |
|
||||
| webclaw-cli | Yes | Depends on all above |
|
||||
|
|
|
|||
221
Cargo.lock
generated
221
Cargo.lock
generated
|
|
@ -28,18 +28,6 @@ dependencies = [
|
|||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
|
|
@ -64,6 +52,12 @@ dependencies = [
|
|||
"alloc-no-stdlib",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "allocator-api2"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
|
|
@ -272,9 +266,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.11.0"
|
||||
version = "2.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
|
||||
checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8"
|
||||
|
||||
[[package]]
|
||||
name = "block-buffer"
|
||||
|
|
@ -285,31 +279,6 @@ dependencies = [
|
|||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boring-sys2"
|
||||
version = "5.0.0-alpha.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "455d79965f5155dcc88a7abce112c3590883889131b799beda10bf9a813ed669"
|
||||
dependencies = [
|
||||
"bindgen",
|
||||
"cmake",
|
||||
"fs_extra",
|
||||
"fslock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boring2"
|
||||
version = "5.0.0-alpha.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "183ccc3854411c035410dcdbffafca62084f3a6c33f013c77e83c025d2a08a28"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"boring-sys2",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"openssl-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "8.0.2"
|
||||
|
|
@ -331,6 +300,31 @@ dependencies = [
|
|||
"alloc-stdlib",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "btls"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c5e60b8c8d282c86360cab651ded04ab0335a7b5390c8d34145cbeab8cacf5f"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"btls-sys",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"openssl-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "btls-sys"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b1b8638a2e1c38a5ae4efa90ae57e643baec35a30d03fc5b399b893adc4954b"
|
||||
dependencies = [
|
||||
"bindgen",
|
||||
"cmake",
|
||||
"fs_extra",
|
||||
"fslock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.20.2"
|
||||
|
|
@ -865,6 +859,12 @@ version = "0.1.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.5.0"
|
||||
|
|
@ -1089,19 +1089,13 @@ version = "0.3.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
|
||||
dependencies = [
|
||||
"foldhash",
|
||||
"foldhash 0.1.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1110,6 +1104,17 @@ version = "0.16.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.17.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
|
||||
dependencies = [
|
||||
"allocator-api2",
|
||||
"equivalent",
|
||||
"foldhash 0.2.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
|
|
@ -1172,9 +1177,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "http2"
|
||||
version = "0.5.15"
|
||||
version = "0.5.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c45c6490693ee8a8d0d95fdbdf76fead9fb87548f7894137259a7c6d22821948"
|
||||
checksum = "569ef7a780e853c4e1768f58a3c8168193b82cdcbab66638a0b1c6583ec5995e"
|
||||
dependencies = [
|
||||
"atomic-waker",
|
||||
"bytes",
|
||||
|
|
@ -1183,7 +1188,6 @@ dependencies = [
|
|||
"futures-sink",
|
||||
"http",
|
||||
"indexmap",
|
||||
"parking_lot",
|
||||
"slab",
|
||||
"smallvec",
|
||||
"tokio",
|
||||
|
|
@ -1495,9 +1499,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
|
|||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.183"
|
||||
version = "0.2.186"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
|
||||
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
|
|
@ -1563,6 +1567,15 @@ dependencies = [
|
|||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a860605968fce16869fd239cf4237a82f3ac470723415db603b0e8b6c8d4fb9"
|
||||
dependencies = [
|
||||
"hashbrown 0.17.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru-slab"
|
||||
version = "0.1.2"
|
||||
|
|
@ -2375,17 +2388,6 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schnellru"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "356285bbf17bea63d9e52e96bd18f039672ac92b55b8cb997d6162a2a37d1649"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"cfg-if",
|
||||
"hashbrown 0.13.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
|
|
@ -2779,9 +2781,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
|||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.50.0"
|
||||
version = "1.52.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d"
|
||||
checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"libc",
|
||||
|
|
@ -2795,20 +2797,20 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-boring2"
|
||||
version = "5.0.0-alpha.13"
|
||||
name = "tokio-btls"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f81df1210d791f31d72d840de8fbd80b9c3cb324956523048b1413e2bd55756"
|
||||
checksum = "2e1fd638ec35427faf3b8f412e0fdd6fae76591d79dba40f38fa667d22bc44dd"
|
||||
dependencies = [
|
||||
"boring2",
|
||||
"btls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-macros"
|
||||
version = "2.6.1"
|
||||
version = "2.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
|
||||
checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
|
@ -3219,7 +3221,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.6.3"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3240,7 +3242,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.6.3"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3258,11 +3260,12 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.6.3"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"calamine",
|
||||
"futures-util",
|
||||
"http",
|
||||
"quick-xml 0.37.5",
|
||||
"rand 0.8.5",
|
||||
|
|
@ -3284,7 +3287,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.6.3"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3297,7 +3300,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.6.3"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3317,7 +3320,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.6.3"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
@ -3326,7 +3329,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-server"
|
||||
version = "0.6.3"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
|
|
@ -3347,9 +3350,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webpki-root-certs"
|
||||
version = "1.0.6"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
|
||||
checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
|
||||
dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
|
@ -3696,17 +3699,14 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "wreq"
|
||||
version = "6.0.0-rc.28"
|
||||
version = "6.0.0-rc.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f79937f6c4df65b3f6f78715b9de2977afe9ee3b3436483c7949a24511e25935"
|
||||
checksum = "3f0eba5f5814a94e5f1a99156f187133464e525b66bdbc69a9627d46530af2e1"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"boring2",
|
||||
"brotli",
|
||||
"btls",
|
||||
"btls-sys",
|
||||
"bytes",
|
||||
"cookie",
|
||||
"flate2",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http-body",
|
||||
|
|
@ -3715,29 +3715,64 @@ dependencies = [
|
|||
"httparse",
|
||||
"ipnet",
|
||||
"libc",
|
||||
"lru",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"schnellru",
|
||||
"smallvec",
|
||||
"socket2",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-boring2",
|
||||
"tokio-btls",
|
||||
"tokio-util",
|
||||
"tower",
|
||||
"tower-http",
|
||||
"url",
|
||||
"want",
|
||||
"webpki-root-certs",
|
||||
"zstd",
|
||||
"wreq-proto",
|
||||
"wreq-rt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wreq-proto"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a43942f024bb303f1042c9aa3c87fa1d9149f507c65db6e5220a11ccdb207387"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http-body",
|
||||
"http2",
|
||||
"httparse",
|
||||
"pin-project-lite",
|
||||
"smallvec",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"want",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wreq-rt"
|
||||
version = "0.2.2-rc.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99e9bce67a3fa3dd3f1503f066d86661c9caf399a763d3bd184da7afaf886c8b"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"wreq-proto",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wreq-util"
|
||||
version = "3.0.0-rc.10"
|
||||
version = "3.0.0-rc.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c6bbe24d28beb9ceb58b514bd6a613c759d3b706f768b9d2950d5d35b543c04"
|
||||
checksum = "baa5d2ab72139256916ca352a3d05c53d74e1dd360052eb5ba7691033c417c65"
|
||||
dependencies = [
|
||||
"brotli",
|
||||
"flate2",
|
||||
"typed-builder",
|
||||
"wreq",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.6.3"
|
||||
version = "0.6.7"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -59,9 +59,9 @@ RUN touch crates/*/src/*.rs \
|
|||
# ---------------------------------------------------------------------------
|
||||
FROM ubuntu:24.04
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
# CA bundle from distroless (ships it, multi-arch, gcr.io) instead of
|
||||
# apt-installing from ports.ubuntu.com (unreachable for arm64 on CI runners).
|
||||
COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||
|
||||
# Copy all three binaries
|
||||
COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ ARG BINARY_DIR=binaries
|
|||
|
||||
FROM ubuntu:24.04
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
# CA bundle copied from a reliable multi-arch image instead of apt-installing
|
||||
# from ports.ubuntu.com — Canonical's arm64 ports mirror is unreachable from
|
||||
# CI runners and breaks the multi-arch release build. No build-time network.
|
||||
COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||
|
||||
ARG BINARY_DIR
|
||||
COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw
|
||||
|
|
|
|||
37
README.md
37
README.md
|
|
@ -385,7 +385,29 @@ Please remove secrets, cookies, private tokens, and customer data from logs befo
|
|||
|
||||
---
|
||||
|
||||
## Studio Partner
|
||||
## Infrastructure Partner
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td align="center">
|
||||
<a href="https://coldproxy.com/">
|
||||
<img src="./assets/sponsors/coldproxy-banner.png" alt="ColdProxy" width="720" />
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<strong>ColdProxy</strong> supports webclaw as an Infrastructure Partner, providing residential IPv4,
|
||||
residential IPv6, and datacenter IPv6 proxy infrastructure across 195+ countries for public data
|
||||
collection, regional testing, monitoring, and web scraping workflows. Explore
|
||||
<a href="https://coldproxy.com/">ColdProxy</a>'s latest plans and available offers directly on the website.
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
---
|
||||
|
||||
## Studio Partners
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
|
|
@ -413,6 +435,19 @@ Please remove secrets, cookies, private tokens, and customer data from logs befo
|
|||
<a href="https://proxy-seller.com/?partner=KXMQNNLIGHXR4B">proxy-seller.com</a>.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td width="340" align="center">
|
||||
<a href="https://www.rapidproxy.io/?ref=webclaw">
|
||||
<img src="./assets/sponsors/rapidproxy-banner.png" alt="RapidProxy" width="300" />
|
||||
</a>
|
||||
</td>
|
||||
<td>
|
||||
<strong>RapidProxy</strong> delivers fast, reliable proxy infrastructure for large-scale data collection.
|
||||
With 90M+ residential IPs, smart rotation, high concurrency, AI-powered CAPTCHA bypass, and non-expiring traffic, it helps keep scraping workflows stable at scale.
|
||||
Use code <code>webclaw</code> for 10% off, or
|
||||
<a href="https://www.rapidproxy.io/?ref=webclaw">Try it free</a>.
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
---
|
||||
|
|
|
|||
BIN
assets/sponsors/coldproxy-banner.png
Normal file
BIN
assets/sponsors/coldproxy-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.3 MiB |
BIN
assets/sponsors/coldproxy-logo.png
Normal file
BIN
assets/sponsors/coldproxy-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 757 KiB |
BIN
assets/sponsors/rapidproxy-banner.png
Normal file
BIN
assets/sponsors/rapidproxy-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 413 KiB |
|
|
@ -166,6 +166,14 @@ struct Cli {
|
|||
#[arg(long)]
|
||||
urls_file: Option<String>,
|
||||
|
||||
/// Assert that the URL has been handled for shell escaping. Suppresses
|
||||
/// the URL-truncation stderr warning. Use when the URL is intentionally
|
||||
/// passed with an empty/keyless query (e.g. legacy CGI) or when a
|
||||
/// trailing `&` is genuinely part of the URL. The URL is fetched as-is
|
||||
/// (no extra normalization beyond the standard scheme prepend).
|
||||
#[arg(long)]
|
||||
url_encoded: bool,
|
||||
|
||||
/// Output format (markdown, json, text, llm, html)
|
||||
#[arg(short, long, default_value = "markdown")]
|
||||
format: OutputFormat,
|
||||
|
|
@ -591,6 +599,31 @@ fn normalize_url(url: &str) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
/// M14: detect URLs that look truncated by the shell (e.g. an unquoted URL
|
||||
/// that the shell split on `&` or `?`). Returns `true` when:
|
||||
/// - the URL ends with `&` (a trailing param separator suggests the next
|
||||
/// param was lopped off), OR
|
||||
/// - the URL contains `?` but no `=` after it (a query with bare keys is
|
||||
/// rare; usually a real query has at least one `=`).
|
||||
///
|
||||
/// Informational only — caller decides whether to warn / abort. This is a
|
||||
/// heuristic; legitimate URLs with bare-key queries will trigger a false
|
||||
/// positive (suppressible via `--url-encoded`).
|
||||
fn looks_truncated(url: &str) -> bool {
|
||||
let trimmed = url.trim();
|
||||
if trimmed.ends_with('&') {
|
||||
return true;
|
||||
}
|
||||
if let Some((_before, after_q)) = trimmed.split_once('?') {
|
||||
// Trim a trailing fragment so `?#section` etc. doesn't mask the check.
|
||||
let query_part = after_q.split('#').next().unwrap_or(after_q);
|
||||
if !query_part.contains('=') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Derive a filename from a URL for `--output-dir`.
|
||||
///
|
||||
/// Strips the scheme/host, maps the path to a filesystem path, and appends
|
||||
|
|
@ -826,6 +859,14 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
.urls
|
||||
.first()
|
||||
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
|
||||
// M14: warn when the URL looks like the shell split it on `&` or `?`.
|
||||
// Informational only — fetch still proceeds. Suppressed by --url-encoded,
|
||||
// which asserts the caller has handled escaping intentionally.
|
||||
if !cli.url_encoded && looks_truncated(raw_url) {
|
||||
eprintln!(
|
||||
"# webclaw: warning: URL looks truncated (ends with '&' or '?'); did the shell split it? Quote the URL or use --url-encoded."
|
||||
);
|
||||
}
|
||||
let url = normalize_url(raw_url);
|
||||
let url = url.as_str();
|
||||
|
||||
|
|
@ -859,8 +900,11 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
let options = build_extraction_options(cli);
|
||||
let result = client
|
||||
.fetch_and_extract_with_options(url, &options)
|
||||
// M13: wrap with periodic stderr progress emitter. Fast fetches see
|
||||
// zero emissions (timer never fires in <10s); slow fetches get a
|
||||
// line every 10s of elapsed time so the CLI doesn't appear hung.
|
||||
let fetch_fut = client.fetch_and_extract_with_options(url, &options);
|
||||
let result = webclaw_fetch::with_progress(url, fetch_fut)
|
||||
.await
|
||||
.map_err(|e| format!("fetch error: {e}"))?;
|
||||
|
||||
|
|
@ -2879,6 +2923,61 @@ mod tests {
|
|||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
// M14: URL truncation heuristic tests.
|
||||
#[test]
|
||||
fn looks_truncated_fires_on_trailing_ampersand() {
|
||||
// The most common shell-split shape: `?a=1&` lost the `b=2`.
|
||||
assert!(looks_truncated("https://example.com/?a=1&"));
|
||||
assert!(looks_truncated("https://example.com/path?key=val&"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn looks_truncated_fires_on_query_with_no_equals() {
|
||||
// `?foo` with no `=` is a strong signal the shell ate the `=value`.
|
||||
assert!(looks_truncated("https://example.com/?foo"));
|
||||
// Bare `?` (empty query) also looks like the shell ate the whole pair.
|
||||
assert!(looks_truncated("https://example.com/?"));
|
||||
// Same with a fragment after — strip fragment before checking.
|
||||
assert!(looks_truncated("https://example.com/?foo#section"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn looks_truncated_silent_on_clean_url() {
|
||||
// Normal URLs (no query, or query with at least one `=`) are clean.
|
||||
assert!(!looks_truncated("https://example.com/"));
|
||||
assert!(!looks_truncated("https://example.com/path/to/page"));
|
||||
assert!(!looks_truncated("https://example.com/?a=1"));
|
||||
assert!(!looks_truncated("https://example.com/?a=1&b=2"));
|
||||
assert!(!looks_truncated(
|
||||
"https://example.com/?a=1&b=2&c=hello%20world"
|
||||
));
|
||||
// Hash anchors without a query are clean.
|
||||
assert!(!looks_truncated("https://example.com/page#section"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn looks_truncated_silent_with_url_encoded_assertion_modeled_via_skip() {
|
||||
// The --url-encoded flag suppresses the warning at the call site
|
||||
// (main.rs gates the eprintln! behind `if !cli.url_encoded`).
|
||||
// This test models the gate logic directly: when --url-encoded is set,
|
||||
// the warning branch is never entered, even on a truncated-looking URL.
|
||||
let url = "https://example.com/?a=1&";
|
||||
let url_encoded_flag = true;
|
||||
let should_warn = !url_encoded_flag && looks_truncated(url);
|
||||
assert!(
|
||||
!should_warn,
|
||||
"--url-encoded must suppress the warning even on URL ending with &"
|
||||
);
|
||||
|
||||
// Sanity: same URL without --url-encoded does warn.
|
||||
let url_encoded_flag = false;
|
||||
let should_warn = !url_encoded_flag && looks_truncated(url);
|
||||
assert!(
|
||||
should_warn,
|
||||
"without --url-encoded, the warning should fire on URL ending with &"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn research_slug_truncation_is_char_safe() {
|
||||
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.
|
||||
|
|
|
|||
|
|
@ -4,6 +4,10 @@ description = "Pure HTML content extraction engine for LLMs"
|
|||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
# Reddit regression fixtures are real old.reddit.com pages read at test time;
|
||||
# they're large and only needed to run the test suite from the repo, so keep
|
||||
# them out of the published crate.
|
||||
exclude = ["testdata/reddit/*.html"]
|
||||
|
||||
[features]
|
||||
default = ["quickjs"]
|
||||
|
|
|
|||
515
crates/webclaw-core/src/endpoints.rs
Normal file
515
crates/webclaw-core/src/endpoints.rs
Normal file
|
|
@ -0,0 +1,515 @@
|
|||
//! API/endpoint surface discovery from HTML + JS bundle text.
|
||||
//!
|
||||
//! Pure and zero-network: callers fetch the page and its `<script src>`
|
||||
//! bundles, then hand the raw text here. We surface API paths, absolute
|
||||
//! API URLs, GraphQL and WebSocket endpoints that live in inline scripts
|
||||
//! and bundles — the surface a sitemap/`map` can never see.
|
||||
//!
|
||||
//! Heuristic by design: regex over string literals, not JS dataflow.
|
||||
//! High-signal patterns only; bounded for DoS safety.
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use scraper::{Html, Selector};
|
||||
use std::collections::BTreeSet;
|
||||
use url::Url;
|
||||
|
||||
/// Hard caps so a hostile/huge bundle set can't blow up CPU or memory.
|
||||
const MAX_SCAN_BYTES: usize = 8 * 1024 * 1024;
|
||||
const MAX_ENDPOINTS: usize = 2000;
|
||||
/// Cap on `<script src>` URLs returned for the caller to fetch.
|
||||
const MAX_SCRIPT_SRCS: usize = 40;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum EndpointKind {
|
||||
RelativePath,
|
||||
AbsoluteUrl,
|
||||
GraphQl,
|
||||
WebSocket,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize)]
|
||||
pub struct DiscoveredEndpoint {
|
||||
pub value: String,
|
||||
pub kind: EndpointKind,
|
||||
pub first_party: bool,
|
||||
/// `"inline"` or the bundle URL the match came from.
|
||||
pub source: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, serde::Serialize)]
|
||||
pub struct EndpointReport {
|
||||
pub endpoints: Vec<DiscoveredEndpoint>,
|
||||
/// Distinct hosts seen across absolute URLs (first- and third-party).
|
||||
pub hosts: Vec<String>,
|
||||
pub bundles_scanned: usize,
|
||||
/// True if a cap was hit and results may be incomplete.
|
||||
pub truncated: bool,
|
||||
}
|
||||
|
||||
// Quoted relative path that looks API-ish. Bounded quantifiers; the `regex`
|
||||
// crate is linear-time (RE2) so this cannot catastrophically backtrack.
|
||||
static RE_REL_PATH: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(
|
||||
r#"["'`](/[A-Za-z0-9_\-./]{0,200}?(?:api|graphql|gql|/v[0-9]|/rest|/gateway|/internal|/discovery)[A-Za-z0-9_\-./]{0,200})["'`]"#,
|
||||
)
|
||||
.expect("RE_REL_PATH")
|
||||
});
|
||||
|
||||
static RE_ABS_URL: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"https?://[A-Za-z0-9.\-]{1,253}(?:/[A-Za-z0-9_\-./%]{0,400})?"#)
|
||||
.expect("RE_ABS_URL")
|
||||
});
|
||||
|
||||
static RE_WS: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"wss?://[A-Za-z0-9.\-]{1,253}(?:/[A-Za-z0-9_\-./%]{0,256})?"#).expect("RE_WS")
|
||||
});
|
||||
|
||||
static SCRIPT_SEL: Lazy<Selector> = Lazy::new(|| Selector::parse("script").expect("script sel"));
|
||||
|
||||
/// Common multi-label public suffixes so `ticketmaster.co.uk` resolves to
|
||||
/// `ticketmaster.co.uk` (not `co.uk`). Not a full PSL — pragmatic v1.
|
||||
const SUFFIX2: &[&str] = &[
|
||||
"co.uk", "org.uk", "gov.uk", "ac.uk", "me.uk", "com.au", "net.au", "org.au", "co.jp", "co.nz",
|
||||
"co.za", "com.br", "com.mx", "com.sg", "co.in", "co.kr", "com.tr", "com.cn",
|
||||
];
|
||||
|
||||
fn registrable_domain(host: &str) -> String {
|
||||
let host = host.trim_end_matches('.').to_ascii_lowercase();
|
||||
let labels: Vec<&str> = host.split('.').collect();
|
||||
if labels.len() < 2 {
|
||||
return host;
|
||||
}
|
||||
let last2 = labels[labels.len() - 2..].join(".");
|
||||
if SUFFIX2.contains(&last2.as_str()) && labels.len() >= 3 {
|
||||
labels[labels.len() - 3..].join(".")
|
||||
} else {
|
||||
last2
|
||||
}
|
||||
}
|
||||
|
||||
fn is_first_party(candidate_host: &str, base_reg: &str) -> bool {
|
||||
let ch = candidate_host.to_ascii_lowercase();
|
||||
ch == base_reg || ch.ends_with(&format!(".{base_reg}"))
|
||||
}
|
||||
|
||||
/// Registrable domains that are spec/schema/example noise, never real API
|
||||
/// surface (minified JSON-Schema/`schema.org` refs show up constantly).
|
||||
const NOISE_HOSTS: &[&str] = &[
|
||||
"schema.org",
|
||||
"json-schema.org",
|
||||
"w3.org",
|
||||
"example.com",
|
||||
"example.org",
|
||||
"example.net",
|
||||
"localhost",
|
||||
];
|
||||
|
||||
/// A host worth reporting: multi-label with an alphabetic TLD (>=2 chars).
|
||||
/// Rejects minifier garbage like `http://f` / `http://n` and UUID-ish
|
||||
/// single labels that the URL regex otherwise picks up.
|
||||
fn is_valid_host(host: &str) -> bool {
|
||||
let h = host.trim_end_matches('.');
|
||||
let labels: Vec<&str> = h.split('.').collect();
|
||||
if labels.len() < 2 || labels.iter().any(|l| l.is_empty()) {
|
||||
return false;
|
||||
}
|
||||
let tld = labels[labels.len() - 1];
|
||||
tld.len() >= 2 && tld.chars().all(|c| c.is_ascii_alphabetic())
|
||||
}
|
||||
|
||||
/// Bare/low-signal relative paths that are just the prefix, not an endpoint
|
||||
/// (e.g. `/api`, `/api/`, `/`). `/graphql`, `/gql`, `/api/x` are kept.
|
||||
fn is_noise_path(p: &str) -> bool {
|
||||
let t = p.trim_end_matches('/');
|
||||
t.len() < 4 || matches!(t, "/api" | "/rest")
|
||||
}
|
||||
|
||||
/// Resolved absolute `<script src>` URLs (http/https only), deduped, capped.
|
||||
/// Inline scripts have no `src` and are scanned via [`extract_endpoints`].
|
||||
pub fn script_srcs(html: &str, base_url: &str) -> Vec<String> {
|
||||
let base = Url::parse(base_url).ok();
|
||||
let doc = Html::parse_document(html);
|
||||
let mut seen = BTreeSet::new();
|
||||
let mut out = Vec::new();
|
||||
for el in doc.select(&SCRIPT_SEL) {
|
||||
if out.len() >= MAX_SCRIPT_SRCS {
|
||||
break;
|
||||
}
|
||||
let Some(src) = el.value().attr("src") else {
|
||||
continue;
|
||||
};
|
||||
let resolved = match Url::parse(src) {
|
||||
Ok(u) => Some(u),
|
||||
Err(_) => base.as_ref().and_then(|b| b.join(src).ok()),
|
||||
};
|
||||
let Some(u) = resolved else {
|
||||
continue;
|
||||
};
|
||||
if (u.scheme() == "http" || u.scheme() == "https") && seen.insert(u.to_string()) {
|
||||
out.push(u.to_string());
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Extract endpoints from inline HTML scripts plus pre-fetched JS bundles.
|
||||
/// `bundles` is `(bundle_url, bundle_text)`.
|
||||
pub fn extract_endpoints(
|
||||
html: &str,
|
||||
base_url: &str,
|
||||
bundles: &[(String, String)],
|
||||
) -> EndpointReport {
|
||||
let base_reg = Url::parse(base_url)
|
||||
.ok()
|
||||
.and_then(|u| u.host_str().map(registrable_domain))
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut endpoints: Vec<DiscoveredEndpoint> = Vec::new();
|
||||
let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
|
||||
let mut hosts: BTreeSet<String> = BTreeSet::new();
|
||||
let mut budget = MAX_SCAN_BYTES;
|
||||
let mut truncated = false;
|
||||
|
||||
let push = |value: String,
|
||||
kind: EndpointKind,
|
||||
source: &str,
|
||||
endpoints: &mut Vec<DiscoveredEndpoint>,
|
||||
seen: &mut BTreeSet<(String, String)>,
|
||||
hosts: &mut BTreeSet<String>|
|
||||
-> bool {
|
||||
if endpoints.len() >= MAX_ENDPOINTS {
|
||||
return false;
|
||||
}
|
||||
let first_party = match Url::parse(&value) {
|
||||
Ok(u) => {
|
||||
let Some(h) = u.host_str() else {
|
||||
return true;
|
||||
};
|
||||
if !is_valid_host(h) {
|
||||
return true; // minifier garbage host
|
||||
}
|
||||
if NOISE_HOSTS.contains(®istrable_domain(h).as_str()) {
|
||||
return true; // schema.org / json-schema.org / example.*
|
||||
}
|
||||
// Absolute URL with no real path is an origin/site link,
|
||||
// not an API endpoint (drops the page's own URL too).
|
||||
let path = u.path();
|
||||
if path.is_empty() || path == "/" {
|
||||
return true;
|
||||
}
|
||||
hosts.insert(h.to_ascii_lowercase());
|
||||
is_first_party(h, &base_reg)
|
||||
}
|
||||
// Relative path: same origin as the page by definition.
|
||||
Err(_) => {
|
||||
if is_noise_path(&value) {
|
||||
return true; // bare /api, /, ultra-short
|
||||
}
|
||||
true
|
||||
}
|
||||
};
|
||||
if seen.insert((value.clone(), source.to_string())) {
|
||||
endpoints.push(DiscoveredEndpoint {
|
||||
value,
|
||||
kind,
|
||||
first_party,
|
||||
source: source.to_string(),
|
||||
});
|
||||
}
|
||||
true
|
||||
};
|
||||
|
||||
let scan = |text: &str,
|
||||
source: &str,
|
||||
endpoints: &mut Vec<DiscoveredEndpoint>,
|
||||
seen: &mut BTreeSet<(String, String)>,
|
||||
hosts: &mut BTreeSet<String>,
|
||||
budget: &mut usize,
|
||||
truncated: &mut bool| {
|
||||
if *budget == 0 {
|
||||
return;
|
||||
}
|
||||
let slice = if text.len() > *budget {
|
||||
*truncated = true;
|
||||
&text[..*budget]
|
||||
} else {
|
||||
text
|
||||
};
|
||||
*budget -= slice.len();
|
||||
|
||||
for c in RE_REL_PATH.captures_iter(slice) {
|
||||
if let Some(m) = c.get(1) {
|
||||
let v = m.as_str().to_string();
|
||||
let kind = if v.contains("graphql") || v.contains("/gql") {
|
||||
EndpointKind::GraphQl
|
||||
} else {
|
||||
EndpointKind::RelativePath
|
||||
};
|
||||
if !push(v, kind, source, endpoints, seen, hosts) {
|
||||
*truncated = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
for m in RE_WS.find_iter(slice) {
|
||||
if !push(
|
||||
m.as_str().to_string(),
|
||||
EndpointKind::WebSocket,
|
||||
source,
|
||||
endpoints,
|
||||
seen,
|
||||
hosts,
|
||||
) {
|
||||
*truncated = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
for m in RE_ABS_URL.find_iter(slice) {
|
||||
let v = m.as_str().to_string();
|
||||
// Skip obvious static assets — we want API surface, not CDN files.
|
||||
let lower = v.to_ascii_lowercase();
|
||||
if lower.ends_with(".js")
|
||||
|| lower.ends_with(".css")
|
||||
|| lower.ends_with(".png")
|
||||
|| lower.ends_with(".jpg")
|
||||
|| lower.ends_with(".svg")
|
||||
|| lower.ends_with(".woff2")
|
||||
{
|
||||
// still record the host for visibility
|
||||
if let Some(h) = Url::parse(&v)
|
||||
.ok()
|
||||
.and_then(|u| u.host_str().map(str::to_string))
|
||||
{
|
||||
hosts.insert(h.to_ascii_lowercase());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
let kind = if lower.contains("graphql") || lower.contains("/gql") {
|
||||
EndpointKind::GraphQl
|
||||
} else {
|
||||
EndpointKind::AbsoluteUrl
|
||||
};
|
||||
if !push(v, kind, source, endpoints, seen, hosts) {
|
||||
*truncated = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Inline scripts.
|
||||
let doc = Html::parse_document(html);
|
||||
let mut inline = String::new();
|
||||
for el in doc.select(&SCRIPT_SEL) {
|
||||
if el.value().attr("src").is_none() {
|
||||
inline.push_str(&el.text().collect::<String>());
|
||||
inline.push('\n');
|
||||
}
|
||||
}
|
||||
scan(
|
||||
&inline,
|
||||
"inline",
|
||||
&mut endpoints,
|
||||
&mut seen,
|
||||
&mut hosts,
|
||||
&mut budget,
|
||||
&mut truncated,
|
||||
);
|
||||
|
||||
// Bundles.
|
||||
let mut bundles_scanned = 0usize;
|
||||
for (src, text) in bundles {
|
||||
if budget == 0 {
|
||||
truncated = true;
|
||||
break;
|
||||
}
|
||||
bundles_scanned += 1;
|
||||
scan(
|
||||
text,
|
||||
src,
|
||||
&mut endpoints,
|
||||
&mut seen,
|
||||
&mut hosts,
|
||||
&mut budget,
|
||||
&mut truncated,
|
||||
);
|
||||
}
|
||||
|
||||
endpoints.sort_by(|a, b| (a.kind, &a.value, &a.source).cmp(&(b.kind, &b.value, &b.source)));
|
||||
|
||||
EndpointReport {
|
||||
endpoints,
|
||||
hosts: hosts.into_iter().collect(),
|
||||
bundles_scanned,
|
||||
truncated,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn registrable_domain_handles_cc_tlds() {
|
||||
assert_eq!(
|
||||
registrable_domain("www.ticketmaster.co.uk"),
|
||||
"ticketmaster.co.uk"
|
||||
);
|
||||
assert_eq!(
|
||||
registrable_domain("api.ticketmaster.com"),
|
||||
"ticketmaster.com"
|
||||
);
|
||||
assert_eq!(
|
||||
registrable_domain("pubapi.ticketmaster.co.uk"),
|
||||
"ticketmaster.co.uk"
|
||||
);
|
||||
assert_eq!(registrable_domain("localhost"), "localhost");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn script_srcs_resolves_and_filters() {
|
||||
let html = r#"<html><head>
|
||||
<script src="/_next/static/chunks/main-abc.js"></script>
|
||||
<script src="https://cdn.example.net/lib.js"></script>
|
||||
<script>var inline = 1;</script>
|
||||
<script src="data:text/javascript,1"></script>
|
||||
</head></html>"#;
|
||||
let srcs = script_srcs(html, "https://www.ticketmaster.co.uk/");
|
||||
assert!(srcs.contains(
|
||||
&"https://www.ticketmaster.co.uk/_next/static/chunks/main-abc.js".to_string()
|
||||
));
|
||||
assert!(srcs.contains(&"https://cdn.example.net/lib.js".to_string()));
|
||||
assert_eq!(srcs.len(), 2, "inline + data: ignored");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_inline_and_bundle_endpoints_with_classification() {
|
||||
let html = r#"<html><body>
|
||||
<script>
|
||||
var cfg = { search: "/api/search/events", suggest: "/api/search/search-suggest" };
|
||||
fetch("/api/venue/info");
|
||||
</script>
|
||||
<script src="/app.js"></script>
|
||||
</body></html>"#;
|
||||
let bundles = vec![(
|
||||
"https://www.ticketmaster.co.uk/app.js".to_string(),
|
||||
r#"
|
||||
const GQL = "https://pubapi.ticketmaster.co.uk/graphql";
|
||||
axios.post("https://services.ticketmaster.co.uk/discovery/v2/events");
|
||||
new WebSocket("wss://live.ticketmaster.co.uk/socket");
|
||||
const ga = "https://www.googletagservices.com/tag/js/gpt.js";
|
||||
const img = "https://cdn.tmol.co/hero.png";
|
||||
"#
|
||||
.to_string(),
|
||||
)];
|
||||
let r = extract_endpoints(html, "https://www.ticketmaster.co.uk/", &bundles);
|
||||
let vals: Vec<&str> = r.endpoints.iter().map(|e| e.value.as_str()).collect();
|
||||
|
||||
assert!(vals.contains(&"/api/search/events"));
|
||||
assert!(vals.contains(&"/api/search/search-suggest"));
|
||||
assert!(vals.contains(&"/api/venue/info"));
|
||||
assert!(vals.contains(&"https://pubapi.ticketmaster.co.uk/graphql"));
|
||||
assert!(vals.contains(&"https://services.ticketmaster.co.uk/discovery/v2/events"));
|
||||
assert!(vals.contains(&"wss://live.ticketmaster.co.uk/socket"));
|
||||
// static .js asset is not an endpoint, but its host is recorded
|
||||
assert!(!vals.contains(&"https://www.googletagservices.com/tag/js/gpt.js"));
|
||||
assert!(r.hosts.iter().any(|h| h == "www.googletagservices.com"));
|
||||
|
||||
let gql = r
|
||||
.endpoints
|
||||
.iter()
|
||||
.find(|e| e.value.contains("graphql"))
|
||||
.unwrap();
|
||||
assert_eq!(gql.kind, EndpointKind::GraphQl);
|
||||
assert!(
|
||||
gql.first_party,
|
||||
"pubapi.ticketmaster.co.uk is first-party to .co.uk"
|
||||
);
|
||||
|
||||
let third = r
|
||||
.endpoints
|
||||
.iter()
|
||||
.find(|e| e.value.starts_with("/api/venue"));
|
||||
assert!(third.unwrap().first_party, "relative path is same-origin");
|
||||
assert_eq!(r.bundles_scanned, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn third_party_absolute_is_flagged_not_first_party() {
|
||||
let bundles = vec![(
|
||||
"b".to_string(),
|
||||
r#"x="https://api.stripe.com/v1/charges""#.to_string(),
|
||||
)];
|
||||
let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
|
||||
let e = r
|
||||
.endpoints
|
||||
.iter()
|
||||
.find(|e| e.value.contains("stripe"))
|
||||
.unwrap();
|
||||
assert!(!e.first_party);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn caps_bound_pathological_input() {
|
||||
// A huge blob of fake endpoints must not exceed MAX_ENDPOINTS and
|
||||
// must return promptly (regex crate is linear-time).
|
||||
let mut big = String::new();
|
||||
for i in 0..50_000 {
|
||||
big.push_str(&format!("\"/api/v1/item/{i}\" "));
|
||||
}
|
||||
let bundles = vec![("big".to_string(), big)];
|
||||
let r = extract_endpoints("<html></html>", "https://x.com/", &bundles);
|
||||
assert!(r.endpoints.len() <= MAX_ENDPOINTS);
|
||||
assert!(r.truncated);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_inputs_are_safe() {
|
||||
let r = extract_endpoints("", "not a url", &[]);
|
||||
assert!(r.endpoints.is_empty());
|
||||
assert_eq!(r.bundles_scanned, 0);
|
||||
assert!(!r.truncated);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v1_1_noise_is_filtered() {
|
||||
let bundles = vec![(
|
||||
"b.js".to_string(),
|
||||
r#"
|
||||
"/api/search/events";
|
||||
"/api"; "/api/";
|
||||
"http://f"; "http://n/x";
|
||||
"https://schema.org/Thing";
|
||||
"http://json-schema.org/draft-07/schema";
|
||||
"https://www.ticketmaster.co.uk/";
|
||||
"https://pubapi.ticketmaster.co.uk/discovery/v2/events";
|
||||
"wss://live.ticketmaster.co.uk/socket";
|
||||
"#
|
||||
.to_string(),
|
||||
)];
|
||||
let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
|
||||
let vals: std::collections::BTreeSet<&str> =
|
||||
r.endpoints.iter().map(|e| e.value.as_str()).collect();
|
||||
assert!(vals.contains("/api/search/events"));
|
||||
assert!(vals.contains("https://pubapi.ticketmaster.co.uk/discovery/v2/events"));
|
||||
assert!(vals.contains("wss://live.ticketmaster.co.uk/socket"));
|
||||
for junk in [
|
||||
"/api",
|
||||
"/api/",
|
||||
"http://f",
|
||||
"http://n/x",
|
||||
"https://schema.org/Thing",
|
||||
"http://json-schema.org/draft-07/schema",
|
||||
"https://www.ticketmaster.co.uk/",
|
||||
] {
|
||||
assert!(!vals.contains(junk), "noise leaked: {junk}");
|
||||
}
|
||||
assert!(
|
||||
!r.hosts
|
||||
.iter()
|
||||
.any(|h| h == "f" || h == "n" || h == "schema.org")
|
||||
);
|
||||
assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
|
||||
}
|
||||
}
|
||||
|
|
@ -7,6 +7,7 @@ pub(crate) mod data_island;
|
|||
/// Zero network dependencies — WASM-compatible by design.
|
||||
pub mod diff;
|
||||
pub mod domain;
|
||||
pub mod endpoints;
|
||||
pub mod error;
|
||||
pub mod extractor;
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
|
|
@ -16,6 +17,7 @@ pub mod markdown;
|
|||
pub mod metadata;
|
||||
#[allow(dead_code)]
|
||||
pub(crate) mod noise;
|
||||
pub mod reddit;
|
||||
pub mod structured_data;
|
||||
pub mod types;
|
||||
pub mod youtube;
|
||||
|
|
@ -93,6 +95,24 @@ fn extract_with_options_inner(
|
|||
return Err(ExtractError::NoContent);
|
||||
}
|
||||
|
||||
// Reddit fast path: parse old.reddit.com HTML directly.
|
||||
// The fetch layer rewrites all Reddit hosts to old.reddit.com before
|
||||
// calling extract, so we always get stable server-rendered HTML here.
|
||||
if let Some(u) = url
|
||||
&& reddit::is_reddit_url(u)
|
||||
{
|
||||
if let Some(result) = reddit::try_extract(html, u) {
|
||||
return Ok(result);
|
||||
}
|
||||
// A recognised comment thread that we couldn't parse (Reddit markup
|
||||
// change, or a block/challenge page) — don't fall through to generic
|
||||
// extraction, which would emit Reddit nav/sidebar chrome. Listings
|
||||
// and profiles (no `/comments/`) intentionally fall through below.
|
||||
if u.contains("/comments/") {
|
||||
return Err(ExtractError::NoContent);
|
||||
}
|
||||
}
|
||||
|
||||
// YouTube fast path: if the URL is a YouTube video page, try extracting
|
||||
// structured metadata from ytInitialPlayerResponse before DOM scoring.
|
||||
// This gives LLMs a clean, structured view of video metadata.
|
||||
|
|
|
|||
968
crates/webclaw-core/src/reddit.rs
Normal file
968
crates/webclaw-core/src/reddit.rs
Normal file
|
|
@ -0,0 +1,968 @@
|
|||
//! Reddit thread extractor — parses old.reddit.com HTML directly.
|
||||
//!
|
||||
//! old.reddit.com serves fully server-rendered HTML with stable class names
|
||||
//! and data attributes. No JS, no API key, no `.json` trick needed.
|
||||
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{Content, DomainData, DomainType, ExtractionResult, Metadata};
|
||||
|
||||
// ─── Public types ──────────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct RedditPost {
|
||||
pub id: Option<String>,
|
||||
pub title: String,
|
||||
pub author: String,
|
||||
pub subreddit: Option<String>,
|
||||
pub score: i64,
|
||||
pub body: Option<String>,
|
||||
pub num_comments: usize,
|
||||
pub permalink: String,
|
||||
pub url: Option<String>,
|
||||
pub is_self: bool,
|
||||
pub flair: Option<String>,
|
||||
pub created_utc: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct RedditComment {
|
||||
pub id: Option<String>,
|
||||
pub author: String,
|
||||
pub body: String,
|
||||
/// `None` when Reddit hides the score (fresh comments). Distinct from
|
||||
/// `Some(0)`, which is a real net-zero score.
|
||||
pub score: Option<i64>,
|
||||
pub depth: usize,
|
||||
pub is_op: bool,
|
||||
pub created_utc: Option<String>,
|
||||
pub replies: Vec<RedditComment>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct RedditThread {
|
||||
#[serde(rename = "url")]
|
||||
pub source_url: String,
|
||||
pub post: Option<RedditPost>,
|
||||
pub comments: Vec<RedditComment>,
|
||||
}
|
||||
|
||||
// ─── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
matches!(
|
||||
host_of(url),
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
)
|
||||
}
|
||||
|
||||
/// Try to parse a Reddit thread from old.reddit.com HTML.
|
||||
/// Returns `None` if the page doesn't have recognisable Reddit structure.
|
||||
pub fn try_extract_thread(html: &str, url: &str) -> Option<RedditThread> {
|
||||
if !url.contains("/comments/") {
|
||||
return None;
|
||||
}
|
||||
let doc = Html::parse_document(html);
|
||||
let post = parse_post(&doc);
|
||||
let op = post.as_ref().map(|p| p.author.as_str()).unwrap_or("");
|
||||
let comments = parse_comments(&doc, op);
|
||||
|
||||
if post.is_none() && comments.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(RedditThread {
|
||||
source_url: url.to_string(),
|
||||
post,
|
||||
comments,
|
||||
})
|
||||
}
|
||||
|
||||
/// Entry point for `webclaw-core`'s extraction fast path.
|
||||
pub fn try_extract(html: &str, url: &str) -> Option<ExtractionResult> {
|
||||
let thread = try_extract_thread(html, url)?;
|
||||
Some(to_extraction_result(&thread))
|
||||
}
|
||||
|
||||
// ─── ExtractionResult builder ──────────────────────────────────────────────────
|
||||
|
||||
fn to_extraction_result(thread: &RedditThread) -> ExtractionResult {
|
||||
let md = to_markdown(thread);
|
||||
let plain = plain_text(&md);
|
||||
let wc = md.split_whitespace().count();
|
||||
|
||||
let (title, author, site_name) = thread
|
||||
.post
|
||||
.as_ref()
|
||||
.map(|p| {
|
||||
(
|
||||
Some(p.title.clone()),
|
||||
Some(p.author.clone()),
|
||||
p.subreddit.clone(),
|
||||
)
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title,
|
||||
description: None,
|
||||
author,
|
||||
published_date: None,
|
||||
language: Some("en".to_string()),
|
||||
url: Some(thread.source_url.clone()),
|
||||
site_name,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count: wc,
|
||||
},
|
||||
content: Content {
|
||||
markdown: md,
|
||||
plain_text: plain,
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: Some(DomainData {
|
||||
domain_type: DomainType::Social,
|
||||
}),
|
||||
structured_data: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Markdown rendering ────────────────────────────────────────────────────────
|
||||
|
||||
pub fn to_markdown(thread: &RedditThread) -> String {
|
||||
let mut out = String::new();
|
||||
|
||||
if let Some(p) = &thread.post {
|
||||
out.push_str(&format!("# {}\n\n", p.title));
|
||||
|
||||
let pts = pt_label(Some(p.score));
|
||||
let cmt = match p.num_comments {
|
||||
0 => String::new(),
|
||||
1 => " · 1 comment".to_string(),
|
||||
n => format!(" · {n} comments"),
|
||||
};
|
||||
let sub = p.subreddit.as_deref().unwrap_or("?");
|
||||
out.push_str(&format!("**u/{}** · r/{sub} · {pts}{cmt}\n\n", p.author));
|
||||
|
||||
if let Some(ref body) = p.body
|
||||
&& !body.is_empty()
|
||||
{
|
||||
out.push_str(body);
|
||||
out.push_str("\n\n");
|
||||
}
|
||||
if let Some(ref link) = p.url
|
||||
&& !p.is_self
|
||||
{
|
||||
out.push_str(&format!("[Link]({link})\n\n"));
|
||||
}
|
||||
out.push_str("---\n\n");
|
||||
}
|
||||
|
||||
if !thread.comments.is_empty() {
|
||||
out.push_str("## Comments\n\n");
|
||||
for c in &thread.comments {
|
||||
render_comment(c, &mut out);
|
||||
}
|
||||
}
|
||||
|
||||
collapse_blank_lines(out.trim_end())
|
||||
}
|
||||
|
||||
/// Render one comment + its replies. Nesting is expressed with blockquote
|
||||
/// depth (`> ` per level) rather than leading spaces: space-indentation of
|
||||
/// 4+ would turn ordinary text and ``` fences into CommonMark indented code
|
||||
/// blocks, corrupting any comment at depth ≥ 2.
|
||||
fn render_comment(c: &RedditComment, out: &mut String) {
|
||||
let q = "> ".repeat(c.depth);
|
||||
let blank = ">".repeat(c.depth);
|
||||
let author = if c.is_op {
|
||||
format!("**u/{} [OP]**", c.author)
|
||||
} else {
|
||||
format!("**u/{}**", c.author)
|
||||
};
|
||||
out.push_str(&format!("{q}{author} · {}\n", pt_label(c.score)));
|
||||
for line in c.body.lines() {
|
||||
if line.is_empty() {
|
||||
out.push_str(&blank);
|
||||
out.push('\n');
|
||||
} else {
|
||||
out.push_str(&q);
|
||||
out.push_str(line);
|
||||
out.push('\n');
|
||||
}
|
||||
}
|
||||
out.push('\n');
|
||||
for reply in &c.replies {
|
||||
render_comment(reply, out);
|
||||
}
|
||||
}
|
||||
|
||||
fn pt_label(n: Option<i64>) -> String {
|
||||
match n {
|
||||
None => "score hidden".to_string(),
|
||||
Some(1) => "1 pt".to_string(),
|
||||
Some(-1) => "-1 pt".to_string(),
|
||||
Some(n) => format!("{n} pts"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Collapse runs of 3+ newlines down to a blank-line separator so the
|
||||
/// blockquote prefixes and `<pre>` spacing don't leave large gaps.
|
||||
fn collapse_blank_lines(s: &str) -> String {
|
||||
let mut out = String::with_capacity(s.len());
|
||||
let mut newlines = 0;
|
||||
for ch in s.chars() {
|
||||
if ch == '\n' {
|
||||
newlines += 1;
|
||||
if newlines <= 2 {
|
||||
out.push(ch);
|
||||
}
|
||||
} else {
|
||||
newlines = 0;
|
||||
out.push(ch);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn plain_text(md: &str) -> String {
|
||||
md.lines()
|
||||
.map(|l| {
|
||||
// Strip a single leading blockquote / heading marker, then drop
|
||||
// emphasis markers. Greedy char-class stripping (the old approach)
|
||||
// ate legitimate content like ">"-prefixed quotes.
|
||||
let l = l.trim_start();
|
||||
let l = l
|
||||
.strip_prefix("> ")
|
||||
.or_else(|| l.strip_prefix('>'))
|
||||
.unwrap_or(l);
|
||||
let l = l.trim_start_matches('#').trim_start();
|
||||
l.replace("**", "")
|
||||
.replace("~~", "")
|
||||
.replace(['*', '`'], "")
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
// ─── HTML parsing ──────────────────────────────────────────────────────────────
|
||||
|
||||
fn parse_post(doc: &Html) -> Option<RedditPost> {
|
||||
let sel = Selector::parse("#siteTable .thing.link").ok()?;
|
||||
let thing = doc.select(&sel).next()?;
|
||||
let v = thing.value();
|
||||
|
||||
let id = v
|
||||
.attr("data-fullname")
|
||||
.map(|s| s.trim_start_matches("t3_").to_string());
|
||||
let author = v.attr("data-author").unwrap_or("[deleted]").to_string();
|
||||
let subreddit = v.attr("data-subreddit").map(str::to_string);
|
||||
let score: i64 = v
|
||||
.attr("data-score")
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(0);
|
||||
let num_comments: usize = v
|
||||
.attr("data-comments-count")
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(0);
|
||||
let permalink_path = v.attr("data-permalink").unwrap_or("");
|
||||
let permalink = format!("https://old.reddit.com{permalink_path}");
|
||||
// Self-posts carry the `self` class and a `self.<sub>` domain; their
|
||||
// data-url points back at the permalink rather than an external site.
|
||||
let is_self = v.has_class("self", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
|| v.attr("data-domain")
|
||||
.is_some_and(|d| d.starts_with("self."));
|
||||
let link_url = v.attr("data-url").map(str::to_string);
|
||||
let url = if is_self { None } else { link_url };
|
||||
|
||||
// Title
|
||||
let sel_title = Selector::parse(".title a.title").ok()?;
|
||||
let title = thing
|
||||
.select(&sel_title)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<String>().trim().to_string())
|
||||
.filter(|s| !s.is_empty())?;
|
||||
|
||||
// Flair
|
||||
let flair = Selector::parse(".linkflairlabel")
|
||||
.ok()
|
||||
.and_then(|s| thing.select(&s).next())
|
||||
.map(|el| el.text().collect::<String>().trim().to_string())
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
// Self-text body: thing > .entry > .expando > .usertext-body [> .md]
|
||||
let body = direct_child(thing, "entry")
|
||||
.and_then(|entry| find_class(entry, "expando"))
|
||||
.and_then(|expando| find_class(expando, "usertext-body"))
|
||||
.and_then(|ut| find_class(ut, "md"))
|
||||
.map(md_to_markdown)
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
// Datetime
|
||||
let created_utc = Selector::parse("time[datetime]")
|
||||
.ok()
|
||||
.and_then(|s| thing.select(&s).next())
|
||||
.and_then(|t| t.value().attr("datetime"))
|
||||
.map(str::to_string);
|
||||
|
||||
Some(RedditPost {
|
||||
id,
|
||||
title,
|
||||
author,
|
||||
subreddit,
|
||||
score,
|
||||
body,
|
||||
num_comments,
|
||||
permalink,
|
||||
url,
|
||||
is_self,
|
||||
flair,
|
||||
created_utc,
|
||||
})
|
||||
}
|
||||
|
||||
// ─── Comment parsing ───────────────────────────────────────────────────────────
|
||||
//
|
||||
// old.reddit.com nests comments structurally, not via a depth attribute:
|
||||
//
|
||||
// .commentarea
|
||||
// .sitetable.nestedlisting
|
||||
// .comment.thing ← root comment
|
||||
// .entry → form → .usertext-body → .md ← its own body
|
||||
// .child
|
||||
// .sitetable.listing
|
||||
// .comment.thing ← reply (recurse)
|
||||
//
|
||||
// `data-depth`/`data-replies` are absent or always "0" in the logged-out
|
||||
// HTML, so we walk the tree by recursing into each comment's `.child`.
|
||||
|
||||
fn parse_comments(doc: &Html, op: &str) -> Vec<RedditComment> {
|
||||
// Root listing is `.sitetable.nestedlisting` inside `.commentarea`
|
||||
// (note: `commentarea` is a class on old.reddit, not an id). Fall back
|
||||
// to the first `.nestedlisting` anywhere for comment-permalink pages.
|
||||
let listing = Selector::parse(".commentarea .sitetable.nestedlisting")
|
||||
.ok()
|
||||
.and_then(|s| doc.select(&s).next())
|
||||
.or_else(|| {
|
||||
Selector::parse(".sitetable.nestedlisting")
|
||||
.ok()
|
||||
.and_then(|s| doc.select(&s).next())
|
||||
});
|
||||
|
||||
match listing {
|
||||
Some(l) => walk_comment_level(l, op, 0),
|
||||
None => vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse the direct-child `.comment.thing` elements of a comment listing.
|
||||
fn walk_comment_level(listing: ElementRef, op: &str, depth: usize) -> Vec<RedditComment> {
|
||||
listing
|
||||
.children()
|
||||
.filter_map(ElementRef::wrap)
|
||||
.filter(|c| {
|
||||
let val = c.value();
|
||||
val.has_class("comment", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
&& val.has_class("thing", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
})
|
||||
.filter_map(|c| parse_one_comment(c, op, depth))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn parse_one_comment(c: ElementRef, op: &str, depth: usize) -> Option<RedditComment> {
|
||||
let v = c.value();
|
||||
|
||||
// "load more comments" placeholders are `.thing` with type=morechildren.
|
||||
// They carry a t1_ fullname but no real content — skip them.
|
||||
if v.attr("data-type") == Some("morechildren")
|
||||
|| v.has_class(
|
||||
"morechildren",
|
||||
scraper::CaseSensitivity::AsciiCaseInsensitive,
|
||||
)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let is_deleted = v.has_class("deleted", scraper::CaseSensitivity::AsciiCaseInsensitive);
|
||||
let id = v
|
||||
.attr("data-fullname")
|
||||
.map(|s| s.trim_start_matches("t1_").to_string());
|
||||
let author = v
|
||||
.attr("data-author")
|
||||
.filter(|a| !a.is_empty())
|
||||
.unwrap_or("[deleted]")
|
||||
.to_string();
|
||||
|
||||
// Own body lives in `.entry > form > .usertext-body > .md`. `.child`
|
||||
// (nested replies) is a sibling of `.entry`, so descending within
|
||||
// `.entry` never crosses into a reply's body.
|
||||
let entry = direct_child(c, "entry");
|
||||
let body = entry
|
||||
.and_then(|e| find_class(e, "usertext-body"))
|
||||
.and_then(|ut| find_class(ut, "md"))
|
||||
.map(md_to_markdown)
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| {
|
||||
if is_deleted {
|
||||
"[removed]".into()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
});
|
||||
|
||||
// Displayed score is `.score.unvoted`, whose `title` holds the exact
|
||||
// integer (the sibling likes/dislikes spans are ±1). Hidden-score
|
||||
// comments have no `.score.unvoted` span, so `comment_score` returns
|
||||
// None — kept distinct from a genuine 0.
|
||||
let score = entry.and_then(comment_score);
|
||||
|
||||
let created_utc = entry
|
||||
.zip(Selector::parse("time[datetime]").ok())
|
||||
.and_then(|(e, s)| e.select(&s).next())
|
||||
.and_then(|t| t.value().attr("datetime"))
|
||||
.map(str::to_string);
|
||||
|
||||
let is_op = !is_deleted && author != "[deleted]" && author == op;
|
||||
|
||||
// Replies: `.comment > .child > .sitetable > .comment`.
|
||||
let replies = direct_child(c, "child")
|
||||
.and_then(|child| direct_child(child, "sitetable"))
|
||||
.map(|st| walk_comment_level(st, op, depth + 1))
|
||||
.unwrap_or_default();
|
||||
|
||||
Some(RedditComment {
|
||||
id,
|
||||
author,
|
||||
body,
|
||||
score,
|
||||
depth,
|
||||
is_op,
|
||||
created_utc,
|
||||
replies,
|
||||
})
|
||||
}
|
||||
|
||||
/// Read a comment's score from the `.score.unvoted` span inside `.entry`.
|
||||
/// Prefers the `title` attribute (exact integer); falls back to the text.
|
||||
/// Returns `None` when Reddit hides the score (no `.score.unvoted` span).
|
||||
fn comment_score(entry: ElementRef) -> Option<i64> {
|
||||
let sel = Selector::parse("span.score.unvoted").ok()?;
|
||||
let span = entry.select(&sel).next()?;
|
||||
span.value()
|
||||
.attr("title")
|
||||
.and_then(|t| t.trim().parse().ok())
|
||||
.or_else(|| parse_score(&span.text().collect::<String>()))
|
||||
}
|
||||
|
||||
// ─── DOM helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
/// First direct child element whose class list includes `class`.
|
||||
fn direct_child<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
|
||||
el.children().filter_map(ElementRef::wrap).find(|c| {
|
||||
c.value()
|
||||
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
})
|
||||
}
|
||||
|
||||
/// First descendant (any depth) whose class list includes `class`.
|
||||
fn find_class<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
|
||||
el.children().filter_map(ElementRef::wrap).find_map(|c| {
|
||||
if c.value()
|
||||
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
{
|
||||
Some(c)
|
||||
} else {
|
||||
find_class(c, class)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_score(text: &str) -> Option<i64> {
|
||||
text.split_whitespace()
|
||||
.next()
|
||||
.map(|w| w.replace('−', "-"))
|
||||
.and_then(|w| w.parse().ok())
|
||||
}
|
||||
|
||||
// ─── .md div → markdown ────────────────────────────────────────────────────────
|
||||
|
||||
fn md_to_markdown(el: ElementRef) -> String {
|
||||
let mut out = String::new();
|
||||
render_children(el, &mut out);
|
||||
out.trim().to_string()
|
||||
}
|
||||
|
||||
fn render_children(el: ElementRef, out: &mut String) {
|
||||
use scraper::node::Node;
|
||||
for child in el.children() {
|
||||
match child.value() {
|
||||
Node::Text(t) => out.push_str(t.as_ref()),
|
||||
Node::Element(_) => {
|
||||
if let Some(c) = ElementRef::wrap(child) {
|
||||
render_node(c, out);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn render_node(el: ElementRef, out: &mut String) {
|
||||
match el.value().name() {
|
||||
"p" | "div" => {
|
||||
let mut inner = String::new();
|
||||
render_children(el, &mut inner);
|
||||
let t = inner.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(t);
|
||||
out.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
"br" => out.push('\n'),
|
||||
"strong" | "b" => {
|
||||
let t: String = el.text().collect();
|
||||
let t = t.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(&format!("**{t}**"));
|
||||
}
|
||||
}
|
||||
"em" | "i" => {
|
||||
let t: String = el.text().collect();
|
||||
let t = t.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(&format!("*{t}*"));
|
||||
}
|
||||
}
|
||||
"del" | "s" | "strike" => {
|
||||
let t: String = el.text().collect();
|
||||
let t = t.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(&format!("~~{t}~~"));
|
||||
}
|
||||
}
|
||||
"code" => {
|
||||
let t: String = el.text().collect();
|
||||
out.push('`');
|
||||
out.push_str(t.trim());
|
||||
out.push('`');
|
||||
}
|
||||
"pre" => {
|
||||
let t: String = el.text().collect();
|
||||
out.push_str("```\n");
|
||||
out.push_str(t.trim_end_matches('\n'));
|
||||
out.push_str("\n```\n\n");
|
||||
}
|
||||
"a" => {
|
||||
let text: String = el.text().collect();
|
||||
let text = text.trim();
|
||||
if !text.is_empty() {
|
||||
// Preserve the destination as a markdown link. Resolve
|
||||
// root-relative reddit hrefs (/r/, /user/, /wiki/, ...) and
|
||||
// drop non-navigational ones (javascript:, #fragment, mailto:).
|
||||
let href = el.value().attr("href").unwrap_or("");
|
||||
if href.starts_with("http://") || href.starts_with("https://") {
|
||||
out.push_str(&format!("[{text}]({href})"));
|
||||
} else if href.starts_with('/') {
|
||||
out.push_str(&format!("[{text}](https://old.reddit.com{href})"));
|
||||
} else {
|
||||
out.push_str(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
"blockquote" => {
|
||||
let mut inner = String::new();
|
||||
render_children(el, &mut inner);
|
||||
let trimmed = inner.trim();
|
||||
for line in trimmed.lines() {
|
||||
out.push('>');
|
||||
if !line.is_empty() {
|
||||
out.push(' ');
|
||||
out.push_str(line);
|
||||
}
|
||||
out.push('\n');
|
||||
}
|
||||
out.push('\n');
|
||||
}
|
||||
"ul" => render_list(el, false, 0, out),
|
||||
"ol" => render_list(el, true, 0, out),
|
||||
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
|
||||
let level = el
|
||||
.value()
|
||||
.name()
|
||||
.chars()
|
||||
.nth(1)
|
||||
.and_then(|c| c.to_digit(10))
|
||||
.unwrap_or(2) as usize;
|
||||
let t: String = el.text().collect();
|
||||
let t = t.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(&"#".repeat(level));
|
||||
out.push(' ');
|
||||
out.push_str(t);
|
||||
out.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
"hr" => out.push_str("---\n\n"),
|
||||
"sup" => {
|
||||
let t: String = el.text().collect();
|
||||
out.push_str(t.trim());
|
||||
}
|
||||
// Unknown / generic containers: recurse
|
||||
_ => render_children(el, out),
|
||||
}
|
||||
}
|
||||
|
||||
/// Render a `<ul>`/`<ol>`, indenting nested lists by two spaces per level so
|
||||
/// child items keep their own line instead of being glued to the parent.
|
||||
fn render_list(list: ElementRef, ordered: bool, indent: usize, out: &mut String) {
|
||||
use scraper::node::Node;
|
||||
let pad = " ".repeat(indent);
|
||||
let mut n = 0;
|
||||
for li in list
|
||||
.children()
|
||||
.filter_map(ElementRef::wrap)
|
||||
.filter(|c| c.value().name() == "li")
|
||||
{
|
||||
n += 1;
|
||||
// Inline content of this <li>, excluding nested lists (rendered after).
|
||||
let mut inline = String::new();
|
||||
for child in li.children() {
|
||||
match child.value() {
|
||||
Node::Text(t) => inline.push_str(t.as_ref()),
|
||||
Node::Element(e) if e.name() == "ul" || e.name() == "ol" => {}
|
||||
Node::Element(_) => {
|
||||
if let Some(c) = ElementRef::wrap(child) {
|
||||
render_node(c, &mut inline);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let marker = if ordered {
|
||||
format!("{n}. ")
|
||||
} else {
|
||||
"- ".to_string()
|
||||
};
|
||||
out.push_str(&format!("{pad}{marker}{}\n", inline.trim()));
|
||||
|
||||
for child in li.children().filter_map(ElementRef::wrap) {
|
||||
match child.value().name() {
|
||||
"ul" => render_list(child, false, indent + 1, out),
|
||||
"ol" => render_list(child, true, indent + 1, out),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
if indent == 0 {
|
||||
out.push('\n');
|
||||
}
|
||||
}
|
||||
|
||||
// ─── URL helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split(['/', '?', '#'])
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
// ─── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn is_reddit_url_recognises_variants() {
|
||||
assert!(is_reddit_url(
|
||||
"https://www.reddit.com/r/rust/comments/abc/x/"
|
||||
));
|
||||
assert!(is_reddit_url(
|
||||
"https://old.reddit.com/r/rust/comments/abc/x/"
|
||||
));
|
||||
assert!(is_reddit_url("https://reddit.com/r/rust/comments/abc/x/"));
|
||||
assert!(!is_reddit_url("https://example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn try_extract_thread_returns_none_for_listing_url() {
|
||||
let html = "<html><body></body></html>";
|
||||
assert!(try_extract_thread(html, "https://old.reddit.com/r/rust/").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn md_to_markdown_basic() {
|
||||
let html =
|
||||
Html::parse_fragment(r#"<div class="md"><p>Hello <strong>world</strong>!</p></div>"#);
|
||||
let sel = Selector::parse(".md").unwrap();
|
||||
let el = html.select(&sel).next().unwrap();
|
||||
let md = md_to_markdown(el);
|
||||
assert!(md.contains("**world**"));
|
||||
assert!(md.contains("Hello"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn md_to_markdown_blockquote_and_code() {
|
||||
let html = Html::parse_fragment(
|
||||
r#"<div class="md"><blockquote><p>Quoted</p></blockquote><pre><code>fn main() {}</code></pre></div>"#,
|
||||
);
|
||||
let sel = Selector::parse(".md").unwrap();
|
||||
let el = html.select(&sel).next().unwrap();
|
||||
let md = md_to_markdown(el);
|
||||
assert!(md.contains("> Quoted"));
|
||||
assert!(md.contains("```"));
|
||||
assert!(md.contains("fn main()"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn md_to_markdown_link_preserves_href() {
|
||||
let abs = Html::parse_fragment(
|
||||
r#"<div class="md"><p>see <a href="https://example.com/x">this</a></p></div>"#,
|
||||
);
|
||||
let sel = Selector::parse(".md").unwrap();
|
||||
let el = abs.select(&sel).next().unwrap();
|
||||
assert!(md_to_markdown(el).contains("[this](https://example.com/x)"));
|
||||
|
||||
// Root-relative reddit links resolve against old.reddit.com.
|
||||
let rel = Html::parse_fragment(
|
||||
r#"<div class="md"><p><a href="/r/rust/wiki/faq">faq</a></p></div>"#,
|
||||
);
|
||||
let el = rel.select(&sel).next().unwrap();
|
||||
assert!(md_to_markdown(el).contains("[faq](https://old.reddit.com/r/rust/wiki/faq)"));
|
||||
|
||||
// javascript: / fragment hrefs degrade to bare text.
|
||||
let js = Html::parse_fragment(
|
||||
r#"<div class="md"><p><a href="javascript:void(0)">x</a></p></div>"#,
|
||||
);
|
||||
let el = js.select(&sel).next().unwrap();
|
||||
let out = md_to_markdown(el);
|
||||
assert!(out.contains('x') && !out.contains("javascript"));
|
||||
}
|
||||
|
||||
// ── Regression tests against REAL old.reddit.com HTML ──────────────────
|
||||
//
|
||||
// These fixtures are genuine pages fetched from old.reddit.com (see
|
||||
// testdata/reddit/). They are the ground truth — synthetic HTML is too
|
||||
// easy to write to match wrong assumptions, which is exactly how the
|
||||
// first version of this parser shipped silently broken.
|
||||
|
||||
fn fixture(name: &str) -> String {
|
||||
std::fs::read_to_string(format!("testdata/reddit/{name}")).unwrap()
|
||||
}
|
||||
|
||||
fn total_comments(cs: &[RedditComment]) -> usize {
|
||||
cs.len() + cs.iter().map(|c| total_comments(&c.replies)).sum::<usize>()
|
||||
}
|
||||
|
||||
fn collect<'a>(cs: &'a [RedditComment], out: &mut Vec<&'a RedditComment>) {
|
||||
for c in cs {
|
||||
out.push(c);
|
||||
collect(&c.replies, out);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_link_post_metadata() {
|
||||
// pandas: external-link post (blog.geekuni.com), 34 comments.
|
||||
let html = fixture("pandas_34comments.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
let p = t.post.expect("post");
|
||||
assert_eq!(p.author, "Horror-Willingness74");
|
||||
assert_eq!(p.subreddit.as_deref(), Some("programming"));
|
||||
assert_eq!(p.score, 43);
|
||||
assert_eq!(p.num_comments, 34, "data-comments-count");
|
||||
assert!(!p.is_self, "external blog link, not a self post");
|
||||
assert_eq!(
|
||||
p.url.as_deref(),
|
||||
Some("https://blog.geekuni.com/2026/06/why-learn-pandas.html")
|
||||
);
|
||||
assert!(p.title.contains("Pandas"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_self_post_metadata() {
|
||||
// A self-post (text) on r/rust: `self.rust` domain, self-text body,
|
||||
// no external url.
|
||||
let html = fixture("rust_selfpost_36comments.html");
|
||||
let t = try_extract_thread(&html, "https://old.reddit.com/r/rust/comments/abc123/t/")
|
||||
.expect("should parse");
|
||||
let p = t.post.expect("post");
|
||||
assert!(p.is_self, "self.rust domain → self post");
|
||||
assert_eq!(p.url, None, "self posts carry no external url");
|
||||
assert_eq!(p.subreddit.as_deref(), Some("rust"));
|
||||
assert!(
|
||||
p.body
|
||||
.as_deref()
|
||||
.unwrap_or("")
|
||||
.contains("IT project manager"),
|
||||
"self-text body should be extracted: {:?}",
|
||||
p.body
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_comment_bodies_and_scores() {
|
||||
// The original bug: every comment body came back empty because
|
||||
// .usertext-body sits inside a <form>, not directly under .entry.
|
||||
let html = fixture("ebpf_6comments.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
// 6 comments total: 5 top-level + 1 nested reply (admalledd under ejrh).
|
||||
assert_eq!(t.comments.len(), 5, "5 top-level comments");
|
||||
assert_eq!(total_comments(&t.comments), 6, "6 comments incl. nested");
|
||||
let teerre = t
|
||||
.comments
|
||||
.iter()
|
||||
.find(|c| c.author == "teerre")
|
||||
.expect("teerre");
|
||||
assert!(
|
||||
teerre.body.contains("Very cool blog"),
|
||||
"body must be populated, got {:?}",
|
||||
teerre.body
|
||||
);
|
||||
// Score comes from .score.unvoted title (the real value), not the
|
||||
// ±1 likes/dislikes siblings.
|
||||
assert_eq!(
|
||||
teerre.score,
|
||||
Some(10),
|
||||
"unvoted score, not dislikes(9)/likes(11)"
|
||||
);
|
||||
assert!(
|
||||
t.comments.iter().all(|c| !c.body.is_empty()),
|
||||
"no comment body should be empty"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_nested_comment_tree() {
|
||||
// pandas has structurally-nested replies (.child > .sitetable >
|
||||
// .comment). data-depth/data-replies are absent in logged-out HTML.
|
||||
let html = fixture("pandas_34comments.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
// 34 rendered comments with content + 1 [deleted] node that old.reddit
|
||||
// still shows because it has live replies = 35 nodes in the tree.
|
||||
assert_eq!(
|
||||
total_comments(&t.comments),
|
||||
35,
|
||||
"all comments incl. nested + deleted"
|
||||
);
|
||||
let nested = t.comments.iter().any(|c| !c.replies.is_empty());
|
||||
assert!(nested, "at least one comment must have replies");
|
||||
let max_depth = {
|
||||
fn d(cs: &[RedditComment]) -> usize {
|
||||
cs.iter().map(|c| 1 + d(&c.replies)).max().unwrap_or(0)
|
||||
}
|
||||
d(&t.comments)
|
||||
};
|
||||
assert!(max_depth >= 2, "tree should be more than one level deep");
|
||||
let a_reply = t.comments.iter().find_map(|c| c.replies.first());
|
||||
assert_eq!(a_reply.map(|r| r.depth), Some(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_morechildren_stubs_skipped() {
|
||||
// AskReddit deep thread: 259 .thing[data-fullname=t1_] markers, but
|
||||
// some are "load more comments" stubs (data-type=morechildren) with
|
||||
// no author/body. They must not appear as ghost comments.
|
||||
let html = fixture("askreddit_deep_morechildren.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
fn check(cs: &[RedditComment]) {
|
||||
for c in cs {
|
||||
let ghost = c.body.is_empty() && c.author == "[deleted]" && c.id.is_some();
|
||||
assert!(!ghost, "morechildren stub leaked as comment: {:?}", c.id);
|
||||
check(&c.replies);
|
||||
}
|
||||
}
|
||||
check(&t.comments);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_hidden_score_is_none_not_zero() {
|
||||
// AskReddit has fresh comments with `.score-hidden` (no .score.unvoted
|
||||
// span). These must be None, distinct from a genuine 0-score comment.
|
||||
let html = fixture("askreddit_deep_morechildren.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
let mut all = Vec::new();
|
||||
collect(&t.comments, &mut all);
|
||||
assert!(
|
||||
all.iter().any(|c| c.score.is_none()),
|
||||
"some fresh comments have hidden scores → None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_deleted_comment_preserves_subtree() {
|
||||
// pandas has a [deleted] comment that still has visible replies. The
|
||||
// structural walk must keep it so its children aren't orphaned.
|
||||
let html = fixture("pandas_34comments.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
let mut all = Vec::new();
|
||||
collect(&t.comments, &mut all);
|
||||
let deleted: Vec<_> = all.iter().filter(|c| c.author == "[deleted]").collect();
|
||||
assert!(!deleted.is_empty(), "should keep deleted comments");
|
||||
assert!(
|
||||
deleted.iter().any(|c| !c.replies.is_empty()),
|
||||
"a deleted comment with replies must retain its subtree"
|
||||
);
|
||||
assert!(deleted.iter().all(|c| !c.is_op));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_markdown_is_commonmark_clean() {
|
||||
// Guards the markdown bugs the verification workflow found: no
|
||||
// whitespace-only "blank" lines, and ``` fences never indented 4+
|
||||
// spaces (which would turn them into literal indented code blocks).
|
||||
let html = fixture("elixir_60comments.html");
|
||||
let result = try_extract(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should extract");
|
||||
let md = &result.content.markdown;
|
||||
assert!(md.starts_with("# "));
|
||||
assert!(md.contains("## Comments"));
|
||||
for line in md.lines() {
|
||||
assert!(
|
||||
!(line.starts_with(' ') && line.trim().is_empty()),
|
||||
"whitespace-only line: {line:?}"
|
||||
);
|
||||
let trimmed = line.trim_start_matches(['>', ' ']);
|
||||
if trimmed.starts_with("```") {
|
||||
let indent = line.len() - line.trim_start_matches(' ').len();
|
||||
assert!(indent < 4, "code fence indented {indent} spaces: {line:?}");
|
||||
}
|
||||
}
|
||||
assert!(result.metadata.word_count > 20);
|
||||
}
|
||||
}
|
||||
596
crates/webclaw-core/testdata/reddit/askreddit_deep_morechildren.html
vendored
Normal file
596
crates/webclaw-core/testdata/reddit/askreddit_deep_morechildren.html
vendored
Normal file
File diff suppressed because one or more lines are too long
82
crates/webclaw-core/testdata/reddit/ebpf_6comments.html
vendored
Normal file
82
crates/webclaw-core/testdata/reddit/ebpf_6comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
312
crates/webclaw-core/testdata/reddit/elixir_60comments.html
vendored
Normal file
312
crates/webclaw-core/testdata/reddit/elixir_60comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
227
crates/webclaw-core/testdata/reddit/pandas_34comments.html
vendored
Normal file
227
crates/webclaw-core/testdata/reddit/pandas_34comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
234
crates/webclaw-core/testdata/reddit/rust_selfpost_36comments.html
vendored
Normal file
234
crates/webclaw-core/testdata/reddit/rust_selfpost_36comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -13,10 +13,17 @@ thiserror = { workspace = true }
|
|||
tracing = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
async-trait = "0.1"
|
||||
wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
|
||||
wreq-util = "3.0.0-rc.10"
|
||||
# Pinned to exact pre-release versions: wreq/wreq-util are release candidates
|
||||
# with no semver stability between rc.N builds. An exact pin keeps `cargo build`,
|
||||
# `cargo install` (which ignores Cargo.lock), and the release workflow all on the
|
||||
# version that compiles.
|
||||
wreq = { version = "=6.0.0-rc.29", features = ["cookies", "gzip", "brotli", "zstd", "deflate", "stream"] }
|
||||
wreq-util = "=3.0.0-rc.12"
|
||||
http = "1"
|
||||
bytes = "1"
|
||||
# Stream adapter for `wreq::Response::bytes_stream()` (wreq 6.0.0-rc.29 dropped
|
||||
# `Response::chunk()`); used to buffer bodies under the running size ceiling.
|
||||
futures-util = "0.3"
|
||||
url = "2"
|
||||
rand = "0.8"
|
||||
quick-xml = { version = "0.37", features = ["serde"] }
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ use std::hash::{Hash, Hasher};
|
|||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use futures_util::StreamExt;
|
||||
use rand::seq::SliceRandom;
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{debug, instrument, warn};
|
||||
|
|
@ -118,7 +119,7 @@ impl Response {
|
|||
/// negotiated), so a tiny compressed payload that inflates to
|
||||
/// gigabytes is aborted as soon as the accumulated size crosses the
|
||||
/// cap — it never gets fully buffered in memory.
|
||||
async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
|
||||
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
|
||||
if let Some(len) = resp.content_length()
|
||||
&& len > MAX_BODY_BYTES
|
||||
{
|
||||
|
|
@ -130,12 +131,13 @@ impl Response {
|
|||
let url = resp.uri().to_string();
|
||||
let headers = resp.headers().clone();
|
||||
|
||||
// wreq 6.0.0-rc.29 dropped `Response::chunk()`. Stream post-decompression
|
||||
// bytes via `bytes_stream()` and keep enforcing the running ceiling so a
|
||||
// compression bomb is aborted before it is fully buffered in memory.
|
||||
let mut buf = bytes::BytesMut::new();
|
||||
while let Some(chunk) = resp
|
||||
.chunk()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?
|
||||
{
|
||||
let mut stream = resp.bytes_stream();
|
||||
while let Some(chunk) = stream.next().await {
|
||||
let chunk = chunk.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
check_body_ceiling(buf.len(), chunk.len())?;
|
||||
buf.extend_from_slice(&chunk);
|
||||
}
|
||||
|
|
@ -160,9 +162,6 @@ impl Response {
|
|||
fn body(&self) -> &[u8] {
|
||||
&self.body
|
||||
}
|
||||
fn is_success(&self) -> bool {
|
||||
(200..300).contains(&self.status)
|
||||
}
|
||||
|
||||
fn text(&self) -> std::borrow::Cow<'_, str> {
|
||||
String::from_utf8_lossy(&self.body)
|
||||
|
|
@ -299,32 +298,15 @@ impl FetchClient {
|
|||
/// when you need literal no-rescue behavior (e.g. inside the rescue
|
||||
/// logic itself to avoid recursion).
|
||||
pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
// Reddit: the HTML page shows a verification interstitial for most
|
||||
// client IPs, but appending `.json` returns the post + comment tree
|
||||
// publicly. `parse_reddit_json` in downstream code knows how to read
|
||||
// the result; here we just do the URL swap at the fetch layer.
|
||||
if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
// Reddit's public .json API serves JSON to identifiable bot
|
||||
// User-Agents and blocks browser UAs with a verification wall.
|
||||
// Override our Chrome-profile UA for this specific call.
|
||||
let ua = concat!(
|
||||
"Webclaw/",
|
||||
env!("CARGO_PKG_VERSION"),
|
||||
" (+https://webclaw.io)"
|
||||
);
|
||||
if let Ok(resp) = self
|
||||
.fetch_with_headers(&json_url, &[("user-agent", ua)])
|
||||
.await
|
||||
&& resp.status == 200
|
||||
{
|
||||
let first = resp.html.trim_start().as_bytes().first().copied();
|
||||
if matches!(first, Some(b'{') | Some(b'[')) {
|
||||
return Ok(resp);
|
||||
}
|
||||
}
|
||||
// If the .json fetch failed or returned HTML, fall through.
|
||||
}
|
||||
// Reddit: fetch old.reddit.com for stable server-rendered HTML.
|
||||
// The JSON API is blocked; old.reddit.com works without JS or auth.
|
||||
let owned;
|
||||
let url = if crate::reddit::is_reddit_url(url) {
|
||||
owned = crate::reddit::to_old_reddit_url(url);
|
||||
owned.as_str()
|
||||
} else {
|
||||
url
|
||||
};
|
||||
|
||||
let resp = self.fetch(url).await?;
|
||||
|
||||
|
|
@ -496,23 +478,16 @@ impl FetchClient {
|
|||
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||
let url = parsed_url.as_str();
|
||||
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
|
||||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let client = self.pick_client(url);
|
||||
let resp = client.get(json_url.as_str()).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
if response.is_success() {
|
||||
let bytes = response.body();
|
||||
match crate::reddit::parse_reddit_json(bytes, url) {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
|
||||
}
|
||||
}
|
||||
}
|
||||
// Reddit: rewrite to old.reddit.com for stable server-rendered HTML.
|
||||
// webclaw-core's Reddit fast path then parses the thread structure.
|
||||
let reddit_owned;
|
||||
let url = if crate::reddit::is_reddit_url(url) {
|
||||
reddit_owned = crate::reddit::to_old_reddit_url(url);
|
||||
debug!("reddit: rewriting to {reddit_owned}");
|
||||
reddit_owned.as_str()
|
||||
} else {
|
||||
url
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
|
|
|||
|
|
@ -810,13 +810,18 @@ mod tests {
|
|||
|
||||
// --- CloudClient construction ------------------------------------------
|
||||
|
||||
// `WEBCLAW_API_KEY` is process-global; cargo runs tests in parallel
|
||||
// threads. Without serialization, a test that sets the var can race a
|
||||
// test asserting it is absent. This lock makes the env-mutating
|
||||
// CloudClient tests mutually exclusive (poison-tolerant: a panicking
|
||||
// test must not wedge the others).
|
||||
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
|
||||
|
||||
#[test]
|
||||
fn cloud_client_explicit_key_wins_over_env() {
|
||||
// SAFETY: this test mutates process env. Serial tests only.
|
||||
// Set env to something, pass an explicit key, explicit should win.
|
||||
// (We don't actually *call* the API, just check the struct stored
|
||||
// the right key.)
|
||||
// rustc std::env::set_var is unsafe in newer toolchains.
|
||||
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
|
||||
// SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
|
||||
// are unsafe on the 2024 toolchain. Explicit key must beat the env.
|
||||
unsafe {
|
||||
std::env::set_var("WEBCLAW_API_KEY", "from-env");
|
||||
}
|
||||
|
|
@ -829,6 +834,9 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn cloud_client_none_when_empty() {
|
||||
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
|
||||
// SAFETY: env mutation serialized by ENV_LOCK. Clearing the var
|
||||
// (incl. any ambient runner value) is what makes this deterministic.
|
||||
unsafe {
|
||||
std::env::remove_var("WEBCLAW_API_KEY");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,12 +1,10 @@
|
|||
//! Reddit structured extractor — returns the full post + comment tree
|
||||
//! as typed JSON via Reddit's `.json` API.
|
||||
//! Reddit structured extractor — parses old.reddit.com HTML.
|
||||
//!
|
||||
//! The same trick the markdown extractor in `crate::reddit` uses:
|
||||
//! appending `.json` to any post URL returns the data the new SPA
|
||||
//! frontend would load client-side. Zero antibot, zero JS rendering.
|
||||
//! Fetches old.reddit.com (stable server-rendered HTML, no JS required)
|
||||
//! and delegates parsing to `webclaw_core::reddit`. Returns a typed JSON
|
||||
//! value with `{ url, post, comments }` structure.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::error::FetchError;
|
||||
|
|
@ -24,182 +22,27 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
|
|||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
let is_reddit_host = matches!(
|
||||
host,
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
);
|
||||
is_reddit_host && url.contains("/comments/")
|
||||
webclaw_core::reddit::is_reddit_url(url) && url.contains("/comments/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
|
||||
let json_url = build_json_url(url);
|
||||
let resp = client.fetch(&json_url).await?;
|
||||
let fetch_url = crate::reddit::to_old_reddit_url(url);
|
||||
let resp = client.fetch(&fetch_url).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"reddit api returned status {}",
|
||||
"reddit: unexpected status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
|
||||
let thread = webclaw_core::reddit::try_extract_thread(&resp.html, url).ok_or_else(|| {
|
||||
FetchError::BodyDecode(
|
||||
"reddit: page structure not recognised — is this a thread URL?".into(),
|
||||
)
|
||||
})?;
|
||||
|
||||
if listings.is_empty() {
|
||||
return Err(FetchError::BodyDecode("reddit response empty".into()));
|
||||
}
|
||||
|
||||
// First listing = the post (single t3 child).
|
||||
let post = listings
|
||||
.first()
|
||||
.and_then(|l| l.data.children.first())
|
||||
.filter(|t| t.kind == "t3")
|
||||
.map(|t| post_json(&t.data))
|
||||
.unwrap_or(Value::Null);
|
||||
|
||||
// Second listing = the comment tree.
|
||||
let comments: Vec<Value> = listings
|
||||
.get(1)
|
||||
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"post": post,
|
||||
"comments": comments,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// JSON shapers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn post_json(d: &ThingData) -> Value {
|
||||
json!({
|
||||
"id": d.id,
|
||||
"title": d.title,
|
||||
"author": d.author,
|
||||
"subreddit": d.subreddit_name_prefixed,
|
||||
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
||||
"url": d.url_overridden_by_dest,
|
||||
"is_self": d.is_self,
|
||||
"selftext": d.selftext,
|
||||
"score": d.score,
|
||||
"upvote_ratio": d.upvote_ratio,
|
||||
"num_comments": d.num_comments,
|
||||
"created_utc": d.created_utc,
|
||||
"link_flair_text": d.link_flair_text,
|
||||
"over_18": d.over_18,
|
||||
"spoiler": d.spoiler,
|
||||
"stickied": d.stickied,
|
||||
"locked": d.locked,
|
||||
})
|
||||
}
|
||||
|
||||
/// Render a single comment + its reply tree. Returns `None` for non-t1
|
||||
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
|
||||
fn comment_json(thing: &Thing) -> Option<Value> {
|
||||
if thing.kind != "t1" {
|
||||
return None;
|
||||
}
|
||||
let d = &thing.data;
|
||||
let replies: Vec<Value> = match &d.replies {
|
||||
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
|
||||
_ => Vec::new(),
|
||||
};
|
||||
Some(json!({
|
||||
"id": d.id,
|
||||
"author": d.author,
|
||||
"body": d.body,
|
||||
"score": d.score,
|
||||
"created_utc": d.created_utc,
|
||||
"is_submitter": d.is_submitter,
|
||||
"stickied": d.stickied,
|
||||
"depth": d.depth,
|
||||
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
||||
"replies": replies,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// URL helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
|
||||
/// or `old.reddit.com` as the caller gave us). Routing through
|
||||
/// `old.reddit.com` unconditionally looks appealing but that host has
|
||||
/// stricter UA-based blocking than `www.reddit.com`, while the main
|
||||
/// host accepts our Chrome-fingerprinted client fine.
|
||||
fn build_json_url(url: &str) -> String {
|
||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
format!("{clean}.json?raw_json=1")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reddit JSON types — only fields we render. Everything else is dropped.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Listing {
|
||||
data: ListingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListingData {
|
||||
children: Vec<Thing>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Thing {
|
||||
kind: String,
|
||||
data: ThingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Default)]
|
||||
struct ThingData {
|
||||
// post (t3)
|
||||
id: Option<String>,
|
||||
title: Option<String>,
|
||||
selftext: Option<String>,
|
||||
subreddit_name_prefixed: Option<String>,
|
||||
url_overridden_by_dest: Option<String>,
|
||||
is_self: Option<bool>,
|
||||
upvote_ratio: Option<f64>,
|
||||
num_comments: Option<i64>,
|
||||
over_18: Option<bool>,
|
||||
spoiler: Option<bool>,
|
||||
stickied: Option<bool>,
|
||||
locked: Option<bool>,
|
||||
link_flair_text: Option<String>,
|
||||
|
||||
// comment (t1)
|
||||
author: Option<String>,
|
||||
body: Option<String>,
|
||||
score: Option<i64>,
|
||||
created_utc: Option<f64>,
|
||||
is_submitter: Option<bool>,
|
||||
depth: Option<i64>,
|
||||
permalink: Option<String>,
|
||||
|
||||
// recursive
|
||||
replies: Option<Replies>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Replies {
|
||||
Listing(Listing),
|
||||
#[allow(dead_code)]
|
||||
Empty(String),
|
||||
serde_json::to_value(&thread)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("reddit: serialisation error: {e}")))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -207,28 +50,17 @@ mod tests {
|
|||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_reddit_post_urls() {
|
||||
fn matches_thread_urls() {
|
||||
assert!(matches(
|
||||
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
|
||||
));
|
||||
assert!(matches(
|
||||
"https://reddit.com/r/rust/comments/abc123/some_title"
|
||||
));
|
||||
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
|
||||
assert!(matches("https://reddit.com/r/rust/comments/abc/x"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_non_post_reddit_urls() {
|
||||
fn rejects_listing_and_non_reddit() {
|
||||
assert!(!matches("https://www.reddit.com/r/rust"));
|
||||
assert!(!matches("https://www.reddit.com/user/foo"));
|
||||
assert!(!matches("https://example.com/r/rust/comments/x"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_url_appends_suffix_and_drops_query() {
|
||||
assert_eq!(
|
||||
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
|
||||
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
|
||||
);
|
||||
assert!(!matches("https://example.com/r/rust/comments/abc/x"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ pub mod extractors;
|
|||
pub mod fetcher;
|
||||
pub mod linkedin;
|
||||
pub mod locale;
|
||||
pub mod progress;
|
||||
pub mod proxy;
|
||||
pub mod reddit;
|
||||
pub mod sitemap;
|
||||
|
|
@ -24,6 +25,7 @@ pub use error::FetchError;
|
|||
pub use fetcher::Fetcher;
|
||||
pub use http::HeaderMap;
|
||||
pub use locale::{accept_language_for_tld, accept_language_for_url};
|
||||
pub use progress::{PROGRESS_INTERVAL, with_progress};
|
||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||
pub use sitemap::SitemapEntry;
|
||||
pub use webclaw_pdf::PdfMode;
|
||||
|
|
|
|||
293
crates/webclaw-fetch/src/progress.rs
Normal file
293
crates/webclaw-fetch/src/progress.rs
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
//! Periodic stderr progress line emitter for slow fetches (M13).
|
||||
//!
|
||||
//! Wraps any async fetch future with a `tokio::select!` against a
|
||||
//! `tokio::time::interval`. Every `PROGRESS_INTERVAL` (default 10s) of
|
||||
//! elapsed time, emits one line to STDERR of the form:
|
||||
//!
|
||||
//! ```text
|
||||
//! # webclaw: still fetching <URL> (Ns)
|
||||
//! ```
|
||||
//!
|
||||
//! Fetches completing in under `PROGRESS_INTERVAL` emit zero lines (the
|
||||
//! timer never fires). Stdout is untouched.
|
||||
//!
|
||||
//! The URL is truncated to at most 80 chars (head + `...` + tail) so
|
||||
//! pathological query strings don't blow up the stderr line. Truncation
|
||||
//! is char-boundary safe (operates on `chars`, not bytes).
|
||||
|
||||
use std::future::Future;
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::{Instant, MissedTickBehavior, interval};
|
||||
|
||||
/// Default progress emission interval. The first tick fires at +10s
|
||||
/// elapsed; subsequent ticks at +20s, +30s, etc.
|
||||
pub const PROGRESS_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Maximum URL length in the progress line. Longer URLs are truncated
|
||||
/// `head...tail` style.
|
||||
const MAX_URL_LEN: usize = 80;
|
||||
|
||||
/// Wrap a fetch future with the default 10s progress emitter. Writes
|
||||
/// progress lines to STDERR via `eprintln!`. Returns the inner future's
|
||||
/// result unchanged.
|
||||
pub async fn with_progress<F, T>(url: &str, future: F) -> T
|
||||
where
|
||||
F: Future<Output = T>,
|
||||
{
|
||||
with_progress_writer(url, future, PROGRESS_INTERVAL, |s| eprintln!("{s}")).await
|
||||
}
|
||||
|
||||
/// Test-friendly variant of [`with_progress`]: caller supplies the tick
|
||||
/// interval (so tests can use a 50ms period instead of 10s) and a
|
||||
/// writer closure (so tests can capture emitted lines without touching
|
||||
/// real stderr).
|
||||
///
|
||||
/// Production code uses [`with_progress`] which delegates here with
|
||||
/// [`PROGRESS_INTERVAL`] and an `eprintln!` writer.
|
||||
pub async fn with_progress_writer<F, T, W>(
|
||||
url: &str,
|
||||
future: F,
|
||||
period: Duration,
|
||||
mut writer: W,
|
||||
) -> T
|
||||
where
|
||||
F: Future<Output = T>,
|
||||
W: FnMut(String),
|
||||
{
|
||||
let start = Instant::now();
|
||||
let mut ticker = interval(period);
|
||||
// First tick of `tokio::time::interval(period)` fires *immediately*
|
||||
// (at construction time). We don't want a t=0 emit — consume that
|
||||
// first tick before entering the select loop. Subsequent ticks fire
|
||||
// at `start + period`, `start + 2*period`, ...
|
||||
ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
ticker.tick().await;
|
||||
|
||||
tokio::pin!(future);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Bias toward the future — if both are ready (rare), prefer
|
||||
// returning the result over emitting a final tick.
|
||||
biased;
|
||||
result = &mut future => {
|
||||
return result;
|
||||
}
|
||||
_ = ticker.tick() => {
|
||||
let elapsed = start.elapsed();
|
||||
writer(format_progress_line(url, elapsed));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the progress line: `# webclaw: still fetching <URL> (Ns)`.
|
||||
/// URL is truncated via [`truncate_url`] to [`MAX_URL_LEN`] chars.
|
||||
/// Elapsed is rounded to whole seconds (10, 20, 30, ...).
|
||||
pub(crate) fn format_progress_line(url: &str, elapsed: Duration) -> String {
|
||||
let truncated = truncate_url(url, MAX_URL_LEN);
|
||||
let secs = elapsed.as_secs();
|
||||
format!("# webclaw: still fetching {truncated} ({secs}s)")
|
||||
}
|
||||
|
||||
/// Truncate `url` to at most `max` chars, using `head...tail` shape
|
||||
/// when truncation is needed. Char-boundary safe (operates on `chars`).
|
||||
pub(crate) fn truncate_url(url: &str, max: usize) -> String {
|
||||
let total_chars = url.chars().count();
|
||||
if total_chars <= max {
|
||||
return url.to_string();
|
||||
}
|
||||
// Reserve 3 chars for "..." and split the remainder ~70/30 between
|
||||
// head (path-side) and tail (query-side).
|
||||
let avail = max.saturating_sub(3);
|
||||
let head_chars = avail.saturating_sub(17);
|
||||
let tail_chars = 17;
|
||||
let head: String = url.chars().take(head_chars).collect();
|
||||
let tail: String = url
|
||||
.chars()
|
||||
.rev()
|
||||
.take(tail_chars)
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter()
|
||||
.rev()
|
||||
.collect();
|
||||
format!("{head}...{tail}")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// Collect emitted lines into a `Vec<String>` via a captured writer.
|
||||
fn capture() -> (Arc<Mutex<Vec<String>>>, impl FnMut(String)) {
|
||||
let sink: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
|
||||
let sink_clone = Arc::clone(&sink);
|
||||
let writer = move |s: String| {
|
||||
sink_clone.lock().unwrap().push(s);
|
||||
};
|
||||
(sink, writer)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_emits_after_interval_elapsed() {
|
||||
let (sink, writer) = capture();
|
||||
// 250ms future, 50ms interval — expect ~4-5 ticks before resolution.
|
||||
let fut = tokio::time::sleep(Duration::from_millis(250));
|
||||
with_progress_writer(
|
||||
"https://example.com/slow",
|
||||
async {
|
||||
fut.await;
|
||||
42_i32
|
||||
},
|
||||
Duration::from_millis(50),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
let lines = sink.lock().unwrap();
|
||||
assert!(
|
||||
!lines.is_empty(),
|
||||
"expected >=1 progress line; got {} ({:?})",
|
||||
lines.len(),
|
||||
*lines
|
||||
);
|
||||
for line in lines.iter() {
|
||||
assert!(
|
||||
line.starts_with("# webclaw: still fetching"),
|
||||
"line shape wrong: {line:?}"
|
||||
);
|
||||
assert!(
|
||||
line.contains("https://example.com/slow"),
|
||||
"url missing from line: {line:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_silent_on_fast_future() {
|
||||
let (sink, writer) = capture();
|
||||
// 10ms future, 1s interval — zero ticks expected.
|
||||
let result = with_progress_writer(
|
||||
"https://example.com/fast",
|
||||
async {
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
"done"
|
||||
},
|
||||
Duration::from_secs(1),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(result, "done");
|
||||
let lines = sink.lock().unwrap();
|
||||
assert_eq!(
|
||||
lines.len(),
|
||||
0,
|
||||
"expected 0 progress lines on fast future; got {:?}",
|
||||
*lines
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_line_includes_url() {
|
||||
let (sink, writer) = capture();
|
||||
let target_url = "https://news.ycombinator.com/item?id=12345";
|
||||
with_progress_writer(
|
||||
target_url,
|
||||
async {
|
||||
tokio::time::sleep(Duration::from_millis(150)).await;
|
||||
},
|
||||
Duration::from_millis(50),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
let lines = sink.lock().unwrap();
|
||||
assert!(!lines.is_empty(), "expected progress lines");
|
||||
assert!(
|
||||
lines.iter().all(|l| l.contains(target_url)),
|
||||
"every line should contain the URL: {:?}",
|
||||
*lines
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_returns_inner_result_ok() {
|
||||
let (_sink, writer) = capture();
|
||||
let r: Result<i32, String> = with_progress_writer(
|
||||
"https://example.com/",
|
||||
async { Ok::<i32, String>(7) },
|
||||
Duration::from_secs(1),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(r, Ok(7));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_propagates_error() {
|
||||
let (_sink, writer) = capture();
|
||||
let r: Result<i32, String> = with_progress_writer(
|
||||
"https://example.com/",
|
||||
async { Err::<i32, String>("boom".to_string()) },
|
||||
Duration::from_secs(1),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(r, Err("boom".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncate_url_short_passthrough() {
|
||||
let url = "https://example.com/";
|
||||
assert_eq!(truncate_url(url, 80), url);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncate_url_long_head_dots_tail() {
|
||||
let url = "https://www.example.com/very/long/path/segments/with/lots/of/text/and/then?q=some_long_query_string_value_here&other=more&another=thing";
|
||||
let truncated = truncate_url(url, 80);
|
||||
assert!(
|
||||
truncated.chars().count() <= 80,
|
||||
"truncated length {} > 80: {truncated:?}",
|
||||
truncated.chars().count()
|
||||
);
|
||||
assert!(
|
||||
truncated.contains("..."),
|
||||
"expected '...' marker in truncated url: {truncated:?}"
|
||||
);
|
||||
assert!(
|
||||
truncated.starts_with("https://www.example.com/"),
|
||||
"truncated should start with the URL head: {truncated:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncate_url_unicode_safe() {
|
||||
// Cyrillic URL longer than 80 chars — must not panic on a
|
||||
// mid-codepoint split.
|
||||
let url =
|
||||
"https://example.com/путь/к/очень/длинной/странице/с/большим/количеством/кириллицы/тут";
|
||||
let truncated = truncate_url(url, 80);
|
||||
assert!(truncated.is_char_boundary(truncated.len()));
|
||||
// Roundtrip through chars to confirm valid UTF-8 throughout.
|
||||
let _: String = truncated.chars().collect();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_progress_line_shape() {
|
||||
let line = format_progress_line("https://example.com/", Duration::from_secs(10));
|
||||
assert_eq!(line, "# webclaw: still fetching https://example.com/ (10s)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_progress_line_seconds_only() {
|
||||
// Sub-second elapsed rounds to 0s, not fractions. (In practice
|
||||
// the first tick fires at +PROGRESS_INTERVAL so this is mostly
|
||||
// a defensive shape assertion.)
|
||||
let line = format_progress_line("https://x/", Duration::from_millis(9_500));
|
||||
assert!(
|
||||
line.ends_with("(9s)"),
|
||||
"line should end with `(9s)`: {line:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,172 +1,56 @@
|
|||
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
|
||||
///
|
||||
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
|
||||
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
|
||||
/// comment tree as structured JSON, which we convert to clean markdown.
|
||||
use serde::Deserialize;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
//! Reddit URL helpers for the fetch layer.
|
||||
//!
|
||||
//! The JSON API (`*.json`) is blocked. We rewrite all Reddit hosts to
|
||||
//! `old.reddit.com`, which serves stable server-rendered HTML that
|
||||
//! `webclaw-core::reddit` parses directly.
|
||||
|
||||
/// Check if a URL points to a Reddit post/comment page.
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
matches!(
|
||||
host,
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
)
|
||||
webclaw_core::reddit::is_reddit_url(url)
|
||||
}
|
||||
|
||||
/// Build the `.json` URL from a Reddit page URL.
|
||||
pub fn json_url(url: &str) -> String {
|
||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
format!("{clean}.json")
|
||||
/// Rewrite any Reddit host to old.reddit.com, preserving path and query.
|
||||
pub fn to_old_reddit_url(url: &str) -> String {
|
||||
let Some(scheme_end) = url.find("://") else {
|
||||
return url.to_string();
|
||||
};
|
||||
let after = &url[scheme_end + 3..];
|
||||
let host_end = after.find(['/', '?', '#']).unwrap_or(after.len());
|
||||
let scheme = &url[..scheme_end + 3];
|
||||
let rest = &after[host_end..];
|
||||
format!("{scheme}old.reddit.com{rest}")
|
||||
}
|
||||
|
||||
/// Convert Reddit JSON API response into an ExtractionResult.
|
||||
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
|
||||
let listings: Vec<Listing> =
|
||||
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
let mut markdown = String::new();
|
||||
let mut title = None;
|
||||
let mut author = None;
|
||||
let mut subreddit = None;
|
||||
|
||||
// First listing = the post itself
|
||||
if let Some(post_listing) = listings.first() {
|
||||
for child in &post_listing.data.children {
|
||||
if child.kind == "t3" {
|
||||
let d = &child.data;
|
||||
title = d.title.clone();
|
||||
author = d.author.clone();
|
||||
subreddit = d.subreddit_name_prefixed.clone();
|
||||
|
||||
if let Some(ref t) = title {
|
||||
markdown.push_str(&format!("# {t}\n\n"));
|
||||
}
|
||||
if let (Some(a), Some(sr)) = (&author, &subreddit) {
|
||||
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
|
||||
}
|
||||
if let Some(ref body) = d.selftext
|
||||
&& !body.is_empty()
|
||||
{
|
||||
markdown.push_str(body);
|
||||
markdown.push_str("\n\n");
|
||||
}
|
||||
if let Some(ref url_field) = d.url_overridden_by_dest
|
||||
&& !url_field.is_empty()
|
||||
{
|
||||
markdown.push_str(&format!("[Link]({url_field})\n\n"));
|
||||
}
|
||||
markdown.push_str("---\n\n");
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn rewrites_www_to_old() {
|
||||
assert_eq!(
|
||||
to_old_reddit_url("https://www.reddit.com/r/rust/comments/abc/x/"),
|
||||
"https://old.reddit.com/r/rust/comments/abc/x/"
|
||||
);
|
||||
}
|
||||
|
||||
// Second listing = comment tree
|
||||
if let Some(comment_listing) = listings.get(1) {
|
||||
markdown.push_str("## Comments\n\n");
|
||||
for child in &comment_listing.data.children {
|
||||
render_comment(child, 0, &mut markdown);
|
||||
}
|
||||
#[test]
|
||||
fn rewrites_bare_to_old() {
|
||||
assert_eq!(
|
||||
to_old_reddit_url("https://reddit.com/r/rust/"),
|
||||
"https://old.reddit.com/r/rust/"
|
||||
);
|
||||
}
|
||||
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
debug!(word_count, "reddit json extracted");
|
||||
|
||||
Ok(ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title,
|
||||
description: None,
|
||||
author,
|
||||
published_date: None,
|
||||
language: Some("en".into()),
|
||||
url: Some(url.to_string()),
|
||||
site_name: subreddit,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
|
||||
if thing.kind != "t1" {
|
||||
return;
|
||||
#[test]
|
||||
fn preserves_old_reddit_unchanged() {
|
||||
let url = "https://old.reddit.com/r/rust/comments/abc/x/?context=3";
|
||||
assert_eq!(to_old_reddit_url(url), url);
|
||||
}
|
||||
let d = &thing.data;
|
||||
let indent = " ".repeat(depth);
|
||||
let author = d.author.as_deref().unwrap_or("[deleted]");
|
||||
let body = d.body.as_deref().unwrap_or("[removed]");
|
||||
let score = d.score.unwrap_or(0);
|
||||
|
||||
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
|
||||
for line in body.lines() {
|
||||
out.push_str(&format!("{indent} {line}\n"));
|
||||
}
|
||||
out.push('\n');
|
||||
|
||||
// Recurse into replies
|
||||
if let Some(Replies::Listing(listing)) = &d.replies {
|
||||
for child in &listing.data.children {
|
||||
render_comment(child, depth + 1, out);
|
||||
}
|
||||
#[test]
|
||||
fn preserves_query_and_hash() {
|
||||
assert_eq!(
|
||||
to_old_reddit_url("https://www.reddit.com/r/rust/?sort=top#anchor"),
|
||||
"https://old.reddit.com/r/rust/?sort=top#anchor"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Reddit JSON types (minimal) ---
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Listing {
|
||||
data: ListingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListingData {
|
||||
children: Vec<Thing>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Thing {
|
||||
kind: String,
|
||||
data: ThingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ThingData {
|
||||
// Post fields (t3)
|
||||
title: Option<String>,
|
||||
selftext: Option<String>,
|
||||
subreddit_name_prefixed: Option<String>,
|
||||
url_overridden_by_dest: Option<String>,
|
||||
// Comment fields (t1)
|
||||
author: Option<String>,
|
||||
body: Option<String>,
|
||||
score: Option<i64>,
|
||||
replies: Option<Replies>,
|
||||
}
|
||||
|
||||
/// Reddit replies can be either a nested Listing or an empty string.
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Replies {
|
||||
Listing(Listing),
|
||||
#[allow(dead_code)]
|
||||
Empty(String),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,15 +10,24 @@ use std::{borrow::Cow, io, time::Duration};
|
|||
use wreq::http2::{
|
||||
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
|
||||
};
|
||||
use wreq::tls::{
|
||||
AlpnProtocol, AlpsProtocol, CertificateCompressionAlgorithm, ExtensionType, TlsOptions,
|
||||
TlsVersion,
|
||||
};
|
||||
use wreq::{Client, Emulation};
|
||||
use wreq::tls::compress::CertificateCompressor;
|
||||
use wreq::tls::{AlpnProtocol, AlpsProtocol, ExtensionType, TlsOptions, TlsVersion};
|
||||
use wreq::{Client, Emulation, Group, IntoEmulation};
|
||||
use wreq_util::emulate::compress::{BrotliCompressor, ZlibCompressor};
|
||||
|
||||
use crate::browser::BrowserVariant;
|
||||
use crate::error::FetchError;
|
||||
|
||||
// Certificate-compression advertisement per profile. wreq 6.0.0-rc.29 replaced
|
||||
// the `CertificateCompressionAlgorithm` enum argument with `&dyn
|
||||
// CertificateCompressor` trait objects; wreq-util ships the concrete zlib/brotli
|
||||
// implementations. The advertised set (and order) is a TLS fingerprint signal,
|
||||
// so these mirror the previous enum lists exactly.
|
||||
static CHROME_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&BrotliCompressor];
|
||||
static FIREFOX_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] =
|
||||
&[&ZlibCompressor, &BrotliCompressor];
|
||||
static SAFARI_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&ZlibCompressor];
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct PublicDnsResolver;
|
||||
|
||||
|
|
@ -119,14 +128,14 @@ fn chrome_extensions() -> Vec<ExtensionType> {
|
|||
ExtensionType::PSK_KEY_EXCHANGE_MODES, // 45
|
||||
ExtensionType::EC_POINT_FORMATS, // 11
|
||||
ExtensionType::CERT_COMPRESSION, // 27
|
||||
ExtensionType::APPLICATION_SETTINGS_NEW, // 17613 (new codepoint, matches alps_use_new_codepoint)
|
||||
ExtensionType::SUPPORTED_VERSIONS, // 43
|
||||
ExtensionType::SIGNATURE_ALGORITHMS, // 13
|
||||
ExtensionType::SERVER_NAME, // 0
|
||||
ExtensionType::APPLICATION_SETTINGS, // 17613 (new codepoint, matches alps_use_new_codepoint)
|
||||
ExtensionType::SUPPORTED_VERSIONS, // 43
|
||||
ExtensionType::SIGNATURE_ALGORITHMS, // 13
|
||||
ExtensionType::SERVER_NAME, // 0
|
||||
ExtensionType::APPLICATION_LAYER_PROTOCOL_NEGOTIATION, // 16
|
||||
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
|
||||
ExtensionType::RENEGOTIATE, // 65281
|
||||
ExtensionType::EXTENDED_MASTER_SECRET, // 23
|
||||
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
|
||||
ExtensionType::RENEGOTIATE, // 65281
|
||||
ExtensionType::EXTENDED_MASTER_SECRET, // 23
|
||||
]
|
||||
}
|
||||
|
||||
|
|
@ -287,7 +296,7 @@ fn chrome_tls() -> TlsOptions {
|
|||
.alps_protocols([AlpsProtocol::HTTP3, AlpsProtocol::HTTP2])
|
||||
.alps_use_new_codepoint(true)
|
||||
.aes_hw_override(true)
|
||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
|
||||
.certificate_compressors(CHROME_CERT_COMPRESSORS)
|
||||
.build()
|
||||
}
|
||||
|
||||
|
|
@ -304,10 +313,7 @@ fn firefox_tls() -> TlsOptions {
|
|||
.pre_shared_key(true)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.certificate_compression_algorithms(&[
|
||||
CertificateCompressionAlgorithm::ZLIB,
|
||||
CertificateCompressionAlgorithm::BROTLI,
|
||||
])
|
||||
.certificate_compressors(FIREFOX_CERT_COMPRESSORS)
|
||||
.build()
|
||||
}
|
||||
|
||||
|
|
@ -324,7 +330,7 @@ fn safari_tls() -> TlsOptions {
|
|||
.pre_shared_key(false)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB])
|
||||
.certificate_compressors(SAFARI_CERT_COMPRESSORS)
|
||||
.build()
|
||||
}
|
||||
|
||||
|
|
@ -345,21 +351,23 @@ fn safari_tls() -> TlsOptions {
|
|||
/// `priority: u=0, i`, zstd), replace with the real iOS 26 set.
|
||||
/// 4. `accept-language` preserved from config.extra_headers for locale.
|
||||
fn safari_ios_emulation() -> wreq::Emulation {
|
||||
use wreq::EmulationFactory;
|
||||
let mut em = wreq_util::Emulation::SafariIos26.emulation();
|
||||
// wreq 6.0.0-rc.29 exposes the `Emulation` fields directly (no `*_mut()`
|
||||
// accessors) and wreq-util 3.0.0-rc.12 renamed the enum to `Profile` with
|
||||
// `IntoEmulation::into_emulation` replacing `EmulationFactory::emulation`.
|
||||
let mut em = wreq_util::Profile::SafariIos26.into_emulation();
|
||||
|
||||
if let Some(tls) = em.tls_options_mut().as_mut() {
|
||||
if let Some(tls) = em.tls_options.as_mut() {
|
||||
tls.extension_permutation = Some(Cow::Owned(safari_ios_extensions()));
|
||||
}
|
||||
|
||||
// Only override the priority flag — keep wreq-util's SETTINGS, WINDOW_UPDATE,
|
||||
// and pseudo-order intact. Replacing the whole Http2Options resets SETTINGS
|
||||
// to defaults, which sends only INITIAL_WINDOW_SIZE and fails DataDome.
|
||||
if let Some(h2) = em.http2_options_mut().as_mut() {
|
||||
if let Some(h2) = em.http2_options.as_mut() {
|
||||
h2.headers_stream_dependency = Some(StreamDependency::new(StreamId::zero(), 255, true));
|
||||
}
|
||||
|
||||
let hm = em.headers_mut();
|
||||
let hm = &mut em.headers;
|
||||
hm.clear();
|
||||
for (k, v) in SAFARI_IOS_HEADERS {
|
||||
if let (Ok(n), Ok(val)) = (
|
||||
|
|
@ -508,12 +516,12 @@ pub fn build_client(
|
|||
.tls_options(tls)
|
||||
.http2_options(h2)
|
||||
.headers(build_headers(headers))
|
||||
.build()
|
||||
.build(Group::default())
|
||||
}
|
||||
};
|
||||
|
||||
// Append extra headers after profile defaults.
|
||||
let hm = emulation.headers_mut();
|
||||
let hm = &mut emulation.headers;
|
||||
for (k, v) in extra_headers {
|
||||
if let (Ok(n), Ok(val)) = (
|
||||
http::header::HeaderName::from_bytes(k.as_bytes()),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue