mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
Reddit blocked unauthenticated `.json` access, so the previous extractor returned block pages or timed out on every thread. Switch to parsing old.reddit.com's server-rendered HTML, which needs no API key or JS. Fetch layer: - Rewrite every Reddit host to old.reddit.com before fetching; drop all `.json` URL handling and the JSON response parser. Extraction (webclaw-core::reddit): - New HTML parser producing a typed post + nested comment tree. - Comments nest structurally (.comment > .child > .sitetable > .comment); old.reddit omits a usable depth attribute, so the tree is walked recursively. Bodies live in .entry > form > .usertext-body > .md. - Post metadata: title, author, subreddit, score, comment count (data-comments-count), self-vs-link (self class / self.* domain), flair, self-text body. - Comment scores read the .score.unvoted title (the displayed value, not the ±1 vote-state siblings); hidden scores are None, not 0. - Deleted comments are kept in place so their replies aren't orphaned; "load more comments" stubs are skipped. Markdown output: - Reply nesting via blockquote depth (avoids 4-space indentation turning text and code fences into broken indented-code blocks). - Links keep their target as [text](url); root-relative reddit links resolve against old.reddit.com. Nested lists indent correctly. - A recognised but unparseable /comments/ page returns no content rather than falling through to generic extraction of Reddit chrome. Tests: regression suite runs against real old.reddit.com fixtures (testdata/reddit/), the ground truth that surfaced the parsing and markdown bugs synthetic HTML had hidden. Fixtures are excluded from the published crate.
35 lines
1.1 KiB
TOML
35 lines
1.1 KiB
TOML
[package]
|
|
name = "webclaw-core"
|
|
description = "Pure HTML content extraction engine for LLMs"
|
|
version.workspace = true
|
|
edition.workspace = true
|
|
license.workspace = true
|
|
# Reddit regression fixtures are real old.reddit.com pages read at test time;
|
|
# they're large and only needed to run the test suite from the repo, so keep
|
|
# them out of the published crate.
|
|
exclude = ["testdata/reddit/*.html"]
|
|
|
|
[features]
|
|
default = ["quickjs"]
|
|
quickjs = ["rquickjs"]
|
|
|
|
[dependencies]
|
|
serde = { workspace = true }
|
|
serde_json = { workspace = true }
|
|
thiserror = { workspace = true }
|
|
tracing = { workspace = true }
|
|
scraper = "0.22"
|
|
ego-tree = "0.10"
|
|
url = { version = "2", features = ["serde"] }
|
|
regex = "1"
|
|
once_cell = "1"
|
|
similar = "2"
|
|
|
|
# rquickjs links a C library and cannot build for wasm32. Gating it per
|
|
# target keeps the `quickjs` feature usable on native while leaving the
|
|
# crate WASM-safe even with default features enabled.
|
|
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
|
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
|
|
|
[dev-dependencies]
|
|
tokio = { workspace = true }
|