2026-03-23 18:31:11 +01:00
|
|
|
[package]
|
|
|
|
|
name = "webclaw-core"
|
|
|
|
|
description = "Pure HTML content extraction engine for LLMs"
|
|
|
|
|
version.workspace = true
|
|
|
|
|
edition.workspace = true
|
|
|
|
|
license.workspace = true
|
feat(reddit): parse old.reddit.com HTML instead of the dead .json API
Reddit blocked unauthenticated `.json` access, so the previous extractor
returned block pages or timed out on every thread. Switch to parsing
old.reddit.com's server-rendered HTML, which needs no API key or JS.
Fetch layer:
- Rewrite every Reddit host to old.reddit.com before fetching; drop all
`.json` URL handling and the JSON response parser.
Extraction (webclaw-core::reddit):
- New HTML parser producing a typed post + nested comment tree.
- Comments nest structurally (.comment > .child > .sitetable > .comment);
old.reddit omits a usable depth attribute, so the tree is walked
recursively. Bodies live in .entry > form > .usertext-body > .md.
- Post metadata: title, author, subreddit, score, comment count
(data-comments-count), self-vs-link (self class / self.* domain),
flair, self-text body.
- Comment scores read the .score.unvoted title (the displayed value, not
the ±1 vote-state siblings); hidden scores are None, not 0.
- Deleted comments are kept in place so their replies aren't orphaned;
"load more comments" stubs are skipped.
Markdown output:
- Reply nesting via blockquote depth (avoids 4-space indentation turning
text and code fences into broken indented-code blocks).
- Links keep their target as [text](url); root-relative reddit links
resolve against old.reddit.com. Nested lists indent correctly.
- A recognised but unparseable /comments/ page returns no content rather
than falling through to generic extraction of Reddit chrome.
Tests: regression suite runs against real old.reddit.com fixtures
(testdata/reddit/), the ground truth that surfaced the parsing and
markdown bugs synthetic HTML had hidden. Fixtures are excluded from the
published crate.
2026-06-04 16:16:08 +02:00
|
|
|
# Reddit regression fixtures are real old.reddit.com pages read at test time;
|
|
|
|
|
# they're large and only needed to run the test suite from the repo, so keep
|
|
|
|
|
# them out of the published crate.
|
|
|
|
|
exclude = ["testdata/reddit/*.html"]
|
2026-03-23 18:31:11 +01:00
|
|
|
|
2026-03-26 10:28:16 +01:00
|
|
|
[features]
|
|
|
|
|
default = ["quickjs"]
|
|
|
|
|
quickjs = ["rquickjs"]
|
|
|
|
|
|
2026-03-23 18:31:11 +01:00
|
|
|
[dependencies]
|
|
|
|
|
serde = { workspace = true }
|
|
|
|
|
serde_json = { workspace = true }
|
|
|
|
|
thiserror = { workspace = true }
|
|
|
|
|
tracing = { workspace = true }
|
|
|
|
|
scraper = "0.22"
|
|
|
|
|
ego-tree = "0.10"
|
|
|
|
|
url = { version = "2", features = ["serde"] }
|
|
|
|
|
regex = "1"
|
|
|
|
|
once_cell = "1"
|
|
|
|
|
similar = "2"
|
2026-05-19 17:03:52 +02:00
|
|
|
|
|
|
|
|
# rquickjs links a C library and cannot build for wasm32. Gating it per
|
|
|
|
|
# target keeps the `quickjs` feature usable on native while leaving the
|
|
|
|
|
# crate WASM-safe even with default features enabled.
|
|
|
|
|
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
2026-03-26 10:28:16 +01:00
|
|
|
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
2026-03-23 18:31:11 +01:00
|
|
|
|
|
|
|
|
[dev-dependencies]
|
|
|
|
|
tokio = { workspace = true }
|