mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-16 23:45:13 +02:00
feat(core): schema-aware JSON-LD parser + --prefer-structured + --articles-from-jsonld
JSON-LD is consistently the cleanest source on major outlets (Reuters,
BBC, Le Monde, N1, Pitchfork). Webclaw already emitted a raw Structured
Data block at the bottom of -f llm output; this iter teaches it to
parse the JSON-LD by schema and surface it usefully.
New schema-aware parser at crates/webclaw-core/src/jsonld.rs classifies
items by @type into: ItemList, LiveBlogPosting, NewsArticle, Review,
WebPageOrChrome, Unknown. CollectionPage with mainEntity ItemList is
auto-lifted (Reuters CollectionPage shape).
Two new CLI flags:
--prefer-structured: surfaces the schema-aware block at the TOP of the
output, before prose. For -f llm emits a Markdown summary block; for
-f json emits a {structured, extracted} envelope. Bypasses the default
DROP list for WebPage/chrome types when explicitly requested.
--articles-from-jsonld: when the page contains ItemList or
LiveBlogPosting, output ONLY a JSON array of articles
({position, title, url, published}). When no such schema is present,
emit a stderr hint and fall through to default extraction (no error).
Default behavior (neither flag set) byte-identical to iter-3 on all
default-flag probes (regression sentinel passed): Cyrillic p14 still
7735 B, M1 caps p18/p19/p20 deterministic, M2 hub p40/p41 byte-identical,
M3 registry p44/p45/p46 still fast-fail with exit 67.
14 new tests in webclaw-core covering schema-variant parsing, parse
error handling, fall-through behavior, flag combinations, and the
default-byte-identical sentinel. Workspace tests 657 -> 671.
This commit is contained in:
parent
e28b22adf7
commit
66974366d7
5 changed files with 911 additions and 36 deletions
|
|
@ -11,8 +11,9 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
|||
use clap::{Parser, Subcommand, ValueEnum};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use webclaw_core::{
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
|
||||
to_llm_text,
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, JsonLdSchema, LlmTextOptions,
|
||||
Metadata, classify_jsonld_all, extract_with_options, primary_schema, to_llm_text,
|
||||
to_llm_text_with_options,
|
||||
};
|
||||
use webclaw_fetch::{
|
||||
BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
|
||||
|
|
@ -188,6 +189,24 @@ struct Cli {
|
|||
#[arg(long)]
|
||||
prefer_articles: bool,
|
||||
|
||||
/// Surface the schema-aware JSON-LD block (when present) at the TOP of
|
||||
/// the output, before prose. Bypasses the default-drop list for
|
||||
/// WebPage/SiteNavigationElement when explicitly requested. Affects
|
||||
/// `-f llm` / `-f text` (adds a Markdown block) and `-f json` (adds a
|
||||
/// `structured` field to the output object).
|
||||
#[arg(long)]
|
||||
prefer_structured: bool,
|
||||
|
||||
/// When the page contains an ItemList or LiveBlogPosting in its JSON-LD,
|
||||
/// emit ONLY the article list as a JSON array of
|
||||
/// {position, title, url, published}. The `-f` flag is OVERRIDDEN in this
|
||||
/// mode: stdout is always a JSON array. When the page has no
|
||||
/// ItemList/LiveBlogPosting, emits a one-line stderr hint and falls
|
||||
/// through to default extraction (does NOT error). Combined with
|
||||
/// --prefer-structured, this flag wins.
|
||||
#[arg(long)]
|
||||
articles_from_jsonld: bool,
|
||||
|
||||
/// Browser to impersonate
|
||||
#[arg(short, long, default_value = "chrome")]
|
||||
browser: Browser,
|
||||
|
|
@ -769,7 +788,28 @@ fn format_output_with_mode(
|
|||
mode: &OutputMode,
|
||||
max_output_bytes: u64,
|
||||
) -> String {
|
||||
let body = render_body(result, format, show_metadata, mode);
|
||||
format_output_with_mode_and_structured(
|
||||
result,
|
||||
format,
|
||||
show_metadata,
|
||||
mode,
|
||||
max_output_bytes,
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
||||
/// M4 extension: same as `format_output_with_mode` but with an extra
|
||||
/// `prefer_structured` flag. When false this is byte-identical to the
|
||||
/// legacy formatter — sentinel-critical for p01-p15.
|
||||
fn format_output_with_mode_and_structured(
|
||||
result: &ExtractionResult,
|
||||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
max_output_bytes: u64,
|
||||
prefer_structured: bool,
|
||||
) -> String {
|
||||
let body = render_body(result, format, show_metadata, mode, prefer_structured);
|
||||
apply_byte_cap(&body, format, max_output_bytes)
|
||||
}
|
||||
|
||||
|
|
@ -778,6 +818,7 @@ fn render_body(
|
|||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
prefer_structured: bool,
|
||||
) -> String {
|
||||
match mode {
|
||||
OutputMode::Summary => match format {
|
||||
|
|
@ -805,10 +846,26 @@ fn render_body(
|
|||
out
|
||||
}
|
||||
OutputFormat::Json => {
|
||||
serde_json::to_string_pretty(result).expect("serialization failed")
|
||||
if prefer_structured {
|
||||
let schemas = classify_jsonld_all(&result.structured_data);
|
||||
let structured = primary_schema(&schemas);
|
||||
let envelope = serde_json::json!({
|
||||
"structured": structured,
|
||||
"extracted": result,
|
||||
});
|
||||
serde_json::to_string_pretty(&envelope).expect("serialization failed")
|
||||
} else {
|
||||
serde_json::to_string_pretty(result).expect("serialization failed")
|
||||
}
|
||||
}
|
||||
OutputFormat::Text => result.content.plain_text.clone(),
|
||||
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
|
||||
OutputFormat::Llm => to_llm_text_with_options(
|
||||
result,
|
||||
result.metadata.url.as_deref(),
|
||||
&LlmTextOptions {
|
||||
prefer_structured,
|
||||
},
|
||||
),
|
||||
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
|
||||
},
|
||||
}
|
||||
|
|
@ -1129,17 +1186,6 @@ fn format_frontmatter(meta: &Metadata) -> String {
|
|||
lines.join("\n")
|
||||
}
|
||||
|
||||
fn print_output_with_mode(
|
||||
result: &ExtractionResult,
|
||||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
max_output_bytes: u64,
|
||||
) {
|
||||
let out = format_output_with_mode(result, format, show_metadata, mode, max_output_bytes);
|
||||
println!("{out}");
|
||||
}
|
||||
|
||||
/// Apply iter-2 M2's hub-page detector. When a hub is detected:
|
||||
/// - emit a single stderr hint line (always — informational only),
|
||||
/// - if `prefer_articles` is on, override the OutputMode to `Summary`
|
||||
|
|
@ -1152,6 +1198,51 @@ fn print_output_with_mode(
|
|||
/// Designed to be additive — `prefer_articles=false` callers keep their
|
||||
/// existing stdout bytes byte-identical; the hint goes to stderr so it
|
||||
/// doesn't affect the sentinel byte-counting on p01-p15.
|
||||
/// M4: If the page has an ItemList or LiveBlogPosting JSON-LD record, return
|
||||
/// a JSON array of articles (one entry per item). Returns None when the page
|
||||
/// has no such schema, in which case the caller should fall through to
|
||||
/// default extraction and emit a stderr hint.
|
||||
///
|
||||
/// Output shape per element: `{position, title, url, published}`. Null fields
|
||||
/// for the values that don't appear on this page.
|
||||
fn try_articles_from_jsonld(result: &ExtractionResult) -> Option<String> {
|
||||
let schemas = classify_jsonld_all(&result.structured_data);
|
||||
let primary = primary_schema(&schemas)?;
|
||||
match primary {
|
||||
JsonLdSchema::ItemList { items, .. } => {
|
||||
let arr: Vec<serde_json::Value> = items
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, it)| {
|
||||
serde_json::json!({
|
||||
"position": it.position.unwrap_or(idx as u64 + 1),
|
||||
"title": it.title,
|
||||
"url": it.url,
|
||||
"published": it.published,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string()))
|
||||
}
|
||||
JsonLdSchema::LiveBlogPosting { updates, .. } => {
|
||||
let arr: Vec<serde_json::Value> = updates
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, u)| {
|
||||
serde_json::json!({
|
||||
"position": idx as u64 + 1,
|
||||
"title": u.headline,
|
||||
"url": u.url,
|
||||
"published": u.published,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string()))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_hub_detection(
|
||||
result: &ExtractionResult,
|
||||
requested_mode: &OutputMode,
|
||||
|
|
@ -2803,6 +2894,21 @@ async fn main() {
|
|||
// Single-page extraction (handles both HTML and PDF via content-type detection)
|
||||
match fetch_and_extract(&cli).await {
|
||||
Ok(FetchOutput::Local(result)) => {
|
||||
// M4: --articles-from-jsonld short-circuits with a JSON array of
|
||||
// articles when the page has an ItemList or LiveBlogPosting.
|
||||
// When neither is present, emit a stderr hint and fall through to
|
||||
// the default extraction path (the --mode flag still applies).
|
||||
if cli.articles_from_jsonld {
|
||||
if let Some(json_array) = try_articles_from_jsonld(&result) {
|
||||
println!("{json_array}");
|
||||
return;
|
||||
}
|
||||
eprintln!(
|
||||
"# hint: --articles-from-jsonld found no ItemList or LiveBlogPosting on this URL; falling through to default extraction"
|
||||
);
|
||||
// Fall through.
|
||||
}
|
||||
|
||||
let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
|
||||
if let Some(ref dir) = cli.output_dir {
|
||||
let url = cli
|
||||
|
|
@ -2812,25 +2918,28 @@ async fn main() {
|
|||
.unwrap_or_default();
|
||||
let custom_name = entries.first().and_then(|(_, name)| name.clone());
|
||||
let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
|
||||
let content = format_output_with_mode(
|
||||
let content = format_output_with_mode_and_structured(
|
||||
&result,
|
||||
&cli.format,
|
||||
cli.metadata,
|
||||
&effective_mode,
|
||||
cli.max_output_bytes,
|
||||
cli.prefer_structured,
|
||||
);
|
||||
if let Err(e) = write_to_file(dir, &filename, &content) {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
} else {
|
||||
print_output_with_mode(
|
||||
let content = format_output_with_mode_and_structured(
|
||||
&result,
|
||||
&cli.format,
|
||||
cli.metadata,
|
||||
&effective_mode,
|
||||
cli.max_output_bytes,
|
||||
cli.prefer_structured,
|
||||
);
|
||||
println!("{content}");
|
||||
}
|
||||
}
|
||||
Ok(FetchOutput::Cloud(resp)) => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue