mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-09 22:35:12 +02:00
feat(core): schema-aware JSON-LD parser + --prefer-structured + --articles-from-jsonld
JSON-LD is consistently the cleanest source on major outlets (Reuters,
BBC, Le Monde, N1, Pitchfork). Webclaw already emitted a raw Structured
Data block at the bottom of -f llm output; this iter teaches it to
parse the JSON-LD by schema and surface it usefully.
New schema-aware parser at crates/webclaw-core/src/jsonld.rs classifies
items by @type into: ItemList, LiveBlogPosting, NewsArticle, Review,
WebPageOrChrome, Unknown. CollectionPage with mainEntity ItemList is
auto-lifted (Reuters CollectionPage shape).
Two new CLI flags:
--prefer-structured: surfaces the schema-aware block at the TOP of the
output, before prose. For -f llm emits a Markdown summary block; for
-f json emits a {structured, extracted} envelope. Bypasses the default
DROP list for WebPage/chrome types when explicitly requested.
--articles-from-jsonld: when the page contains ItemList or
LiveBlogPosting, output ONLY a JSON array of articles
({position, title, url, published}). When no such schema is present,
emit a stderr hint and fall through to default extraction (no error).
Default behavior (neither flag set) byte-identical to iter-3 on all
default-flag probes (regression sentinel passed): Cyrillic p14 still
7735 B, M1 caps p18/p19/p20 deterministic, M2 hub p40/p41 byte-identical,
M3 registry p44/p45/p46 still fast-fail with exit 67.
14 new tests in webclaw-core covering schema-variant parsing, parse
error handling, fall-through behavior, flag combinations, and the
default-byte-identical sentinel. Workspace tests 657 -> 671.
This commit is contained in:
parent
e28b22adf7
commit
66974366d7
5 changed files with 911 additions and 36 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -27,3 +27,4 @@ _build-release.bat
|
|||
_build-release.log
|
||||
improve-loop-CONTINUE.md
|
||||
iter-*-smoke/
|
||||
_local/
|
||||
|
|
|
|||
|
|
@ -11,8 +11,9 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
|||
use clap::{Parser, Subcommand, ValueEnum};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use webclaw_core::{
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
|
||||
to_llm_text,
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, JsonLdSchema, LlmTextOptions,
|
||||
Metadata, classify_jsonld_all, extract_with_options, primary_schema, to_llm_text,
|
||||
to_llm_text_with_options,
|
||||
};
|
||||
use webclaw_fetch::{
|
||||
BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
|
||||
|
|
@ -188,6 +189,24 @@ struct Cli {
|
|||
#[arg(long)]
|
||||
prefer_articles: bool,
|
||||
|
||||
/// Surface the schema-aware JSON-LD block (when present) at the TOP of
|
||||
/// the output, before prose. Bypasses the default-drop list for
|
||||
/// WebPage/SiteNavigationElement when explicitly requested. Affects
|
||||
/// `-f llm` / `-f text` (adds a Markdown block) and `-f json` (adds a
|
||||
/// `structured` field to the output object).
|
||||
#[arg(long)]
|
||||
prefer_structured: bool,
|
||||
|
||||
/// When the page contains an ItemList or LiveBlogPosting in its JSON-LD,
|
||||
/// emit ONLY the article list as a JSON array of
|
||||
/// {position, title, url, published}. The `-f` flag is OVERRIDDEN in this
|
||||
/// mode: stdout is always a JSON array. When the page has no
|
||||
/// ItemList/LiveBlogPosting, emits a one-line stderr hint and falls
|
||||
/// through to default extraction (does NOT error). Combined with
|
||||
/// --prefer-structured, this flag wins.
|
||||
#[arg(long)]
|
||||
articles_from_jsonld: bool,
|
||||
|
||||
/// Browser to impersonate
|
||||
#[arg(short, long, default_value = "chrome")]
|
||||
browser: Browser,
|
||||
|
|
@ -769,7 +788,28 @@ fn format_output_with_mode(
|
|||
mode: &OutputMode,
|
||||
max_output_bytes: u64,
|
||||
) -> String {
|
||||
let body = render_body(result, format, show_metadata, mode);
|
||||
format_output_with_mode_and_structured(
|
||||
result,
|
||||
format,
|
||||
show_metadata,
|
||||
mode,
|
||||
max_output_bytes,
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
||||
/// M4 extension: same as `format_output_with_mode` but with an extra
|
||||
/// `prefer_structured` flag. When false this is byte-identical to the
|
||||
/// legacy formatter — sentinel-critical for p01-p15.
|
||||
fn format_output_with_mode_and_structured(
|
||||
result: &ExtractionResult,
|
||||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
max_output_bytes: u64,
|
||||
prefer_structured: bool,
|
||||
) -> String {
|
||||
let body = render_body(result, format, show_metadata, mode, prefer_structured);
|
||||
apply_byte_cap(&body, format, max_output_bytes)
|
||||
}
|
||||
|
||||
|
|
@ -778,6 +818,7 @@ fn render_body(
|
|||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
prefer_structured: bool,
|
||||
) -> String {
|
||||
match mode {
|
||||
OutputMode::Summary => match format {
|
||||
|
|
@ -805,10 +846,26 @@ fn render_body(
|
|||
out
|
||||
}
|
||||
OutputFormat::Json => {
|
||||
serde_json::to_string_pretty(result).expect("serialization failed")
|
||||
if prefer_structured {
|
||||
let schemas = classify_jsonld_all(&result.structured_data);
|
||||
let structured = primary_schema(&schemas);
|
||||
let envelope = serde_json::json!({
|
||||
"structured": structured,
|
||||
"extracted": result,
|
||||
});
|
||||
serde_json::to_string_pretty(&envelope).expect("serialization failed")
|
||||
} else {
|
||||
serde_json::to_string_pretty(result).expect("serialization failed")
|
||||
}
|
||||
}
|
||||
OutputFormat::Text => result.content.plain_text.clone(),
|
||||
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
|
||||
OutputFormat::Llm => to_llm_text_with_options(
|
||||
result,
|
||||
result.metadata.url.as_deref(),
|
||||
&LlmTextOptions {
|
||||
prefer_structured,
|
||||
},
|
||||
),
|
||||
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
|
||||
},
|
||||
}
|
||||
|
|
@ -1129,17 +1186,6 @@ fn format_frontmatter(meta: &Metadata) -> String {
|
|||
lines.join("\n")
|
||||
}
|
||||
|
||||
fn print_output_with_mode(
|
||||
result: &ExtractionResult,
|
||||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
max_output_bytes: u64,
|
||||
) {
|
||||
let out = format_output_with_mode(result, format, show_metadata, mode, max_output_bytes);
|
||||
println!("{out}");
|
||||
}
|
||||
|
||||
/// Apply iter-2 M2's hub-page detector. When a hub is detected:
|
||||
/// - emit a single stderr hint line (always — informational only),
|
||||
/// - if `prefer_articles` is on, override the OutputMode to `Summary`
|
||||
|
|
@ -1152,6 +1198,51 @@ fn print_output_with_mode(
|
|||
/// Designed to be additive — `prefer_articles=false` callers keep their
|
||||
/// existing stdout bytes byte-identical; the hint goes to stderr so it
|
||||
/// doesn't affect the sentinel byte-counting on p01-p15.
|
||||
/// M4: If the page has an ItemList or LiveBlogPosting JSON-LD record, return
|
||||
/// a JSON array of articles (one entry per item). Returns None when the page
|
||||
/// has no such schema, in which case the caller should fall through to
|
||||
/// default extraction and emit a stderr hint.
|
||||
///
|
||||
/// Output shape per element: `{position, title, url, published}`. Null fields
|
||||
/// for the values that don't appear on this page.
|
||||
fn try_articles_from_jsonld(result: &ExtractionResult) -> Option<String> {
|
||||
let schemas = classify_jsonld_all(&result.structured_data);
|
||||
let primary = primary_schema(&schemas)?;
|
||||
match primary {
|
||||
JsonLdSchema::ItemList { items, .. } => {
|
||||
let arr: Vec<serde_json::Value> = items
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, it)| {
|
||||
serde_json::json!({
|
||||
"position": it.position.unwrap_or(idx as u64 + 1),
|
||||
"title": it.title,
|
||||
"url": it.url,
|
||||
"published": it.published,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string()))
|
||||
}
|
||||
JsonLdSchema::LiveBlogPosting { updates, .. } => {
|
||||
let arr: Vec<serde_json::Value> = updates
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, u)| {
|
||||
serde_json::json!({
|
||||
"position": idx as u64 + 1,
|
||||
"title": u.headline,
|
||||
"url": u.url,
|
||||
"published": u.published,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
Some(serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string()))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_hub_detection(
|
||||
result: &ExtractionResult,
|
||||
requested_mode: &OutputMode,
|
||||
|
|
@ -2803,6 +2894,21 @@ async fn main() {
|
|||
// Single-page extraction (handles both HTML and PDF via content-type detection)
|
||||
match fetch_and_extract(&cli).await {
|
||||
Ok(FetchOutput::Local(result)) => {
|
||||
// M4: --articles-from-jsonld short-circuits with a JSON array of
|
||||
// articles when the page has an ItemList or LiveBlogPosting.
|
||||
// When neither is present, emit a stderr hint and fall through to
|
||||
// the default extraction path (the --mode flag still applies).
|
||||
if cli.articles_from_jsonld {
|
||||
if let Some(json_array) = try_articles_from_jsonld(&result) {
|
||||
println!("{json_array}");
|
||||
return;
|
||||
}
|
||||
eprintln!(
|
||||
"# hint: --articles-from-jsonld found no ItemList or LiveBlogPosting on this URL; falling through to default extraction"
|
||||
);
|
||||
// Fall through.
|
||||
}
|
||||
|
||||
let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
|
||||
if let Some(ref dir) = cli.output_dir {
|
||||
let url = cli
|
||||
|
|
@ -2812,25 +2918,28 @@ async fn main() {
|
|||
.unwrap_or_default();
|
||||
let custom_name = entries.first().and_then(|(_, name)| name.clone());
|
||||
let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
|
||||
let content = format_output_with_mode(
|
||||
let content = format_output_with_mode_and_structured(
|
||||
&result,
|
||||
&cli.format,
|
||||
cli.metadata,
|
||||
&effective_mode,
|
||||
cli.max_output_bytes,
|
||||
cli.prefer_structured,
|
||||
);
|
||||
if let Err(e) = write_to_file(dir, &filename, &content) {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
} else {
|
||||
print_output_with_mode(
|
||||
let content = format_output_with_mode_and_structured(
|
||||
&result,
|
||||
&cli.format,
|
||||
cli.metadata,
|
||||
&effective_mode,
|
||||
cli.max_output_bytes,
|
||||
cli.prefer_structured,
|
||||
);
|
||||
println!("{content}");
|
||||
}
|
||||
}
|
||||
Ok(FetchOutput::Cloud(resp)) => {
|
||||
|
|
|
|||
521
crates/webclaw-core/src/jsonld.rs
Normal file
521
crates/webclaw-core/src/jsonld.rs
Normal file
|
|
@ -0,0 +1,521 @@
|
|||
/// Schema-aware JSON-LD classification.
|
||||
///
|
||||
/// The existing `structured_data::extract_json_ld` returns raw parsed
|
||||
/// `serde_json::Value`s. This module classifies them into the typed
|
||||
/// `JsonLdSchema` enum that the M4 CLI flags (`--prefer-structured`,
|
||||
/// `--articles-from-jsonld`) route on.
|
||||
///
|
||||
/// Design: a thin classifier on top of the existing parser. We do NOT
|
||||
/// re-implement JSON-LD parsing — we accept the same `Vec<Value>` that
|
||||
/// `ExtractionResult.structured_data` already carries, and produce a
|
||||
/// typed view useful for downstream formatting.
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
|
||||
/// Article reference extracted from an ItemList / LiveBlogPosting.
|
||||
#[derive(Debug, Clone, PartialEq, Serialize)]
|
||||
pub struct ArticleRef {
|
||||
pub title: Option<String>,
|
||||
pub url: Option<String>,
|
||||
pub published: Option<String>,
|
||||
pub position: Option<u64>,
|
||||
}
|
||||
|
||||
/// One update from a LiveBlogPosting.liveBlogUpdate array.
|
||||
#[derive(Debug, Clone, PartialEq, Serialize)]
|
||||
pub struct LiveUpdate {
|
||||
pub headline: Option<String>,
|
||||
pub url: Option<String>,
|
||||
pub published: Option<String>,
|
||||
}
|
||||
|
||||
/// Classified JSON-LD record. Mirrors the schema.org types that webclaw
|
||||
/// callers care about most: ItemList (Reuters category pages, Pitchfork
|
||||
/// index), LiveBlogPosting (Le Monde live updates), NewsArticle / Article
|
||||
/// (most outlets), Review (Pitchfork album reviews), and chrome types
|
||||
/// (WebPage, WebSite, SiteNavigationElement) that downstream formatters
|
||||
/// usually drop.
|
||||
#[derive(Debug, Clone, PartialEq, Serialize)]
|
||||
#[serde(tag = "schema", rename_all = "PascalCase")]
|
||||
pub enum JsonLdSchema {
|
||||
/// `@type=ItemList` — possibly nested inside `CollectionPage.mainEntity`.
|
||||
ItemList {
|
||||
items: Vec<ArticleRef>,
|
||||
number_of_items: Option<u64>,
|
||||
},
|
||||
/// `@type=LiveBlogPosting` — Le Monde / Guardian live coverage.
|
||||
LiveBlogPosting {
|
||||
headline: Option<String>,
|
||||
updates: Vec<LiveUpdate>,
|
||||
},
|
||||
/// `@type=NewsArticle` / `Article` / `BlogPosting`.
|
||||
NewsArticle {
|
||||
headline: Option<String>,
|
||||
body: Option<String>,
|
||||
date_published: Option<String>,
|
||||
author: Option<String>,
|
||||
},
|
||||
/// `@type=Review` — Pitchfork album reviews.
|
||||
Review {
|
||||
headline: Option<String>,
|
||||
review_body: Option<String>,
|
||||
rated_item: Option<String>,
|
||||
author: Option<String>,
|
||||
date_published: Option<String>,
|
||||
},
|
||||
/// Chrome types: WebPage, WebSite, SiteNavigationElement, BreadcrumbList.
|
||||
/// Formatters typically drop these unless explicitly asked to surface.
|
||||
WebPageOrChrome { raw_type: String },
|
||||
/// Recognised schema.org type we don't have a typed variant for yet.
|
||||
/// The raw value is preserved so callers can still emit it.
|
||||
Unknown {
|
||||
raw_type: String,
|
||||
raw: Box<serde_json::Value>,
|
||||
},
|
||||
}
|
||||
|
||||
impl JsonLdSchema {
|
||||
/// Convenience: is this a content-bearing schema (vs WebPage chrome)?
|
||||
pub fn is_content(&self) -> bool {
|
||||
!matches!(self, JsonLdSchema::WebPageOrChrome { .. })
|
||||
}
|
||||
|
||||
/// Convenience: short stable string for the schema kind, used by probe.py.
|
||||
pub fn kind(&self) -> &'static str {
|
||||
match self {
|
||||
JsonLdSchema::ItemList { .. } => "ItemList",
|
||||
JsonLdSchema::LiveBlogPosting { .. } => "LiveBlogPosting",
|
||||
JsonLdSchema::NewsArticle { .. } => "NewsArticle",
|
||||
JsonLdSchema::Review { .. } => "Review",
|
||||
JsonLdSchema::WebPageOrChrome { .. } => "WebPageOrChrome",
|
||||
JsonLdSchema::Unknown { .. } => "Unknown",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Classify a single JSON-LD value. Descends into `mainEntity` once
|
||||
/// (Reuters `CollectionPage.mainEntity` → ItemList).
|
||||
pub fn classify_value(v: &Value) -> Option<JsonLdSchema> {
|
||||
let obj = v.as_object()?;
|
||||
let raw_type = type_string(obj.get("@type"))?;
|
||||
let lower = raw_type.to_ascii_lowercase();
|
||||
|
||||
match lower.as_str() {
|
||||
"itemlist" => Some(parse_itemlist(obj)),
|
||||
"liveblogposting" => Some(parse_liveblog(obj)),
|
||||
"newsarticle" | "article" | "blogposting" | "reportagenewsarticle" => {
|
||||
Some(parse_news_article(obj))
|
||||
}
|
||||
"review" => Some(parse_review(obj)),
|
||||
// Chrome / navigation types — explicit list.
|
||||
"webpage" | "website" | "sitenavigationelement" | "breadcrumblist"
|
||||
| "collectionpage" => {
|
||||
// CollectionPage may wrap an ItemList in mainEntity. If so, lift it.
|
||||
if let Some(main) = obj.get("mainEntity") {
|
||||
if let Some(inner) = classify_value(main) {
|
||||
return Some(inner);
|
||||
}
|
||||
}
|
||||
Some(JsonLdSchema::WebPageOrChrome { raw_type })
|
||||
}
|
||||
_ => Some(JsonLdSchema::Unknown {
|
||||
raw_type,
|
||||
raw: Box::new(v.clone()),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Classify a `Vec<Value>` (matches `ExtractionResult.structured_data`'s shape).
|
||||
/// Returns one `JsonLdSchema` per input value.
|
||||
pub fn classify_all(values: &[Value]) -> Vec<JsonLdSchema> {
|
||||
values.iter().filter_map(classify_value).collect()
|
||||
}
|
||||
|
||||
/// Find the FIRST schema among the classified items that is a content-bearing
|
||||
/// type useful for routing. Priority: ItemList > LiveBlogPosting > Review >
|
||||
/// NewsArticle > Unknown > WebPageOrChrome.
|
||||
pub fn primary_schema(schemas: &[JsonLdSchema]) -> Option<&JsonLdSchema> {
|
||||
let priority = |s: &JsonLdSchema| -> u8 {
|
||||
match s {
|
||||
JsonLdSchema::ItemList { .. } => 0,
|
||||
JsonLdSchema::LiveBlogPosting { .. } => 1,
|
||||
JsonLdSchema::Review { .. } => 2,
|
||||
JsonLdSchema::NewsArticle { .. } => 3,
|
||||
JsonLdSchema::Unknown { .. } => 4,
|
||||
JsonLdSchema::WebPageOrChrome { .. } => 5,
|
||||
}
|
||||
};
|
||||
schemas.iter().min_by_key(|s| priority(s))
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
fn type_string(v: Option<&Value>) -> Option<String> {
|
||||
match v? {
|
||||
Value::String(s) => Some(s.clone()),
|
||||
Value::Array(a) => a
|
||||
.iter()
|
||||
.find_map(|x| x.as_str().map(str::to_string)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn str_field(obj: &serde_json::Map<String, Value>, key: &str) -> Option<String> {
|
||||
obj.get(key)
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
fn u64_field(obj: &serde_json::Map<String, Value>, key: &str) -> Option<u64> {
|
||||
obj.get(key).and_then(|v| v.as_u64())
|
||||
}
|
||||
|
||||
fn author_string(v: Option<&Value>) -> Option<String> {
|
||||
match v? {
|
||||
Value::String(s) => Some(s.clone()),
|
||||
Value::Object(o) => o.get("name").and_then(|n| n.as_str()).map(str::to_string),
|
||||
Value::Array(a) => {
|
||||
let names: Vec<String> = a
|
||||
.iter()
|
||||
.filter_map(|x| match x {
|
||||
Value::String(s) => Some(s.clone()),
|
||||
Value::Object(o) => o.get("name").and_then(|n| n.as_str()).map(str::to_string),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
if names.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(names.join(", "))
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn item_reviewed_string(v: Option<&Value>) -> Option<String> {
|
||||
let v = v?;
|
||||
let obj = v.as_object()?;
|
||||
obj.get("name").and_then(|n| n.as_str()).map(str::to_string)
|
||||
}
|
||||
|
||||
fn parse_itemlist(obj: &serde_json::Map<String, Value>) -> JsonLdSchema {
|
||||
let mut items = Vec::new();
|
||||
if let Some(arr) = obj.get("itemListElement").and_then(|v| v.as_array()) {
|
||||
for entry in arr {
|
||||
let Some(e) = entry.as_object() else { continue };
|
||||
// Two shapes seen in the wild:
|
||||
// (1) ListItem with {position, url, name}.
|
||||
// (2) ListItem with {position, item: {url, name, datePublished}}.
|
||||
let inner_obj = e.get("item").and_then(|v| v.as_object()).unwrap_or(e);
|
||||
|
||||
let position = u64_field(e, "position").or_else(|| u64_field(inner_obj, "position"));
|
||||
let url = str_field(inner_obj, "url").or_else(|| str_field(e, "url"));
|
||||
let title = str_field(inner_obj, "name")
|
||||
.or_else(|| str_field(e, "name"))
|
||||
.or_else(|| str_field(inner_obj, "headline"))
|
||||
.or_else(|| str_field(e, "headline"));
|
||||
let published = str_field(inner_obj, "datePublished")
|
||||
.or_else(|| str_field(e, "datePublished"));
|
||||
|
||||
items.push(ArticleRef {
|
||||
title,
|
||||
url,
|
||||
published,
|
||||
position,
|
||||
});
|
||||
}
|
||||
}
|
||||
let number_of_items = u64_field(obj, "numberOfItems");
|
||||
JsonLdSchema::ItemList {
|
||||
items,
|
||||
number_of_items,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_liveblog(obj: &serde_json::Map<String, Value>) -> JsonLdSchema {
|
||||
let headline = str_field(obj, "headline");
|
||||
let mut updates = Vec::new();
|
||||
if let Some(arr) = obj.get("liveBlogUpdate").and_then(|v| v.as_array()) {
|
||||
for entry in arr {
|
||||
let Some(e) = entry.as_object() else { continue };
|
||||
updates.push(LiveUpdate {
|
||||
headline: str_field(e, "headline"),
|
||||
url: str_field(e, "url"),
|
||||
published: str_field(e, "datePublished"),
|
||||
});
|
||||
}
|
||||
}
|
||||
JsonLdSchema::LiveBlogPosting { headline, updates }
|
||||
}
|
||||
|
||||
fn parse_news_article(obj: &serde_json::Map<String, Value>) -> JsonLdSchema {
|
||||
let headline = str_field(obj, "headline");
|
||||
// articleBody is the canonical field; some sites use description.
|
||||
let body = str_field(obj, "articleBody").or_else(|| str_field(obj, "description"));
|
||||
let date_published = str_field(obj, "datePublished");
|
||||
let author = author_string(obj.get("author"));
|
||||
JsonLdSchema::NewsArticle {
|
||||
headline,
|
||||
body,
|
||||
date_published,
|
||||
author,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_review(obj: &serde_json::Map<String, Value>) -> JsonLdSchema {
|
||||
let headline = str_field(obj, "headline").or_else(|| str_field(obj, "name"));
|
||||
let review_body = str_field(obj, "reviewBody").or_else(|| str_field(obj, "description"));
|
||||
let rated_item = item_reviewed_string(obj.get("itemReviewed"));
|
||||
let author = author_string(obj.get("author"));
|
||||
let date_published = str_field(obj, "datePublished");
|
||||
JsonLdSchema::Review {
|
||||
headline,
|
||||
review_body,
|
||||
rated_item,
|
||||
author,
|
||||
date_published,
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Tests
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
/// Test 1: ItemList JSON-LD with 3 itemListElement entries.
|
||||
#[test]
|
||||
fn test_jsonld_parse_itemlist() {
|
||||
let v = json!({
|
||||
"@context": "https://schema.org",
|
||||
"@type": "ItemList",
|
||||
"numberOfItems": 3,
|
||||
"itemListElement": [
|
||||
{"@type": "ListItem", "position": 1, "url": "https://a.example/1", "name": "First"},
|
||||
{"@type": "ListItem", "position": 2, "url": "https://a.example/2", "name": "Second"},
|
||||
{"@type": "ListItem", "position": 3, "url": "https://a.example/3", "name": "Third"},
|
||||
]
|
||||
});
|
||||
let s = classify_value(&v).expect("classify");
|
||||
match s {
|
||||
JsonLdSchema::ItemList { items, number_of_items } => {
|
||||
assert_eq!(number_of_items, Some(3));
|
||||
assert_eq!(items.len(), 3);
|
||||
assert_eq!(items[0].position, Some(1));
|
||||
assert_eq!(items[0].url.as_deref(), Some("https://a.example/1"));
|
||||
assert_eq!(items[0].title.as_deref(), Some("First"));
|
||||
assert_eq!(items[2].position, Some(3));
|
||||
}
|
||||
other => panic!("expected ItemList, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Test 2: LiveBlogPosting with 2 liveBlogUpdate entries.
|
||||
#[test]
|
||||
fn test_jsonld_parse_liveblog() {
|
||||
let v = json!({
|
||||
"@type": "LiveBlogPosting",
|
||||
"headline": "Election Night Live",
|
||||
"liveBlogUpdate": [
|
||||
{"headline": "Polls closing", "url": "https://x/1", "datePublished": "2026-05-23T19:00:00Z"},
|
||||
{"headline": "First results", "url": "https://x/2", "datePublished": "2026-05-23T19:15:00Z"},
|
||||
]
|
||||
});
|
||||
let s = classify_value(&v).expect("classify");
|
||||
match s {
|
||||
JsonLdSchema::LiveBlogPosting { headline, updates } => {
|
||||
assert_eq!(headline.as_deref(), Some("Election Night Live"));
|
||||
assert_eq!(updates.len(), 2);
|
||||
assert_eq!(updates[0].headline.as_deref(), Some("Polls closing"));
|
||||
assert_eq!(updates[1].url.as_deref(), Some("https://x/2"));
|
||||
}
|
||||
other => panic!("expected LiveBlogPosting, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Test 3: NewsArticle with articleBody.
|
||||
#[test]
|
||||
fn test_jsonld_parse_newsarticle() {
|
||||
let v = json!({
|
||||
"@type": "NewsArticle",
|
||||
"headline": "Big Story",
|
||||
"articleBody": "Lorem ipsum dolor sit amet.",
|
||||
"datePublished": "2026-05-23",
|
||||
"author": {"@type": "Person", "name": "Jane Doe"},
|
||||
});
|
||||
let s = classify_value(&v).expect("classify");
|
||||
match s {
|
||||
JsonLdSchema::NewsArticle { headline, body, date_published, author } => {
|
||||
assert_eq!(headline.as_deref(), Some("Big Story"));
|
||||
assert_eq!(body.as_deref(), Some("Lorem ipsum dolor sit amet."));
|
||||
assert_eq!(date_published.as_deref(), Some("2026-05-23"));
|
||||
assert_eq!(author.as_deref(), Some("Jane Doe"));
|
||||
}
|
||||
other => panic!("expected NewsArticle, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Test 4: Review with reviewBody and itemReviewed.
|
||||
#[test]
|
||||
fn test_jsonld_parse_review() {
|
||||
let v = json!({
|
||||
"@type": "Review",
|
||||
"headline": "Images of Life",
|
||||
"reviewBody": "A bountiful, baroque, eccentric record.",
|
||||
"itemReviewed": {"@type": "MusicRecording", "name": "Images of Life"},
|
||||
"author": [{"@type": "Person", "name": "Critic A"}],
|
||||
"datePublished": "2026-05-23",
|
||||
});
|
||||
let s = classify_value(&v).expect("classify");
|
||||
match s {
|
||||
JsonLdSchema::Review { headline, review_body, rated_item, author, date_published } => {
|
||||
assert_eq!(headline.as_deref(), Some("Images of Life"));
|
||||
assert_eq!(review_body.as_deref(), Some("A bountiful, baroque, eccentric record."));
|
||||
assert_eq!(rated_item.as_deref(), Some("Images of Life"));
|
||||
assert_eq!(author.as_deref(), Some("Critic A"));
|
||||
assert_eq!(date_published.as_deref(), Some("2026-05-23"));
|
||||
}
|
||||
other => panic!("expected Review, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Test 5: Unknown @type (Recipe) returns Unknown variant, doesn't crash.
|
||||
#[test]
|
||||
fn test_jsonld_parse_unknown_type() {
|
||||
let v = json!({
|
||||
"@type": "Recipe",
|
||||
"name": "Banana Bread",
|
||||
"recipeYield": "1 loaf",
|
||||
});
|
||||
let s = classify_value(&v).expect("classify");
|
||||
match s {
|
||||
JsonLdSchema::Unknown { raw_type, .. } => {
|
||||
assert_eq!(raw_type, "Recipe");
|
||||
}
|
||||
other => panic!("expected Unknown, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Test 6: SiteNavigationElement returns WebPageOrChrome.
|
||||
#[test]
|
||||
fn test_jsonld_parse_webpage_dropped() {
|
||||
let v = json!({
|
||||
"@type": "SiteNavigationElement",
|
||||
"name": "Main nav",
|
||||
});
|
||||
let s = classify_value(&v).expect("classify");
|
||||
assert!(matches!(s, JsonLdSchema::WebPageOrChrome { .. }));
|
||||
if let JsonLdSchema::WebPageOrChrome { raw_type } = s {
|
||||
assert_eq!(raw_type, "SiteNavigationElement");
|
||||
}
|
||||
}
|
||||
|
||||
/// Test 7: Malformed Value (no @type at all) returns None, doesn't panic.
|
||||
/// The "truncated JSON" case is the parser's responsibility (already
|
||||
/// handled in structured_data.rs); the classifier sees only valid Values.
|
||||
#[test]
|
||||
fn test_jsonld_parse_malformed_no_crash() {
|
||||
// Empty object — no @type.
|
||||
let v1 = json!({});
|
||||
assert!(classify_value(&v1).is_none());
|
||||
|
||||
// Bare string — not an object at all.
|
||||
let v2 = json!("garbage");
|
||||
assert!(classify_value(&v2).is_none());
|
||||
|
||||
// @type is not a string or array.
|
||||
let v3 = json!({"@type": 42});
|
||||
assert!(classify_value(&v3).is_none());
|
||||
|
||||
// Array of mixed garbage.
|
||||
let v4 = json!([1, "two", {"@type": "Article", "headline": "ok"}]);
|
||||
// classify_value on the array itself returns None (not an object),
|
||||
// but classify_all extracts the one Article.
|
||||
let all = classify_all(v4.as_array().unwrap());
|
||||
assert_eq!(all.len(), 1);
|
||||
assert_eq!(all[0].kind(), "NewsArticle");
|
||||
}
|
||||
|
||||
/// Test 8: CollectionPage with nested mainEntity ItemList — lifts the inner.
|
||||
/// This is the Reuters shape phase A confirmed.
|
||||
#[test]
|
||||
fn test_jsonld_collectionpage_lifts_mainentity_itemlist() {
|
||||
let v = json!({
|
||||
"@type": "CollectionPage",
|
||||
"mainEntity": {
|
||||
"@type": "ItemList",
|
||||
"numberOfItems": 2,
|
||||
"itemListElement": [
|
||||
{"@type": "ListItem", "position": 1, "url": "https://r.example/1"},
|
||||
{"@type": "ListItem", "position": 2, "url": "https://r.example/2"},
|
||||
]
|
||||
}
|
||||
});
|
||||
let s = classify_value(&v).expect("classify");
|
||||
match s {
|
||||
JsonLdSchema::ItemList { items, number_of_items } => {
|
||||
assert_eq!(items.len(), 2);
|
||||
assert_eq!(number_of_items, Some(2));
|
||||
}
|
||||
other => panic!("expected lifted ItemList, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Test 9: primary_schema picks ItemList over NewsArticle and WebPage.
|
||||
#[test]
|
||||
fn test_primary_schema_picks_itemlist_first() {
|
||||
let schemas = vec![
|
||||
JsonLdSchema::WebPageOrChrome { raw_type: "WebPage".into() },
|
||||
JsonLdSchema::NewsArticle {
|
||||
headline: Some("x".into()),
|
||||
body: None,
|
||||
date_published: None,
|
||||
author: None,
|
||||
},
|
||||
JsonLdSchema::ItemList {
|
||||
items: vec![],
|
||||
number_of_items: None,
|
||||
},
|
||||
];
|
||||
let p = primary_schema(&schemas).expect("primary");
|
||||
assert!(matches!(p, JsonLdSchema::ItemList { .. }));
|
||||
}
|
||||
|
||||
/// Test 10: ListItem with nested `item` object (alternate shape).
|
||||
#[test]
|
||||
fn test_jsonld_itemlist_with_nested_item_shape() {
|
||||
let v = json!({
|
||||
"@type": "ItemList",
|
||||
"itemListElement": [
|
||||
{
|
||||
"@type": "ListItem",
|
||||
"position": 1,
|
||||
"item": {
|
||||
"@type": "NewsArticle",
|
||||
"url": "https://x/1",
|
||||
"name": "Wrapped Title",
|
||||
"datePublished": "2026-05-23",
|
||||
}
|
||||
},
|
||||
]
|
||||
});
|
||||
let s = classify_value(&v).expect("classify");
|
||||
match s {
|
||||
JsonLdSchema::ItemList { items, .. } => {
|
||||
assert_eq!(items.len(), 1);
|
||||
assert_eq!(items[0].url.as_deref(), Some("https://x/1"));
|
||||
assert_eq!(items[0].title.as_deref(), Some("Wrapped Title"));
|
||||
assert_eq!(items[0].published.as_deref(), Some("2026-05-23"));
|
||||
assert_eq!(items[0].position, Some(1));
|
||||
}
|
||||
other => panic!("expected ItemList, got {other:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -12,6 +12,7 @@ pub mod error;
|
|||
pub mod extractor;
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
pub mod js_eval;
|
||||
pub mod jsonld;
|
||||
pub mod llm;
|
||||
pub mod markdown;
|
||||
pub mod metadata;
|
||||
|
|
@ -25,9 +26,14 @@ pub use brand::BrandIdentity;
|
|||
pub use diff::{ChangeStatus, ContentDiff, MetadataChange};
|
||||
pub use domain::DomainType;
|
||||
pub use error::ExtractError;
|
||||
pub use jsonld::{
|
||||
classify_all as classify_jsonld_all, classify_value as classify_jsonld_value, primary_schema,
|
||||
ArticleRef, JsonLdSchema, LiveUpdate,
|
||||
};
|
||||
pub use llm::{
|
||||
classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text, to_llm_toc,
|
||||
truncate_json_with_wrapper, truncate_with_footer, HubClassification,
|
||||
classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text,
|
||||
to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer,
|
||||
HubClassification, LlmTextOptions,
|
||||
};
|
||||
pub use types::{
|
||||
CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
|
||||
|
|
|
|||
|
|
@ -18,8 +18,27 @@ pub use output_size::{
|
|||
truncate_with_footer,
|
||||
};
|
||||
|
||||
use crate::jsonld::{classify_all, primary_schema, JsonLdSchema};
|
||||
use crate::types::ExtractionResult;
|
||||
|
||||
/// Hard size cap on the legacy `## Structured Data` block emitted at the
|
||||
/// bottom of `to_llm_text` output. The schema-aware block emitted at the top
|
||||
/// when `--prefer-structured` is set is NOT capped by this value (it has its
|
||||
/// own per-variant size discipline; see `render_structured_block`).
|
||||
const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
|
||||
|
||||
/// Controls extra structured-data rendering on top of the legacy `to_llm_text`.
|
||||
///
|
||||
/// Default values reproduce the legacy `to_llm_text` behaviour exactly —
|
||||
/// no caller without M4 flags sees any byte change.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct LlmTextOptions {
|
||||
/// When true, emit a schema-aware structured-data block at the TOP of
|
||||
/// the output (after metadata, before prose) and suppress the legacy
|
||||
/// raw JSON `## Structured Data` block at the bottom.
|
||||
pub prefer_structured: bool,
|
||||
}
|
||||
|
||||
/// Produce a token-optimized text representation of extracted content.
|
||||
///
|
||||
/// The output has three sections:
|
||||
|
|
@ -27,11 +46,35 @@ use crate::types::ExtractionResult;
|
|||
/// 2. Cleaned body (no images, no bold/italic, links as plain text)
|
||||
/// 3. Deduplicated links section at the end
|
||||
pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||
to_llm_text_with_options(result, url, &LlmTextOptions::default())
|
||||
}
|
||||
|
||||
/// Same as `to_llm_text`, but with additional structured-data behaviours
|
||||
/// controlled by `LlmTextOptions`. Used by the M4 `--prefer-structured` CLI
|
||||
/// flag.
|
||||
pub fn to_llm_text_with_options(
|
||||
result: &ExtractionResult,
|
||||
url: Option<&str>,
|
||||
opts: &LlmTextOptions,
|
||||
) -> String {
|
||||
let mut out = String::new();
|
||||
|
||||
// -- 1. Metadata header --
|
||||
metadata::build_metadata_header(&mut out, result, url);
|
||||
|
||||
// -- 1b. Schema-aware structured data BEFORE the prose, if requested --
|
||||
// Phase A confirmed that on Pitchfork review pages the existing raw-JSON
|
||||
// block surfaces at byte ~50000 of a 58KB output; this hoists it.
|
||||
if opts.prefer_structured {
|
||||
let schemas = classify_all(&result.structured_data);
|
||||
if let Some(block) = render_structured_block(&schemas) {
|
||||
if !out.is_empty() {
|
||||
out.push('\n');
|
||||
}
|
||||
out.push_str(&block);
|
||||
}
|
||||
}
|
||||
|
||||
// -- 2. Process body --
|
||||
let processed = body::process_body(&result.content.markdown);
|
||||
|
||||
|
|
@ -59,28 +102,140 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
|||
// hydration blobs (Next.js pageProps full of ad-targeting flags, build
|
||||
// IDs, schedule paths) explode to hundreds of KB and drown the LLM in
|
||||
// noise — drop them rather than ship them.
|
||||
let mut useful: Vec<_> = result
|
||||
.structured_data
|
||||
.iter()
|
||||
.filter(|v| is_useful_structured_data(v))
|
||||
.cloned()
|
||||
.collect();
|
||||
for value in &mut useful {
|
||||
scrub_body_fields(value, 0);
|
||||
}
|
||||
if !useful.is_empty() {
|
||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||
const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
|
||||
if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(&serialized);
|
||||
out.push_str("\n```");
|
||||
//
|
||||
// When `prefer_structured` is set the schema-aware block already
|
||||
// carries this information at the top, so we drop the legacy raw block
|
||||
// to avoid duplication.
|
||||
if !opts.prefer_structured {
|
||||
let mut useful: Vec<_> = result
|
||||
.structured_data
|
||||
.iter()
|
||||
.filter(|v| is_useful_structured_data(v))
|
||||
.cloned()
|
||||
.collect();
|
||||
for value in &mut useful {
|
||||
scrub_body_fields(value, 0);
|
||||
}
|
||||
if !useful.is_empty() {
|
||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||
if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(&serialized);
|
||||
out.push_str("\n```");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.trim().to_string()
|
||||
}
|
||||
|
||||
/// Render a schema-aware Markdown block summarising the page's JSON-LD.
|
||||
/// Returns `None` when no content-bearing schema is present.
|
||||
///
|
||||
/// Format:
|
||||
/// ```text
|
||||
/// ## Structured data
|
||||
///
|
||||
/// schema: ItemList (20 items)
|
||||
/// 1. <name or url> — <url>
|
||||
/// 2. ...
|
||||
/// ```
|
||||
fn render_structured_block(schemas: &[JsonLdSchema]) -> Option<String> {
|
||||
let primary = primary_schema(schemas)?;
|
||||
let mut buf = String::new();
|
||||
buf.push_str("\n## Structured data\n\n");
|
||||
match primary {
|
||||
JsonLdSchema::ItemList { items, number_of_items } => {
|
||||
let n = number_of_items.unwrap_or(items.len() as u64);
|
||||
buf.push_str(&format!("schema: ItemList ({n} items)\n"));
|
||||
for (i, it) in items.iter().enumerate() {
|
||||
let pos = it.position.unwrap_or(i as u64 + 1);
|
||||
let label = it.title.clone().unwrap_or_else(|| {
|
||||
it.url.clone().unwrap_or_else(|| "(no url)".to_string())
|
||||
});
|
||||
let url = it.url.as_deref().unwrap_or("");
|
||||
if url.is_empty() {
|
||||
buf.push_str(&format!("{pos}. {label}\n"));
|
||||
} else {
|
||||
buf.push_str(&format!("{pos}. {label} — {url}\n"));
|
||||
}
|
||||
}
|
||||
}
|
||||
JsonLdSchema::LiveBlogPosting { headline, updates } => {
|
||||
buf.push_str("schema: LiveBlogPosting");
|
||||
if let Some(h) = headline {
|
||||
buf.push_str(&format!(" — {h}"));
|
||||
}
|
||||
buf.push('\n');
|
||||
buf.push_str(&format!("updates: {}\n", updates.len()));
|
||||
for u in updates {
|
||||
let label = u.headline.clone().unwrap_or_else(|| {
|
||||
u.url.clone().unwrap_or_else(|| "(no url)".into())
|
||||
});
|
||||
let ts = u.published.as_deref().unwrap_or("");
|
||||
if ts.is_empty() {
|
||||
buf.push_str(&format!("- {label}\n"));
|
||||
} else {
|
||||
buf.push_str(&format!("- [{ts}] {label}\n"));
|
||||
}
|
||||
}
|
||||
}
|
||||
JsonLdSchema::NewsArticle { headline, body, date_published, author } => {
|
||||
buf.push_str("schema: NewsArticle\n");
|
||||
if let Some(h) = headline {
|
||||
buf.push_str(&format!("headline: {h}\n"));
|
||||
}
|
||||
if let Some(a) = author {
|
||||
buf.push_str(&format!("author: {a}\n"));
|
||||
}
|
||||
if let Some(d) = date_published {
|
||||
buf.push_str(&format!("published: {d}\n"));
|
||||
}
|
||||
if let Some(b) = body {
|
||||
buf.push_str("\n");
|
||||
buf.push_str(b);
|
||||
buf.push('\n');
|
||||
}
|
||||
}
|
||||
JsonLdSchema::Review { headline, review_body, rated_item, author, date_published } => {
|
||||
buf.push_str("schema: Review\n");
|
||||
if let Some(h) = headline {
|
||||
buf.push_str(&format!("headline: {h}\n"));
|
||||
}
|
||||
if let Some(item) = rated_item {
|
||||
buf.push_str(&format!("rated: {item}\n"));
|
||||
}
|
||||
if let Some(a) = author {
|
||||
buf.push_str(&format!("author: {a}\n"));
|
||||
}
|
||||
if let Some(d) = date_published {
|
||||
buf.push_str(&format!("published: {d}\n"));
|
||||
}
|
||||
if let Some(b) = review_body {
|
||||
buf.push('\n');
|
||||
buf.push_str(b);
|
||||
buf.push('\n');
|
||||
}
|
||||
}
|
||||
JsonLdSchema::WebPageOrChrome { raw_type } => {
|
||||
// Surface the WebPage block even though normal output drops it —
|
||||
// user explicitly asked via --prefer-structured.
|
||||
buf.push_str(&format!("schema: {raw_type}\n"));
|
||||
buf.push_str("(navigation/chrome record; no content fields)\n");
|
||||
}
|
||||
JsonLdSchema::Unknown { raw_type, raw } => {
|
||||
buf.push_str(&format!("schema: {raw_type} (unrecognised)\n"));
|
||||
let pretty = serde_json::to_string_pretty(raw).unwrap_or_default();
|
||||
if pretty.len() <= 4096 {
|
||||
buf.push_str("\n```json\n");
|
||||
buf.push_str(&pretty);
|
||||
buf.push_str("\n```\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(buf)
|
||||
}
|
||||
|
||||
/// Decide whether a structured-data value carries content worth emitting.
|
||||
///
|
||||
/// Schema.org records with a recognizable content `@type` (Article, NewsArticle,
|
||||
|
|
@ -976,4 +1131,87 @@ mod tests {
|
|||
"shallow articleBody must still be scrubbed"
|
||||
);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// M4: --prefer-structured / --articles-from-jsonld integration tests
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
/// Default options (no flags) produce byte-identical output to legacy
|
||||
/// `to_llm_text`. This is the sentinel for "additive change" — every
|
||||
/// p01-p20 probe relies on this.
|
||||
#[test]
|
||||
fn to_llm_text_with_options_default_is_legacy_identical() {
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "Article",
|
||||
"headline": "Hello",
|
||||
})]);
|
||||
let legacy = to_llm_text(&r, None);
|
||||
let with_opts = to_llm_text_with_options(&r, None, &LlmTextOptions::default());
|
||||
assert_eq!(legacy, with_opts, "default opts must be byte-identical");
|
||||
}
|
||||
|
||||
/// With `prefer_structured`, the schema-aware block appears at the TOP
|
||||
/// of the output (after the metadata header, before the prose body).
|
||||
/// Also: the legacy bottom `## Structured Data` block is suppressed.
|
||||
#[test]
|
||||
fn prefer_structured_places_block_above_body_and_drops_legacy() {
|
||||
let mut r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "Review",
|
||||
"headline": "Album X",
|
||||
"reviewBody": "A long-form review body that would normally be far down the page.".repeat(20),
|
||||
"datePublished": "2026-05-23",
|
||||
})]);
|
||||
r.content.markdown = "## Body Section\n\nLong prose body here.\n".repeat(20);
|
||||
let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true });
|
||||
|
||||
// Structured-data section is present at the top.
|
||||
let struct_idx = out
|
||||
.find("## Structured data")
|
||||
.expect("schema-aware block must be present");
|
||||
let body_idx = out
|
||||
.find("Body Section")
|
||||
.expect("prose body must be present");
|
||||
assert!(
|
||||
struct_idx < body_idx,
|
||||
"schema-aware block must come BEFORE prose body (struct@{struct_idx}, body@{body_idx})"
|
||||
);
|
||||
|
||||
// Legacy bottom block is suppressed to avoid duplication.
|
||||
assert!(
|
||||
!out.contains("## Structured Data"),
|
||||
"legacy uppercase 'Structured Data' block must be dropped when prefer_structured is set"
|
||||
);
|
||||
}
|
||||
|
||||
/// With `prefer_structured` and an ItemList page, the top block lists
|
||||
/// the items with positions and URLs.
|
||||
#[test]
|
||||
fn prefer_structured_itemlist_renders_items() {
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "ItemList",
|
||||
"numberOfItems": 2,
|
||||
"itemListElement": [
|
||||
{"@type": "ListItem", "position": 1, "url": "https://x/1", "name": "First"},
|
||||
{"@type": "ListItem", "position": 2, "url": "https://x/2", "name": "Second"},
|
||||
]
|
||||
})]);
|
||||
let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true });
|
||||
assert!(out.contains("schema: ItemList (2 items)"), "missing header in:\n{out}");
|
||||
assert!(out.contains("1. First — https://x/1"), "missing item 1 in:\n{out}");
|
||||
assert!(out.contains("2. Second — https://x/2"), "missing item 2 in:\n{out}");
|
||||
}
|
||||
|
||||
/// With `prefer_structured` and a WebPage chrome type, the block is
|
||||
/// still emitted (override of the normal DROP filter) but identifies
|
||||
/// itself as a navigation/chrome record.
|
||||
#[test]
|
||||
fn prefer_structured_surfaces_webpage_chrome() {
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "WebPage",
|
||||
"name": "Hub Page",
|
||||
})]);
|
||||
let out = to_llm_text_with_options(&r, None, &LlmTextOptions { prefer_structured: true });
|
||||
assert!(out.contains("## Structured data"), "missing header in:\n{out}");
|
||||
assert!(out.contains("schema: WebPage"), "missing WebPage schema label in:\n{out}");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue