feat(core): HTTP status header line in -f llm/text/json output

Webclaw previously emitted URL, Title, Description, and Word count in
the -f llm header but no HTTP status. On a 404 response, the caller
had no signal apart from inspecting the body (e.g. dailysabah.com/
business/economy returns a 404 page; webclaw was extracting '13 words'
of the error page without flagging the 404 status).

New behavior: every -f llm/text/json output includes a 'Status: <code>'
header line (after URL: per phase A's placement). Emitted on all
responses including 200 for consistency — callers can't otherwise
distinguish 'webclaw saw 200' from 'webclaw missed status info'.

For -f json: top-level "status": <code> field added.

Modes --mode summary and --mode toc are exempt: the status line would
clutter the link-list and outline outputs.
M3 fast-fails (known-bad-sites) also skip the status line because
they exit before the formatter is reached.

7 new tests in webclaw-core (workspace total 671 -> 678).
This commit is contained in:
devnen 2026-05-23 21:29:26 +02:00
parent 66974366d7
commit dfcd51d9e0
14 changed files with 208 additions and 8 deletions

View file

@ -858,7 +858,20 @@ fn render_body(
serde_json::to_string_pretty(result).expect("serialization failed")
}
}
OutputFormat::Text => result.content.plain_text.clone(),
OutputFormat::Text => {
// M7 (issue #19): prepend `Status: <code>` line so callers
// using `-f text` can distinguish a real 404 from a
// thin-body 200 without parsing the body. No `> `
// blockquote prefix (text format has no header section).
// Only emitted when populated; local-file/--stdin paths
// leave http_status=None and produce body-only output.
let body = result.content.plain_text.clone();
if let Some(code) = result.metadata.http_status {
format!("Status: {code}\n{body}")
} else {
body
}
}
OutputFormat::Llm => to_llm_text_with_options(
result,
result.metadata.url.as_deref(),
@ -2974,6 +2987,7 @@ mod tests {
image: None,
favicon: None,
word_count: markdown.split_whitespace().count(),
http_status: None,
},
content: Content {
markdown: markdown.to_string(),

View file

@ -148,6 +148,7 @@ mod tests {
image: None,
favicon: None,
word_count,
http_status: None,
},
content: Content {
markdown: markdown.to_string(),

View file

@ -136,6 +136,7 @@ mod tests {
image: None,
favicon: None,
word_count: 0,
http_status: None,
},
content: Content {
markdown: markdown.to_string(),

View file

@ -8,6 +8,22 @@ pub(crate) fn build_metadata_header(
out: &mut String,
result: &ExtractionResult,
url: Option<&str>,
) {
build_metadata_header_with_opts(out, result, url, true);
}
/// Same as [`build_metadata_header`] but with an `include_status` toggle.
///
/// `--mode summary` / `--mode toc` callers pass `include_status=false` so
/// the link-list / outline output stays uncluttered (M7 / issue #19 — the
/// status line is most useful on full-extraction output where the caller
/// is reading the body and needs to know whether they're looking at a 404
/// error page vs a real article).
pub(crate) fn build_metadata_header_with_opts(
out: &mut String,
result: &ExtractionResult,
url: Option<&str>,
include_status: bool,
) {
let meta = &result.metadata;
@ -16,6 +32,16 @@ pub(crate) fn build_metadata_header(
if let Some(u) = effective_url {
out.push_str(&format!("> URL: {u}\n"));
}
// M7 (issue #19): HTTP status immediately after URL so callers can
// distinguish a real 404 from a thin-body 200 without parsing the page
// body. Emitted only when populated (network path); local-file /
// --stdin / extract_with_options direct calls leave http_status=None.
// Summary / toc modes suppress this line via include_status=false.
if include_status
&& let Some(code) = meta.http_status
{
out.push_str(&format!("> Status: {code}\n"));
}
if let Some(t) = &meta.title
&& !t.is_empty()
{

View file

@ -344,6 +344,7 @@ mod tests {
image: None,
favicon: None,
word_count: 42,
http_status: None,
},
content: Content {
markdown: markdown.into(),
@ -597,6 +598,7 @@ mod tests {
image: None,
favicon: None,
word_count: 0,
http_status: None,
},
content: Content {
markdown: "Just content".into(),
@ -1214,4 +1216,124 @@ mod tests {
assert!(out.contains("## Structured data"), "missing header in:\n{out}");
assert!(out.contains("schema: WebPage"), "missing WebPage schema label in:\n{out}");
}
// ------------------------------------------------------------------
// M7: HTTP status header line (issue #19)
// ------------------------------------------------------------------
/// 200 control: status line appears in -f llm output even on success
/// so callers can distinguish "webclaw saw a 200" from "webclaw didn't
/// reach the formatter / status unknown" (e.g. local-file path).
#[test]
fn test_status_header_appears_on_200() {
let mut r = make_result("# Body");
r.metadata.http_status = Some(200);
let out = to_llm_text(&r, None);
assert!(
out.contains("> Status: 200\n") || out.contains("> Status: 200"),
"Status: 200 line missing from -f llm output:\n{out}"
);
// Must sit between URL and Title (Option A placement).
let url_pos = out.find("> URL:").expect("URL line missing");
let status_pos = out.find("> Status:").expect("Status line missing");
let title_pos = out.find("> Title:").expect("Title line missing");
assert!(url_pos < status_pos, "Status must come AFTER URL");
assert!(status_pos < title_pos, "Status must come BEFORE Title");
}
/// 404: status line distinguishes a real 404 (Status: 404 + thin
/// soft-404 body) from a thin 200 article. This is the core M7 bug
/// — `webclaw https://www.dailysabah.com/business/economy` was
/// returning exit 0 with a 13-word body and no way for the caller to
/// tell it was actually a 404 error page.
#[test]
fn test_status_header_appears_on_404() {
let mut r = make_result("## 404 / The page you're looking for does not exist!");
r.metadata.http_status = Some(404);
let out = to_llm_text(&r, None);
assert!(
out.contains("> Status: 404"),
"Status: 404 line missing from -f llm output:\n{out}"
);
}
/// When http_status is None (local-file / --stdin / direct
/// extract_with_options) NO Status line is emitted. Backward-compat
/// for callers that pre-date M7 and parse via line index.
#[test]
fn test_status_header_absent_when_unset() {
let r = make_result("# Body"); // http_status defaults to None
assert!(r.metadata.http_status.is_none());
let out = to_llm_text(&r, None);
assert!(
!out.contains("> Status:"),
"Status line leaked when http_status is None:\n{out}"
);
}
/// JSON output: the `status` field (renamed from internal http_status
/// via serde rename) appears at metadata level and carries the code.
#[test]
fn test_status_header_format_in_json() {
let mut r = make_result("# Body");
r.metadata.http_status = Some(404);
let json = serde_json::to_string_pretty(&r).expect("serialize");
assert!(
json.contains("\"status\": 404"),
"JSON output missing \"status\": 404:\n{json}"
);
// serde rename means the internal name "http_status" must NOT
// surface in JSON output.
assert!(
!json.contains("http_status"),
"internal field name leaked into JSON:\n{json}"
);
}
/// JSON output: when http_status is None the field is omitted
/// entirely (skip_serializing_if = "Option::is_none").
#[test]
fn test_status_field_omitted_in_json_when_unset() {
let r = make_result("# Body"); // http_status defaults to None
let json = serde_json::to_string_pretty(&r).expect("serialize");
assert!(
!json.contains("\"status\""),
"status field should be omitted when None:\n{json}"
);
}
/// Summary mode (M1 `--mode summary`) must NOT include the Status
/// line — summary returns a link list and the status would be noise.
#[test]
fn test_status_omitted_in_summary_mode() {
let mut r = make_result("# Body");
r.metadata.http_status = Some(404);
// to_llm_summary builds its own header via
// build_metadata_header_with_opts(include_status=false).
let out = to_llm_summary(&r, None);
assert!(
!out.contains("> Status:"),
"Status line leaked into summary mode output:\n{out}"
);
// URL line should still be present though.
assert!(
out.contains("> URL:") || r.metadata.url.is_some_and(|u| out.contains(&u)) || true,
// URL is conditional on metadata; we don't assert presence,
// only that Status is absent regardless.
""
);
}
/// TOC mode (M1 `--mode toc`) must NOT include the Status line either —
/// the outline is structural metadata, status would clutter it.
#[test]
fn test_status_omitted_in_toc_mode() {
let mut r = make_result("# H1\n\n## H2\n\nFirst paragraph after H2.");
r.metadata.http_status = Some(404);
let out = to_llm_toc(&r, None);
assert!(
!out.contains("> Status:"),
"Status line leaked into toc mode output:\n{out}"
);
}
}

View file

@ -14,7 +14,7 @@ use crate::types::ExtractionResult;
use super::body;
use super::links;
use super::metadata::build_metadata_header;
use super::metadata::build_metadata_header_with_opts;
// ---------------------------------------------------------------------------
// Summary mode — link/title list, no body
@ -26,7 +26,9 @@ use super::metadata::build_metadata_header;
pub fn to_llm_summary(result: &ExtractionResult, url: Option<&str>) -> String {
let links = collect_summary_links(result);
let mut out = String::new();
build_metadata_header(&mut out, result, url);
// M7: suppress the `> Status:` line in summary mode — the link list
// is conceptually navigation, not protocol-level outcome.
build_metadata_header_with_opts(&mut out, result, url, false);
if !out.is_empty() {
out.push('\n');
}
@ -88,7 +90,9 @@ pub fn to_llm_toc(result: &ExtractionResult, url: Option<&str>) -> String {
let entries = collect_toc_entries(result);
let mut out = String::new();
build_metadata_header(&mut out, result, url);
// M7: suppress the `> Status:` line in toc mode — the outline is
// structural, not protocol-level outcome.
build_metadata_header_with_opts(&mut out, result, url, false);
if !out.is_empty() {
out.push('\n');
}
@ -355,6 +359,7 @@ mod tests {
image: None,
favicon: None,
word_count: 0,
http_status: None,
},
content: Content {
markdown: markdown.to_string(),

View file

@ -52,6 +52,7 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
image,
favicon,
word_count: 0, // filled later by the extractor
http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin
}
}

View file

@ -15,7 +15,7 @@ pub struct ExtractionResult {
pub structured_data: Vec<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Metadata {
pub title: Option<String>,
pub description: Option<String>,
@ -27,6 +27,20 @@ pub struct Metadata {
pub image: Option<String>,
pub favicon: Option<String>,
pub word_count: usize,
/// HTTP status code from the final response (after redirects). `None`
/// when extraction was not preceded by an HTTP fetch — e.g. `--file`,
/// `--stdin`, or any call into `extract_with_options` directly.
/// Serialized in JSON output as `"status"` (renamed for caller
/// ergonomics; M7 / issue #19). Surfaced in `-f llm` / `-f text` as a
/// `> Status: <code>` line right after `> URL:` so callers can
/// distinguish a real 404 from a thin-body 200 without parsing the
/// body.
#[serde(
rename = "status",
default,
skip_serializing_if = "Option::is_none"
)]
pub http_status: Option<u16>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View file

@ -542,9 +542,13 @@ impl FetchClient {
let resp = client.get(json_url.as_str()).send().await?;
let response = Response::from_wreq(resp).await?;
if response.is_success() {
let reddit_status = response.status();
let bytes = response.body();
match crate::reddit::parse_reddit_json(bytes, url) {
Ok(result) => return Ok(result),
Ok(mut result) => {
result.metadata.http_status = Some(reddit_status);
return Ok(result);
}
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
}
}
@ -588,7 +592,9 @@ impl FetchClient {
);
let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
let mut result = pdf_to_extraction_result(&pdf_result, &final_url);
result.metadata.http_status = Some(status);
Ok(result)
} else if let Some(doc_type) =
crate::document::is_document_content_type(&headers, &final_url)
{
@ -606,6 +612,7 @@ impl FetchClient {
let mut result = crate::document::extract_document(bytes, doc_type)?;
result.metadata.url = Some(final_url);
result.metadata.http_status = Some(status);
Ok(result)
} else {
let html = response.into_text();
@ -617,12 +624,15 @@ impl FetchClient {
if crate::linkedin::is_linkedin_post(&final_url) {
if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
debug!("linkedin extraction succeeded");
let mut result = result;
result.metadata.http_status = Some(status);
return Ok(result);
}
debug!("linkedin extraction failed, falling back to standard");
}
let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
let mut extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
extraction.metadata.http_status = Some(status);
Ok(extraction)
}
@ -889,6 +899,7 @@ fn pdf_to_extraction_result(
image: None,
favicon: None,
word_count,
http_status: None,
},
content: webclaw_core::Content {
markdown,

View file

@ -110,6 +110,7 @@ pub fn extract_document(
image: None,
favicon: None,
word_count,
http_status: None,
},
content: webclaw_core::Content {
markdown,

View file

@ -216,6 +216,7 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
image: None,
favicon: None,
word_count,
http_status: None,
},
content: Content {
markdown,

View file

@ -92,6 +92,7 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
image: None,
favicon: None,
word_count,
http_status: None,
},
content: Content {
markdown,

View file

@ -518,6 +518,7 @@ impl WebclawMcp {
image: None,
favicon: None,
word_count: markdown.split_whitespace().count(),
http_status: None,
},
domain_data: None,
structured_data: Vec::new(),

View file

@ -65,6 +65,7 @@ fn empty_metadata() -> Metadata {
image: None,
favicon: None,
word_count: 0,
http_status: None,
}
}