mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-12 23:05:12 +02:00
feat(core): HTTP status header line in -f llm/text/json output
Webclaw previously emitted URL, Title, Description, and Word count in the -f llm header but no HTTP status. On a 404 response, the caller had no signal apart from inspecting the body (e.g. dailysabah.com/ business/economy returns a 404 page; webclaw was extracting '13 words' of the error page without flagging the 404 status). New behavior: every -f llm/text/json output includes a 'Status: <code>' header line (after URL: per phase A's placement). Emitted on all responses including 200 for consistency — callers can't otherwise distinguish 'webclaw saw 200' from 'webclaw missed status info'. For -f json: top-level "status": <code> field added. Modes --mode summary and --mode toc are exempt: the status line would clutter the link-list and outline outputs. M3 fast-fails (known-bad-sites) also skip the status line because they exit before the formatter is reached. 7 new tests in webclaw-core (workspace total 671 -> 678).
This commit is contained in:
parent
66974366d7
commit
dfcd51d9e0
14 changed files with 208 additions and 8 deletions
|
|
@ -858,7 +858,20 @@ fn render_body(
|
|||
serde_json::to_string_pretty(result).expect("serialization failed")
|
||||
}
|
||||
}
|
||||
OutputFormat::Text => result.content.plain_text.clone(),
|
||||
OutputFormat::Text => {
|
||||
// M7 (issue #19): prepend `Status: <code>` line so callers
|
||||
// using `-f text` can distinguish a real 404 from a
|
||||
// thin-body 200 without parsing the body. No `> `
|
||||
// blockquote prefix (text format has no header section).
|
||||
// Only emitted when populated; local-file/--stdin paths
|
||||
// leave http_status=None and produce body-only output.
|
||||
let body = result.content.plain_text.clone();
|
||||
if let Some(code) = result.metadata.http_status {
|
||||
format!("Status: {code}\n{body}")
|
||||
} else {
|
||||
body
|
||||
}
|
||||
}
|
||||
OutputFormat::Llm => to_llm_text_with_options(
|
||||
result,
|
||||
result.metadata.url.as_deref(),
|
||||
|
|
@ -2974,6 +2987,7 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: markdown.split_whitespace().count(),
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown: markdown.to_string(),
|
||||
|
|
|
|||
|
|
@ -148,6 +148,7 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown: markdown.to_string(),
|
||||
|
|
|
|||
|
|
@ -136,6 +136,7 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown: markdown.to_string(),
|
||||
|
|
|
|||
|
|
@ -8,6 +8,22 @@ pub(crate) fn build_metadata_header(
|
|||
out: &mut String,
|
||||
result: &ExtractionResult,
|
||||
url: Option<&str>,
|
||||
) {
|
||||
build_metadata_header_with_opts(out, result, url, true);
|
||||
}
|
||||
|
||||
/// Same as [`build_metadata_header`] but with an `include_status` toggle.
|
||||
///
|
||||
/// `--mode summary` / `--mode toc` callers pass `include_status=false` so
|
||||
/// the link-list / outline output stays uncluttered (M7 / issue #19 — the
|
||||
/// status line is most useful on full-extraction output where the caller
|
||||
/// is reading the body and needs to know whether they're looking at a 404
|
||||
/// error page vs a real article).
|
||||
pub(crate) fn build_metadata_header_with_opts(
|
||||
out: &mut String,
|
||||
result: &ExtractionResult,
|
||||
url: Option<&str>,
|
||||
include_status: bool,
|
||||
) {
|
||||
let meta = &result.metadata;
|
||||
|
||||
|
|
@ -16,6 +32,16 @@ pub(crate) fn build_metadata_header(
|
|||
if let Some(u) = effective_url {
|
||||
out.push_str(&format!("> URL: {u}\n"));
|
||||
}
|
||||
// M7 (issue #19): HTTP status immediately after URL so callers can
|
||||
// distinguish a real 404 from a thin-body 200 without parsing the page
|
||||
// body. Emitted only when populated (network path); local-file /
|
||||
// --stdin / extract_with_options direct calls leave http_status=None.
|
||||
// Summary / toc modes suppress this line via include_status=false.
|
||||
if include_status
|
||||
&& let Some(code) = meta.http_status
|
||||
{
|
||||
out.push_str(&format!("> Status: {code}\n"));
|
||||
}
|
||||
if let Some(t) = &meta.title
|
||||
&& !t.is_empty()
|
||||
{
|
||||
|
|
|
|||
|
|
@ -344,6 +344,7 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 42,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown: markdown.into(),
|
||||
|
|
@ -597,6 +598,7 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown: "Just content".into(),
|
||||
|
|
@ -1214,4 +1216,124 @@ mod tests {
|
|||
assert!(out.contains("## Structured data"), "missing header in:\n{out}");
|
||||
assert!(out.contains("schema: WebPage"), "missing WebPage schema label in:\n{out}");
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// M7: HTTP status header line (issue #19)
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
/// 200 control: status line appears in -f llm output even on success
|
||||
/// so callers can distinguish "webclaw saw a 200" from "webclaw didn't
|
||||
/// reach the formatter / status unknown" (e.g. local-file path).
|
||||
#[test]
|
||||
fn test_status_header_appears_on_200() {
|
||||
let mut r = make_result("# Body");
|
||||
r.metadata.http_status = Some(200);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
out.contains("> Status: 200\n") || out.contains("> Status: 200"),
|
||||
"Status: 200 line missing from -f llm output:\n{out}"
|
||||
);
|
||||
// Must sit between URL and Title (Option A placement).
|
||||
let url_pos = out.find("> URL:").expect("URL line missing");
|
||||
let status_pos = out.find("> Status:").expect("Status line missing");
|
||||
let title_pos = out.find("> Title:").expect("Title line missing");
|
||||
assert!(url_pos < status_pos, "Status must come AFTER URL");
|
||||
assert!(status_pos < title_pos, "Status must come BEFORE Title");
|
||||
}
|
||||
|
||||
/// 404: status line distinguishes a real 404 (Status: 404 + thin
|
||||
/// soft-404 body) from a thin 200 article. This is the core M7 bug
|
||||
/// — `webclaw https://www.dailysabah.com/business/economy` was
|
||||
/// returning exit 0 with a 13-word body and no way for the caller to
|
||||
/// tell it was actually a 404 error page.
|
||||
#[test]
|
||||
fn test_status_header_appears_on_404() {
|
||||
let mut r = make_result("## 404 / The page you're looking for does not exist!");
|
||||
r.metadata.http_status = Some(404);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
out.contains("> Status: 404"),
|
||||
"Status: 404 line missing from -f llm output:\n{out}"
|
||||
);
|
||||
}
|
||||
|
||||
/// When http_status is None (local-file / --stdin / direct
|
||||
/// extract_with_options) NO Status line is emitted. Backward-compat
|
||||
/// for callers that pre-date M7 and parse via line index.
|
||||
#[test]
|
||||
fn test_status_header_absent_when_unset() {
|
||||
let r = make_result("# Body"); // http_status defaults to None
|
||||
assert!(r.metadata.http_status.is_none());
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(
|
||||
!out.contains("> Status:"),
|
||||
"Status line leaked when http_status is None:\n{out}"
|
||||
);
|
||||
}
|
||||
|
||||
/// JSON output: the `status` field (renamed from internal http_status
|
||||
/// via serde rename) appears at metadata level and carries the code.
|
||||
#[test]
|
||||
fn test_status_header_format_in_json() {
|
||||
let mut r = make_result("# Body");
|
||||
r.metadata.http_status = Some(404);
|
||||
let json = serde_json::to_string_pretty(&r).expect("serialize");
|
||||
assert!(
|
||||
json.contains("\"status\": 404"),
|
||||
"JSON output missing \"status\": 404:\n{json}"
|
||||
);
|
||||
// serde rename means the internal name "http_status" must NOT
|
||||
// surface in JSON output.
|
||||
assert!(
|
||||
!json.contains("http_status"),
|
||||
"internal field name leaked into JSON:\n{json}"
|
||||
);
|
||||
}
|
||||
|
||||
/// JSON output: when http_status is None the field is omitted
|
||||
/// entirely (skip_serializing_if = "Option::is_none").
|
||||
#[test]
|
||||
fn test_status_field_omitted_in_json_when_unset() {
|
||||
let r = make_result("# Body"); // http_status defaults to None
|
||||
let json = serde_json::to_string_pretty(&r).expect("serialize");
|
||||
assert!(
|
||||
!json.contains("\"status\""),
|
||||
"status field should be omitted when None:\n{json}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Summary mode (M1 `--mode summary`) must NOT include the Status
|
||||
/// line — summary returns a link list and the status would be noise.
|
||||
#[test]
|
||||
fn test_status_omitted_in_summary_mode() {
|
||||
let mut r = make_result("# Body");
|
||||
r.metadata.http_status = Some(404);
|
||||
// to_llm_summary builds its own header via
|
||||
// build_metadata_header_with_opts(include_status=false).
|
||||
let out = to_llm_summary(&r, None);
|
||||
assert!(
|
||||
!out.contains("> Status:"),
|
||||
"Status line leaked into summary mode output:\n{out}"
|
||||
);
|
||||
// URL line should still be present though.
|
||||
assert!(
|
||||
out.contains("> URL:") || r.metadata.url.is_some_and(|u| out.contains(&u)) || true,
|
||||
// URL is conditional on metadata; we don't assert presence,
|
||||
// only that Status is absent regardless.
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
/// TOC mode (M1 `--mode toc`) must NOT include the Status line either —
|
||||
/// the outline is structural metadata, status would clutter it.
|
||||
#[test]
|
||||
fn test_status_omitted_in_toc_mode() {
|
||||
let mut r = make_result("# H1\n\n## H2\n\nFirst paragraph after H2.");
|
||||
r.metadata.http_status = Some(404);
|
||||
let out = to_llm_toc(&r, None);
|
||||
assert!(
|
||||
!out.contains("> Status:"),
|
||||
"Status line leaked into toc mode output:\n{out}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ use crate::types::ExtractionResult;
|
|||
|
||||
use super::body;
|
||||
use super::links;
|
||||
use super::metadata::build_metadata_header;
|
||||
use super::metadata::build_metadata_header_with_opts;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Summary mode — link/title list, no body
|
||||
|
|
@ -26,7 +26,9 @@ use super::metadata::build_metadata_header;
|
|||
pub fn to_llm_summary(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||
let links = collect_summary_links(result);
|
||||
let mut out = String::new();
|
||||
build_metadata_header(&mut out, result, url);
|
||||
// M7: suppress the `> Status:` line in summary mode — the link list
|
||||
// is conceptually navigation, not protocol-level outcome.
|
||||
build_metadata_header_with_opts(&mut out, result, url, false);
|
||||
if !out.is_empty() {
|
||||
out.push('\n');
|
||||
}
|
||||
|
|
@ -88,7 +90,9 @@ pub fn to_llm_toc(result: &ExtractionResult, url: Option<&str>) -> String {
|
|||
let entries = collect_toc_entries(result);
|
||||
|
||||
let mut out = String::new();
|
||||
build_metadata_header(&mut out, result, url);
|
||||
// M7: suppress the `> Status:` line in toc mode — the outline is
|
||||
// structural, not protocol-level outcome.
|
||||
build_metadata_header_with_opts(&mut out, result, url, false);
|
||||
if !out.is_empty() {
|
||||
out.push('\n');
|
||||
}
|
||||
|
|
@ -355,6 +359,7 @@ mod tests {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown: markdown.to_string(),
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
|
|||
image,
|
||||
favicon,
|
||||
word_count: 0, // filled later by the extractor
|
||||
http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ pub struct ExtractionResult {
|
|||
pub structured_data: Vec<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct Metadata {
|
||||
pub title: Option<String>,
|
||||
pub description: Option<String>,
|
||||
|
|
@ -27,6 +27,20 @@ pub struct Metadata {
|
|||
pub image: Option<String>,
|
||||
pub favicon: Option<String>,
|
||||
pub word_count: usize,
|
||||
/// HTTP status code from the final response (after redirects). `None`
|
||||
/// when extraction was not preceded by an HTTP fetch — e.g. `--file`,
|
||||
/// `--stdin`, or any call into `extract_with_options` directly.
|
||||
/// Serialized in JSON output as `"status"` (renamed for caller
|
||||
/// ergonomics; M7 / issue #19). Surfaced in `-f llm` / `-f text` as a
|
||||
/// `> Status: <code>` line right after `> URL:` so callers can
|
||||
/// distinguish a real 404 from a thin-body 200 without parsing the
|
||||
/// body.
|
||||
#[serde(
|
||||
rename = "status",
|
||||
default,
|
||||
skip_serializing_if = "Option::is_none"
|
||||
)]
|
||||
pub http_status: Option<u16>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
|
|
|||
|
|
@ -542,9 +542,13 @@ impl FetchClient {
|
|||
let resp = client.get(json_url.as_str()).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
if response.is_success() {
|
||||
let reddit_status = response.status();
|
||||
let bytes = response.body();
|
||||
match crate::reddit::parse_reddit_json(bytes, url) {
|
||||
Ok(result) => return Ok(result),
|
||||
Ok(mut result) => {
|
||||
result.metadata.http_status = Some(reddit_status);
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
|
||||
}
|
||||
}
|
||||
|
|
@ -588,7 +592,9 @@ impl FetchClient {
|
|||
);
|
||||
|
||||
let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
|
||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||
let mut result = pdf_to_extraction_result(&pdf_result, &final_url);
|
||||
result.metadata.http_status = Some(status);
|
||||
Ok(result)
|
||||
} else if let Some(doc_type) =
|
||||
crate::document::is_document_content_type(&headers, &final_url)
|
||||
{
|
||||
|
|
@ -606,6 +612,7 @@ impl FetchClient {
|
|||
|
||||
let mut result = crate::document::extract_document(bytes, doc_type)?;
|
||||
result.metadata.url = Some(final_url);
|
||||
result.metadata.http_status = Some(status);
|
||||
Ok(result)
|
||||
} else {
|
||||
let html = response.into_text();
|
||||
|
|
@ -617,12 +624,15 @@ impl FetchClient {
|
|||
if crate::linkedin::is_linkedin_post(&final_url) {
|
||||
if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
|
||||
debug!("linkedin extraction succeeded");
|
||||
let mut result = result;
|
||||
result.metadata.http_status = Some(status);
|
||||
return Ok(result);
|
||||
}
|
||||
debug!("linkedin extraction failed, falling back to standard");
|
||||
}
|
||||
|
||||
let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
let mut extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
extraction.metadata.http_status = Some(status);
|
||||
|
||||
Ok(extraction)
|
||||
}
|
||||
|
|
@ -889,6 +899,7 @@ fn pdf_to_extraction_result(
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
http_status: None,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
markdown,
|
||||
|
|
|
|||
|
|
@ -110,6 +110,7 @@ pub fn extract_document(
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
http_status: None,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
markdown,
|
||||
|
|
|
|||
|
|
@ -216,6 +216,7 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
|
|
|
|||
|
|
@ -92,6 +92,7 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
|
|
|
|||
|
|
@ -518,6 +518,7 @@ impl WebclawMcp {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: markdown.split_whitespace().count(),
|
||||
http_status: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -65,6 +65,7 @@ fn empty_metadata() -> Metadata {
|
|||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
http_status: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue