diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 6855fca..bc20249 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -858,7 +858,20 @@ fn render_body( serde_json::to_string_pretty(result).expect("serialization failed") } } - OutputFormat::Text => result.content.plain_text.clone(), + OutputFormat::Text => { + // M7 (issue #19): prepend `Status: ` line so callers + // using `-f text` can distinguish a real 404 from a + // thin-body 200 without parsing the body. No `> ` + // blockquote prefix (text format has no header section). + // Only emitted when populated; local-file/--stdin paths + // leave http_status=None and produce body-only output. + let body = result.content.plain_text.clone(); + if let Some(code) = result.metadata.http_status { + format!("Status: {code}\n{body}") + } else { + body + } + } OutputFormat::Llm => to_llm_text_with_options( result, result.metadata.url.as_deref(), @@ -2974,6 +2987,7 @@ mod tests { image: None, favicon: None, word_count: markdown.split_whitespace().count(), + http_status: None, }, content: Content { markdown: markdown.to_string(), diff --git a/crates/webclaw-core/src/diff.rs b/crates/webclaw-core/src/diff.rs index c6a5d71..e888d1c 100644 --- a/crates/webclaw-core/src/diff.rs +++ b/crates/webclaw-core/src/diff.rs @@ -148,6 +148,7 @@ mod tests { image: None, favicon: None, word_count, + http_status: None, }, content: Content { markdown: markdown.to_string(), diff --git a/crates/webclaw-core/src/llm/hub_detect.rs b/crates/webclaw-core/src/llm/hub_detect.rs index b8c7211..273d626 100644 --- a/crates/webclaw-core/src/llm/hub_detect.rs +++ b/crates/webclaw-core/src/llm/hub_detect.rs @@ -136,6 +136,7 @@ mod tests { image: None, favicon: None, word_count: 0, + http_status: None, }, content: Content { markdown: markdown.to_string(), diff --git a/crates/webclaw-core/src/llm/metadata.rs b/crates/webclaw-core/src/llm/metadata.rs index 6f7666e..9ae27a2 100644 --- a/crates/webclaw-core/src/llm/metadata.rs +++ b/crates/webclaw-core/src/llm/metadata.rs @@ -8,6 +8,22 @@ pub(crate) fn build_metadata_header( out: &mut String, result: &ExtractionResult, url: Option<&str>, +) { + build_metadata_header_with_opts(out, result, url, true); +} + +/// Same as [`build_metadata_header`] but with an `include_status` toggle. +/// +/// `--mode summary` / `--mode toc` callers pass `include_status=false` so +/// the link-list / outline output stays uncluttered (M7 / issue #19 — the +/// status line is most useful on full-extraction output where the caller +/// is reading the body and needs to know whether they're looking at a 404 +/// error page vs a real article). +pub(crate) fn build_metadata_header_with_opts( + out: &mut String, + result: &ExtractionResult, + url: Option<&str>, + include_status: bool, ) { let meta = &result.metadata; @@ -16,6 +32,16 @@ pub(crate) fn build_metadata_header( if let Some(u) = effective_url { out.push_str(&format!("> URL: {u}\n")); } + // M7 (issue #19): HTTP status immediately after URL so callers can + // distinguish a real 404 from a thin-body 200 without parsing the page + // body. Emitted only when populated (network path); local-file / + // --stdin / extract_with_options direct calls leave http_status=None. + // Summary / toc modes suppress this line via include_status=false. + if include_status + && let Some(code) = meta.http_status + { + out.push_str(&format!("> Status: {code}\n")); + } if let Some(t) = &meta.title && !t.is_empty() { diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs index 507f258..f92ccd1 100644 --- a/crates/webclaw-core/src/llm/mod.rs +++ b/crates/webclaw-core/src/llm/mod.rs @@ -344,6 +344,7 @@ mod tests { image: None, favicon: None, word_count: 42, + http_status: None, }, content: Content { markdown: markdown.into(), @@ -597,6 +598,7 @@ mod tests { image: None, favicon: None, word_count: 0, + http_status: None, }, content: Content { markdown: "Just content".into(), @@ -1214,4 +1216,124 @@ mod tests { assert!(out.contains("## Structured data"), "missing header in:\n{out}"); assert!(out.contains("schema: WebPage"), "missing WebPage schema label in:\n{out}"); } + + // ------------------------------------------------------------------ + // M7: HTTP status header line (issue #19) + // ------------------------------------------------------------------ + + /// 200 control: status line appears in -f llm output even on success + /// so callers can distinguish "webclaw saw a 200" from "webclaw didn't + /// reach the formatter / status unknown" (e.g. local-file path). + #[test] + fn test_status_header_appears_on_200() { + let mut r = make_result("# Body"); + r.metadata.http_status = Some(200); + let out = to_llm_text(&r, None); + assert!( + out.contains("> Status: 200\n") || out.contains("> Status: 200"), + "Status: 200 line missing from -f llm output:\n{out}" + ); + // Must sit between URL and Title (Option A placement). + let url_pos = out.find("> URL:").expect("URL line missing"); + let status_pos = out.find("> Status:").expect("Status line missing"); + let title_pos = out.find("> Title:").expect("Title line missing"); + assert!(url_pos < status_pos, "Status must come AFTER URL"); + assert!(status_pos < title_pos, "Status must come BEFORE Title"); + } + + /// 404: status line distinguishes a real 404 (Status: 404 + thin + /// soft-404 body) from a thin 200 article. This is the core M7 bug + /// — `webclaw https://www.dailysabah.com/business/economy` was + /// returning exit 0 with a 13-word body and no way for the caller to + /// tell it was actually a 404 error page. + #[test] + fn test_status_header_appears_on_404() { + let mut r = make_result("## 404 / The page you're looking for does not exist!"); + r.metadata.http_status = Some(404); + let out = to_llm_text(&r, None); + assert!( + out.contains("> Status: 404"), + "Status: 404 line missing from -f llm output:\n{out}" + ); + } + + /// When http_status is None (local-file / --stdin / direct + /// extract_with_options) NO Status line is emitted. Backward-compat + /// for callers that pre-date M7 and parse via line index. + #[test] + fn test_status_header_absent_when_unset() { + let r = make_result("# Body"); // http_status defaults to None + assert!(r.metadata.http_status.is_none()); + let out = to_llm_text(&r, None); + assert!( + !out.contains("> Status:"), + "Status line leaked when http_status is None:\n{out}" + ); + } + + /// JSON output: the `status` field (renamed from internal http_status + /// via serde rename) appears at metadata level and carries the code. + #[test] + fn test_status_header_format_in_json() { + let mut r = make_result("# Body"); + r.metadata.http_status = Some(404); + let json = serde_json::to_string_pretty(&r).expect("serialize"); + assert!( + json.contains("\"status\": 404"), + "JSON output missing \"status\": 404:\n{json}" + ); + // serde rename means the internal name "http_status" must NOT + // surface in JSON output. + assert!( + !json.contains("http_status"), + "internal field name leaked into JSON:\n{json}" + ); + } + + /// JSON output: when http_status is None the field is omitted + /// entirely (skip_serializing_if = "Option::is_none"). + #[test] + fn test_status_field_omitted_in_json_when_unset() { + let r = make_result("# Body"); // http_status defaults to None + let json = serde_json::to_string_pretty(&r).expect("serialize"); + assert!( + !json.contains("\"status\""), + "status field should be omitted when None:\n{json}" + ); + } + + /// Summary mode (M1 `--mode summary`) must NOT include the Status + /// line — summary returns a link list and the status would be noise. + #[test] + fn test_status_omitted_in_summary_mode() { + let mut r = make_result("# Body"); + r.metadata.http_status = Some(404); + // to_llm_summary builds its own header via + // build_metadata_header_with_opts(include_status=false). + let out = to_llm_summary(&r, None); + assert!( + !out.contains("> Status:"), + "Status line leaked into summary mode output:\n{out}" + ); + // URL line should still be present though. + assert!( + out.contains("> URL:") || r.metadata.url.is_some_and(|u| out.contains(&u)) || true, + // URL is conditional on metadata; we don't assert presence, + // only that Status is absent regardless. + "" + ); + } + + /// TOC mode (M1 `--mode toc`) must NOT include the Status line either — + /// the outline is structural metadata, status would clutter it. + #[test] + fn test_status_omitted_in_toc_mode() { + let mut r = make_result("# H1\n\n## H2\n\nFirst paragraph after H2."); + r.metadata.http_status = Some(404); + let out = to_llm_toc(&r, None); + assert!( + !out.contains("> Status:"), + "Status line leaked into toc mode output:\n{out}" + ); + } } diff --git a/crates/webclaw-core/src/llm/output_size.rs b/crates/webclaw-core/src/llm/output_size.rs index 4f3b94f..f9aa7ed 100644 --- a/crates/webclaw-core/src/llm/output_size.rs +++ b/crates/webclaw-core/src/llm/output_size.rs @@ -14,7 +14,7 @@ use crate::types::ExtractionResult; use super::body; use super::links; -use super::metadata::build_metadata_header; +use super::metadata::build_metadata_header_with_opts; // --------------------------------------------------------------------------- // Summary mode — link/title list, no body @@ -26,7 +26,9 @@ use super::metadata::build_metadata_header; pub fn to_llm_summary(result: &ExtractionResult, url: Option<&str>) -> String { let links = collect_summary_links(result); let mut out = String::new(); - build_metadata_header(&mut out, result, url); + // M7: suppress the `> Status:` line in summary mode — the link list + // is conceptually navigation, not protocol-level outcome. + build_metadata_header_with_opts(&mut out, result, url, false); if !out.is_empty() { out.push('\n'); } @@ -88,7 +90,9 @@ pub fn to_llm_toc(result: &ExtractionResult, url: Option<&str>) -> String { let entries = collect_toc_entries(result); let mut out = String::new(); - build_metadata_header(&mut out, result, url); + // M7: suppress the `> Status:` line in toc mode — the outline is + // structural, not protocol-level outcome. + build_metadata_header_with_opts(&mut out, result, url, false); if !out.is_empty() { out.push('\n'); } @@ -355,6 +359,7 @@ mod tests { image: None, favicon: None, word_count: 0, + http_status: None, }, content: Content { markdown: markdown.to_string(), diff --git a/crates/webclaw-core/src/metadata.rs b/crates/webclaw-core/src/metadata.rs index c7f142b..ccac3c7 100644 --- a/crates/webclaw-core/src/metadata.rs +++ b/crates/webclaw-core/src/metadata.rs @@ -52,6 +52,7 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata { image, favicon, word_count: 0, // filled later by the extractor + http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin } } diff --git a/crates/webclaw-core/src/types.rs b/crates/webclaw-core/src/types.rs index ebe7a92..927f302 100644 --- a/crates/webclaw-core/src/types.rs +++ b/crates/webclaw-core/src/types.rs @@ -15,7 +15,7 @@ pub struct ExtractionResult { pub structured_data: Vec, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct Metadata { pub title: Option, pub description: Option, @@ -27,6 +27,20 @@ pub struct Metadata { pub image: Option, pub favicon: Option, pub word_count: usize, + /// HTTP status code from the final response (after redirects). `None` + /// when extraction was not preceded by an HTTP fetch — e.g. `--file`, + /// `--stdin`, or any call into `extract_with_options` directly. + /// Serialized in JSON output as `"status"` (renamed for caller + /// ergonomics; M7 / issue #19). Surfaced in `-f llm` / `-f text` as a + /// `> Status: ` line right after `> URL:` so callers can + /// distinguish a real 404 from a thin-body 200 without parsing the + /// body. + #[serde( + rename = "status", + default, + skip_serializing_if = "Option::is_none" + )] + pub http_status: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 20ef276..e7d35d1 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -542,9 +542,13 @@ impl FetchClient { let resp = client.get(json_url.as_str()).send().await?; let response = Response::from_wreq(resp).await?; if response.is_success() { + let reddit_status = response.status(); let bytes = response.body(); match crate::reddit::parse_reddit_json(bytes, url) { - Ok(result) => return Ok(result), + Ok(mut result) => { + result.metadata.http_status = Some(reddit_status); + return Ok(result); + } Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"), } } @@ -588,7 +592,9 @@ impl FetchClient { ); let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?; - Ok(pdf_to_extraction_result(&pdf_result, &final_url)) + let mut result = pdf_to_extraction_result(&pdf_result, &final_url); + result.metadata.http_status = Some(status); + Ok(result) } else if let Some(doc_type) = crate::document::is_document_content_type(&headers, &final_url) { @@ -606,6 +612,7 @@ impl FetchClient { let mut result = crate::document::extract_document(bytes, doc_type)?; result.metadata.url = Some(final_url); + result.metadata.http_status = Some(status); Ok(result) } else { let html = response.into_text(); @@ -617,12 +624,15 @@ impl FetchClient { if crate::linkedin::is_linkedin_post(&final_url) { if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) { debug!("linkedin extraction succeeded"); + let mut result = result; + result.metadata.http_status = Some(status); return Ok(result); } debug!("linkedin extraction failed, falling back to standard"); } - let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?; + let mut extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?; + extraction.metadata.http_status = Some(status); Ok(extraction) } @@ -889,6 +899,7 @@ fn pdf_to_extraction_result( image: None, favicon: None, word_count, + http_status: None, }, content: webclaw_core::Content { markdown, diff --git a/crates/webclaw-fetch/src/document.rs b/crates/webclaw-fetch/src/document.rs index 3d7d89d..0d9b232 100644 --- a/crates/webclaw-fetch/src/document.rs +++ b/crates/webclaw-fetch/src/document.rs @@ -110,6 +110,7 @@ pub fn extract_document( image: None, favicon: None, word_count, + http_status: None, }, content: webclaw_core::Content { markdown, diff --git a/crates/webclaw-fetch/src/linkedin.rs b/crates/webclaw-fetch/src/linkedin.rs index 0cabd1c..d8322b3 100644 --- a/crates/webclaw-fetch/src/linkedin.rs +++ b/crates/webclaw-fetch/src/linkedin.rs @@ -216,6 +216,7 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option image: None, favicon: None, word_count, + http_status: None, }, content: Content { markdown, diff --git a/crates/webclaw-fetch/src/reddit.rs b/crates/webclaw-fetch/src/reddit.rs index 7676ccd..5e934de 100644 --- a/crates/webclaw-fetch/src/reddit.rs +++ b/crates/webclaw-fetch/src/reddit.rs @@ -92,6 +92,7 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result Metadata { image: None, favicon: None, word_count: 0, + http_status: None, } }