feat(core): HTTP status header line in -f llm/text/json output

Webclaw previously emitted URL, Title, Description, and Word count in the -f llm header but no HTTP status. On a 404 response, the caller had no signal apart from inspecting the body (e.g. dailysabah.com/ business/economy returns a 404 page; webclaw was extracting '13 words' of the error page without flagging the 404 status). New behavior: every -f llm/text/json output includes a 'Status: <code>' header line (after URL: per phase A's placement). Emitted on all responses including 200 for consistency — callers can't otherwise distinguish 'webclaw saw 200' from 'webclaw missed status info'. For -f json: top-level "status": <code> field added. Modes --mode summary and --mode toc are exempt: the status line would clutter the link-list and outline outputs. M3 fast-fails (known-bad-sites) also skip the status line because they exit before the formatter is reached. 7 new tests in webclaw-core (workspace total 671 -> 678).
2026-06-12 23:05:12 +02:00 · 2026-05-23 21:29:26 +02:00 · 2026-05-23 21:29:26 +02:00 · dfcd51d9e0
commit dfcd51d9e0
parent 66974366d7
14 changed files with 208 additions and 8 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -858,7 +858,20 @@ fn render_body(
                    serde_json::to_string_pretty(result).expect("serialization failed")
                }
            }
-            OutputFormat::Text => result.content.plain_text.clone(),
+            OutputFormat::Text => {
+                // M7 (issue #19): prepend `Status: <code>` line so callers
+                // using `-f text` can distinguish a real 404 from a
+                // thin-body 200 without parsing the body. No `> `
+                // blockquote prefix (text format has no header section).
+                // Only emitted when populated; local-file/--stdin paths
+                // leave http_status=None and produce body-only output.
+                let body = result.content.plain_text.clone();
+                if let Some(code) = result.metadata.http_status {
+                    format!("Status: {code}\n{body}")
+                } else {
+                    body
+                }
+            }
            OutputFormat::Llm => to_llm_text_with_options(
                result,
                result.metadata.url.as_deref(),
@ -2974,6 +2987,7 @@ mod tests {
                image: None,
                favicon: None,
                word_count: markdown.split_whitespace().count(),
+                http_status: None,
            },
            content: Content {
                markdown: markdown.to_string(),
--- a/crates/webclaw-core/src/diff.rs
+++ b/crates/webclaw-core/src/diff.rs
@ -148,6 +148,7 @@ mod tests {
                image: None,
                favicon: None,
                word_count,
+                http_status: None,
            },
            content: Content {
                markdown: markdown.to_string(),
--- a/crates/webclaw-core/src/llm/hub_detect.rs
+++ b/crates/webclaw-core/src/llm/hub_detect.rs
@ -136,6 +136,7 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 0,
+                http_status: None,
            },
            content: Content {
                markdown: markdown.to_string(),
--- a/crates/webclaw-core/src/llm/metadata.rs
+++ b/crates/webclaw-core/src/llm/metadata.rs
@ -8,6 +8,22 @@ pub(crate) fn build_metadata_header(
    out: &mut String,
    result: &ExtractionResult,
    url: Option<&str>,
+) {
+    build_metadata_header_with_opts(out, result, url, true);
+}
+
+/// Same as [`build_metadata_header`] but with an `include_status` toggle.
+///
+/// `--mode summary` / `--mode toc` callers pass `include_status=false` so
+/// the link-list / outline output stays uncluttered (M7 / issue #19 — the
+/// status line is most useful on full-extraction output where the caller
+/// is reading the body and needs to know whether they're looking at a 404
+/// error page vs a real article).
+pub(crate) fn build_metadata_header_with_opts(
+    out: &mut String,
+    result: &ExtractionResult,
+    url: Option<&str>,
+    include_status: bool,
 ) {
    let meta = &result.metadata;

@ -16,6 +32,16 @@ pub(crate) fn build_metadata_header(
    if let Some(u) = effective_url {
        out.push_str(&format!("> URL: {u}\n"));
    }
+    // M7 (issue #19): HTTP status immediately after URL so callers can
+    // distinguish a real 404 from a thin-body 200 without parsing the page
+    // body. Emitted only when populated (network path); local-file /
+    // --stdin / extract_with_options direct calls leave http_status=None.
+    // Summary / toc modes suppress this line via include_status=false.
+    if include_status
+        && let Some(code) = meta.http_status
+    {
+        out.push_str(&format!("> Status: {code}\n"));
+    }
    if let Some(t) = &meta.title
        && !t.is_empty()
    {
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -344,6 +344,7 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 42,
+                http_status: None,
            },
            content: Content {
                markdown: markdown.into(),
@ -597,6 +598,7 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 0,
+                http_status: None,
            },
            content: Content {
                markdown: "Just content".into(),
@ -1214,4 +1216,124 @@ mod tests {
        assert!(out.contains("## Structured data"), "missing header in:\n{out}");
        assert!(out.contains("schema: WebPage"), "missing WebPage schema label in:\n{out}");
    }
+
+    // ------------------------------------------------------------------
+    // M7: HTTP status header line (issue #19)
+    // ------------------------------------------------------------------
+
+    /// 200 control: status line appears in -f llm output even on success
+    /// so callers can distinguish "webclaw saw a 200" from "webclaw didn't
+    /// reach the formatter / status unknown" (e.g. local-file path).
+    #[test]
+    fn test_status_header_appears_on_200() {
+        let mut r = make_result("# Body");
+        r.metadata.http_status = Some(200);
+        let out = to_llm_text(&r, None);
+        assert!(
+            out.contains("> Status: 200\n") || out.contains("> Status: 200"),
+            "Status: 200 line missing from -f llm output:\n{out}"
+        );
+        // Must sit between URL and Title (Option A placement).
+        let url_pos = out.find("> URL:").expect("URL line missing");
+        let status_pos = out.find("> Status:").expect("Status line missing");
+        let title_pos = out.find("> Title:").expect("Title line missing");
+        assert!(url_pos < status_pos, "Status must come AFTER URL");
+        assert!(status_pos < title_pos, "Status must come BEFORE Title");
+    }
+
+    /// 404: status line distinguishes a real 404 (Status: 404 + thin
+    /// soft-404 body) from a thin 200 article. This is the core M7 bug
+    /// — `webclaw https://www.dailysabah.com/business/economy` was
+    /// returning exit 0 with a 13-word body and no way for the caller to
+    /// tell it was actually a 404 error page.
+    #[test]
+    fn test_status_header_appears_on_404() {
+        let mut r = make_result("## 404 / The page you're looking for does not exist!");
+        r.metadata.http_status = Some(404);
+        let out = to_llm_text(&r, None);
+        assert!(
+            out.contains("> Status: 404"),
+            "Status: 404 line missing from -f llm output:\n{out}"
+        );
+    }
+
+    /// When http_status is None (local-file / --stdin / direct
+    /// extract_with_options) NO Status line is emitted. Backward-compat
+    /// for callers that pre-date M7 and parse via line index.
+    #[test]
+    fn test_status_header_absent_when_unset() {
+        let r = make_result("# Body"); // http_status defaults to None
+        assert!(r.metadata.http_status.is_none());
+        let out = to_llm_text(&r, None);
+        assert!(
+            !out.contains("> Status:"),
+            "Status line leaked when http_status is None:\n{out}"
+        );
+    }
+
+    /// JSON output: the `status` field (renamed from internal http_status
+    /// via serde rename) appears at metadata level and carries the code.
+    #[test]
+    fn test_status_header_format_in_json() {
+        let mut r = make_result("# Body");
+        r.metadata.http_status = Some(404);
+        let json = serde_json::to_string_pretty(&r).expect("serialize");
+        assert!(
+            json.contains("\"status\": 404"),
+            "JSON output missing \"status\": 404:\n{json}"
+        );
+        // serde rename means the internal name "http_status" must NOT
+        // surface in JSON output.
+        assert!(
+            !json.contains("http_status"),
+            "internal field name leaked into JSON:\n{json}"
+        );
+    }
+
+    /// JSON output: when http_status is None the field is omitted
+    /// entirely (skip_serializing_if = "Option::is_none").
+    #[test]
+    fn test_status_field_omitted_in_json_when_unset() {
+        let r = make_result("# Body"); // http_status defaults to None
+        let json = serde_json::to_string_pretty(&r).expect("serialize");
+        assert!(
+            !json.contains("\"status\""),
+            "status field should be omitted when None:\n{json}"
+        );
+    }
+
+    /// Summary mode (M1 `--mode summary`) must NOT include the Status
+    /// line — summary returns a link list and the status would be noise.
+    #[test]
+    fn test_status_omitted_in_summary_mode() {
+        let mut r = make_result("# Body");
+        r.metadata.http_status = Some(404);
+        // to_llm_summary builds its own header via
+        // build_metadata_header_with_opts(include_status=false).
+        let out = to_llm_summary(&r, None);
+        assert!(
+            !out.contains("> Status:"),
+            "Status line leaked into summary mode output:\n{out}"
+        );
+        // URL line should still be present though.
+        assert!(
+            out.contains("> URL:") || r.metadata.url.is_some_and(|u| out.contains(&u)) || true,
+            // URL is conditional on metadata; we don't assert presence,
+            // only that Status is absent regardless.
+            ""
+        );
+    }
+
+    /// TOC mode (M1 `--mode toc`) must NOT include the Status line either —
+    /// the outline is structural metadata, status would clutter it.
+    #[test]
+    fn test_status_omitted_in_toc_mode() {
+        let mut r = make_result("# H1\n\n## H2\n\nFirst paragraph after H2.");
+        r.metadata.http_status = Some(404);
+        let out = to_llm_toc(&r, None);
+        assert!(
+            !out.contains("> Status:"),
+            "Status line leaked into toc mode output:\n{out}"
+        );
+    }
 }
--- a/crates/webclaw-core/src/llm/output_size.rs
+++ b/crates/webclaw-core/src/llm/output_size.rs
@ -14,7 +14,7 @@ use crate::types::ExtractionResult;

 use super::body;
 use super::links;
-use super::metadata::build_metadata_header;
+use super::metadata::build_metadata_header_with_opts;

 // ---------------------------------------------------------------------------
 // Summary mode — link/title list, no body
@ -26,7 +26,9 @@ use super::metadata::build_metadata_header;
 pub fn to_llm_summary(result: &ExtractionResult, url: Option<&str>) -> String {
    let links = collect_summary_links(result);
    let mut out = String::new();
-    build_metadata_header(&mut out, result, url);
+    // M7: suppress the `> Status:` line in summary mode — the link list
+    // is conceptually navigation, not protocol-level outcome.
+    build_metadata_header_with_opts(&mut out, result, url, false);
    if !out.is_empty() {
        out.push('\n');
    }
@ -88,7 +90,9 @@ pub fn to_llm_toc(result: &ExtractionResult, url: Option<&str>) -> String {
    let entries = collect_toc_entries(result);

    let mut out = String::new();
-    build_metadata_header(&mut out, result, url);
+    // M7: suppress the `> Status:` line in toc mode — the outline is
+    // structural, not protocol-level outcome.
+    build_metadata_header_with_opts(&mut out, result, url, false);
    if !out.is_empty() {
        out.push('\n');
    }
@ -355,6 +359,7 @@ mod tests {
                image: None,
                favicon: None,
                word_count: 0,
+                http_status: None,
            },
            content: Content {
                markdown: markdown.to_string(),
--- a/crates/webclaw-core/src/metadata.rs
+++ b/crates/webclaw-core/src/metadata.rs
@ -52,6 +52,7 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
        image,
        favicon,
        word_count: 0, // filled later by the extractor
+        http_status: None, // filled by webclaw-fetch when reachable; None for local-file / --stdin
    }
 }

--- a/crates/webclaw-core/src/types.rs
+++ b/crates/webclaw-core/src/types.rs
@ -15,7 +15,7 @@ pub struct ExtractionResult {
    pub structured_data: Vec<serde_json::Value>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct Metadata {
    pub title: Option<String>,
    pub description: Option<String>,
@ -27,6 +27,20 @@ pub struct Metadata {
    pub image: Option<String>,
    pub favicon: Option<String>,
    pub word_count: usize,
+    /// HTTP status code from the final response (after redirects). `None`
+    /// when extraction was not preceded by an HTTP fetch — e.g. `--file`,
+    /// `--stdin`, or any call into `extract_with_options` directly.
+    /// Serialized in JSON output as `"status"` (renamed for caller
+    /// ergonomics; M7 / issue #19). Surfaced in `-f llm` / `-f text` as a
+    /// `> Status: <code>` line right after `> URL:` so callers can
+    /// distinguish a real 404 from a thin-body 200 without parsing the
+    /// body.
+    #[serde(
+        rename = "status",
+        default,
+        skip_serializing_if = "Option::is_none"
+    )]
+    pub http_status: Option<u16>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -542,9 +542,13 @@ impl FetchClient {
            let resp = client.get(json_url.as_str()).send().await?;
            let response = Response::from_wreq(resp).await?;
            if response.is_success() {
+                let reddit_status = response.status();
                let bytes = response.body();
                match crate::reddit::parse_reddit_json(bytes, url) {
-                    Ok(result) => return Ok(result),
+                    Ok(mut result) => {
+                        result.metadata.http_status = Some(reddit_status);
+                        return Ok(result);
+                    }
                    Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
                }
            }
@ -588,7 +592,9 @@ impl FetchClient {
            );

            let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
-            Ok(pdf_to_extraction_result(&pdf_result, &final_url))
+            let mut result = pdf_to_extraction_result(&pdf_result, &final_url);
+            result.metadata.http_status = Some(status);
+            Ok(result)
        } else if let Some(doc_type) =
            crate::document::is_document_content_type(&headers, &final_url)
        {
@ -606,6 +612,7 @@ impl FetchClient {

            let mut result = crate::document::extract_document(bytes, doc_type)?;
            result.metadata.url = Some(final_url);
+            result.metadata.http_status = Some(status);
            Ok(result)
        } else {
            let html = response.into_text();
@ -617,12 +624,15 @@ impl FetchClient {
            if crate::linkedin::is_linkedin_post(&final_url) {
                if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
                    debug!("linkedin extraction succeeded");
+                    let mut result = result;
+                    result.metadata.http_status = Some(status);
                    return Ok(result);
                }
                debug!("linkedin extraction failed, falling back to standard");
            }

-            let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
+            let mut extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
+            extraction.metadata.http_status = Some(status);

            Ok(extraction)
        }
@ -889,6 +899,7 @@ fn pdf_to_extraction_result(
            image: None,
            favicon: None,
            word_count,
+            http_status: None,
        },
        content: webclaw_core::Content {
            markdown,
--- a/crates/webclaw-fetch/src/document.rs
+++ b/crates/webclaw-fetch/src/document.rs
@ -110,6 +110,7 @@ pub fn extract_document(
            image: None,
            favicon: None,
            word_count,
+            http_status: None,
        },
        content: webclaw_core::Content {
            markdown,
--- a/crates/webclaw-fetch/src/linkedin.rs
+++ b/crates/webclaw-fetch/src/linkedin.rs
@ -216,6 +216,7 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
            image: None,
            favicon: None,
            word_count,
+            http_status: None,
        },
        content: Content {
            markdown,
--- a/crates/webclaw-fetch/src/reddit.rs
+++ b/crates/webclaw-fetch/src/reddit.rs
@ -92,6 +92,7 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
            image: None,
            favicon: None,
            word_count,
+            http_status: None,
        },
        content: Content {
            markdown,
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@ -518,6 +518,7 @@ impl WebclawMcp {
                        image: None,
                        favicon: None,
                        word_count: markdown.split_whitespace().count(),
+                        http_status: None,
                    },
                    domain_data: None,
                    structured_data: Vec::new(),
--- a/crates/webclaw-server/src/routes/diff.rs
+++ b/crates/webclaw-server/src/routes/diff.rs
@ -65,6 +65,7 @@ fn empty_metadata() -> Metadata {
        image: None,
        favicon: None,
        word_count: 0,
+        http_status: None,
    }
 }