diff --git a/Cargo.lock b/Cargo.lock index 51c0df5..6f6df3e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3113,6 +3113,7 @@ name = "webclaw-fetch" version = "0.3.0" dependencies = [ "calamine", + "http", "quick-xml 0.37.5", "rand 0.8.5", "serde", diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml index dd1b046..bb16cd7 100644 --- a/crates/webclaw-fetch/Cargo.toml +++ b/crates/webclaw-fetch/Cargo.toml @@ -22,3 +22,4 @@ zip = "2" [dev-dependencies] tempfile = "3" +http = "1" diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index a5565dc..b6ebc0d 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -60,7 +60,7 @@ pub struct FetchResult { pub status: u16, /// Final URL after any redirects. pub url: String, - pub headers: HashMap, + pub headers: webclaw_http::HeaderMap, pub elapsed: Duration, } @@ -257,7 +257,7 @@ impl FetchClient { let status = response.status(); let final_url = response.url().to_string(); - let headers: HashMap = response.headers().clone(); + let headers = response.headers().clone(); let is_pdf = is_pdf_content_type(&headers); @@ -507,9 +507,10 @@ fn is_retryable_error(err: &FetchError) -> bool { matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_)) } -fn is_pdf_content_type(headers: &HashMap) -> bool { +fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool { headers .get("content-type") + .and_then(|ct| ct.to_str().ok()) .map(|ct| { let mime = ct.split(';').next().unwrap_or("").trim(); mime.eq_ignore_ascii_case("application/pdf") @@ -584,7 +585,7 @@ mod tests { html: "".to_string(), status: 200, url: "https://example.com".to_string(), - headers: HashMap::new(), + headers: webclaw_http::HeaderMap::new(), elapsed: Duration::from_millis(42), }), }; @@ -636,23 +637,20 @@ mod tests { #[test] fn test_is_pdf_content_type() { - let mut headers = HashMap::new(); - headers.insert("content-type".to_string(), "application/pdf".to_string()); + let mut headers = webclaw_http::HeaderMap::new(); + headers.insert("content-type", "application/pdf".parse().unwrap()); assert!(is_pdf_content_type(&headers)); - headers.insert( - "content-type".to_string(), - "application/pdf; charset=utf-8".to_string(), - ); + headers.insert("content-type", "application/pdf; charset=utf-8".parse().unwrap()); assert!(is_pdf_content_type(&headers)); - headers.insert("content-type".to_string(), "Application/PDF".to_string()); + headers.insert("content-type", "Application/PDF".parse().unwrap()); assert!(is_pdf_content_type(&headers)); - headers.insert("content-type".to_string(), "text/html".to_string()); + headers.insert("content-type", "text/html".parse().unwrap()); assert!(!is_pdf_content_type(&headers)); - let empty: HashMap = HashMap::new(); + let empty = webclaw_http::HeaderMap::new(); assert!(!is_pdf_content_type(&empty)); } diff --git a/crates/webclaw-fetch/src/document.rs b/crates/webclaw-fetch/src/document.rs index 0291d52..05c3b34 100644 --- a/crates/webclaw-fetch/src/document.rs +++ b/crates/webclaw-fetch/src/document.rs @@ -1,7 +1,6 @@ /// Document extraction for DOCX, XLSX, XLS, and CSV files. /// Auto-detects document type from Content-Type headers or URL extension, /// then extracts text content as markdown — same pattern as PDF extraction. -use std::collections::HashMap; use std::io::{Cursor, Read}; use tracing::debug; @@ -27,11 +26,17 @@ impl DocType { } } +impl std::fmt::Display for DocType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.label()) + } +} + /// Detect document type from response headers or URL extension. /// Returns `None` for non-document responses (HTML, PDF, etc.). -pub fn is_document_content_type(headers: &HashMap, url: &str) -> Option { +pub fn is_document_content_type(headers: &webclaw_http::HeaderMap, url: &str) -> Option { // Check Content-Type header first - if let Some(ct) = headers.get("content-type") { + if let Some(ct) = headers.get("content-type").and_then(|v| v.to_str().ok()) { let mime = ct.split(';').next().unwrap_or("").trim(); if mime.eq_ignore_ascii_case( @@ -155,7 +160,7 @@ fn parse_docx_xml(xml: &str) -> Result { let mut in_run = false; // inside (run) let mut in_text = false; // inside let mut current_text = String::new(); - let mut heading_level: Option = 0.into(); // None = normal paragraph + let mut heading_level: Option = None; // None = normal paragraph let mut in_ppr = false; // inside (paragraph properties) loop { @@ -469,15 +474,24 @@ fn strip_markdown_formatting(markdown: &str) -> String { #[cfg(test)] mod tests { use super::*; + use webclaw_http::HeaderMap; + + fn headers_with(name: &str, value: &str) -> HeaderMap { + let mut h = HeaderMap::new(); + h.insert( + name.parse::().unwrap(), + value.parse().unwrap(), + ); + h + } // --- Content-type detection --- #[test] fn test_detect_docx_content_type() { - let mut headers = HashMap::new(); - headers.insert( - "content-type".to_string(), - "application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(), + let headers = headers_with( + "content-type", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), @@ -487,10 +501,9 @@ mod tests { #[test] fn test_detect_xlsx_content_type() { - let mut headers = HashMap::new(); - headers.insert( - "content-type".to_string(), - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(), + let headers = headers_with( + "content-type", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), @@ -500,11 +513,7 @@ mod tests { #[test] fn test_detect_xls_content_type() { - let mut headers = HashMap::new(); - headers.insert( - "content-type".to_string(), - "application/vnd.ms-excel".to_string(), - ); + let headers = headers_with("content-type", "application/vnd.ms-excel"); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), Some(DocType::Xls) @@ -513,8 +522,7 @@ mod tests { #[test] fn test_detect_csv_content_type() { - let mut headers = HashMap::new(); - headers.insert("content-type".to_string(), "text/csv".to_string()); + let headers = headers_with("content-type", "text/csv"); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), Some(DocType::Csv) @@ -523,11 +531,7 @@ mod tests { #[test] fn test_detect_csv_content_type_with_charset() { - let mut headers = HashMap::new(); - headers.insert( - "content-type".to_string(), - "text/csv; charset=utf-8".to_string(), - ); + let headers = headers_with("content-type", "text/csv; charset=utf-8"); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), Some(DocType::Csv) @@ -536,7 +540,7 @@ mod tests { #[test] fn test_detect_by_url_extension() { - let empty: HashMap = HashMap::new(); + let empty = HeaderMap::new(); assert_eq!( is_document_content_type(&empty, "https://example.com/report.docx"), Some(DocType::Docx) @@ -557,7 +561,7 @@ mod tests { #[test] fn test_detect_url_extension_with_query() { - let empty: HashMap = HashMap::new(); + let empty = HeaderMap::new(); assert_eq!( is_document_content_type(&empty, "https://example.com/report.docx?token=abc"), Some(DocType::Docx) @@ -566,7 +570,7 @@ mod tests { #[test] fn test_detect_url_extension_case_insensitive() { - let empty: HashMap = HashMap::new(); + let empty = HeaderMap::new(); assert_eq!( is_document_content_type(&empty, "https://example.com/FILE.XLSX"), Some(DocType::Xlsx) @@ -575,8 +579,7 @@ mod tests { #[test] fn test_detect_none_for_html() { - let mut headers = HashMap::new(); - headers.insert("content-type".to_string(), "text/html".to_string()); + let headers = headers_with("content-type", "text/html"); assert_eq!( is_document_content_type(&headers, "https://example.com/page"), None @@ -585,8 +588,7 @@ mod tests { #[test] fn test_content_type_takes_precedence_over_url() { - let mut headers = HashMap::new(); - headers.insert("content-type".to_string(), "text/csv".to_string()); + let headers = headers_with("content-type", "text/csv"); // URL says .xlsx but Content-Type says CSV — header wins assert_eq!( is_document_content_type(&headers, "https://example.com/data.xlsx"), diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index baec4cb..b03a208 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -17,4 +17,5 @@ pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult}; pub use error::FetchError; pub use proxy::{parse_proxy_file, parse_proxy_line}; pub use sitemap::SitemapEntry; +pub use webclaw_http::HeaderMap; pub use webclaw_pdf::PdfMode; diff --git a/crates/webclaw-mcp/src/cloud.rs b/crates/webclaw-mcp/src/cloud.rs index bf05ffe..ac602e4 100644 --- a/crates/webclaw-mcp/src/cloud.rs +++ b/crates/webclaw-mcp/src/cloud.rs @@ -2,7 +2,6 @@ /// /// When local fetch returns a challenge page, this module retries /// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set. -use std::collections::HashMap; use std::time::Duration; use serde_json::{Value, json}; @@ -72,7 +71,8 @@ impl CloudClient { let status = resp.status(); if !status.is_success() { let text = resp.text().await.unwrap_or_default(); - return Err(format!("Cloud API error {status}: {text}")); + let truncated = truncate_error(&text); + return Err(format!("Cloud API error {status}: {truncated}")); } resp.json::() @@ -93,7 +93,8 @@ impl CloudClient { let status = resp.status(); if !status.is_success() { let text = resp.text().await.unwrap_or_default(); - return Err(format!("Cloud API error {status}: {text}")); + let truncated = truncate_error(&text); + return Err(format!("Cloud API error {status}: {truncated}")); } resp.json::() @@ -102,9 +103,18 @@ impl CloudClient { } } +/// Truncate error body to avoid flooding logs with huge HTML responses. +fn truncate_error(text: &str) -> &str { + const MAX_LEN: usize = 500; + match text.char_indices().nth(MAX_LEN) { + Some((byte_pos, _)) => &text[..byte_pos], + None => text, + } +} + /// Check if fetched HTML looks like a bot protection challenge page. /// Detects common bot protection challenge pages. -pub fn is_bot_protected(html: &str, headers: &HashMap) -> bool { +pub fn is_bot_protected(html: &str, headers: &webclaw_fetch::HeaderMap) -> bool { let html_lower = html.to_lowercase(); // Cloudflare challenge page @@ -148,9 +158,7 @@ pub fn is_bot_protected(html: &str, headers: &HashMap) -> bool { } // Cloudflare via headers + challenge body - let has_cf_headers = headers - .iter() - .any(|(k, _)| k.eq_ignore_ascii_case("cf-ray") || k.eq_ignore_ascii_case("cf-mitigated")); + let has_cf_headers = headers.get("cf-ray").is_some() || headers.get("cf-mitigated").is_some(); if has_cf_headers && (html_lower.contains("just a moment") || html_lower.contains("checking your browser")) {