mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix: adapt to webclaw-tls v0.1.1 HeaderMap API change
Response.headers() now returns &http::HeaderMap instead of &HashMap<String, String>. Updated FetchResult, is_pdf_content_type, is_document_content_type, is_bot_protected, and all related tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
68b9406ff5
commit
199dab6dfa
6 changed files with 62 additions and 51 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -3113,6 +3113,7 @@ name = "webclaw-fetch"
|
|||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"calamine",
|
||||
"http",
|
||||
"quick-xml 0.37.5",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
|
|
|
|||
|
|
@ -22,3 +22,4 @@ zip = "2"
|
|||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
http = "1"
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ pub struct FetchResult {
|
|||
pub status: u16,
|
||||
/// Final URL after any redirects.
|
||||
pub url: String,
|
||||
pub headers: HashMap<String, String>,
|
||||
pub headers: webclaw_http::HeaderMap,
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
|
|
@ -257,7 +257,7 @@ impl FetchClient {
|
|||
let status = response.status();
|
||||
let final_url = response.url().to_string();
|
||||
|
||||
let headers: HashMap<String, String> = response.headers().clone();
|
||||
let headers = response.headers().clone();
|
||||
|
||||
let is_pdf = is_pdf_content_type(&headers);
|
||||
|
||||
|
|
@ -507,9 +507,10 @@ fn is_retryable_error(err: &FetchError) -> bool {
|
|||
matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_))
|
||||
}
|
||||
|
||||
fn is_pdf_content_type(headers: &HashMap<String, String>) -> bool {
|
||||
fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool {
|
||||
headers
|
||||
.get("content-type")
|
||||
.and_then(|ct| ct.to_str().ok())
|
||||
.map(|ct| {
|
||||
let mime = ct.split(';').next().unwrap_or("").trim();
|
||||
mime.eq_ignore_ascii_case("application/pdf")
|
||||
|
|
@ -584,7 +585,7 @@ mod tests {
|
|||
html: "<html></html>".to_string(),
|
||||
status: 200,
|
||||
url: "https://example.com".to_string(),
|
||||
headers: HashMap::new(),
|
||||
headers: webclaw_http::HeaderMap::new(),
|
||||
elapsed: Duration::from_millis(42),
|
||||
}),
|
||||
};
|
||||
|
|
@ -636,23 +637,20 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_is_pdf_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("content-type".to_string(), "application/pdf".to_string());
|
||||
let mut headers = webclaw_http::HeaderMap::new();
|
||||
headers.insert("content-type", "application/pdf".parse().unwrap());
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"application/pdf; charset=utf-8".to_string(),
|
||||
);
|
||||
headers.insert("content-type", "application/pdf; charset=utf-8".parse().unwrap());
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert("content-type".to_string(), "Application/PDF".to_string());
|
||||
headers.insert("content-type", "Application/PDF".parse().unwrap());
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert("content-type".to_string(), "text/html".to_string());
|
||||
headers.insert("content-type", "text/html".parse().unwrap());
|
||||
assert!(!is_pdf_content_type(&headers));
|
||||
|
||||
let empty: HashMap<String, String> = HashMap::new();
|
||||
let empty = webclaw_http::HeaderMap::new();
|
||||
assert!(!is_pdf_content_type(&empty));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
/// Document extraction for DOCX, XLSX, XLS, and CSV files.
|
||||
/// Auto-detects document type from Content-Type headers or URL extension,
|
||||
/// then extracts text content as markdown — same pattern as PDF extraction.
|
||||
use std::collections::HashMap;
|
||||
use std::io::{Cursor, Read};
|
||||
|
||||
use tracing::debug;
|
||||
|
|
@ -27,11 +26,17 @@ impl DocType {
|
|||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DocType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.label())
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect document type from response headers or URL extension.
|
||||
/// Returns `None` for non-document responses (HTML, PDF, etc.).
|
||||
pub fn is_document_content_type(headers: &HashMap<String, String>, url: &str) -> Option<DocType> {
|
||||
pub fn is_document_content_type(headers: &webclaw_http::HeaderMap, url: &str) -> Option<DocType> {
|
||||
// Check Content-Type header first
|
||||
if let Some(ct) = headers.get("content-type") {
|
||||
if let Some(ct) = headers.get("content-type").and_then(|v| v.to_str().ok()) {
|
||||
let mime = ct.split(';').next().unwrap_or("").trim();
|
||||
|
||||
if mime.eq_ignore_ascii_case(
|
||||
|
|
@ -155,7 +160,7 @@ fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
|
|||
let mut in_run = false; // inside <w:r> (run)
|
||||
let mut in_text = false; // inside <w:t>
|
||||
let mut current_text = String::new();
|
||||
let mut heading_level: Option<u8> = 0.into(); // None = normal paragraph
|
||||
let mut heading_level: Option<u8> = None; // None = normal paragraph
|
||||
let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
|
||||
|
||||
loop {
|
||||
|
|
@ -469,15 +474,24 @@ fn strip_markdown_formatting(markdown: &str) -> String {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use webclaw_http::HeaderMap;
|
||||
|
||||
fn headers_with(name: &str, value: &str) -> HeaderMap {
|
||||
let mut h = HeaderMap::new();
|
||||
h.insert(
|
||||
name.parse::<http::header::HeaderName>().unwrap(),
|
||||
value.parse().unwrap(),
|
||||
);
|
||||
h
|
||||
}
|
||||
|
||||
// --- Content-type detection ---
|
||||
|
||||
#[test]
|
||||
fn test_detect_docx_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(),
|
||||
let headers = headers_with(
|
||||
"content-type",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
|
|
@ -487,10 +501,9 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_detect_xlsx_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(),
|
||||
let headers = headers_with(
|
||||
"content-type",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
|
|
@ -500,11 +513,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_detect_xls_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"application/vnd.ms-excel".to_string(),
|
||||
);
|
||||
let headers = headers_with("content-type", "application/vnd.ms-excel");
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Xls)
|
||||
|
|
@ -513,8 +522,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_detect_csv_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("content-type".to_string(), "text/csv".to_string());
|
||||
let headers = headers_with("content-type", "text/csv");
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Csv)
|
||||
|
|
@ -523,11 +531,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_detect_csv_content_type_with_charset() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"text/csv; charset=utf-8".to_string(),
|
||||
);
|
||||
let headers = headers_with("content-type", "text/csv; charset=utf-8");
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Csv)
|
||||
|
|
@ -536,7 +540,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_detect_by_url_extension() {
|
||||
let empty: HashMap<String, String> = HashMap::new();
|
||||
let empty = HeaderMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/report.docx"),
|
||||
Some(DocType::Docx)
|
||||
|
|
@ -557,7 +561,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_detect_url_extension_with_query() {
|
||||
let empty: HashMap<String, String> = HashMap::new();
|
||||
let empty = HeaderMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
|
||||
Some(DocType::Docx)
|
||||
|
|
@ -566,7 +570,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_detect_url_extension_case_insensitive() {
|
||||
let empty: HashMap<String, String> = HashMap::new();
|
||||
let empty = HeaderMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
|
||||
Some(DocType::Xlsx)
|
||||
|
|
@ -575,8 +579,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_detect_none_for_html() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("content-type".to_string(), "text/html".to_string());
|
||||
let headers = headers_with("content-type", "text/html");
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/page"),
|
||||
None
|
||||
|
|
@ -585,8 +588,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_content_type_takes_precedence_over_url() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("content-type".to_string(), "text/csv".to_string());
|
||||
let headers = headers_with("content-type", "text/csv");
|
||||
// URL says .xlsx but Content-Type says CSV — header wins
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/data.xlsx"),
|
||||
|
|
|
|||
|
|
@ -17,4 +17,5 @@ pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult};
|
|||
pub use error::FetchError;
|
||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||
pub use sitemap::SitemapEntry;
|
||||
pub use webclaw_http::HeaderMap;
|
||||
pub use webclaw_pdf::PdfMode;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
///
|
||||
/// When local fetch returns a challenge page, this module retries
|
||||
/// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set.
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
|
||||
use serde_json::{Value, json};
|
||||
|
|
@ -72,7 +71,8 @@ impl CloudClient {
|
|||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("Cloud API error {status}: {text}"));
|
||||
let truncated = truncate_error(&text);
|
||||
return Err(format!("Cloud API error {status}: {truncated}"));
|
||||
}
|
||||
|
||||
resp.json::<Value>()
|
||||
|
|
@ -93,7 +93,8 @@ impl CloudClient {
|
|||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("Cloud API error {status}: {text}"));
|
||||
let truncated = truncate_error(&text);
|
||||
return Err(format!("Cloud API error {status}: {truncated}"));
|
||||
}
|
||||
|
||||
resp.json::<Value>()
|
||||
|
|
@ -102,9 +103,18 @@ impl CloudClient {
|
|||
}
|
||||
}
|
||||
|
||||
/// Truncate error body to avoid flooding logs with huge HTML responses.
|
||||
fn truncate_error(text: &str) -> &str {
|
||||
const MAX_LEN: usize = 500;
|
||||
match text.char_indices().nth(MAX_LEN) {
|
||||
Some((byte_pos, _)) => &text[..byte_pos],
|
||||
None => text,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if fetched HTML looks like a bot protection challenge page.
|
||||
/// Detects common bot protection challenge pages.
|
||||
pub fn is_bot_protected(html: &str, headers: &HashMap<String, String>) -> bool {
|
||||
pub fn is_bot_protected(html: &str, headers: &webclaw_fetch::HeaderMap) -> bool {
|
||||
let html_lower = html.to_lowercase();
|
||||
|
||||
// Cloudflare challenge page
|
||||
|
|
@ -148,9 +158,7 @@ pub fn is_bot_protected(html: &str, headers: &HashMap<String, String>) -> bool {
|
|||
}
|
||||
|
||||
// Cloudflare via headers + challenge body
|
||||
let has_cf_headers = headers
|
||||
.iter()
|
||||
.any(|(k, _)| k.eq_ignore_ascii_case("cf-ray") || k.eq_ignore_ascii_case("cf-mitigated"));
|
||||
let has_cf_headers = headers.get("cf-ray").is_some() || headers.get("cf-mitigated").is_some();
|
||||
if has_cf_headers
|
||||
&& (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue