mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-15 18:25:24 +02:00
Initial release: webclaw v0.1.0 — web content extraction for LLMs
CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
This commit is contained in:
commit
c99ec684fa
79 changed files with 24074 additions and 0 deletions
96
crates/webclaw-fetch/src/browser.rs
Normal file
96
crates/webclaw-fetch/src/browser.rs
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
/// Browser fingerprint selection and rotation.
|
||||
/// Maps our simple `BrowserProfile` enum to primp's impersonation profiles.
|
||||
use primp::{Impersonate, ImpersonateOS};
|
||||
|
||||
/// Which browser identity to present at the TLS/HTTP layer.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub enum BrowserProfile {
|
||||
#[default]
|
||||
Chrome,
|
||||
Firefox,
|
||||
/// Randomly pick from all available profiles on each request.
|
||||
Random,
|
||||
}
|
||||
|
||||
/// A complete impersonation profile: browser + OS.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ImpersonateProfile {
|
||||
pub browser: Impersonate,
|
||||
pub os: ImpersonateOS,
|
||||
}
|
||||
|
||||
/// All Chrome profiles we ship, newest first.
|
||||
pub fn chrome_profiles() -> Vec<ImpersonateProfile> {
|
||||
vec![
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV145,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV145,
|
||||
os: ImpersonateOS::MacOS,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV144,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV144,
|
||||
os: ImpersonateOS::Linux,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
/// All Firefox profiles we ship, newest first.
|
||||
pub fn firefox_profiles() -> Vec<ImpersonateProfile> {
|
||||
vec![
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::FirefoxV146,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::FirefoxV146,
|
||||
os: ImpersonateOS::Linux,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::FirefoxV140,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
/// Safari + Edge + Opera profiles for maximum diversity in Random mode.
|
||||
pub fn extra_profiles() -> Vec<ImpersonateProfile> {
|
||||
vec![
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::SafariV18_5,
|
||||
os: ImpersonateOS::MacOS,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::SafariV26,
|
||||
os: ImpersonateOS::MacOS,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::EdgeV145,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::OperaV127,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
pub fn latest_chrome() -> ImpersonateProfile {
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV145,
|
||||
os: ImpersonateOS::Windows,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn latest_firefox() -> ImpersonateProfile {
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::FirefoxV146,
|
||||
os: ImpersonateOS::Windows,
|
||||
}
|
||||
}
|
||||
798
crates/webclaw-fetch/src/client.rs
Normal file
798
crates/webclaw-fetch/src/client.rs
Normal file
|
|
@ -0,0 +1,798 @@
|
|||
/// HTTP client with browser TLS fingerprint impersonation.
|
||||
/// Wraps primp to provide a simple fetch interface with optional
|
||||
/// content extraction via webclaw-core. Supports single and batch operations.
|
||||
/// Automatically detects PDF responses and extracts text via webclaw-pdf.
|
||||
///
|
||||
/// Two proxy modes:
|
||||
/// - **Static**: single proxy (or none) baked into pre-built clients at construction.
|
||||
/// - **Rotating**: pre-built pool of clients, each with a different proxy + fingerprint.
|
||||
/// Same-host URLs are routed to the same client for HTTP/2 connection reuse.
|
||||
use std::collections::HashMap;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{debug, instrument, warn};
|
||||
use webclaw_pdf::PdfMode;
|
||||
|
||||
use crate::browser::{self, BrowserProfile, ImpersonateProfile};
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Configuration for building a [`FetchClient`].
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FetchConfig {
|
||||
pub browser: BrowserProfile,
|
||||
/// Single proxy URL. Used when `proxy_pool` is empty.
|
||||
pub proxy: Option<String>,
|
||||
/// Pool of proxy URLs to rotate through.
|
||||
/// When non-empty, each proxy gets a pre-built client with a
|
||||
/// random browser fingerprint. Same-host URLs reuse the same client
|
||||
/// for HTTP/2 connection multiplexing.
|
||||
pub proxy_pool: Vec<String>,
|
||||
pub timeout: Duration,
|
||||
pub follow_redirects: bool,
|
||||
pub max_redirects: u32,
|
||||
pub headers: HashMap<String, String>,
|
||||
pub pdf_mode: PdfMode,
|
||||
}
|
||||
|
||||
impl Default for FetchConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
browser: BrowserProfile::Chrome,
|
||||
proxy: None,
|
||||
proxy_pool: Vec::new(),
|
||||
timeout: Duration::from_secs(30),
|
||||
follow_redirects: true,
|
||||
max_redirects: 10,
|
||||
headers: HashMap::from([("Accept-Language".to_string(), "en-US,en;q=0.9".to_string())]),
|
||||
pdf_mode: PdfMode::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a successful fetch.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FetchResult {
|
||||
pub html: String,
|
||||
pub status: u16,
|
||||
/// Final URL after any redirects.
|
||||
pub url: String,
|
||||
pub headers: HashMap<String, String>,
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
/// Result for a single URL in a batch fetch operation.
|
||||
#[derive(Debug)]
|
||||
pub struct BatchResult {
|
||||
pub url: String,
|
||||
pub result: Result<FetchResult, FetchError>,
|
||||
}
|
||||
|
||||
/// Result for a single URL in a batch fetch-and-extract operation.
|
||||
#[derive(Debug)]
|
||||
pub struct BatchExtractResult {
|
||||
pub url: String,
|
||||
pub result: Result<webclaw_core::ExtractionResult, FetchError>,
|
||||
}
|
||||
|
||||
/// Internal representation of the client pool strategy.
|
||||
enum ClientPool {
|
||||
/// Pre-built clients with a fixed proxy (or no proxy).
|
||||
/// Fingerprint rotation still works via the pool when `random` is true.
|
||||
Static {
|
||||
clients: Vec<primp::Client>,
|
||||
random: bool,
|
||||
},
|
||||
/// Pre-built pool of clients, each with a different proxy + fingerprint.
|
||||
/// Requests pick a client deterministically by host for HTTP/2 connection reuse.
|
||||
Rotating { clients: Vec<primp::Client> },
|
||||
}
|
||||
|
||||
/// HTTP client that impersonates browser TLS fingerprints via primp.
|
||||
///
|
||||
/// Operates in two modes:
|
||||
/// - **Static pool**: pre-built primp clients, optionally with fingerprint rotation.
|
||||
/// Used when no `proxy_pool` is configured. Fast (no per-request construction).
|
||||
/// - **Rotating pool**: pre-built primp clients, one per proxy in the pool.
|
||||
/// Same-host URLs are routed to the same client for HTTP/2 multiplexing.
|
||||
pub struct FetchClient {
|
||||
pool: ClientPool,
|
||||
pdf_mode: PdfMode,
|
||||
}
|
||||
|
||||
impl FetchClient {
|
||||
/// Build a new client from config.
|
||||
///
|
||||
/// When `config.proxy_pool` is non-empty, pre-builds one primp client per proxy,
|
||||
/// each with a randomly assigned fingerprint. Same-host URLs get routed to the
|
||||
/// same client for HTTP/2 connection reuse.
|
||||
///
|
||||
/// When `proxy_pool` is empty, pre-builds primp clients at construction time
|
||||
/// (one per fingerprint for `Random` profiles, one for fixed profiles).
|
||||
pub fn new(config: FetchConfig) -> Result<Self, FetchError> {
|
||||
let profiles = collect_profiles(&config.browser);
|
||||
let pdf_mode = config.pdf_mode.clone();
|
||||
|
||||
let pool = if config.proxy_pool.is_empty() {
|
||||
let clients = profiles
|
||||
.into_iter()
|
||||
.map(|p| build_primp_client(&config, &p, config.proxy.as_deref()))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
let random = matches!(config.browser, BrowserProfile::Random);
|
||||
debug!(
|
||||
count = clients.len(),
|
||||
random, "fetch client ready (static pool)"
|
||||
);
|
||||
|
||||
ClientPool::Static { clients, random }
|
||||
} else {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
let clients = config
|
||||
.proxy_pool
|
||||
.iter()
|
||||
.map(|proxy| {
|
||||
let p = profiles.choose(&mut rng).unwrap().clone();
|
||||
build_primp_client(&config, &p, Some(proxy))
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
debug!(
|
||||
clients = clients.len(),
|
||||
profiles = profiles.len(),
|
||||
"fetch client ready (pre-built rotating pool)"
|
||||
);
|
||||
|
||||
ClientPool::Rotating { clients }
|
||||
};
|
||||
|
||||
Ok(Self { pool, pdf_mode })
|
||||
}
|
||||
|
||||
/// Fetch a URL and return the raw HTML + response metadata.
|
||||
///
|
||||
/// Automatically retries on transient failures (network errors, 5xx, 429)
|
||||
/// with exponential backoff: 0s, 1s, 3s (3 attempts total).
|
||||
#[instrument(skip(self), fields(url = %url))]
|
||||
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
let delays = [
|
||||
Duration::ZERO,
|
||||
Duration::from_secs(1),
|
||||
Duration::from_secs(3),
|
||||
];
|
||||
let mut last_err = None;
|
||||
|
||||
for (attempt, delay) in delays.iter().enumerate() {
|
||||
if attempt > 0 {
|
||||
tokio::time::sleep(*delay).await;
|
||||
}
|
||||
|
||||
match self.fetch_once(url).await {
|
||||
Ok(result) => {
|
||||
if is_retryable_status(result.status) && attempt < delays.len() - 1 {
|
||||
warn!(
|
||||
url,
|
||||
status = result.status,
|
||||
attempt = attempt + 1,
|
||||
"retryable status, will retry"
|
||||
);
|
||||
last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
|
||||
continue;
|
||||
}
|
||||
if attempt > 0 {
|
||||
debug!(url, attempt = attempt + 1, "retry succeeded");
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => {
|
||||
if !is_retryable_error(&e) || attempt == delays.len() - 1 {
|
||||
return Err(e);
|
||||
}
|
||||
warn!(
|
||||
url,
|
||||
error = %e,
|
||||
attempt = attempt + 1,
|
||||
"transient error, will retry"
|
||||
);
|
||||
last_err = Some(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
|
||||
}
|
||||
|
||||
/// Single fetch attempt (no retry).
|
||||
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
let start = Instant::now();
|
||||
|
||||
let client = match &self.pool {
|
||||
ClientPool::Static { clients, random } => {
|
||||
if *random {
|
||||
let host = extract_host(url);
|
||||
pick_for_host(clients, &host)
|
||||
} else {
|
||||
&clients[0]
|
||||
}
|
||||
}
|
||||
ClientPool::Rotating { clients } => pick_random(clients),
|
||||
};
|
||||
|
||||
let response = client.get(url).send().await?;
|
||||
|
||||
let status = response.status().as_u16();
|
||||
let final_url = response.url().to_string();
|
||||
|
||||
let headers: HashMap<String, String> = response
|
||||
.headers()
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
|
||||
.collect();
|
||||
|
||||
let html = response
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
||||
Ok(FetchResult {
|
||||
html,
|
||||
status,
|
||||
url: final_url,
|
||||
headers,
|
||||
elapsed,
|
||||
})
|
||||
}
|
||||
|
||||
/// Fetch a URL then extract structured content.
|
||||
///
|
||||
/// Automatically detects PDF responses via Content-Type header and routes
|
||||
/// to webclaw-pdf for text extraction. HTML responses go through webclaw-core.
|
||||
#[instrument(skip(self), fields(url = %url))]
|
||||
pub async fn fetch_and_extract(
|
||||
&self,
|
||||
url: &str,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
self.fetch_and_extract_with_options(url, &webclaw_core::ExtractionOptions::default())
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetch a URL then extract structured content with custom extraction options.
|
||||
///
|
||||
/// Same as [`fetch_and_extract`] but accepts `ExtractionOptions` for CSS selector
|
||||
/// filtering, main-content-only mode, etc. Options only apply to HTML responses;
|
||||
/// PDF extraction ignores them (no DOM to filter).
|
||||
#[instrument(skip(self, options), fields(url = %url))]
|
||||
pub async fn fetch_and_extract_with_options(
|
||||
&self,
|
||||
url: &str,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let client = self.pick_client(&json_url);
|
||||
let response = client.get(&json_url).send().await?;
|
||||
if response.status().is_success() {
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
match crate::reddit::parse_reddit_json(&bytes, url) {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
let response = client.get(url).send().await?;
|
||||
|
||||
let status = response.status().as_u16();
|
||||
let final_url = response.url().to_string();
|
||||
|
||||
let headers: HashMap<String, String> = response
|
||||
.headers()
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
|
||||
.collect();
|
||||
|
||||
let is_pdf = is_pdf_content_type(&headers);
|
||||
|
||||
if is_pdf {
|
||||
debug!(status, "detected PDF response, using pdf extraction");
|
||||
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(
|
||||
status,
|
||||
bytes = bytes.len(),
|
||||
elapsed_ms = %elapsed.as_millis(),
|
||||
"PDF fetch complete"
|
||||
);
|
||||
|
||||
let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
|
||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||
} else {
|
||||
let html = response
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
||||
// LinkedIn: extract from embedded <code> JSON blobs
|
||||
if crate::linkedin::is_linkedin_post(&final_url) {
|
||||
if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
|
||||
debug!("linkedin extraction succeeded");
|
||||
return Ok(result);
|
||||
}
|
||||
debug!("linkedin extraction failed, falling back to standard");
|
||||
}
|
||||
|
||||
let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
Ok(extraction)
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch multiple URLs concurrently with bounded parallelism.
|
||||
///
|
||||
/// Spawns one task per URL, bounded by a semaphore. Results are returned
|
||||
/// in the same order as the input URLs, regardless of completion order.
|
||||
pub async fn fetch_batch(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
) -> Vec<BatchResult> {
|
||||
let semaphore = Arc::new(Semaphore::new(concurrency));
|
||||
let mut handles = Vec::with_capacity(urls.len());
|
||||
|
||||
for (idx, url) in urls.iter().enumerate() {
|
||||
let permit = Arc::clone(&semaphore);
|
||||
let client = Arc::clone(self);
|
||||
let url = url.to_string();
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||
let result = client.fetch(&url).await;
|
||||
(idx, BatchResult { url, result })
|
||||
}));
|
||||
}
|
||||
|
||||
collect_ordered(handles, urls.len()).await
|
||||
}
|
||||
|
||||
/// Fetch and extract multiple URLs concurrently with bounded parallelism.
|
||||
///
|
||||
/// Same semantics as [`fetch_batch`] but runs extraction on each response.
|
||||
/// Results preserve input URL order.
|
||||
pub async fn fetch_and_extract_batch(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
) -> Vec<BatchExtractResult> {
|
||||
let semaphore = Arc::new(Semaphore::new(concurrency));
|
||||
let mut handles = Vec::with_capacity(urls.len());
|
||||
|
||||
for (idx, url) in urls.iter().enumerate() {
|
||||
let permit = Arc::clone(&semaphore);
|
||||
let client = Arc::clone(self);
|
||||
let url = url.to_string();
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||
let result = client.fetch_and_extract(&url).await;
|
||||
(idx, BatchExtractResult { url, result })
|
||||
}));
|
||||
}
|
||||
|
||||
collect_ordered(handles, urls.len()).await
|
||||
}
|
||||
|
||||
/// Returns the number of proxies in the rotation pool, or 0 if static mode.
|
||||
pub fn proxy_pool_size(&self) -> usize {
|
||||
match &self.pool {
|
||||
ClientPool::Static { .. } => 0,
|
||||
ClientPool::Rotating { clients } => clients.len(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Pick a client from the pool for a given URL.
|
||||
fn pick_client(&self, url: &str) -> &primp::Client {
|
||||
match &self.pool {
|
||||
ClientPool::Static { clients, random } => {
|
||||
if *random {
|
||||
let host = extract_host(url);
|
||||
pick_for_host(clients, &host)
|
||||
} else {
|
||||
&clients[0]
|
||||
}
|
||||
}
|
||||
ClientPool::Rotating { clients } => pick_random(clients),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect the impersonation profiles to use based on the browser profile.
|
||||
fn collect_profiles(profile: &BrowserProfile) -> Vec<ImpersonateProfile> {
|
||||
match profile {
|
||||
BrowserProfile::Random => {
|
||||
let mut profiles = Vec::new();
|
||||
profiles.extend(browser::chrome_profiles());
|
||||
profiles.extend(browser::firefox_profiles());
|
||||
profiles.extend(browser::extra_profiles());
|
||||
profiles
|
||||
}
|
||||
BrowserProfile::Chrome => vec![browser::latest_chrome()],
|
||||
BrowserProfile::Firefox => vec![browser::latest_firefox()],
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the host from a URL, returning empty string on parse failure.
|
||||
fn extract_host(url: &str) -> String {
|
||||
url::Url::parse(url)
|
||||
.ok()
|
||||
.and_then(|u| u.host_str().map(String::from))
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Pick a client deterministically based on a host string.
|
||||
/// Same host always gets the same client, enabling HTTP/2 connection reuse.
|
||||
fn pick_for_host<'a>(clients: &'a [primp::Client], host: &str) -> &'a primp::Client {
|
||||
let mut hasher = std::collections::hash_map::DefaultHasher::new();
|
||||
host.hash(&mut hasher);
|
||||
let idx = (hasher.finish() as usize) % clients.len();
|
||||
&clients[idx]
|
||||
}
|
||||
|
||||
/// Pick a random client from the pool for per-request rotation.
|
||||
fn pick_random(clients: &[primp::Client]) -> &primp::Client {
|
||||
use rand::Rng;
|
||||
let idx = rand::thread_rng().gen_range(0..clients.len());
|
||||
&clients[idx]
|
||||
}
|
||||
|
||||
/// Status codes worth retrying: server errors + rate limiting.
|
||||
fn is_retryable_status(status: u16) -> bool {
|
||||
status == 429
|
||||
|| status == 502
|
||||
|| status == 503
|
||||
|| status == 504
|
||||
|| status == 520
|
||||
|| status == 521
|
||||
|| status == 522
|
||||
|| status == 523
|
||||
|| status == 524
|
||||
}
|
||||
|
||||
/// Errors worth retrying: network/connection failures (not client errors).
|
||||
fn is_retryable_error(err: &FetchError) -> bool {
|
||||
matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_))
|
||||
}
|
||||
|
||||
fn is_pdf_content_type(headers: &HashMap<String, String>) -> bool {
|
||||
headers
|
||||
.get("content-type")
|
||||
.map(|ct| {
|
||||
let mime = ct.split(';').next().unwrap_or("").trim();
|
||||
mime.eq_ignore_ascii_case("application/pdf")
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
||||
fn pdf_to_extraction_result(
|
||||
pdf: &webclaw_pdf::PdfResult,
|
||||
url: &str,
|
||||
) -> webclaw_core::ExtractionResult {
|
||||
let markdown = webclaw_pdf::to_markdown(pdf);
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
|
||||
webclaw_core::ExtractionResult {
|
||||
metadata: webclaw_core::Metadata {
|
||||
title: pdf.metadata.title.clone(),
|
||||
description: pdf.metadata.subject.clone(),
|
||||
author: pdf.metadata.author.clone(),
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: Some(url.to_string()),
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
markdown,
|
||||
plain_text: pdf.text.clone(),
|
||||
links: Vec::new(),
|
||||
images: Vec::new(),
|
||||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect spawned tasks and reorder results to match input order.
|
||||
async fn collect_ordered<T>(
|
||||
handles: Vec<tokio::task::JoinHandle<(usize, T)>>,
|
||||
len: usize,
|
||||
) -> Vec<T> {
|
||||
let mut slots: Vec<Option<T>> = (0..len).map(|_| None).collect();
|
||||
|
||||
for handle in handles {
|
||||
match handle.await {
|
||||
Ok((idx, result)) => {
|
||||
slots[idx] = Some(result);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(error = %e, "batch task panicked");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
slots.into_iter().flatten().collect()
|
||||
}
|
||||
|
||||
/// Build a single primp Client from config + impersonation profile + optional proxy.
|
||||
fn build_primp_client(
|
||||
config: &FetchConfig,
|
||||
profile: &ImpersonateProfile,
|
||||
proxy: Option<&str>,
|
||||
) -> Result<primp::Client, FetchError> {
|
||||
let redirect_policy = if config.follow_redirects {
|
||||
primp::redirect::Policy::limited(config.max_redirects as usize)
|
||||
} else {
|
||||
primp::redirect::Policy::none()
|
||||
};
|
||||
|
||||
let mut headers = primp::header::HeaderMap::new();
|
||||
for (k, v) in &config.headers {
|
||||
if let (Ok(name), Ok(val)) = (
|
||||
primp::header::HeaderName::from_bytes(k.as_bytes()),
|
||||
primp::header::HeaderValue::from_str(v),
|
||||
) {
|
||||
headers.insert(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
let mut builder = primp::Client::builder()
|
||||
.impersonate(profile.browser)
|
||||
.impersonate_os(profile.os)
|
||||
.cookie_store(true)
|
||||
.timeout(config.timeout)
|
||||
.redirect(redirect_policy)
|
||||
.default_headers(headers);
|
||||
|
||||
if let Some(proxy_url) = proxy {
|
||||
builder = builder
|
||||
.proxy(primp::Proxy::all(proxy_url).map_err(|e| FetchError::Build(e.to_string()))?);
|
||||
}
|
||||
|
||||
builder
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(e.to_string()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_batch_result_struct() {
|
||||
let ok = BatchResult {
|
||||
url: "https://example.com".to_string(),
|
||||
result: Ok(FetchResult {
|
||||
html: "<html></html>".to_string(),
|
||||
status: 200,
|
||||
url: "https://example.com".to_string(),
|
||||
headers: HashMap::new(),
|
||||
elapsed: Duration::from_millis(42),
|
||||
}),
|
||||
};
|
||||
assert_eq!(ok.url, "https://example.com");
|
||||
assert!(ok.result.is_ok());
|
||||
assert_eq!(ok.result.unwrap().status, 200);
|
||||
|
||||
let err = BatchResult {
|
||||
url: "https://bad.example".to_string(),
|
||||
result: Err(FetchError::InvalidUrl("bad url".into())),
|
||||
};
|
||||
assert!(err.result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_extract_result_struct() {
|
||||
let err = BatchExtractResult {
|
||||
url: "https://example.com".to_string(),
|
||||
result: Err(FetchError::BodyDecode("timeout".into())),
|
||||
};
|
||||
assert_eq!(err.url, "https://example.com");
|
||||
assert!(err.result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_batch_preserves_order() {
|
||||
let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
|
||||
tokio::spawn(async { (2, "c".to_string()) }),
|
||||
tokio::spawn(async { (0, "a".to_string()) }),
|
||||
tokio::spawn(async { (1, "b".to_string()) }),
|
||||
];
|
||||
|
||||
let results = collect_ordered(handles, 3).await;
|
||||
assert_eq!(results, vec!["a", "b", "c"]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_collect_ordered_handles_gaps() {
|
||||
let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
|
||||
tokio::spawn(async { (0, "first".to_string()) }),
|
||||
tokio::spawn(async { (2, "third".to_string()) }),
|
||||
];
|
||||
|
||||
let results = collect_ordered(handles, 3).await;
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0], "first");
|
||||
assert_eq!(results[1], "third");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_pdf_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("content-type".to_string(), "application/pdf".to_string());
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"application/pdf; charset=utf-8".to_string(),
|
||||
);
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert("content-type".to_string(), "Application/PDF".to_string());
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert("content-type".to_string(), "text/html".to_string());
|
||||
assert!(!is_pdf_content_type(&headers));
|
||||
|
||||
let empty: HashMap<String, String> = HashMap::new();
|
||||
assert!(!is_pdf_content_type(&empty));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_to_extraction_result() {
|
||||
let pdf = webclaw_pdf::PdfResult {
|
||||
text: "Hello from PDF.".into(),
|
||||
page_count: 2,
|
||||
metadata: webclaw_pdf::PdfMetadata {
|
||||
title: Some("My Doc".into()),
|
||||
author: Some("Author".into()),
|
||||
subject: Some("Testing".into()),
|
||||
creator: None,
|
||||
},
|
||||
};
|
||||
|
||||
let result = pdf_to_extraction_result(&pdf, "https://example.com/doc.pdf");
|
||||
|
||||
assert_eq!(result.metadata.title.as_deref(), Some("My Doc"));
|
||||
assert_eq!(result.metadata.author.as_deref(), Some("Author"));
|
||||
assert_eq!(result.metadata.description.as_deref(), Some("Testing"));
|
||||
assert_eq!(
|
||||
result.metadata.url.as_deref(),
|
||||
Some("https://example.com/doc.pdf")
|
||||
);
|
||||
assert!(result.content.markdown.contains("# My Doc"));
|
||||
assert!(result.content.markdown.contains("Hello from PDF."));
|
||||
assert_eq!(result.content.plain_text, "Hello from PDF.");
|
||||
assert!(result.content.links.is_empty());
|
||||
assert!(result.domain_data.is_none());
|
||||
assert!(result.metadata.word_count > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_static_pool_no_proxy() {
|
||||
let config = FetchConfig::default();
|
||||
let client = FetchClient::new(config).unwrap();
|
||||
assert_eq!(client.proxy_pool_size(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rotating_pool_prebuilds_clients() {
|
||||
let config = FetchConfig {
|
||||
proxy_pool: vec![
|
||||
"http://proxy1:8080".into(),
|
||||
"http://proxy2:8080".into(),
|
||||
"http://proxy3:8080".into(),
|
||||
],
|
||||
..Default::default()
|
||||
};
|
||||
let client = FetchClient::new(config).unwrap();
|
||||
assert_eq!(client.proxy_pool_size(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pick_for_host_deterministic() {
|
||||
let config = FetchConfig {
|
||||
browser: BrowserProfile::Random,
|
||||
..Default::default()
|
||||
};
|
||||
let client = FetchClient::new(config).unwrap();
|
||||
|
||||
let clients = match &client.pool {
|
||||
ClientPool::Static { clients, .. } => clients,
|
||||
ClientPool::Rotating { clients } => clients,
|
||||
};
|
||||
|
||||
let a1 = pick_for_host(clients, "example.com") as *const _;
|
||||
let a2 = pick_for_host(clients, "example.com") as *const _;
|
||||
let a3 = pick_for_host(clients, "example.com") as *const _;
|
||||
assert_eq!(a1, a2);
|
||||
assert_eq!(a2, a3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pick_for_host_distributes() {
|
||||
let config = FetchConfig {
|
||||
proxy_pool: (0..10).map(|i| format!("http://proxy{i}:8080")).collect(),
|
||||
..Default::default()
|
||||
};
|
||||
let client = FetchClient::new(config).unwrap();
|
||||
|
||||
let clients = match &client.pool {
|
||||
ClientPool::Static { clients, .. } | ClientPool::Rotating { clients } => clients,
|
||||
};
|
||||
|
||||
let hosts = [
|
||||
"example.com",
|
||||
"google.com",
|
||||
"github.com",
|
||||
"rust-lang.org",
|
||||
"crates.io",
|
||||
];
|
||||
|
||||
let indices: Vec<usize> = hosts
|
||||
.iter()
|
||||
.map(|h| {
|
||||
let ptr = pick_for_host(clients, h) as *const _;
|
||||
clients.iter().position(|c| std::ptr::eq(c, ptr)).unwrap()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let unique: std::collections::HashSet<_> = indices.iter().collect();
|
||||
assert!(
|
||||
unique.len() >= 2,
|
||||
"expected host distribution across clients, got indices: {indices:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_host() {
|
||||
assert_eq!(extract_host("https://example.com/path"), "example.com");
|
||||
assert_eq!(
|
||||
extract_host("https://sub.example.com:8080/foo"),
|
||||
"sub.example.com"
|
||||
);
|
||||
assert_eq!(extract_host("not-a-url"), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_config_has_empty_proxy_pool() {
|
||||
let config = FetchConfig::default();
|
||||
assert!(config.proxy_pool.is_empty());
|
||||
assert!(config.proxy.is_none());
|
||||
}
|
||||
}
|
||||
557
crates/webclaw-fetch/src/crawler.rs
Normal file
557
crates/webclaw-fetch/src/crawler.rs
Normal file
|
|
@ -0,0 +1,557 @@
|
|||
/// Recursive same-origin web crawler built on top of [`FetchClient`].
|
||||
///
|
||||
/// Starts from a seed URL, extracts content, discovers links, and follows
|
||||
/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
|
||||
/// for bounded concurrency and per-request delays for politeness.
|
||||
///
|
||||
/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
|
||||
/// site's sitemaps and seeds the BFS frontier before crawling.
|
||||
use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{debug, info, warn};
|
||||
use url::Url;
|
||||
|
||||
use crate::client::{FetchClient, FetchConfig};
|
||||
use crate::error::FetchError;
|
||||
use crate::sitemap;
|
||||
|
||||
/// Controls crawl scope, depth, concurrency, and politeness.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CrawlConfig {
|
||||
/// Fetch configuration (browser profile, proxy, timeout, etc.)
|
||||
pub fetch: FetchConfig,
|
||||
/// How deep to follow links. 1 = only immediate links from seed page.
|
||||
pub max_depth: usize,
|
||||
/// Hard cap on total pages fetched (including the seed).
|
||||
pub max_pages: usize,
|
||||
/// Max concurrent in-flight requests.
|
||||
pub concurrency: usize,
|
||||
/// Minimum delay before each request (politeness).
|
||||
pub delay: Duration,
|
||||
/// Only follow URLs whose path starts with this prefix (e.g. "/docs/").
|
||||
pub path_prefix: Option<String>,
|
||||
/// Seed BFS frontier from sitemap discovery before crawling.
|
||||
pub use_sitemap: bool,
|
||||
/// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
|
||||
/// E.g. `["/api/*", "/guides/*"]` — matched against the URL path.
|
||||
pub include_patterns: Vec<String>,
|
||||
/// Glob patterns for paths to exclude. Checked after include_patterns.
|
||||
/// E.g. `["/changelog/*", "/blog/*"]` — matching URLs are skipped.
|
||||
pub exclude_patterns: Vec<String>,
|
||||
/// Optional channel sender for streaming per-page results as they complete.
|
||||
/// When set, each `PageResult` is sent on this channel immediately after extraction.
|
||||
pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
|
||||
}
|
||||
|
||||
impl Default for CrawlConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
fetch: FetchConfig::default(),
|
||||
max_depth: 1,
|
||||
max_pages: 50,
|
||||
concurrency: 5,
|
||||
delay: Duration::from_millis(100),
|
||||
path_prefix: None,
|
||||
use_sitemap: false,
|
||||
include_patterns: Vec::new(),
|
||||
exclude_patterns: Vec::new(),
|
||||
progress_tx: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregated results from a crawl run.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct CrawlResult {
|
||||
pub pages: Vec<PageResult>,
|
||||
pub total: usize,
|
||||
pub ok: usize,
|
||||
pub errors: usize,
|
||||
pub elapsed_secs: f64,
|
||||
}
|
||||
|
||||
/// Outcome of extracting a single page during the crawl.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PageResult {
|
||||
pub url: String,
|
||||
pub depth: usize,
|
||||
pub extraction: Option<webclaw_core::ExtractionResult>,
|
||||
pub error: Option<String>,
|
||||
#[serde(skip)]
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
/// Recursive crawler that wraps a shared [`FetchClient`].
|
||||
pub struct Crawler {
|
||||
client: Arc<FetchClient>,
|
||||
config: CrawlConfig,
|
||||
seed_origin: String,
|
||||
}
|
||||
|
||||
impl Crawler {
|
||||
/// Build a new crawler from a seed URL and config.
|
||||
/// Constructs the underlying `FetchClient` from `config.fetch`.
|
||||
pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
|
||||
let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
|
||||
let seed_origin = origin_key(&seed);
|
||||
|
||||
let client = FetchClient::new(config.fetch.clone())?;
|
||||
|
||||
Ok(Self {
|
||||
client: Arc::new(client),
|
||||
config,
|
||||
seed_origin,
|
||||
})
|
||||
}
|
||||
|
||||
/// Crawl starting from `start_url`, returning results for every page visited.
|
||||
///
|
||||
/// Uses breadth-first traversal: all pages at depth N are fetched (concurrently,
|
||||
/// bounded by `config.concurrency`) before moving to depth N+1.
|
||||
///
|
||||
/// When `config.use_sitemap` is true, sitemap URLs are discovered first and
|
||||
/// added to the initial frontier at depth 0 alongside the seed URL.
|
||||
pub async fn crawl(&self, start_url: &str) -> CrawlResult {
|
||||
let start = Instant::now();
|
||||
|
||||
let seed = match Url::parse(start_url) {
|
||||
Ok(u) => u,
|
||||
Err(_) => {
|
||||
return CrawlResult {
|
||||
pages: vec![PageResult {
|
||||
url: start_url.to_string(),
|
||||
depth: 0,
|
||||
extraction: None,
|
||||
error: Some(format!("invalid URL: {start_url}")),
|
||||
elapsed: Duration::ZERO,
|
||||
}],
|
||||
total: 1,
|
||||
ok: 0,
|
||||
errors: 1,
|
||||
elapsed_secs: 0.0,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
let semaphore = Arc::new(Semaphore::new(self.config.concurrency));
|
||||
let mut visited: HashSet<String> = HashSet::new();
|
||||
let mut pages: Vec<PageResult> = Vec::new();
|
||||
|
||||
// BFS frontier: vec of (normalized_url, depth) for the current level
|
||||
let mut frontier: Vec<(String, usize)> = vec![(normalize(&seed), 0)];
|
||||
|
||||
// Seed frontier from sitemap if enabled
|
||||
if self.config.use_sitemap {
|
||||
let base_url = format!("{}://{}", seed.scheme(), seed.host_str().unwrap_or(""));
|
||||
match sitemap::discover(&self.client, &base_url).await {
|
||||
Ok(entries) => {
|
||||
let before = frontier.len();
|
||||
for entry in entries {
|
||||
if self.qualify_link(&entry.url, &visited).is_some() {
|
||||
let parsed = match Url::parse(&entry.url) {
|
||||
Ok(u) => u,
|
||||
Err(_) => continue,
|
||||
};
|
||||
let norm = normalize(&parsed);
|
||||
frontier.push((norm, 0));
|
||||
}
|
||||
}
|
||||
let added = frontier.len() - before;
|
||||
info!(
|
||||
sitemap_urls = added,
|
||||
"seeded frontier from sitemap discovery"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(error = %e, "sitemap discovery failed, continuing with seed URL only");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while !frontier.is_empty() && pages.len() < self.config.max_pages {
|
||||
// Dedup this level's frontier against the visited set and page cap
|
||||
let batch: Vec<(String, usize)> = frontier
|
||||
.drain(..)
|
||||
.filter(|(url, _)| visited.insert(url.clone()))
|
||||
.take(self.config.max_pages.saturating_sub(pages.len()))
|
||||
.collect();
|
||||
|
||||
if batch.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Spawn one task per URL, bounded by semaphore
|
||||
let mut handles = Vec::with_capacity(batch.len());
|
||||
|
||||
for (url, depth) in &batch {
|
||||
let permit = Arc::clone(&semaphore);
|
||||
let client = Arc::clone(&self.client);
|
||||
let url = url.clone();
|
||||
let depth = *depth;
|
||||
let delay = self.config.delay;
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
// Acquire permit — blocks if concurrency limit reached
|
||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||
tokio::time::sleep(delay).await;
|
||||
|
||||
let page_start = Instant::now();
|
||||
let result = client.fetch_and_extract(&url).await;
|
||||
let elapsed = page_start.elapsed();
|
||||
|
||||
match result {
|
||||
Ok(extraction) => {
|
||||
debug!(
|
||||
url = %url, depth,
|
||||
elapsed_ms = %elapsed.as_millis(),
|
||||
"page extracted"
|
||||
);
|
||||
PageResult {
|
||||
url,
|
||||
depth,
|
||||
extraction: Some(extraction),
|
||||
error: None,
|
||||
elapsed,
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(url = %url, depth, error = %e, "page failed");
|
||||
PageResult {
|
||||
url,
|
||||
depth,
|
||||
extraction: None,
|
||||
error: Some(e.to_string()),
|
||||
elapsed,
|
||||
}
|
||||
}
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
// Collect results and harvest links for the next depth level
|
||||
let mut next_frontier: Vec<(String, usize)> = Vec::new();
|
||||
|
||||
for handle in handles {
|
||||
let page = match handle.await {
|
||||
Ok(page) => page,
|
||||
Err(e) => {
|
||||
warn!(error = %e, "crawl task panicked");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let depth = page.depth;
|
||||
|
||||
if depth < self.config.max_depth
|
||||
&& let Some(ref extraction) = page.extraction
|
||||
{
|
||||
for link in &extraction.content.links {
|
||||
if let Some(candidate) = self.qualify_link(&link.href, &visited) {
|
||||
next_frontier.push((candidate, depth + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Stream progress if a channel is configured
|
||||
if let Some(tx) = &self.config.progress_tx {
|
||||
let _ = tx.send(page.clone());
|
||||
}
|
||||
|
||||
pages.push(page);
|
||||
|
||||
if pages.len() >= self.config.max_pages {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
frontier = next_frontier;
|
||||
}
|
||||
|
||||
let total_elapsed = start.elapsed();
|
||||
let ok_count = pages.iter().filter(|p| p.extraction.is_some()).count();
|
||||
let err_count = pages.len() - ok_count;
|
||||
info!(
|
||||
total = pages.len(),
|
||||
ok = ok_count,
|
||||
errors = err_count,
|
||||
elapsed_ms = %total_elapsed.as_millis(),
|
||||
"crawl complete"
|
||||
);
|
||||
|
||||
CrawlResult {
|
||||
total: pages.len(),
|
||||
ok: ok_count,
|
||||
errors: err_count,
|
||||
elapsed_secs: total_elapsed.as_secs_f64(),
|
||||
pages,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a discovered link should be added to the frontier.
|
||||
/// Returns `Some(normalized_url)` if it passes all filters, `None` otherwise.
|
||||
fn qualify_link(&self, href: &str, visited: &HashSet<String>) -> Option<String> {
|
||||
let parsed = Url::parse(href).ok()?;
|
||||
|
||||
// Only http(s) schemes
|
||||
match parsed.scheme() {
|
||||
"http" | "https" => {}
|
||||
_ => return None,
|
||||
}
|
||||
|
||||
// Same-origin check (scheme + host + port)
|
||||
if origin_key(&parsed) != self.seed_origin {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Path prefix filter
|
||||
if let Some(ref prefix) = self.config.path_prefix
|
||||
&& !parsed.path().starts_with(prefix.as_str())
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Include patterns: if any are set, path must match at least one
|
||||
let path = parsed.path();
|
||||
if !self.config.include_patterns.is_empty()
|
||||
&& !self
|
||||
.config
|
||||
.include_patterns
|
||||
.iter()
|
||||
.any(|pat| glob_match(pat, path))
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Exclude patterns: if path matches any, skip
|
||||
if self
|
||||
.config
|
||||
.exclude_patterns
|
||||
.iter()
|
||||
.any(|pat| glob_match(pat, path))
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip common non-page file extensions
|
||||
const SKIP_EXTENSIONS: &[&str] = &[
|
||||
".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".css", ".js",
|
||||
".zip", ".tar", ".gz", ".xml", ".rss", ".mp3", ".mp4", ".avi", ".mov", ".woff",
|
||||
".woff2", ".ttf", ".eot",
|
||||
];
|
||||
if SKIP_EXTENSIONS.iter().any(|ext| path.ends_with(ext)) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let normalized = normalize(&parsed);
|
||||
|
||||
if visited.contains(&normalized) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(normalized)
|
||||
}
|
||||
}
|
||||
|
||||
/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
|
||||
fn origin_key(url: &Url) -> String {
|
||||
let port_suffix = match url.port() {
|
||||
Some(p) => format!(":{p}"),
|
||||
None => String::new(),
|
||||
};
|
||||
let host = url.host_str().unwrap_or("");
|
||||
let host = host.strip_prefix("www.").unwrap_or(host);
|
||||
format!("{}://{}{}", url.scheme(), host, port_suffix)
|
||||
}
|
||||
|
||||
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
|
||||
/// lowercase scheme + host. Preserves query params and path case.
|
||||
fn normalize(url: &Url) -> String {
|
||||
let scheme = url.scheme();
|
||||
let host = url.host_str().unwrap_or("").to_ascii_lowercase();
|
||||
let port_suffix = match url.port() {
|
||||
Some(p) => format!(":{p}"),
|
||||
None => String::new(),
|
||||
};
|
||||
|
||||
let mut path = url.path().to_string();
|
||||
if path.len() > 1 && path.ends_with('/') {
|
||||
path.pop();
|
||||
}
|
||||
|
||||
let query = match url.query() {
|
||||
Some(q) => format!("?{q}"),
|
||||
None => String::new(),
|
||||
};
|
||||
|
||||
// Fragment intentionally omitted
|
||||
format!("{scheme}://{host}{port_suffix}{path}{query}")
|
||||
}
|
||||
|
||||
/// Simple glob matching for URL paths. Supports:
|
||||
/// - `*` matches any characters within a single path segment (no `/`)
|
||||
/// - `**` matches any characters including `/` (any number of segments)
|
||||
/// - Literal characters match exactly
|
||||
///
|
||||
/// Examples:
|
||||
/// - `/api/*` matches `/api/users` but not `/api/users/123`
|
||||
/// - `/api/**` matches `/api/users`, `/api/users/123`, `/api/a/b/c`
|
||||
/// - `/docs/*/intro` matches `/docs/v2/intro`
|
||||
fn glob_match(pattern: &str, path: &str) -> bool {
|
||||
glob_match_inner(pattern.as_bytes(), path.as_bytes())
|
||||
}
|
||||
|
||||
fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
|
||||
let mut pi = 0;
|
||||
let mut ti = 0;
|
||||
let mut star_pi = usize::MAX;
|
||||
let mut star_ti = 0;
|
||||
|
||||
while ti < text.len() {
|
||||
if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
|
||||
// `**` — match everything including slashes
|
||||
// Skip all consecutive `*`
|
||||
while pi < pat.len() && pat[pi] == b'*' {
|
||||
pi += 1;
|
||||
}
|
||||
// Skip trailing `/` after `**`
|
||||
if pi < pat.len() && pat[pi] == b'/' {
|
||||
pi += 1;
|
||||
}
|
||||
if pi >= pat.len() {
|
||||
return true; // `**` at end matches everything
|
||||
}
|
||||
// Try matching the rest of pattern against every suffix of text
|
||||
for start in ti..=text.len() {
|
||||
if glob_match_inner(&pat[pi..], &text[start..]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} else if pi < pat.len() && pat[pi] == b'*' {
|
||||
// `*` — match any chars except `/`
|
||||
star_pi = pi;
|
||||
star_ti = ti;
|
||||
pi += 1;
|
||||
} else if pi < pat.len() && (pat[pi] == text[ti] || pat[pi] == b'?') {
|
||||
pi += 1;
|
||||
ti += 1;
|
||||
} else if star_pi != usize::MAX {
|
||||
// Backtrack: `*` absorbs one more char (but not `/`)
|
||||
if text[star_ti] == b'/' {
|
||||
return false;
|
||||
}
|
||||
star_ti += 1;
|
||||
ti = star_ti;
|
||||
pi = star_pi + 1;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Consume trailing `*` or `**` in pattern
|
||||
while pi < pat.len() && pat[pi] == b'*' {
|
||||
pi += 1;
|
||||
}
|
||||
|
||||
pi >= pat.len()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_fragment() {
|
||||
let url = Url::parse("https://example.com/page#section").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/page");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_trailing_slash() {
|
||||
let url = Url::parse("https://example.com/docs/").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/docs");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_keeps_root_slash() {
|
||||
let url = Url::parse("https://example.com/").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_preserves_query() {
|
||||
let url = Url::parse("https://example.com/search?q=rust&page=2").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/search?q=rust&page=2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_lowercases_host() {
|
||||
let url = Url::parse("https://Example.COM/Path").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/Path");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn origin_includes_explicit_port() {
|
||||
let url = Url::parse("https://example.com:8443/foo").unwrap();
|
||||
assert_eq!(origin_key(&url), "https://example.com:8443");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn origin_omits_default_port() {
|
||||
let url = Url::parse("https://example.com/foo").unwrap();
|
||||
assert_eq!(origin_key(&url), "https://example.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_schemes_are_different_origins() {
|
||||
let http = Url::parse("http://example.com/").unwrap();
|
||||
let https = Url::parse("https://example.com/").unwrap();
|
||||
assert_ne!(origin_key(&http), origin_key(&https));
|
||||
}
|
||||
|
||||
// -- glob_match tests --
|
||||
|
||||
#[test]
|
||||
fn glob_star_matches_single_segment() {
|
||||
assert!(glob_match("/api/*", "/api/users"));
|
||||
assert!(glob_match("/api/*", "/api/products"));
|
||||
assert!(!glob_match("/api/*", "/api/users/123"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_doublestar_matches_multiple_segments() {
|
||||
assert!(glob_match("/api/**", "/api/users"));
|
||||
assert!(glob_match("/api/**", "/api/users/123"));
|
||||
assert!(glob_match("/api/**", "/api/a/b/c/d"));
|
||||
assert!(!glob_match("/api/**", "/docs/intro"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_exact_match() {
|
||||
assert!(glob_match("/about", "/about"));
|
||||
assert!(!glob_match("/about", "/about/team"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_middle_wildcard() {
|
||||
assert!(glob_match("/docs/*/intro", "/docs/v2/intro"));
|
||||
assert!(!glob_match("/docs/*/intro", "/docs/v2/v3/intro"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_no_pattern_matches_nothing() {
|
||||
// Empty pattern only matches empty string
|
||||
assert!(glob_match("", ""));
|
||||
assert!(!glob_match("", "/foo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_trailing_star() {
|
||||
assert!(glob_match("/blog*", "/blog"));
|
||||
assert!(glob_match("/blog*", "/blog-post"));
|
||||
assert!(!glob_match("/blog*", "/blog/post")); // * doesn't cross /
|
||||
}
|
||||
}
|
||||
24
crates/webclaw-fetch/src/error.rs
Normal file
24
crates/webclaw-fetch/src/error.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
/// Fetch-layer errors. Wraps primp/network failures into a single type
|
||||
/// that callers can match on without leaking transport details.
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum FetchError {
|
||||
#[error("request failed: {0}")]
|
||||
Request(#[from] primp::Error),
|
||||
|
||||
#[error("invalid url: {0}")]
|
||||
InvalidUrl(String),
|
||||
|
||||
#[error("response body decode failed: {0}")]
|
||||
BodyDecode(String),
|
||||
|
||||
#[error("extraction failed: {0}")]
|
||||
Extraction(#[from] webclaw_core::ExtractError),
|
||||
|
||||
#[error("PDF extraction failed: {0}")]
|
||||
Pdf(#[from] webclaw_pdf::PdfError),
|
||||
|
||||
#[error("client build failed: {0}")]
|
||||
Build(String),
|
||||
}
|
||||
20
crates/webclaw-fetch/src/lib.rs
Normal file
20
crates/webclaw-fetch/src/lib.rs
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
/// webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
/// Uses Impit under the hood to make requests that look like real
|
||||
/// browsers at the TLS, HTTP/2, and header levels.
|
||||
/// Automatically detects PDF responses and delegates to webclaw-pdf.
|
||||
pub mod browser;
|
||||
pub mod client;
|
||||
pub mod crawler;
|
||||
pub mod error;
|
||||
pub mod linkedin;
|
||||
pub mod proxy;
|
||||
pub mod reddit;
|
||||
pub mod sitemap;
|
||||
|
||||
pub use browser::BrowserProfile;
|
||||
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
|
||||
pub use crawler::{CrawlConfig, CrawlResult, Crawler, PageResult};
|
||||
pub use error::FetchError;
|
||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||
pub use sitemap::SitemapEntry;
|
||||
pub use webclaw_pdf::PdfMode;
|
||||
279
crates/webclaw-fetch/src/linkedin.rs
Normal file
279
crates/webclaw-fetch/src/linkedin.rs
Normal file
|
|
@ -0,0 +1,279 @@
|
|||
/// LinkedIn post extraction from authenticated HTML.
|
||||
///
|
||||
/// LinkedIn's SPA stores all data in `<code>` tags as HTML-escaped JSON.
|
||||
/// The `included` array contains typed entities: Update (post), Comment,
|
||||
/// Profile, etc. We parse these to reconstruct post + comments as markdown.
|
||||
use serde_json::Value;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL is a LinkedIn post/activity.
|
||||
pub fn is_linkedin_post(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
(host == "www.linkedin.com" || host == "linkedin.com")
|
||||
&& (url.contains("/feed/update/") || url.contains("/posts/"))
|
||||
}
|
||||
|
||||
/// Extract `<code>` block contents from HTML using simple string scanning.
|
||||
/// LinkedIn wraps JSON data in `<code>` tags with HTML-escaped content.
|
||||
fn extract_code_blocks(html: &str) -> Vec<String> {
|
||||
let mut blocks = Vec::new();
|
||||
let mut search_from = 0;
|
||||
while let Some(start) = html[search_from..].find("<code") {
|
||||
let abs_start = search_from + start;
|
||||
// Find end of opening tag
|
||||
let Some(tag_end) = html[abs_start..].find('>') else {
|
||||
break;
|
||||
};
|
||||
let content_start = abs_start + tag_end + 1;
|
||||
let Some(end) = html[content_start..].find("</code>") else {
|
||||
break;
|
||||
};
|
||||
let content = &html[content_start..content_start + end];
|
||||
if content.len() > 1000 {
|
||||
blocks.push(html_unescape(content));
|
||||
}
|
||||
search_from = content_start + end + 7;
|
||||
}
|
||||
blocks
|
||||
}
|
||||
|
||||
/// Extract post + comments from LinkedIn's SSR HTML (requires auth cookies).
|
||||
pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult> {
|
||||
let code_blocks = extract_code_blocks(html);
|
||||
|
||||
// Find the largest <code> block with "included" — that's the main data payload
|
||||
let mut best_included: Option<Vec<Value>> = None;
|
||||
for raw in &code_blocks {
|
||||
if let Ok(obj) = serde_json::from_str::<Value>(raw)
|
||||
&& let Some(arr) = obj.get("included").and_then(|v| v.as_array())
|
||||
{
|
||||
let current_len = best_included.as_ref().map(|a| a.len()).unwrap_or(0);
|
||||
if arr.len() > current_len {
|
||||
best_included = Some(arr.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let included = best_included?;
|
||||
debug!(entities = included.len(), "linkedin: found included array");
|
||||
|
||||
// Collect profiles (entityUrn → "First Last")
|
||||
let mut profiles = std::collections::HashMap::new();
|
||||
for item in &included {
|
||||
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if t.contains("Profile") {
|
||||
let urn = item.get("entityUrn").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let first = item.get("firstName").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let last = item.get("lastName").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let headline = item.get("headline").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !first.is_empty() {
|
||||
profiles.insert(
|
||||
urn.to_string(),
|
||||
(
|
||||
format!("{first} {last}").trim().to_string(),
|
||||
headline.to_string(),
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find the main post (Update type)
|
||||
let mut markdown = String::new();
|
||||
let mut post_author = String::new();
|
||||
let mut post_headline = String::new();
|
||||
|
||||
for item in &included {
|
||||
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !t.contains("Update") {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get author from actor profile
|
||||
if let Some(actor) = item.get("actor") {
|
||||
// actor can have a nested profile reference or inline data
|
||||
let author_urn = actor
|
||||
.get("*author")
|
||||
.or(actor.get("author"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
if let Some((name, headline)) = profiles.get(author_urn) {
|
||||
post_author = name.clone();
|
||||
post_headline = headline.clone();
|
||||
}
|
||||
// Or inline name
|
||||
if post_author.is_empty()
|
||||
&& let Some(name) = actor.get("name").and_then(|v| v.as_object())
|
||||
{
|
||||
let text = name.get("text").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !text.is_empty() {
|
||||
post_author = text.to_string();
|
||||
}
|
||||
}
|
||||
if post_headline.is_empty()
|
||||
&& let Some(desc) = actor.get("description").and_then(|v| v.as_object())
|
||||
{
|
||||
let text = desc.get("text").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !text.is_empty() {
|
||||
post_headline = text.to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get post body from commentary
|
||||
if let Some(commentary) = item.get("commentary")
|
||||
&& let Some(text) = commentary
|
||||
.get("text")
|
||||
.and_then(|v| v.as_object())
|
||||
.and_then(|o| o.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
{
|
||||
if !post_author.is_empty() {
|
||||
markdown.push_str(&format!("# {post_author}\n\n"));
|
||||
}
|
||||
if !post_headline.is_empty() {
|
||||
markdown.push_str(&format!("*{post_headline}*\n\n"));
|
||||
}
|
||||
markdown.push_str("---\n\n");
|
||||
// Unescape literal \n from JSON
|
||||
markdown.push_str(&text.replace("\\n", "\n"));
|
||||
markdown.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
if markdown.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Collect comments — LinkedIn stores comment text in `commentary.text`
|
||||
// and commenter name in `commenter.name.text`
|
||||
let mut comments: Vec<(String, String)> = Vec::new();
|
||||
for item in &included {
|
||||
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !t.contains("Comment") {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get comment text from commentary.text
|
||||
let text = item
|
||||
.get("commentary")
|
||||
.and_then(|c| c.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
if text.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get commenter name from commenter.title.text
|
||||
let name = item
|
||||
.get("commenter")
|
||||
.and_then(|c| c.get("title"))
|
||||
.and_then(|n| n.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("Someone");
|
||||
|
||||
comments.push((name.to_string(), text.to_string()));
|
||||
}
|
||||
|
||||
if !comments.is_empty() {
|
||||
markdown.push_str("---\n\n## Comments\n\n");
|
||||
for (name, text) in &comments {
|
||||
markdown.push_str(&format!("- **{name}**: {text}\n\n"));
|
||||
}
|
||||
}
|
||||
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
debug!(
|
||||
word_count,
|
||||
comments = comments.len(),
|
||||
"linkedin extraction done"
|
||||
);
|
||||
|
||||
Some(ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title: if post_author.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(format!("{post_author}'s LinkedIn Post"))
|
||||
},
|
||||
description: None,
|
||||
author: if post_author.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(post_author)
|
||||
},
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: Some(url.to_string()),
|
||||
site_name: Some("LinkedIn".into()),
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
/// Unescape HTML entities (named + numeric decimal).
|
||||
fn html_unescape(s: &str) -> String {
|
||||
let mut out = String::with_capacity(s.len());
|
||||
let mut chars = s.chars().peekable();
|
||||
while let Some(c) = chars.next() {
|
||||
if c != '&' {
|
||||
out.push(c);
|
||||
continue;
|
||||
}
|
||||
// Collect until ';'
|
||||
let mut entity = String::new();
|
||||
for c2 in chars.by_ref() {
|
||||
if c2 == ';' {
|
||||
break;
|
||||
}
|
||||
entity.push(c2);
|
||||
if entity.len() > 10 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
match entity.as_str() {
|
||||
"quot" => out.push('"'),
|
||||
"amp" => out.push('&'),
|
||||
"lt" => out.push('<'),
|
||||
"gt" => out.push('>'),
|
||||
"apos" => out.push('\''),
|
||||
s if s.starts_with('#') => {
|
||||
let num = &s[1..];
|
||||
if let Ok(n) = num.parse::<u32>()
|
||||
&& let Some(ch) = char::from_u32(n)
|
||||
{
|
||||
out.push(ch);
|
||||
continue;
|
||||
}
|
||||
out.push('&');
|
||||
out.push_str(&entity);
|
||||
out.push(';');
|
||||
}
|
||||
_ => {
|
||||
out.push('&');
|
||||
out.push_str(&entity);
|
||||
out.push(';');
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
122
crates/webclaw-fetch/src/proxy.rs
Normal file
122
crates/webclaw-fetch/src/proxy.rs
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
/// Proxy file parsing utilities.
|
||||
///
|
||||
/// Format: `host:port:user:pass` (one per line).
|
||||
/// Lines starting with `#` and blank lines are skipped.
|
||||
/// Also accepts `host:port` (no auth).
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Parse a single proxy line into an HTTP proxy URL.
|
||||
///
|
||||
/// Accepts two formats:
|
||||
/// - `host:port:user:pass` -> `http://user:pass@host:port`
|
||||
/// - `host:port` -> `http://host:port`
|
||||
pub fn parse_proxy_line(line: &str) -> Option<String> {
|
||||
let parts: Vec<&str> = line.trim().splitn(4, ':').collect();
|
||||
match parts.len() {
|
||||
4 => Some(format!(
|
||||
"http://{}:{}@{}:{}",
|
||||
parts[2], parts[3], parts[0], parts[1]
|
||||
)),
|
||||
2 => Some(format!("http://{}:{}", parts[0], parts[1])),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Load proxies from a file, returning parsed HTTP proxy URLs.
|
||||
///
|
||||
/// Skips blank lines and `#` comments. Returns an error if the file
|
||||
/// can't be read or contains no valid entries.
|
||||
pub fn parse_proxy_file(path: &str) -> Result<Vec<String>, FetchError> {
|
||||
let content = std::fs::read_to_string(path)
|
||||
.map_err(|e| FetchError::Build(format!("failed to read proxy file: {e}")))?;
|
||||
|
||||
let proxies: Vec<String> = content
|
||||
.lines()
|
||||
.filter_map(|line| {
|
||||
let trimmed = line.trim();
|
||||
if trimmed.is_empty() || trimmed.starts_with('#') {
|
||||
None
|
||||
} else {
|
||||
parse_proxy_line(trimmed)
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if proxies.is_empty() {
|
||||
return Err(FetchError::Build(
|
||||
"proxy file is empty or has no valid entries".into(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(proxies)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn parse_host_port_user_pass() {
|
||||
let result = parse_proxy_line("proxy.example.com:8080:alice:s3cret");
|
||||
assert_eq!(
|
||||
result.as_deref(),
|
||||
Some("http://alice:s3cret@proxy.example.com:8080")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_host_port_only() {
|
||||
let result = parse_proxy_line("10.0.0.1:3128");
|
||||
assert_eq!(result.as_deref(), Some("http://10.0.0.1:3128"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_trims_whitespace() {
|
||||
let result = parse_proxy_line(" host:9999:user:pass ");
|
||||
assert_eq!(result.as_deref(), Some("http://user:pass@host:9999"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_invalid_returns_none() {
|
||||
assert!(parse_proxy_line("just-a-hostname").is_none());
|
||||
assert!(parse_proxy_line("a:b:c").is_none()); // 3 parts is invalid
|
||||
assert!(parse_proxy_line("").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_happy_path() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("proxies.txt");
|
||||
let mut f = std::fs::File::create(&path).unwrap();
|
||||
writeln!(f, "# residential pool").unwrap();
|
||||
writeln!(f, "host1:8080:user1:pass1").unwrap();
|
||||
writeln!(f).unwrap(); // blank line
|
||||
writeln!(f, "host2:3128").unwrap();
|
||||
writeln!(f, "# datacenter").unwrap();
|
||||
writeln!(f, "host3:9999:u:p").unwrap();
|
||||
drop(f);
|
||||
|
||||
let proxies = parse_proxy_file(path.to_str().unwrap()).unwrap();
|
||||
assert_eq!(proxies.len(), 3);
|
||||
assert_eq!(proxies[0], "http://user1:pass1@host1:8080");
|
||||
assert_eq!(proxies[1], "http://host2:3128");
|
||||
assert_eq!(proxies[2], "http://u:p@host3:9999");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_empty_errors() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("empty.txt");
|
||||
std::fs::write(&path, "# only comments\n\n").unwrap();
|
||||
|
||||
let err = parse_proxy_file(path.to_str().unwrap());
|
||||
assert!(err.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_missing_errors() {
|
||||
let err = parse_proxy_file("/nonexistent/proxies.txt");
|
||||
assert!(err.is_err());
|
||||
}
|
||||
}
|
||||
172
crates/webclaw-fetch/src/reddit.rs
Normal file
172
crates/webclaw-fetch/src/reddit.rs
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
|
||||
///
|
||||
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
|
||||
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
|
||||
/// comment tree as structured JSON, which we convert to clean markdown.
|
||||
use serde::Deserialize;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL points to a Reddit post/comment page.
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
matches!(
|
||||
host,
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
)
|
||||
}
|
||||
|
||||
/// Build the `.json` URL from a Reddit page URL.
|
||||
pub fn json_url(url: &str) -> String {
|
||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
format!("{clean}.json")
|
||||
}
|
||||
|
||||
/// Convert Reddit JSON API response into an ExtractionResult.
|
||||
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
|
||||
let listings: Vec<Listing> =
|
||||
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
|
||||
|
||||
let mut markdown = String::new();
|
||||
let mut title = None;
|
||||
let mut author = None;
|
||||
let mut subreddit = None;
|
||||
|
||||
// First listing = the post itself
|
||||
if let Some(post_listing) = listings.first() {
|
||||
for child in &post_listing.data.children {
|
||||
if child.kind == "t3" {
|
||||
let d = &child.data;
|
||||
title = d.title.clone();
|
||||
author = d.author.clone();
|
||||
subreddit = d.subreddit_name_prefixed.clone();
|
||||
|
||||
if let Some(ref t) = title {
|
||||
markdown.push_str(&format!("# {t}\n\n"));
|
||||
}
|
||||
if let (Some(a), Some(sr)) = (&author, &subreddit) {
|
||||
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
|
||||
}
|
||||
if let Some(ref body) = d.selftext
|
||||
&& !body.is_empty()
|
||||
{
|
||||
markdown.push_str(body);
|
||||
markdown.push_str("\n\n");
|
||||
}
|
||||
if let Some(ref url_field) = d.url_overridden_by_dest
|
||||
&& !url_field.is_empty()
|
||||
{
|
||||
markdown.push_str(&format!("[Link]({url_field})\n\n"));
|
||||
}
|
||||
markdown.push_str("---\n\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second listing = comment tree
|
||||
if let Some(comment_listing) = listings.get(1) {
|
||||
markdown.push_str("## Comments\n\n");
|
||||
for child in &comment_listing.data.children {
|
||||
render_comment(child, 0, &mut markdown);
|
||||
}
|
||||
}
|
||||
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
debug!(word_count, "reddit json extracted");
|
||||
|
||||
Ok(ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title,
|
||||
description: None,
|
||||
author,
|
||||
published_date: None,
|
||||
language: Some("en".into()),
|
||||
url: Some(url.to_string()),
|
||||
site_name: subreddit,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
|
||||
if thing.kind != "t1" {
|
||||
return;
|
||||
}
|
||||
let d = &thing.data;
|
||||
let indent = " ".repeat(depth);
|
||||
let author = d.author.as_deref().unwrap_or("[deleted]");
|
||||
let body = d.body.as_deref().unwrap_or("[removed]");
|
||||
let score = d.score.unwrap_or(0);
|
||||
|
||||
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
|
||||
for line in body.lines() {
|
||||
out.push_str(&format!("{indent} {line}\n"));
|
||||
}
|
||||
out.push('\n');
|
||||
|
||||
// Recurse into replies
|
||||
if let Some(Replies::Listing(listing)) = &d.replies {
|
||||
for child in &listing.data.children {
|
||||
render_comment(child, depth + 1, out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Reddit JSON types (minimal) ---
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Listing {
|
||||
data: ListingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListingData {
|
||||
children: Vec<Thing>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Thing {
|
||||
kind: String,
|
||||
data: ThingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ThingData {
|
||||
// Post fields (t3)
|
||||
title: Option<String>,
|
||||
selftext: Option<String>,
|
||||
subreddit_name_prefixed: Option<String>,
|
||||
url_overridden_by_dest: Option<String>,
|
||||
// Comment fields (t1)
|
||||
author: Option<String>,
|
||||
body: Option<String>,
|
||||
score: Option<i64>,
|
||||
replies: Option<Replies>,
|
||||
}
|
||||
|
||||
/// Reddit replies can be either a nested Listing or an empty string.
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Replies {
|
||||
Listing(Listing),
|
||||
#[allow(dead_code)]
|
||||
Empty(String),
|
||||
}
|
||||
582
crates/webclaw-fetch/src/sitemap.rs
Normal file
582
crates/webclaw-fetch/src/sitemap.rs
Normal file
|
|
@ -0,0 +1,582 @@
|
|||
/// Sitemap parsing and URL discovery.
|
||||
///
|
||||
/// Discovers URLs from a site's sitemaps using a 3-step process:
|
||||
/// 1. Parse robots.txt for `Sitemap:` directives
|
||||
/// 2. Try /sitemap.xml as fallback
|
||||
/// 3. Recursively resolve sitemap index files
|
||||
///
|
||||
/// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
|
||||
use std::collections::HashSet;
|
||||
|
||||
use quick_xml::Reader;
|
||||
use quick_xml::events::Event;
|
||||
use serde::Serialize;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Maximum depth when recursively fetching sitemap index files.
|
||||
/// Prevents infinite loops from circular sitemap references.
|
||||
const MAX_RECURSION_DEPTH: usize = 3;
|
||||
|
||||
/// A single URL discovered from a sitemap.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct SitemapEntry {
|
||||
pub url: String,
|
||||
pub last_modified: Option<String>,
|
||||
pub priority: Option<f64>,
|
||||
pub change_freq: Option<String>,
|
||||
}
|
||||
|
||||
/// Discover all URLs from a site's sitemaps.
|
||||
///
|
||||
/// Discovery order:
|
||||
/// 1. Fetch /robots.txt, parse `Sitemap:` directives
|
||||
/// 2. Fetch /sitemap.xml directly
|
||||
/// 3. If sitemap index, recursively fetch child sitemaps
|
||||
/// 4. Deduplicate by URL
|
||||
///
|
||||
/// Returns an empty vec (not an error) if no sitemaps are found.
|
||||
pub async fn discover(
|
||||
client: &FetchClient,
|
||||
base_url: &str,
|
||||
) -> Result<Vec<SitemapEntry>, FetchError> {
|
||||
let base = base_url.trim_end_matches('/');
|
||||
let mut sitemap_urls: Vec<String> = Vec::new();
|
||||
|
||||
// Step 1: Try robots.txt
|
||||
let robots_url = format!("{base}/robots.txt");
|
||||
debug!(url = %robots_url, "fetching robots.txt");
|
||||
|
||||
match client.fetch(&robots_url).await {
|
||||
Ok(result) if result.status == 200 => {
|
||||
let found = parse_robots_txt(&result.html);
|
||||
debug!(count = found.len(), "sitemap URLs from robots.txt");
|
||||
sitemap_urls.extend(found);
|
||||
}
|
||||
Ok(result) => {
|
||||
debug!(status = result.status, "robots.txt not found");
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(error = %e, "failed to fetch robots.txt");
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Always try /sitemap.xml as well (may not be listed in robots.txt)
|
||||
let default_sitemap = format!("{base}/sitemap.xml");
|
||||
if !sitemap_urls.iter().any(|u| u == &default_sitemap) {
|
||||
sitemap_urls.push(default_sitemap);
|
||||
}
|
||||
|
||||
// Step 3: Fetch and parse each sitemap, handling indexes recursively
|
||||
let mut seen_urls: HashSet<String> = HashSet::new();
|
||||
let mut entries: Vec<SitemapEntry> = Vec::new();
|
||||
|
||||
fetch_sitemaps(client, &sitemap_urls, &mut entries, &mut seen_urls, 0).await;
|
||||
|
||||
debug!(total = entries.len(), "sitemap discovery complete");
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
/// Recursively fetch and parse sitemap URLs, handling both urlsets and indexes.
|
||||
async fn fetch_sitemaps(
|
||||
client: &FetchClient,
|
||||
urls: &[String],
|
||||
entries: &mut Vec<SitemapEntry>,
|
||||
seen_urls: &mut HashSet<String>,
|
||||
depth: usize,
|
||||
) {
|
||||
if depth > MAX_RECURSION_DEPTH {
|
||||
warn!(depth, "sitemap recursion limit reached, stopping");
|
||||
return;
|
||||
}
|
||||
|
||||
for sitemap_url in urls {
|
||||
debug!(url = %sitemap_url, depth, "fetching sitemap");
|
||||
|
||||
let xml = match client.fetch(sitemap_url).await {
|
||||
Ok(result) if result.status == 200 => result.html,
|
||||
Ok(result) => {
|
||||
debug!(url = %sitemap_url, status = result.status, "sitemap not found");
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(url = %sitemap_url, error = %e, "failed to fetch sitemap");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match detect_sitemap_type(&xml) {
|
||||
SitemapType::UrlSet => {
|
||||
let parsed = parse_urlset(&xml);
|
||||
for entry in parsed {
|
||||
if seen_urls.insert(entry.url.clone()) {
|
||||
entries.push(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
SitemapType::Index => {
|
||||
let child_urls = parse_sitemap_index(&xml);
|
||||
debug!(count = child_urls.len(), "found child sitemaps in index");
|
||||
|
||||
// Box the recursive call to avoid large future sizes
|
||||
Box::pin(fetch_sitemaps(
|
||||
client,
|
||||
&child_urls,
|
||||
entries,
|
||||
seen_urls,
|
||||
depth + 1,
|
||||
))
|
||||
.await;
|
||||
}
|
||||
SitemapType::Unknown => {
|
||||
debug!(url = %sitemap_url, "unrecognized sitemap format, skipping");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pure parsing functions (no I/O, fully testable)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Extract `Sitemap:` directive URLs from robots.txt content.
|
||||
pub fn parse_robots_txt(text: &str) -> Vec<String> {
|
||||
text.lines()
|
||||
.filter_map(|line| {
|
||||
let trimmed = line.trim();
|
||||
// Case-insensitive match for "Sitemap:" prefix
|
||||
if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") {
|
||||
let url = trimmed[8..].trim();
|
||||
if !url.is_empty() {
|
||||
return Some(url.to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse a sitemap XML string. Handles both `<urlset>` and `<sitemapindex>`.
|
||||
/// Returns entries from urlsets and recursion targets from indexes.
|
||||
pub fn parse_sitemap_xml(xml: &str) -> Vec<SitemapEntry> {
|
||||
match detect_sitemap_type(xml) {
|
||||
SitemapType::UrlSet => parse_urlset(xml),
|
||||
SitemapType::Index => {
|
||||
// For the public parsing API, convert index <loc> entries into
|
||||
// SitemapEntry with just the URL. The async `discover` function
|
||||
// handles actual recursive fetching.
|
||||
parse_sitemap_index(xml)
|
||||
.into_iter()
|
||||
.map(|url| SitemapEntry {
|
||||
url,
|
||||
last_modified: None,
|
||||
priority: None,
|
||||
change_freq: None,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
SitemapType::Unknown => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum SitemapType {
|
||||
UrlSet,
|
||||
Index,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
/// Peek at the first element to determine if this is a urlset or sitemapindex.
|
||||
fn detect_sitemap_type(xml: &str) -> SitemapType {
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
|
||||
let name = e.local_name();
|
||||
return match name.as_ref() {
|
||||
b"urlset" => SitemapType::UrlSet,
|
||||
b"sitemapindex" => SitemapType::Index,
|
||||
_ => continue, // skip processing instructions, comments
|
||||
};
|
||||
}
|
||||
Ok(Event::Eof) => return SitemapType::Unknown,
|
||||
Err(_) => return SitemapType::Unknown,
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse `<url>` entries from a `<urlset>` sitemap.
|
||||
fn parse_urlset(xml: &str) -> Vec<SitemapEntry> {
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut buf = Vec::new();
|
||||
let mut entries = Vec::new();
|
||||
|
||||
// State for current <url> element being parsed
|
||||
let mut in_url = false;
|
||||
let mut current_tag: Option<UrlTag> = None;
|
||||
let mut loc: Option<String> = None;
|
||||
let mut lastmod: Option<String> = None;
|
||||
let mut priority: Option<f64> = None;
|
||||
let mut changefreq: Option<String> = None;
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) => {
|
||||
let name = e.local_name();
|
||||
match name.as_ref() {
|
||||
b"url" => {
|
||||
in_url = true;
|
||||
loc = None;
|
||||
lastmod = None;
|
||||
priority = None;
|
||||
changefreq = None;
|
||||
}
|
||||
b"loc" if in_url => current_tag = Some(UrlTag::Loc),
|
||||
b"lastmod" if in_url => current_tag = Some(UrlTag::LastMod),
|
||||
b"priority" if in_url => current_tag = Some(UrlTag::Priority),
|
||||
b"changefreq" if in_url => current_tag = Some(UrlTag::ChangeFreq),
|
||||
_ => current_tag = None,
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(ref e)) => {
|
||||
if let Some(ref tag) = current_tag
|
||||
&& let Ok(text) = e.unescape()
|
||||
{
|
||||
let text = text.trim().to_string();
|
||||
if !text.is_empty() {
|
||||
match tag {
|
||||
UrlTag::Loc => loc = Some(text),
|
||||
UrlTag::LastMod => lastmod = Some(text),
|
||||
UrlTag::Priority => priority = text.parse().ok(),
|
||||
UrlTag::ChangeFreq => changefreq = Some(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
let name = e.local_name();
|
||||
if name.as_ref() == b"url" && in_url {
|
||||
if let Some(url) = loc.take() {
|
||||
entries.push(SitemapEntry {
|
||||
url,
|
||||
last_modified: lastmod.take(),
|
||||
priority: priority.take(),
|
||||
change_freq: changefreq.take(),
|
||||
});
|
||||
}
|
||||
in_url = false;
|
||||
}
|
||||
current_tag = None;
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
warn!(error = %e, "XML parse error in sitemap, returning partial results");
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum UrlTag {
|
||||
Loc,
|
||||
LastMod,
|
||||
Priority,
|
||||
ChangeFreq,
|
||||
}
|
||||
|
||||
/// Parse `<sitemap>` entries from a `<sitemapindex>`, returning child sitemap URLs.
|
||||
fn parse_sitemap_index(xml: &str) -> Vec<String> {
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut buf = Vec::new();
|
||||
let mut urls = Vec::new();
|
||||
|
||||
let mut in_sitemap = false;
|
||||
let mut in_loc = false;
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) => {
|
||||
let name = e.local_name();
|
||||
match name.as_ref() {
|
||||
b"sitemap" => in_sitemap = true,
|
||||
b"loc" if in_sitemap => in_loc = true,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(ref e)) => {
|
||||
if in_loc && let Ok(text) = e.unescape() {
|
||||
let text = text.trim().to_string();
|
||||
if !text.is_empty() {
|
||||
urls.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
let name = e.local_name();
|
||||
match name.as_ref() {
|
||||
b"sitemap" => {
|
||||
in_sitemap = false;
|
||||
in_loc = false;
|
||||
}
|
||||
b"loc" => in_loc = false,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
warn!(error = %e, "XML parse error in sitemap index, returning partial results");
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
urls
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_urlset() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>https://example.com/</loc>
|
||||
<lastmod>2026-01-15</lastmod>
|
||||
<changefreq>daily</changefreq>
|
||||
<priority>1.0</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/about</loc>
|
||||
<lastmod>2026-01-10</lastmod>
|
||||
<changefreq>monthly</changefreq>
|
||||
<priority>0.8</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/blog/post-1</loc>
|
||||
</url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert_eq!(entries.len(), 3);
|
||||
|
||||
assert_eq!(entries[0].url, "https://example.com/");
|
||||
assert_eq!(entries[0].last_modified.as_deref(), Some("2026-01-15"));
|
||||
assert_eq!(entries[0].change_freq.as_deref(), Some("daily"));
|
||||
assert_eq!(entries[0].priority, Some(1.0));
|
||||
|
||||
assert_eq!(entries[1].url, "https://example.com/about");
|
||||
assert_eq!(entries[1].priority, Some(0.8));
|
||||
|
||||
assert_eq!(entries[2].url, "https://example.com/blog/post-1");
|
||||
assert_eq!(entries[2].last_modified, None);
|
||||
assert_eq!(entries[2].priority, None);
|
||||
assert_eq!(entries[2].change_freq, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_sitemap_index() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap>
|
||||
<loc>https://example.com/sitemap-posts.xml</loc>
|
||||
<lastmod>2026-03-01</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://example.com/sitemap-pages.xml</loc>
|
||||
</sitemap>
|
||||
</sitemapindex>"#;
|
||||
|
||||
let urls = parse_sitemap_index(xml);
|
||||
assert_eq!(urls.len(), 2);
|
||||
assert_eq!(urls[0], "https://example.com/sitemap-posts.xml");
|
||||
assert_eq!(urls[1], "https://example.com/sitemap-pages.xml");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_sitemap_xml_dispatches_urlset() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/page</loc></url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
assert_eq!(entries.len(), 1);
|
||||
assert_eq!(entries[0].url, "https://example.com/page");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_sitemap_xml_dispatches_index() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap><loc>https://example.com/sitemap-1.xml</loc></sitemap>
|
||||
</sitemapindex>"#;
|
||||
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
assert_eq!(entries.len(), 1);
|
||||
assert_eq!(entries[0].url, "https://example.com/sitemap-1.xml");
|
||||
// Index entries have no metadata when parsed through the public API
|
||||
assert_eq!(entries[0].priority, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_robots_txt() {
|
||||
let robots = "User-agent: *\n\
|
||||
Disallow: /admin/\n\
|
||||
\n\
|
||||
Sitemap: https://example.com/sitemap.xml\n\
|
||||
sitemap: https://example.com/sitemap-news.xml\n\
|
||||
SITEMAP: https://example.com/sitemap-images.xml\n\
|
||||
\n\
|
||||
User-agent: Googlebot\n\
|
||||
Allow: /\n";
|
||||
|
||||
let urls = parse_robots_txt(robots);
|
||||
assert_eq!(urls.len(), 3);
|
||||
assert_eq!(urls[0], "https://example.com/sitemap.xml");
|
||||
assert_eq!(urls[1], "https://example.com/sitemap-news.xml");
|
||||
assert_eq!(urls[2], "https://example.com/sitemap-images.xml");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_robots_txt_empty_value() {
|
||||
// "Sitemap:" with no URL should be skipped
|
||||
let robots = "Sitemap:\nSitemap: \nSitemap: https://example.com/s.xml\n";
|
||||
let urls = parse_robots_txt(robots);
|
||||
assert_eq!(urls.len(), 1);
|
||||
assert_eq!(urls[0], "https://example.com/s.xml");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deduplicate() {
|
||||
// parse_sitemap_xml deduplicates via the discover() path, but
|
||||
// we can verify that parsing the same URL twice produces entries
|
||||
// that the HashSet in discover() would collapse.
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/page</loc></url>
|
||||
<url><loc>https://example.com/page</loc></url>
|
||||
<url><loc>https://example.com/other</loc></url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert_eq!(entries.len(), 3, "parser returns all entries");
|
||||
|
||||
// Simulate the dedup that discover() does
|
||||
let mut seen = HashSet::new();
|
||||
let deduped: Vec<_> = entries
|
||||
.into_iter()
|
||||
.filter(|e| seen.insert(e.url.clone()))
|
||||
.collect();
|
||||
assert_eq!(deduped.len(), 2, "dedup collapses duplicates");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_sitemap() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert!(entries.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_xml() {
|
||||
let xml = "this is not xml at all <><><";
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
assert!(entries.is_empty(), "malformed XML returns empty vec");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_xml_partial() {
|
||||
// Partial XML that starts valid but breaks mid-stream
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/good</loc></url>
|
||||
<url><loc>broken
|
||||
"#;
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
// Should return at least the successfully parsed entry
|
||||
assert!(entries.len() >= 1);
|
||||
assert_eq!(entries[0].url, "https://example.com/good");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_missing_loc() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<lastmod>2026-01-01</lastmod>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/valid</loc>
|
||||
</url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert_eq!(entries.len(), 1, "entry without <loc> is skipped");
|
||||
assert_eq!(entries[0].url, "https://example.com/valid");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_parsing() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>https://example.com/high</loc>
|
||||
<priority>1.0</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/mid</loc>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/low</loc>
|
||||
<priority>0.1</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/invalid</loc>
|
||||
<priority>not-a-number</priority>
|
||||
</url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert_eq!(entries.len(), 4);
|
||||
|
||||
assert_eq!(entries[0].priority, Some(1.0));
|
||||
assert_eq!(entries[1].priority, Some(0.5));
|
||||
assert_eq!(entries[2].priority, Some(0.1));
|
||||
assert_eq!(entries[3].priority, None, "invalid priority parses as None");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_sitemap_type() {
|
||||
let urlset = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
|
||||
assert_eq!(detect_sitemap_type(urlset), SitemapType::UrlSet);
|
||||
|
||||
let index = r#"<?xml version="1.0"?><sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></sitemapindex>"#;
|
||||
assert_eq!(detect_sitemap_type(index), SitemapType::Index);
|
||||
|
||||
assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
|
||||
assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue