chore: rebrand webclaw to noxa

This commit is contained in:
Jacob Magar 2026-04-11 00:10:38 -04:00
parent a4c351d5ae
commit 8674b60b4e
86 changed files with 781 additions and 2121 deletions

View file

@ -0,0 +1,26 @@
[package]
name = "noxa-fetch"
description = "HTTP client with browser TLS fingerprint impersonation via wreq"
version.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
noxa-core = { workspace = true }
noxa-pdf = { path = "../noxa-pdf" }
serde = { workspace = true }
thiserror = { workspace = true }
tracing = { workspace = true }
tokio = { workspace = true }
wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
http = "1"
bytes = "1"
url = "2"
rand = "0.8"
quick-xml = { version = "0.37", features = ["serde"] }
serde_json.workspace = true
calamine = "0.34"
zip = "2"
[dev-dependencies]
tempfile = "3"

View file

@ -0,0 +1,51 @@
//! Browser fingerprint selection and rotation.
//! Maps our BrowserProfile enum to noxa-http client builder methods.
/// Which browser identity to present at the TLS/HTTP layer.
#[derive(Debug, Clone, Default)]
pub enum BrowserProfile {
#[default]
Chrome,
Firefox,
/// Randomly pick from all available profiles on each request.
Random,
}
/// A browser variant for building noxa-http clients.
#[derive(Debug, Clone, Copy)]
pub enum BrowserVariant {
Chrome,
ChromeMacos,
Firefox,
Safari,
Edge,
}
/// All Chrome variants we ship.
pub fn chrome_variants() -> Vec<BrowserVariant> {
vec![BrowserVariant::Chrome, BrowserVariant::ChromeMacos]
}
/// All Firefox variants we ship.
pub fn firefox_variants() -> Vec<BrowserVariant> {
vec![BrowserVariant::Firefox]
}
/// All variants for maximum diversity in Random mode.
pub fn all_variants() -> Vec<BrowserVariant> {
vec![
BrowserVariant::Chrome,
BrowserVariant::ChromeMacos,
BrowserVariant::Firefox,
BrowserVariant::Safari,
BrowserVariant::Edge,
]
}
pub fn latest_chrome() -> BrowserVariant {
BrowserVariant::Chrome
}
pub fn latest_firefox() -> BrowserVariant {
BrowserVariant::Firefox
}

View file

@ -0,0 +1,836 @@
/// HTTP client with browser TLS fingerprint impersonation.
/// Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
/// Supports single and batch operations with proxy rotation.
/// Automatically detects PDF responses and extracts text via noxa-pdf.
///
/// Two proxy modes:
/// - **Static**: single proxy (or none) baked into pre-built clients at construction.
/// - **Rotating**: pre-built pool of clients, each with a different proxy + fingerprint.
/// Same-host URLs are routed to the same client for HTTP/2 connection reuse.
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::sync::Arc;
use std::time::{Duration, Instant};
use rand::seq::SliceRandom;
use tokio::sync::Semaphore;
use tracing::{debug, instrument, warn};
use noxa_pdf::PdfMode;
use crate::browser::{self, BrowserProfile, BrowserVariant};
use crate::error::FetchError;
/// Configuration for building a [`FetchClient`].
#[derive(Debug, Clone)]
pub struct FetchConfig {
pub browser: BrowserProfile,
/// Single proxy URL. Used when `proxy_pool` is empty.
pub proxy: Option<String>,
/// Pool of proxy URLs to rotate through.
/// When non-empty, each proxy gets a pre-built client with a
/// random browser fingerprint. Same-host URLs reuse the same client
/// for HTTP/2 connection multiplexing.
pub proxy_pool: Vec<String>,
pub timeout: Duration,
pub follow_redirects: bool,
pub max_redirects: u32,
pub headers: HashMap<String, String>,
pub pdf_mode: PdfMode,
}
impl Default for FetchConfig {
fn default() -> Self {
Self {
browser: BrowserProfile::Chrome,
proxy: None,
proxy_pool: Vec::new(),
timeout: Duration::from_secs(12),
follow_redirects: true,
max_redirects: 10,
headers: HashMap::from([("Accept-Language".to_string(), "en-US,en;q=0.9".to_string())]),
pdf_mode: PdfMode::default(),
}
}
}
/// Result of a successful fetch.
#[derive(Debug, Clone)]
pub struct FetchResult {
pub html: String,
pub status: u16,
/// Final URL after any redirects.
pub url: String,
pub headers: http::HeaderMap,
pub elapsed: Duration,
}
/// Result for a single URL in a batch fetch operation.
#[derive(Debug)]
pub struct BatchResult {
pub url: String,
pub result: Result<FetchResult, FetchError>,
}
/// Result for a single URL in a batch fetch-and-extract operation.
#[derive(Debug)]
pub struct BatchExtractResult {
pub url: String,
pub result: Result<noxa_core::ExtractionResult, FetchError>,
}
/// Buffered response that owns its body. Provides the same sync API
/// that noxa-http::Response used to provide.
struct Response {
status: u16,
url: String,
headers: http::HeaderMap,
body: bytes::Bytes,
}
impl Response {
/// Buffer a wreq response into an owned Response.
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
let status = resp.status().as_u16();
let url = resp.uri().to_string();
let headers = resp.headers().clone();
let body = resp
.bytes()
.await
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
Ok(Self {
status,
url,
headers,
body,
})
}
fn status(&self) -> u16 {
self.status
}
fn url(&self) -> &str {
&self.url
}
fn headers(&self) -> &http::HeaderMap {
&self.headers
}
fn body(&self) -> &[u8] {
&self.body
}
fn is_success(&self) -> bool {
(200..300).contains(&self.status)
}
fn text(&self) -> std::borrow::Cow<'_, str> {
String::from_utf8_lossy(&self.body)
}
fn into_text(self) -> String {
String::from_utf8_lossy(&self.body).into_owned()
}
}
/// Internal representation of the client pool strategy.
enum ClientPool {
/// Pre-built clients with a fixed proxy (or no proxy).
/// Fingerprint rotation still works via the pool when `random` is true.
Static {
clients: Vec<wreq::Client>,
random: bool,
},
/// Pre-built pool of clients, each with a different proxy + fingerprint.
/// Requests pick a client deterministically by host for HTTP/2 connection reuse.
Rotating { clients: Vec<wreq::Client> },
}
/// HTTP client with browser TLS + HTTP/2 fingerprinting via wreq.
///
/// Operates in two modes:
/// - **Static pool**: pre-built clients, optionally with fingerprint rotation.
/// Used when no `proxy_pool` is configured. Fast (no per-request construction).
/// - **Rotating pool**: pre-built clients, one per proxy in the pool.
/// Same-host URLs are routed to the same client for HTTP/2 multiplexing.
pub struct FetchClient {
pool: ClientPool,
pdf_mode: PdfMode,
}
impl FetchClient {
/// Build a new client from config.
pub fn new(config: FetchConfig) -> Result<Self, FetchError> {
let variants = collect_variants(&config.browser);
let pdf_mode = config.pdf_mode.clone();
let pool = if config.proxy_pool.is_empty() {
let clients = variants
.into_iter()
.map(|v| {
crate::tls::build_client(
v,
config.timeout,
&config.headers,
config.proxy.as_deref(),
)
})
.collect::<Result<Vec<_>, _>>()?;
let random = matches!(config.browser, BrowserProfile::Random);
debug!(
count = clients.len(),
random, "fetch client ready (static pool)"
);
ClientPool::Static { clients, random }
} else {
let mut rng = rand::thread_rng();
let clients = config
.proxy_pool
.iter()
.map(|proxy| {
let v = *variants.choose(&mut rng).unwrap();
crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
})
.collect::<Result<Vec<_>, _>>()?;
debug!(
clients = clients.len(),
"fetch client ready (pre-built rotating pool)"
);
ClientPool::Rotating { clients }
};
Ok(Self { pool, pdf_mode })
}
/// Fetch a URL and return the raw HTML + response metadata.
///
/// Automatically retries on transient failures (network errors, 5xx, 429)
/// with exponential backoff: 0s, 1s (2 attempts total).
#[instrument(skip(self), fields(url = %url))]
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
let delays = [Duration::ZERO, Duration::from_secs(1)];
let mut last_err = None;
for (attempt, delay) in delays.iter().enumerate() {
if attempt > 0 {
tokio::time::sleep(*delay).await;
}
match self.fetch_once(url).await {
Ok(result) => {
if is_retryable_status(result.status) && attempt < delays.len() - 1 {
warn!(
url,
status = result.status,
attempt = attempt + 1,
"retryable status, will retry"
);
last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
continue;
}
if attempt > 0 {
debug!(url, attempt = attempt + 1, "retry succeeded");
}
return Ok(result);
}
Err(e) => {
if !is_retryable_error(&e) || attempt == delays.len() - 1 {
return Err(e);
}
warn!(
url,
error = %e,
attempt = attempt + 1,
"transient error, will retry"
);
last_err = Some(e);
}
}
}
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
}
/// Single fetch attempt.
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
let start = Instant::now();
let client = self.pick_client(url);
let resp = client.get(url).send().await?;
let response = Response::from_wreq(resp).await?;
response_to_result(response, start)
}
/// Fetch a URL then extract structured content.
#[instrument(skip(self), fields(url = %url))]
pub async fn fetch_and_extract(
&self,
url: &str,
) -> Result<noxa_core::ExtractionResult, FetchError> {
self.fetch_and_extract_with_options(url, &noxa_core::ExtractionOptions::default())
.await
}
/// Fetch a URL then extract structured content with custom extraction options.
#[instrument(skip(self, options), fields(url = %url))]
pub async fn fetch_and_extract_with_options(
&self,
url: &str,
options: &noxa_core::ExtractionOptions,
) -> Result<noxa_core::ExtractionResult, FetchError> {
// Reddit fallback: use their JSON API to get post + full comment tree.
if crate::reddit::is_reddit_url(url) {
let json_url = crate::reddit::json_url(url);
debug!("reddit detected, fetching {json_url}");
let client = self.pick_client(url);
let resp = client.get(&json_url).send().await?;
let response = Response::from_wreq(resp).await?;
if response.is_success() {
let bytes = response.body();
match crate::reddit::parse_reddit_json(bytes, url) {
Ok(result) => return Ok(result),
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
}
}
}
let start = Instant::now();
let client = self.pick_client(url);
let resp = client.get(url).send().await?;
let mut response = Response::from_wreq(resp).await?;
// Cookie warmup: if we get a challenge page, visit the homepage first
// to collect Akamai cookies (_abck, bm_sz, etc.), then retry.
if is_challenge_response(&response)
&& let Some(homepage) = extract_homepage(url)
{
debug!("challenge detected, warming cookies via {homepage}");
let _ = client.get(&homepage).send().await;
let resp = client.get(url).send().await?;
response = Response::from_wreq(resp).await?;
debug!("retried after cookie warmup: status={}", response.status());
}
let status = response.status();
let final_url = response.url().to_string();
let headers = response.headers().clone();
let is_pdf = is_pdf_content_type(&headers);
if is_pdf {
debug!(status, "detected PDF response, using pdf extraction");
let bytes = response.body();
let elapsed = start.elapsed();
debug!(
status,
bytes = bytes.len(),
elapsed_ms = %elapsed.as_millis(),
"PDF fetch complete"
);
let pdf_result = noxa_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
} else if let Some(doc_type) =
crate::document::is_document_content_type(&headers, &final_url)
{
debug!(status, doc_type = ?doc_type, "detected document response, extracting");
let bytes = response.body();
let elapsed = start.elapsed();
debug!(
status,
bytes = bytes.len(),
elapsed_ms = %elapsed.as_millis(),
"document fetch complete"
);
let mut result = crate::document::extract_document(bytes, doc_type)?;
result.metadata.url = Some(final_url);
Ok(result)
} else {
let html = response.into_text();
let elapsed = start.elapsed();
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
// LinkedIn: extract from embedded <code> JSON blobs
if crate::linkedin::is_linkedin_post(&final_url) {
if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
debug!("linkedin extraction succeeded");
return Ok(result);
}
debug!("linkedin extraction failed, falling back to standard");
}
let extraction = noxa_core::extract_with_options(&html, Some(&final_url), options)?;
Ok(extraction)
}
}
/// Fetch multiple URLs concurrently with bounded parallelism.
pub async fn fetch_batch(
self: &Arc<Self>,
urls: &[&str],
concurrency: usize,
) -> Vec<BatchResult> {
let semaphore = Arc::new(Semaphore::new(concurrency));
let mut handles = Vec::with_capacity(urls.len());
for (idx, url) in urls.iter().enumerate() {
let permit = Arc::clone(&semaphore);
let client = Arc::clone(self);
let url = url.to_string();
handles.push(tokio::spawn(async move {
let _permit = permit.acquire().await.expect("semaphore closed");
let result = client.fetch(&url).await;
(idx, BatchResult { url, result })
}));
}
collect_ordered(handles, urls.len()).await
}
/// Fetch and extract multiple URLs concurrently with bounded parallelism.
pub async fn fetch_and_extract_batch(
self: &Arc<Self>,
urls: &[&str],
concurrency: usize,
) -> Vec<BatchExtractResult> {
self.fetch_and_extract_batch_with_options(
urls,
concurrency,
&noxa_core::ExtractionOptions::default(),
)
.await
}
/// Fetch and extract multiple URLs concurrently with custom extraction options.
pub async fn fetch_and_extract_batch_with_options(
self: &Arc<Self>,
urls: &[&str],
concurrency: usize,
options: &noxa_core::ExtractionOptions,
) -> Vec<BatchExtractResult> {
let semaphore = Arc::new(Semaphore::new(concurrency));
let mut handles = Vec::with_capacity(urls.len());
for (idx, url) in urls.iter().enumerate() {
let permit = Arc::clone(&semaphore);
let client = Arc::clone(self);
let url = url.to_string();
let opts = options.clone();
handles.push(tokio::spawn(async move {
let _permit = permit.acquire().await.expect("semaphore closed");
let result = client.fetch_and_extract_with_options(&url, &opts).await;
(idx, BatchExtractResult { url, result })
}));
}
collect_ordered(handles, urls.len()).await
}
/// Returns the number of proxies in the rotation pool, or 0 if static mode.
pub fn proxy_pool_size(&self) -> usize {
match &self.pool {
ClientPool::Static { .. } => 0,
ClientPool::Rotating { clients } => clients.len(),
}
}
/// Pick a client from the pool for a given URL.
fn pick_client(&self, url: &str) -> &wreq::Client {
match &self.pool {
ClientPool::Static { clients, random } => {
if *random {
let host = extract_host(url);
pick_for_host(clients, &host)
} else {
&clients[0]
}
}
ClientPool::Rotating { clients } => pick_random(clients),
}
}
}
/// Collect the browser variants to use based on the browser profile.
fn collect_variants(profile: &BrowserProfile) -> Vec<BrowserVariant> {
match profile {
BrowserProfile::Random => browser::all_variants(),
BrowserProfile::Chrome => vec![browser::latest_chrome()],
BrowserProfile::Firefox => vec![browser::latest_firefox()],
}
}
/// Convert a buffered Response into a FetchResult.
fn response_to_result(response: Response, start: Instant) -> Result<FetchResult, FetchError> {
let status = response.status();
let final_url = response.url().to_string();
let headers = response.headers().clone();
let html = response.into_text();
let elapsed = start.elapsed();
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
Ok(FetchResult {
html,
status,
url: final_url,
headers,
elapsed,
})
}
/// Extract the host from a URL, returning empty string on parse failure.
fn extract_host(url: &str) -> String {
url::Url::parse(url)
.ok()
.and_then(|u| u.host_str().map(String::from))
.unwrap_or_default()
}
/// Pick a client deterministically based on a host string.
/// Same host always gets the same client, enabling HTTP/2 connection reuse.
fn pick_for_host<'a>(clients: &'a [wreq::Client], host: &str) -> &'a wreq::Client {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
host.hash(&mut hasher);
let idx = (hasher.finish() as usize) % clients.len();
&clients[idx]
}
/// Pick a random client from the pool for per-request rotation.
fn pick_random(clients: &[wreq::Client]) -> &wreq::Client {
use rand::Rng;
let idx = rand::thread_rng().gen_range(0..clients.len());
&clients[idx]
}
/// Status codes worth retrying: server errors + rate limiting.
fn is_retryable_status(status: u16) -> bool {
status == 429
|| status == 502
|| status == 503
|| status == 504
|| status == 520
|| status == 521
|| status == 522
|| status == 523
|| status == 524
}
/// Errors worth retrying: network/connection failures (not client errors).
fn is_retryable_error(err: &FetchError) -> bool {
matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_))
}
fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
headers
.get("content-type")
.and_then(|ct| ct.to_str().ok())
.map(|ct| {
let mime = ct.split(';').next().unwrap_or("").trim();
mime.eq_ignore_ascii_case("application/pdf")
})
.unwrap_or(false)
}
/// Detect if a response looks like a bot protection challenge page.
fn is_challenge_response(response: &Response) -> bool {
let len = response.body().len();
if len > 15_000 || len == 0 {
return false;
}
let text = response.text();
let lower = text.to_lowercase();
if lower.contains("<title>challenge page</title>") {
return true;
}
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
return true;
}
false
}
/// Extract the homepage URL (scheme + host) from a full URL.
fn extract_homepage(url: &str) -> Option<String> {
url::Url::parse(url)
.ok()
.map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or("")))
}
/// Convert a noxa-pdf PdfResult into a noxa-core ExtractionResult.
fn pdf_to_extraction_result(
pdf: &noxa_pdf::PdfResult,
url: &str,
) -> noxa_core::ExtractionResult {
let markdown = noxa_pdf::to_markdown(pdf);
let word_count = markdown.split_whitespace().count();
noxa_core::ExtractionResult {
metadata: noxa_core::Metadata {
title: pdf.metadata.title.clone(),
description: pdf.metadata.subject.clone(),
author: pdf.metadata.author.clone(),
published_date: None,
language: None,
url: Some(url.to_string()),
site_name: None,
image: None,
favicon: None,
word_count,
},
content: noxa_core::Content {
markdown,
plain_text: pdf.text.clone(),
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: vec![],
}
}
/// Collect spawned tasks and reorder results to match input order.
async fn collect_ordered<T>(
handles: Vec<tokio::task::JoinHandle<(usize, T)>>,
len: usize,
) -> Vec<T> {
let mut slots: Vec<Option<T>> = (0..len).map(|_| None).collect();
for handle in handles {
match handle.await {
Ok((idx, result)) => {
slots[idx] = Some(result);
}
Err(e) => {
warn!(error = %e, "batch task panicked");
}
}
}
slots.into_iter().flatten().collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_batch_result_struct() {
let ok = BatchResult {
url: "https://example.com".to_string(),
result: Ok(FetchResult {
html: "<html></html>".to_string(),
status: 200,
url: "https://example.com".to_string(),
headers: http::HeaderMap::new(),
elapsed: Duration::from_millis(42),
}),
};
assert_eq!(ok.url, "https://example.com");
assert!(ok.result.is_ok());
assert_eq!(ok.result.unwrap().status, 200);
let err = BatchResult {
url: "https://bad.example".to_string(),
result: Err(FetchError::InvalidUrl("bad url".into())),
};
assert!(err.result.is_err());
}
#[test]
fn test_batch_extract_result_struct() {
let err = BatchExtractResult {
url: "https://example.com".to_string(),
result: Err(FetchError::BodyDecode("timeout".into())),
};
assert_eq!(err.url, "https://example.com");
assert!(err.result.is_err());
}
#[tokio::test]
async fn test_batch_preserves_order() {
let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
tokio::spawn(async { (2, "c".to_string()) }),
tokio::spawn(async { (0, "a".to_string()) }),
tokio::spawn(async { (1, "b".to_string()) }),
];
let results = collect_ordered(handles, 3).await;
assert_eq!(results, vec!["a", "b", "c"]);
}
#[tokio::test]
async fn test_collect_ordered_handles_gaps() {
let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
tokio::spawn(async { (0, "first".to_string()) }),
tokio::spawn(async { (2, "third".to_string()) }),
];
let results = collect_ordered(handles, 3).await;
assert_eq!(results.len(), 2);
assert_eq!(results[0], "first");
assert_eq!(results[1], "third");
}
#[test]
fn test_is_pdf_content_type() {
let mut headers = http::HeaderMap::new();
headers.insert("content-type", "application/pdf".parse().unwrap());
assert!(is_pdf_content_type(&headers));
headers.insert(
"content-type",
"application/pdf; charset=utf-8".parse().unwrap(),
);
assert!(is_pdf_content_type(&headers));
headers.insert("content-type", "Application/PDF".parse().unwrap());
assert!(is_pdf_content_type(&headers));
headers.insert("content-type", "text/html".parse().unwrap());
assert!(!is_pdf_content_type(&headers));
let empty = http::HeaderMap::new();
assert!(!is_pdf_content_type(&empty));
}
#[test]
fn test_pdf_to_extraction_result() {
let pdf = noxa_pdf::PdfResult {
text: "Hello from PDF.".into(),
page_count: 2,
metadata: noxa_pdf::PdfMetadata {
title: Some("My Doc".into()),
author: Some("Author".into()),
subject: Some("Testing".into()),
creator: None,
},
};
let result = pdf_to_extraction_result(&pdf, "https://example.com/doc.pdf");
assert_eq!(result.metadata.title.as_deref(), Some("My Doc"));
assert_eq!(result.metadata.author.as_deref(), Some("Author"));
assert_eq!(result.metadata.description.as_deref(), Some("Testing"));
assert_eq!(
result.metadata.url.as_deref(),
Some("https://example.com/doc.pdf")
);
assert!(result.content.markdown.contains("# My Doc"));
assert!(result.content.markdown.contains("Hello from PDF."));
assert_eq!(result.content.plain_text, "Hello from PDF.");
assert!(result.content.links.is_empty());
assert!(result.domain_data.is_none());
assert!(result.metadata.word_count > 0);
}
#[test]
fn test_static_pool_no_proxy() {
let config = FetchConfig::default();
let client = FetchClient::new(config).unwrap();
assert_eq!(client.proxy_pool_size(), 0);
}
#[test]
fn test_rotating_pool_prebuilds_clients() {
let config = FetchConfig {
proxy_pool: vec![
"http://proxy1:8080".into(),
"http://proxy2:8080".into(),
"http://proxy3:8080".into(),
],
..Default::default()
};
let client = FetchClient::new(config).unwrap();
assert_eq!(client.proxy_pool_size(), 3);
}
#[test]
fn test_pick_for_host_deterministic() {
let config = FetchConfig {
browser: BrowserProfile::Random,
..Default::default()
};
let client = FetchClient::new(config).unwrap();
let clients = match &client.pool {
ClientPool::Static { clients, .. } => clients,
ClientPool::Rotating { clients } => clients,
};
let a1 = pick_for_host(clients, "example.com") as *const _;
let a2 = pick_for_host(clients, "example.com") as *const _;
let a3 = pick_for_host(clients, "example.com") as *const _;
assert_eq!(a1, a2);
assert_eq!(a2, a3);
}
#[test]
fn test_pick_for_host_distributes() {
let config = FetchConfig {
proxy_pool: (0..10).map(|i| format!("http://proxy{i}:8080")).collect(),
..Default::default()
};
let client = FetchClient::new(config).unwrap();
let clients = match &client.pool {
ClientPool::Static { clients, .. } | ClientPool::Rotating { clients } => clients,
};
let hosts = [
"example.com",
"google.com",
"github.com",
"rust-lang.org",
"crates.io",
];
let indices: Vec<usize> = hosts
.iter()
.map(|h| {
let ptr = pick_for_host(clients, h) as *const _;
clients.iter().position(|c| std::ptr::eq(c, ptr)).unwrap()
})
.collect();
let unique: std::collections::HashSet<_> = indices.iter().collect();
assert!(
unique.len() >= 2,
"expected host distribution across clients, got indices: {indices:?}"
);
}
#[test]
fn test_extract_host() {
assert_eq!(extract_host("https://example.com/path"), "example.com");
assert_eq!(
extract_host("https://sub.example.com:8080/foo"),
"sub.example.com"
);
assert_eq!(extract_host("not-a-url"), "");
}
#[test]
fn test_default_config_has_empty_proxy_pool() {
let config = FetchConfig::default();
assert!(config.proxy_pool.is_empty());
assert!(config.proxy.is_none());
}
}

View file

@ -0,0 +1,648 @@
/// Recursive same-origin web crawler built on top of [`FetchClient`].
///
/// Starts from a seed URL, extracts content, discovers links, and follows
/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
/// for bounded concurrency and per-request delays for politeness.
///
/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
/// site's sitemaps and seeds the BFS frontier before crawling.
use std::collections::HashSet;
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::{Duration, Instant};
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tracing::{debug, info, warn};
use url::Url;
use crate::client::{FetchClient, FetchConfig};
use crate::error::FetchError;
use crate::sitemap;
/// Controls crawl scope, depth, concurrency, and politeness.
#[derive(Debug, Clone)]
pub struct CrawlConfig {
/// Fetch configuration (browser profile, proxy, timeout, etc.)
pub fetch: FetchConfig,
/// How deep to follow links. 1 = only immediate links from seed page.
pub max_depth: usize,
/// Hard cap on total pages fetched (including the seed).
pub max_pages: usize,
/// Max concurrent in-flight requests.
pub concurrency: usize,
/// Minimum delay before each request (politeness).
pub delay: Duration,
/// Only follow URLs whose path starts with this prefix (e.g. "/docs/").
pub path_prefix: Option<String>,
/// Seed BFS frontier from sitemap discovery before crawling.
pub use_sitemap: bool,
/// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
/// E.g. `["/api/*", "/guides/*"]` — matched against the URL path.
pub include_patterns: Vec<String>,
/// Glob patterns for paths to exclude. Checked after include_patterns.
/// E.g. `["/changelog/*", "/blog/*"]` — matching URLs are skipped.
pub exclude_patterns: Vec<String>,
/// Optional channel sender for streaming per-page results as they complete.
/// When set, each `PageResult` is sent on this channel immediately after extraction.
pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
/// When set to `true`, the crawler breaks out of the main loop early.
/// Callers (e.g. a Ctrl+C handler) can flip this to request graceful cancellation.
pub cancel_flag: Option<Arc<AtomicBool>>,
}
impl Default for CrawlConfig {
fn default() -> Self {
Self {
fetch: FetchConfig::default(),
max_depth: 1,
max_pages: 50,
concurrency: 5,
delay: Duration::from_millis(100),
path_prefix: None,
use_sitemap: false,
include_patterns: Vec::new(),
exclude_patterns: Vec::new(),
progress_tx: None,
cancel_flag: None,
}
}
}
/// Aggregated results from a crawl run.
#[derive(Debug, Serialize, Deserialize)]
pub struct CrawlResult {
pub pages: Vec<PageResult>,
pub total: usize,
pub ok: usize,
pub errors: usize,
pub elapsed_secs: f64,
/// URLs visited during this crawl (for resume state).
#[serde(skip)]
pub visited: HashSet<String>,
/// Remaining frontier when crawl was cancelled (for resume state).
#[serde(skip)]
pub remaining_frontier: Vec<(String, usize)>,
}
/// Outcome of extracting a single page during the crawl.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageResult {
pub url: String,
pub depth: usize,
pub extraction: Option<noxa_core::ExtractionResult>,
pub error: Option<String>,
#[serde(skip)]
pub elapsed: Duration,
}
/// Serializable crawl state for resume after Ctrl+C cancellation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlState {
pub seed_url: String,
pub visited: Vec<String>,
pub frontier: Vec<(String, usize)>,
pub completed_pages: usize,
pub max_pages: usize,
pub max_depth: usize,
}
/// Recursive crawler that wraps a shared [`FetchClient`].
pub struct Crawler {
client: Arc<FetchClient>,
config: CrawlConfig,
seed_origin: String,
}
impl Crawler {
/// Build a new crawler from a seed URL and config.
/// Constructs the underlying `FetchClient` from `config.fetch`.
pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
let seed_origin = origin_key(&seed);
let client = FetchClient::new(config.fetch.clone())?;
Ok(Self {
client: Arc::new(client),
config,
seed_origin,
})
}
/// Save current crawl state to a JSON file for later resume.
pub fn save_state(
path: &Path,
seed_url: &str,
visited: &HashSet<String>,
frontier: &[(String, usize)],
completed_pages: usize,
max_pages: usize,
max_depth: usize,
) -> Result<(), String> {
let state = CrawlState {
seed_url: seed_url.to_string(),
visited: visited.iter().cloned().collect(),
frontier: frontier.to_vec(),
completed_pages,
max_pages,
max_depth,
};
let json =
serde_json::to_string_pretty(&state).map_err(|e| format!("serialize state: {e}"))?;
std::fs::write(path, json).map_err(|e| format!("write state to {}: {e}", path.display()))
}
/// Load crawl state from a JSON file. Returns `None` if file doesn't exist.
pub fn load_state(path: &Path) -> Option<CrawlState> {
let content = std::fs::read_to_string(path).ok()?;
serde_json::from_str(&content).ok()
}
/// Returns true if the cancel flag has been set.
fn is_cancelled(&self) -> bool {
self.config
.cancel_flag
.as_ref()
.is_some_and(|f| f.load(Ordering::Relaxed))
}
/// Crawl starting from `start_url`, returning results for every page visited.
///
/// Uses breadth-first traversal: all pages at depth N are fetched (concurrently,
/// bounded by `config.concurrency`) before moving to depth N+1.
///
/// When `config.use_sitemap` is true, sitemap URLs are discovered first and
/// added to the initial frontier at depth 0 alongside the seed URL.
///
/// If `resume_state` is provided, the crawl resumes from the saved state
/// (pre-populated visited set and frontier) instead of starting fresh.
pub async fn crawl(&self, start_url: &str, resume_state: Option<CrawlState>) -> CrawlResult {
let start = Instant::now();
let seed = match Url::parse(start_url) {
Ok(u) => u,
Err(_) => {
return CrawlResult {
pages: vec![PageResult {
url: start_url.to_string(),
depth: 0,
extraction: None,
error: Some(format!("invalid URL: {start_url}")),
elapsed: Duration::ZERO,
}],
total: 1,
ok: 0,
errors: 1,
elapsed_secs: 0.0,
visited: HashSet::new(),
remaining_frontier: Vec::new(),
};
}
};
let semaphore = Arc::new(Semaphore::new(self.config.concurrency));
let mut visited: HashSet<String>;
let mut pages: Vec<PageResult> = Vec::new();
let mut frontier: Vec<(String, usize)>;
// Resume from saved state or start fresh
if let Some(state) = resume_state {
visited = state.visited.into_iter().collect();
frontier = state.frontier;
info!(
visited = visited.len(),
frontier = frontier.len(),
"resuming crawl from saved state"
);
} else {
visited = HashSet::new();
frontier = vec![(normalize(&seed), 0)];
// Seed frontier from sitemap if enabled
if self.config.use_sitemap {
let base_url = format!("{}://{}", seed.scheme(), seed.host_str().unwrap_or(""));
match sitemap::discover(&self.client, &base_url).await {
Ok(entries) => {
let before = frontier.len();
for entry in entries {
if self.qualify_link(&entry.url, &visited).is_some() {
let parsed = match Url::parse(&entry.url) {
Ok(u) => u,
Err(_) => continue,
};
let norm = normalize(&parsed);
frontier.push((norm, 0));
}
}
let added = frontier.len() - before;
info!(
sitemap_urls = added,
"seeded frontier from sitemap discovery"
);
}
Err(e) => {
warn!(error = %e, "sitemap discovery failed, continuing with seed URL only");
}
}
}
}
while !frontier.is_empty() && pages.len() < self.config.max_pages {
// Check cancel flag before processing each batch
if self.is_cancelled() {
info!("crawl cancelled by user");
break;
}
// Dedup this level's frontier against the visited set and page cap
let batch: Vec<(String, usize)> = frontier
.drain(..)
.filter(|(url, _)| visited.insert(url.clone()))
.take(self.config.max_pages.saturating_sub(pages.len()))
.collect();
if batch.is_empty() {
break;
}
// Spawn one task per URL, bounded by semaphore
let mut handles = Vec::with_capacity(batch.len());
for (url, depth) in &batch {
let permit = Arc::clone(&semaphore);
let client = Arc::clone(&self.client);
let url = url.clone();
let depth = *depth;
let delay = self.config.delay;
handles.push(tokio::spawn(async move {
// Acquire permit — blocks if concurrency limit reached
let _permit = permit.acquire().await.expect("semaphore closed");
tokio::time::sleep(delay).await;
let page_start = Instant::now();
let result = client.fetch_and_extract(&url).await;
let elapsed = page_start.elapsed();
match result {
Ok(extraction) => {
debug!(
url = %url, depth,
elapsed_ms = %elapsed.as_millis(),
"page extracted"
);
PageResult {
url,
depth,
extraction: Some(extraction),
error: None,
elapsed,
}
}
Err(e) => {
warn!(url = %url, depth, error = %e, "page failed");
PageResult {
url,
depth,
extraction: None,
error: Some(e.to_string()),
elapsed,
}
}
}
}));
}
// Collect results and harvest links for the next depth level
let mut next_frontier: Vec<(String, usize)> = Vec::new();
for handle in handles {
let page = match handle.await {
Ok(page) => page,
Err(e) => {
warn!(error = %e, "crawl task panicked");
continue;
}
};
let depth = page.depth;
if depth < self.config.max_depth
&& let Some(ref extraction) = page.extraction
{
for link in &extraction.content.links {
if let Some(candidate) = self.qualify_link(&link.href, &visited) {
next_frontier.push((candidate, depth + 1));
}
}
}
// Stream progress if a channel is configured
if let Some(tx) = &self.config.progress_tx {
let _ = tx.send(page.clone());
}
pages.push(page);
if pages.len() >= self.config.max_pages {
break;
}
// Check cancel flag between page results
if self.is_cancelled() {
info!("crawl cancelled by user (mid-batch)");
break;
}
}
frontier = next_frontier;
}
let total_elapsed = start.elapsed();
let ok_count = pages.iter().filter(|p| p.extraction.is_some()).count();
let err_count = pages.len() - ok_count;
info!(
total = pages.len(),
ok = ok_count,
errors = err_count,
elapsed_ms = %total_elapsed.as_millis(),
"crawl complete"
);
CrawlResult {
total: pages.len(),
ok: ok_count,
errors: err_count,
elapsed_secs: total_elapsed.as_secs_f64(),
remaining_frontier: frontier,
visited,
pages,
}
}
/// Check if a discovered link should be added to the frontier.
/// Returns `Some(normalized_url)` if it passes all filters, `None` otherwise.
fn qualify_link(&self, href: &str, visited: &HashSet<String>) -> Option<String> {
let parsed = Url::parse(href).ok()?;
// Only http(s) schemes
match parsed.scheme() {
"http" | "https" => {}
_ => return None,
}
// Same-origin check (scheme + host + port)
if origin_key(&parsed) != self.seed_origin {
return None;
}
// Path prefix filter
if let Some(ref prefix) = self.config.path_prefix
&& !parsed.path().starts_with(prefix.as_str())
{
return None;
}
// Include patterns: if any are set, path must match at least one
let path = parsed.path();
if !self.config.include_patterns.is_empty()
&& !self
.config
.include_patterns
.iter()
.any(|pat| glob_match(pat, path))
{
return None;
}
// Exclude patterns: if path matches any, skip
if self
.config
.exclude_patterns
.iter()
.any(|pat| glob_match(pat, path))
{
return None;
}
// Skip common non-page file extensions
const SKIP_EXTENSIONS: &[&str] = &[
".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".css", ".js",
".zip", ".tar", ".gz", ".xml", ".rss", ".mp3", ".mp4", ".avi", ".mov", ".woff",
".woff2", ".ttf", ".eot",
];
if SKIP_EXTENSIONS.iter().any(|ext| path.ends_with(ext)) {
return None;
}
let normalized = normalize(&parsed);
if visited.contains(&normalized) {
return None;
}
Some(normalized)
}
}
/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
fn origin_key(url: &Url) -> String {
let port_suffix = match url.port() {
Some(p) => format!(":{p}"),
None => String::new(),
};
let host = url.host_str().unwrap_or("");
let host = host.strip_prefix("www.").unwrap_or(host);
format!("{}://{}{}", url.scheme(), host, port_suffix)
}
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
/// lowercase scheme + host. Preserves query params and path case.
fn normalize(url: &Url) -> String {
let scheme = url.scheme();
let host = url.host_str().unwrap_or("").to_ascii_lowercase();
let port_suffix = match url.port() {
Some(p) => format!(":{p}"),
None => String::new(),
};
let mut path = url.path().to_string();
if path.len() > 1 && path.ends_with('/') {
path.pop();
}
let query = match url.query() {
Some(q) => format!("?{q}"),
None => String::new(),
};
// Fragment intentionally omitted
format!("{scheme}://{host}{port_suffix}{path}{query}")
}
/// Simple glob matching for URL paths. Supports:
/// - `*` matches any characters within a single path segment (no `/`)
/// - `**` matches any characters including `/` (any number of segments)
/// - Literal characters match exactly
///
/// Examples:
/// - `/api/*` matches `/api/users` but not `/api/users/123`
/// - `/api/**` matches `/api/users`, `/api/users/123`, `/api/a/b/c`
/// - `/docs/*/intro` matches `/docs/v2/intro`
fn glob_match(pattern: &str, path: &str) -> bool {
glob_match_inner(pattern.as_bytes(), path.as_bytes())
}
fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
let mut pi = 0;
let mut ti = 0;
let mut star_pi = usize::MAX;
let mut star_ti = 0;
while ti < text.len() {
if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
// `**` — match everything including slashes
// Skip all consecutive `*`
while pi < pat.len() && pat[pi] == b'*' {
pi += 1;
}
// Skip trailing `/` after `**`
if pi < pat.len() && pat[pi] == b'/' {
pi += 1;
}
if pi >= pat.len() {
return true; // `**` at end matches everything
}
// Try matching the rest of pattern against every suffix of text
for start in ti..=text.len() {
if glob_match_inner(&pat[pi..], &text[start..]) {
return true;
}
}
return false;
} else if pi < pat.len() && pat[pi] == b'*' {
// `*` — match any chars except `/`
star_pi = pi;
star_ti = ti;
pi += 1;
} else if pi < pat.len() && (pat[pi] == text[ti] || pat[pi] == b'?') {
pi += 1;
ti += 1;
} else if star_pi != usize::MAX {
// Backtrack: `*` absorbs one more char (but not `/`)
if text[star_ti] == b'/' {
return false;
}
star_ti += 1;
ti = star_ti;
pi = star_pi + 1;
} else {
return false;
}
}
// Consume trailing `*` or `**` in pattern
while pi < pat.len() && pat[pi] == b'*' {
pi += 1;
}
pi >= pat.len()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_strips_fragment() {
let url = Url::parse("https://example.com/page#section").unwrap();
assert_eq!(normalize(&url), "https://example.com/page");
}
#[test]
fn normalize_strips_trailing_slash() {
let url = Url::parse("https://example.com/docs/").unwrap();
assert_eq!(normalize(&url), "https://example.com/docs");
}
#[test]
fn normalize_keeps_root_slash() {
let url = Url::parse("https://example.com/").unwrap();
assert_eq!(normalize(&url), "https://example.com/");
}
#[test]
fn normalize_preserves_query() {
let url = Url::parse("https://example.com/search?q=rust&page=2").unwrap();
assert_eq!(normalize(&url), "https://example.com/search?q=rust&page=2");
}
#[test]
fn normalize_lowercases_host() {
let url = Url::parse("https://Example.COM/Path").unwrap();
assert_eq!(normalize(&url), "https://example.com/Path");
}
#[test]
fn origin_includes_explicit_port() {
let url = Url::parse("https://example.com:8443/foo").unwrap();
assert_eq!(origin_key(&url), "https://example.com:8443");
}
#[test]
fn origin_omits_default_port() {
let url = Url::parse("https://example.com/foo").unwrap();
assert_eq!(origin_key(&url), "https://example.com");
}
#[test]
fn different_schemes_are_different_origins() {
let http = Url::parse("http://example.com/").unwrap();
let https = Url::parse("https://example.com/").unwrap();
assert_ne!(origin_key(&http), origin_key(&https));
}
// -- glob_match tests --
#[test]
fn glob_star_matches_single_segment() {
assert!(glob_match("/api/*", "/api/users"));
assert!(glob_match("/api/*", "/api/products"));
assert!(!glob_match("/api/*", "/api/users/123"));
}
#[test]
fn glob_doublestar_matches_multiple_segments() {
assert!(glob_match("/api/**", "/api/users"));
assert!(glob_match("/api/**", "/api/users/123"));
assert!(glob_match("/api/**", "/api/a/b/c/d"));
assert!(!glob_match("/api/**", "/docs/intro"));
}
#[test]
fn glob_exact_match() {
assert!(glob_match("/about", "/about"));
assert!(!glob_match("/about", "/about/team"));
}
#[test]
fn glob_middle_wildcard() {
assert!(glob_match("/docs/*/intro", "/docs/v2/intro"));
assert!(!glob_match("/docs/*/intro", "/docs/v2/v3/intro"));
}
#[test]
fn glob_no_pattern_matches_nothing() {
// Empty pattern only matches empty string
assert!(glob_match("", ""));
assert!(!glob_match("", "/foo"));
}
#[test]
fn glob_trailing_star() {
assert!(glob_match("/blog*", "/blog"));
assert!(glob_match("/blog*", "/blog-post"));
assert!(!glob_match("/blog*", "/blog/post")); // * doesn't cross /
}
}

View file

@ -0,0 +1,745 @@
/// Document extraction for DOCX, XLSX, XLS, and CSV files.
/// Auto-detects document type from Content-Type headers or URL extension,
/// then extracts text content as markdown — same pattern as PDF extraction.
use std::io::{Cursor, Read};
use tracing::debug;
use crate::error::FetchError;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocType {
Docx,
Xlsx,
Xls,
Csv,
}
impl DocType {
fn label(self) -> &'static str {
match self {
DocType::Docx => "DOCX",
DocType::Xlsx => "XLSX",
DocType::Xls => "XLS",
DocType::Csv => "CSV",
}
}
}
impl std::fmt::Display for DocType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.label())
}
}
/// Detect document type from response headers or URL extension.
/// Returns `None` for non-document responses (HTML, PDF, etc.).
pub fn is_document_content_type(headers: &http::HeaderMap, url: &str) -> Option<DocType> {
// Check Content-Type header first
if let Some(ct) = headers.get("content-type").and_then(|v| v.to_str().ok()) {
let mime = ct.split(';').next().unwrap_or("").trim();
if mime.eq_ignore_ascii_case(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
) {
return Some(DocType::Docx);
}
if mime.eq_ignore_ascii_case(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
) {
return Some(DocType::Xlsx);
}
if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
return Some(DocType::Xls);
}
if mime.eq_ignore_ascii_case("text/csv") {
return Some(DocType::Csv);
}
}
// Fall back to URL extension
let path = url.split('?').next().unwrap_or(url);
let lower = path.to_ascii_lowercase();
if lower.ends_with(".docx") {
return Some(DocType::Docx);
}
if lower.ends_with(".xlsx") {
return Some(DocType::Xlsx);
}
if lower.ends_with(".xls") {
return Some(DocType::Xls);
}
if lower.ends_with(".csv") {
return Some(DocType::Csv);
}
None
}
/// Extract text content from document bytes, returning an ExtractionResult.
pub fn extract_document(
bytes: &[u8],
doc_type: DocType,
) -> Result<noxa_core::ExtractionResult, FetchError> {
debug!(
doc_type = doc_type.label(),
bytes = bytes.len(),
"extracting document"
);
let markdown = match doc_type {
DocType::Docx => extract_docx(bytes)?,
DocType::Xlsx => extract_xlsx(bytes)?,
DocType::Xls => extract_xls(bytes)?,
DocType::Csv => extract_csv(bytes)?,
};
let plain_text = strip_markdown_formatting(&markdown);
let word_count = plain_text.split_whitespace().count();
Ok(noxa_core::ExtractionResult {
metadata: noxa_core::Metadata {
title: None,
description: None,
author: None,
published_date: None,
language: None,
url: None,
site_name: None,
image: None,
favicon: None,
word_count,
},
content: noxa_core::Content {
markdown,
plain_text,
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
}
/// Extract text from a DOCX file (ZIP of XML).
/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
let cursor = Cursor::new(bytes);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
let xml = {
let mut file = archive
.by_name("word/document.xml")
.map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
let mut buf = String::new();
file.read_to_string(&mut buf)
.map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
buf
};
parse_docx_xml(&xml)
}
/// Parse DOCX XML (word/document.xml) into markdown.
///
/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
use quick_xml::Reader;
use quick_xml::events::Event;
let mut reader = Reader::from_str(xml);
let mut paragraphs: Vec<String> = Vec::new();
// State tracking for the current paragraph
let mut in_paragraph = false;
let mut in_run = false; // inside <w:r> (run)
let mut in_text = false; // inside <w:t>
let mut current_text = String::new();
let mut heading_level: Option<u8> = None; // None = normal paragraph
let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let name_bytes = e.name().as_ref().to_vec();
let local = local_name(&name_bytes);
match local {
b"p" if is_w_namespace(&name_bytes) => {
in_paragraph = true;
current_text.clear();
heading_level = None;
}
b"pPr" if in_paragraph => in_ppr = true,
b"pStyle" if in_ppr => {
heading_level = extract_heading_level(e);
}
b"r" if in_paragraph => in_run = true,
b"t" if in_run => in_text = true,
b"br" if in_paragraph => {
current_text.push('\n');
}
b"tab" if in_paragraph => {
current_text.push('\t');
}
_ => {}
}
}
Ok(Event::End(ref e)) => {
let name_bytes = e.name().as_ref().to_vec();
let local = local_name(&name_bytes);
match local {
b"p" if in_paragraph => {
let text = current_text.trim().to_string();
if !text.is_empty() {
let formatted = match heading_level {
Some(1) => format!("# {text}"),
Some(2) => format!("## {text}"),
Some(3) => format!("### {text}"),
Some(4) => format!("#### {text}"),
Some(5) => format!("##### {text}"),
Some(6) => format!("###### {text}"),
_ => text,
};
paragraphs.push(formatted);
}
in_paragraph = false;
}
b"pPr" => in_ppr = false,
b"r" => {
in_run = false;
in_text = false;
}
b"t" => in_text = false,
_ => {}
}
}
Ok(Event::Text(ref e)) if in_text => {
if let Ok(text) = e.unescape() {
current_text.push_str(&text);
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
}
_ => {}
}
}
Ok(paragraphs.join("\n\n"))
}
/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
fn is_w_namespace(name: &[u8]) -> bool {
// quick-xml gives us the full name bytes. Accept both "w:p" and "p".
name == b"w:p" || name == b"p"
}
/// Extract the local name from a possibly namespaced XML tag.
/// `w:p` -> `p`, `p` -> `p`
fn local_name(name: &[u8]) -> &[u8] {
match name.iter().position(|&b| b == b':') {
Some(pos) => &name[pos + 1..],
None => name,
}
}
/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
for attr in e.attributes().flatten() {
let local = local_name(attr.key.as_ref());
if local == b"val" {
let val = String::from_utf8_lossy(&attr.value);
let lower = val.to_ascii_lowercase();
// Match "heading1", "heading2", etc. and "title" -> h1
if lower == "title" {
return Some(1);
}
if let Some(rest) = lower.strip_prefix("heading")
&& let Ok(n) = rest.parse::<u8>()
{
return Some(n.min(6));
}
}
}
None
}
/// Extract spreadsheet content using calamine (XLSX format).
fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
extract_spreadsheet(bytes, "XLSX")
}
/// Extract spreadsheet content using calamine (XLS format).
fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
extract_spreadsheet(bytes, "XLS")
}
/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
/// Reads all sheets and formats each as a markdown table.
fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
use calamine::Reader;
let cursor = Cursor::new(bytes);
let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
.map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
let mut sections: Vec<String> = Vec::new();
for name in &sheet_names {
let range = workbook
.worksheet_range(name)
.map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
let rows: Vec<Vec<String>> = range
.rows()
.map(|row| row.iter().map(cell_to_string).collect())
.collect();
if rows.is_empty() {
continue;
}
let mut section = format!("## Sheet: {name}\n\n");
section.push_str(&rows_to_markdown_table(&rows));
sections.push(section);
}
if sections.is_empty() {
return Ok("(empty spreadsheet)".to_string());
}
Ok(sections.join("\n\n"))
}
/// Convert a calamine cell value to a display string.
fn cell_to_string(cell: &calamine::Data) -> String {
use calamine::Data;
match cell {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Int(n) => n.to_string(),
Data::Float(f) => format_float(*f),
Data::Bool(b) => b.to_string(),
Data::Error(e) => format!("#{e:?}"),
Data::DateTime(dt) => format!("{dt}"),
Data::DateTimeIso(s) => s.clone(),
Data::DurationIso(s) => s.clone(),
}
}
/// Format a float, dropping trailing `.0` for clean integer display.
fn format_float(f: f64) -> String {
if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
format!("{}", f as i64)
} else {
format!("{f}")
}
}
/// Extract CSV text and convert to markdown table.
fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
let text = String::from_utf8_lossy(bytes);
let rows = parse_csv_rows(&text);
if rows.is_empty() {
return Ok("(empty CSV)".to_string());
}
Ok(rows_to_markdown_table(&rows))
}
/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
let mut rows: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
let mut current_field = String::new();
let mut in_quotes = false;
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
if in_quotes {
if ch == '"' {
// Escaped quote ("") or end of quoted field
if chars.peek() == Some(&'"') {
chars.next();
current_field.push('"');
} else {
in_quotes = false;
}
} else {
current_field.push(ch);
}
} else {
match ch {
'"' => in_quotes = true,
',' => {
current_row.push(current_field.trim().to_string());
current_field = String::new();
}
'\n' => {
current_row.push(current_field.trim().to_string());
current_field = String::new();
if !current_row.iter().all(|f| f.is_empty()) {
rows.push(current_row);
}
current_row = Vec::new();
}
'\r' => {
// Skip carriage returns (handled with \n)
}
_ => current_field.push(ch),
}
}
}
// Flush last field/row
if !current_field.is_empty() || !current_row.is_empty() {
current_row.push(current_field.trim().to_string());
if !current_row.iter().all(|f| f.is_empty()) {
rows.push(current_row);
}
}
rows
}
/// Convert rows (first row = header) into a markdown table.
fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
if rows.is_empty() {
return String::new();
}
// Find the max column count across all rows
let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
if col_count == 0 {
return String::new();
}
let mut lines: Vec<String> = Vec::new();
// Header row
let header = &rows[0];
let header_cells: Vec<&str> = (0..col_count)
.map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
.collect();
lines.push(format!("| {} |", header_cells.join(" | ")));
// Separator row
let sep: Vec<&str> = vec!["---"; col_count];
lines.push(format!("| {} |", sep.join(" | ")));
// Data rows
for row in &rows[1..] {
let cells: Vec<&str> = (0..col_count)
.map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
.collect();
lines.push(format!("| {} |", cells.join(" | ")));
}
lines.join("\n")
}
/// Strip markdown formatting to get plain text.
fn strip_markdown_formatting(markdown: &str) -> String {
let mut plain = String::with_capacity(markdown.len());
for line in markdown.lines() {
let trimmed = line.trim_start_matches('#').trim();
if trimmed.starts_with("| ---") || trimmed == "|---|" {
continue; // Skip separator rows
}
if let Some(stripped) = trimmed.strip_prefix('|')
&& let Some(stripped) = stripped.strip_suffix('|')
{
// Table row: join cells with spaces
let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
plain.push_str(&cells.join(" "));
plain.push('\n');
continue;
}
plain.push_str(trimmed);
plain.push('\n');
}
plain.trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
use http::HeaderMap;
fn headers_with(name: &str, value: &str) -> HeaderMap {
let mut h = HeaderMap::new();
h.insert(
name.parse::<http::header::HeaderName>().unwrap(),
value.parse().unwrap(),
);
h
}
// --- Content-type detection ---
#[test]
fn test_detect_docx_content_type() {
let headers = headers_with(
"content-type",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Docx)
);
}
#[test]
fn test_detect_xlsx_content_type() {
let headers = headers_with(
"content-type",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Xlsx)
);
}
#[test]
fn test_detect_xls_content_type() {
let headers = headers_with("content-type", "application/vnd.ms-excel");
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Xls)
);
}
#[test]
fn test_detect_csv_content_type() {
let headers = headers_with("content-type", "text/csv");
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_csv_content_type_with_charset() {
let headers = headers_with("content-type", "text/csv; charset=utf-8");
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_by_url_extension() {
let empty = HeaderMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/report.docx"),
Some(DocType::Docx)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/data.xlsx"),
Some(DocType::Xlsx)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/old.xls"),
Some(DocType::Xls)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/data.csv"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_url_extension_with_query() {
let empty = HeaderMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
Some(DocType::Docx)
);
}
#[test]
fn test_detect_url_extension_case_insensitive() {
let empty = HeaderMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
Some(DocType::Xlsx)
);
}
#[test]
fn test_detect_none_for_html() {
let headers = headers_with("content-type", "text/html");
assert_eq!(
is_document_content_type(&headers, "https://example.com/page"),
None
);
}
#[test]
fn test_content_type_takes_precedence_over_url() {
let headers = headers_with("content-type", "text/csv");
// URL says .xlsx but Content-Type says CSV — header wins
assert_eq!(
is_document_content_type(&headers, "https://example.com/data.xlsx"),
Some(DocType::Csv)
);
}
// --- CSV parsing ---
#[test]
fn test_csv_simple() {
let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("| Name | Age | City |"));
assert!(result.contains("| --- | --- | --- |"));
assert!(result.contains("| Alice | 30 | NYC |"));
assert!(result.contains("| Bob | 25 | LA |"));
}
#[test]
fn test_csv_quoted_fields() {
let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("Has a, comma"));
assert!(result.contains("Said \"hello\""));
}
#[test]
fn test_csv_empty() {
let result = extract_csv(b"").unwrap();
assert_eq!(result, "(empty CSV)");
}
#[test]
fn test_csv_windows_line_endings() {
let csv = "A,B\r\n1,2\r\n3,4\r\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("| A | B |"));
assert!(result.contains("| 1 | 2 |"));
}
// --- DOCX XML parsing ---
#[test]
fn test_docx_xml_simple_paragraphs() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
<w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Hello world\n\nSecond paragraph");
}
#[test]
fn test_docx_xml_headings() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
<w:r><w:t>Title</w:t></w:r>
</w:p>
<w:p><w:r><w:t>Body text</w:t></w:r></w:p>
<w:p>
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
<w:r><w:t>Subtitle</w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert!(result.contains("# Title"));
assert!(result.contains("Body text"));
assert!(result.contains("## Subtitle"));
}
#[test]
fn test_docx_xml_multiple_runs() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>Hello </w:t></w:r>
<w:r><w:t>world</w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Hello world");
}
#[test]
fn test_docx_xml_empty_paragraphs_skipped() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p></w:p>
<w:p><w:r><w:t>Content</w:t></w:r></w:p>
<w:p><w:r><w:t> </w:t></w:r></w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Content");
}
// --- Markdown table ---
#[test]
fn test_rows_to_markdown_table() {
let rows = vec![
vec!["A".to_string(), "B".to_string()],
vec!["1".to_string(), "2".to_string()],
vec!["3".to_string(), "4".to_string()],
];
let table = rows_to_markdown_table(&rows);
assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
}
#[test]
fn test_rows_to_markdown_table_ragged() {
let rows = vec![
vec!["A".to_string(), "B".to_string(), "C".to_string()],
vec!["1".to_string()], // fewer columns
];
let table = rows_to_markdown_table(&rows);
assert!(table.contains("| 1 | | |"));
}
// --- Extract result ---
#[test]
fn test_extract_csv_result() {
let csv = "Name,Score\nAlice,100\n";
let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
assert!(result.content.markdown.contains("| Name | Score |"));
assert!(result.metadata.word_count > 0);
assert!(result.content.links.is_empty());
assert!(result.domain_data.is_none());
}
// --- Strip markdown ---
#[test]
fn test_strip_markdown() {
let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
let plain = strip_markdown_formatting(md);
assert!(plain.contains("Title"));
assert!(plain.contains("Some text"));
assert!(plain.contains("A B"));
assert!(!plain.contains("---"));
}
}

View file

@ -0,0 +1,24 @@
/// Fetch-layer errors. Wraps HTTP/network failures into a single type
/// that callers can match on without leaking transport details.
use thiserror::Error;
#[derive(Debug, Error)]
pub enum FetchError {
#[error("request failed: {0}")]
Request(#[from] wreq::Error),
#[error("invalid url: {0}")]
InvalidUrl(String),
#[error("response body decode failed: {0}")]
BodyDecode(String),
#[error("extraction failed: {0}")]
Extraction(#[from] noxa_core::ExtractError),
#[error("PDF extraction failed: {0}")]
Pdf(#[from] noxa_pdf::PdfError),
#[error("client build failed: {0}")]
Build(String),
}

View file

@ -0,0 +1,22 @@
//! noxa-fetch: HTTP client layer with browser TLS fingerprint impersonation.
//! Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
//! Automatically detects PDF responses and delegates to noxa-pdf.
pub mod browser;
pub mod client;
pub mod crawler;
pub mod document;
pub mod error;
pub mod linkedin;
pub mod proxy;
pub mod reddit;
pub mod sitemap;
pub mod tls;
pub use browser::BrowserProfile;
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult};
pub use error::FetchError;
pub use http::HeaderMap;
pub use proxy::{parse_proxy_file, parse_proxy_line};
pub use sitemap::SitemapEntry;
pub use noxa_pdf::PdfMode;

View file

@ -0,0 +1,279 @@
/// LinkedIn post extraction from authenticated HTML.
///
/// LinkedIn's SPA stores all data in `<code>` tags as HTML-escaped JSON.
/// The `included` array contains typed entities: Update (post), Comment,
/// Profile, etc. We parse these to reconstruct post + comments as markdown.
use serde_json::Value;
use tracing::debug;
use noxa_core::{Content, ExtractionResult, Metadata};
/// Check if a URL is a LinkedIn post/activity.
pub fn is_linkedin_post(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
(host == "www.linkedin.com" || host == "linkedin.com")
&& (url.contains("/feed/update/") || url.contains("/posts/"))
}
/// Extract `<code>` block contents from HTML using simple string scanning.
/// LinkedIn wraps JSON data in `<code>` tags with HTML-escaped content.
fn extract_code_blocks(html: &str) -> Vec<String> {
let mut blocks = Vec::new();
let mut search_from = 0;
while let Some(start) = html[search_from..].find("<code") {
let abs_start = search_from + start;
// Find end of opening tag
let Some(tag_end) = html[abs_start..].find('>') else {
break;
};
let content_start = abs_start + tag_end + 1;
let Some(end) = html[content_start..].find("</code>") else {
break;
};
let content = &html[content_start..content_start + end];
if content.len() > 1000 {
blocks.push(html_unescape(content));
}
search_from = content_start + end + 7;
}
blocks
}
/// Extract post + comments from LinkedIn's SSR HTML (requires auth cookies).
pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult> {
let code_blocks = extract_code_blocks(html);
// Find the largest <code> block with "included" — that's the main data payload
let mut best_included: Option<Vec<Value>> = None;
for raw in &code_blocks {
if let Ok(obj) = serde_json::from_str::<Value>(raw)
&& let Some(arr) = obj.get("included").and_then(|v| v.as_array())
{
let current_len = best_included.as_ref().map(|a| a.len()).unwrap_or(0);
if arr.len() > current_len {
best_included = Some(arr.clone());
}
}
}
let included = best_included?;
debug!(entities = included.len(), "linkedin: found included array");
// Collect profiles (entityUrn → "First Last")
let mut profiles = std::collections::HashMap::new();
for item in &included {
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
if t.contains("Profile") {
let urn = item.get("entityUrn").and_then(|v| v.as_str()).unwrap_or("");
let first = item.get("firstName").and_then(|v| v.as_str()).unwrap_or("");
let last = item.get("lastName").and_then(|v| v.as_str()).unwrap_or("");
let headline = item.get("headline").and_then(|v| v.as_str()).unwrap_or("");
if !first.is_empty() {
profiles.insert(
urn.to_string(),
(
format!("{first} {last}").trim().to_string(),
headline.to_string(),
),
);
}
}
}
// Find the main post (Update type)
let mut markdown = String::new();
let mut post_author = String::new();
let mut post_headline = String::new();
for item in &included {
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
if !t.contains("Update") {
continue;
}
// Get author from actor profile
if let Some(actor) = item.get("actor") {
// actor can have a nested profile reference or inline data
let author_urn = actor
.get("*author")
.or(actor.get("author"))
.and_then(|v| v.as_str())
.unwrap_or("");
if let Some((name, headline)) = profiles.get(author_urn) {
post_author = name.clone();
post_headline = headline.clone();
}
// Or inline name
if post_author.is_empty()
&& let Some(name) = actor.get("name").and_then(|v| v.as_object())
{
let text = name.get("text").and_then(|v| v.as_str()).unwrap_or("");
if !text.is_empty() {
post_author = text.to_string();
}
}
if post_headline.is_empty()
&& let Some(desc) = actor.get("description").and_then(|v| v.as_object())
{
let text = desc.get("text").and_then(|v| v.as_str()).unwrap_or("");
if !text.is_empty() {
post_headline = text.to_string();
}
}
}
// Get post body from commentary
if let Some(commentary) = item.get("commentary")
&& let Some(text) = commentary
.get("text")
.and_then(|v| v.as_object())
.and_then(|o| o.get("text"))
.and_then(|v| v.as_str())
{
if !post_author.is_empty() {
markdown.push_str(&format!("# {post_author}\n\n"));
}
if !post_headline.is_empty() {
markdown.push_str(&format!("*{post_headline}*\n\n"));
}
markdown.push_str("---\n\n");
// Unescape literal \n from JSON
markdown.push_str(&text.replace("\\n", "\n"));
markdown.push_str("\n\n");
}
}
if markdown.is_empty() {
return None;
}
// Collect comments — LinkedIn stores comment text in `commentary.text`
// and commenter name in `commenter.name.text`
let mut comments: Vec<(String, String)> = Vec::new();
for item in &included {
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
if !t.contains("Comment") {
continue;
}
// Get comment text from commentary.text
let text = item
.get("commentary")
.and_then(|c| c.get("text"))
.and_then(|v| v.as_str())
.unwrap_or("");
if text.is_empty() {
continue;
}
// Get commenter name from commenter.title.text
let name = item
.get("commenter")
.and_then(|c| c.get("title"))
.and_then(|n| n.get("text"))
.and_then(|v| v.as_str())
.unwrap_or("Someone");
comments.push((name.to_string(), text.to_string()));
}
if !comments.is_empty() {
markdown.push_str("---\n\n## Comments\n\n");
for (name, text) in &comments {
markdown.push_str(&format!("- **{name}**: {text}\n\n"));
}
}
let word_count = markdown.split_whitespace().count();
debug!(
word_count,
comments = comments.len(),
"linkedin extraction done"
);
Some(ExtractionResult {
metadata: Metadata {
title: if post_author.is_empty() {
None
} else {
Some(format!("{post_author}'s LinkedIn Post"))
},
description: None,
author: if post_author.is_empty() {
None
} else {
Some(post_author)
},
published_date: None,
language: None,
url: Some(url.to_string()),
site_name: Some("LinkedIn".into()),
image: None,
favicon: None,
word_count,
},
content: Content {
markdown,
plain_text: String::new(),
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
}
/// Unescape HTML entities (named + numeric decimal).
fn html_unescape(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c != '&' {
out.push(c);
continue;
}
// Collect until ';'
let mut entity = String::new();
for c2 in chars.by_ref() {
if c2 == ';' {
break;
}
entity.push(c2);
if entity.len() > 10 {
break;
}
}
match entity.as_str() {
"quot" => out.push('"'),
"amp" => out.push('&'),
"lt" => out.push('<'),
"gt" => out.push('>'),
"apos" => out.push('\''),
s if s.starts_with('#') => {
let num = &s[1..];
if let Ok(n) = num.parse::<u32>()
&& let Some(ch) = char::from_u32(n)
{
out.push(ch);
continue;
}
out.push('&');
out.push_str(&entity);
out.push(';');
}
_ => {
out.push('&');
out.push_str(&entity);
out.push(';');
}
}
}
out
}

View file

@ -0,0 +1,122 @@
/// Proxy file parsing utilities.
///
/// Format: `host:port:user:pass` (one per line).
/// Lines starting with `#` and blank lines are skipped.
/// Also accepts `host:port` (no auth).
use crate::error::FetchError;
/// Parse a single proxy line into an HTTP proxy URL.
///
/// Accepts two formats:
/// - `host:port:user:pass` -> `http://user:pass@host:port`
/// - `host:port` -> `http://host:port`
pub fn parse_proxy_line(line: &str) -> Option<String> {
let parts: Vec<&str> = line.trim().splitn(4, ':').collect();
match parts.len() {
4 => Some(format!(
"http://{}:{}@{}:{}",
parts[2], parts[3], parts[0], parts[1]
)),
2 => Some(format!("http://{}:{}", parts[0], parts[1])),
_ => None,
}
}
/// Load proxies from a file, returning parsed HTTP proxy URLs.
///
/// Skips blank lines and `#` comments. Returns an error if the file
/// can't be read or contains no valid entries.
pub fn parse_proxy_file(path: &str) -> Result<Vec<String>, FetchError> {
let content = std::fs::read_to_string(path)
.map_err(|e| FetchError::Build(format!("failed to read proxy file: {e}")))?;
let proxies: Vec<String> = content
.lines()
.filter_map(|line| {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
None
} else {
parse_proxy_line(trimmed)
}
})
.collect();
if proxies.is_empty() {
return Err(FetchError::Build(
"proxy file is empty or has no valid entries".into(),
));
}
Ok(proxies)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
#[test]
fn parse_host_port_user_pass() {
let result = parse_proxy_line("proxy.example.com:8080:alice:s3cret");
assert_eq!(
result.as_deref(),
Some("http://alice:s3cret@proxy.example.com:8080")
);
}
#[test]
fn parse_host_port_only() {
let result = parse_proxy_line("10.0.0.1:3128");
assert_eq!(result.as_deref(), Some("http://10.0.0.1:3128"));
}
#[test]
fn parse_trims_whitespace() {
let result = parse_proxy_line(" host:9999:user:pass ");
assert_eq!(result.as_deref(), Some("http://user:pass@host:9999"));
}
#[test]
fn parse_invalid_returns_none() {
assert!(parse_proxy_line("just-a-hostname").is_none());
assert!(parse_proxy_line("a:b:c").is_none()); // 3 parts is invalid
assert!(parse_proxy_line("").is_none());
}
#[test]
fn parse_file_happy_path() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("proxies.txt");
let mut f = std::fs::File::create(&path).unwrap();
writeln!(f, "# residential pool").unwrap();
writeln!(f, "host1:8080:user1:pass1").unwrap();
writeln!(f).unwrap(); // blank line
writeln!(f, "host2:3128").unwrap();
writeln!(f, "# datacenter").unwrap();
writeln!(f, "host3:9999:u:p").unwrap();
drop(f);
let proxies = parse_proxy_file(path.to_str().unwrap()).unwrap();
assert_eq!(proxies.len(), 3);
assert_eq!(proxies[0], "http://user1:pass1@host1:8080");
assert_eq!(proxies[1], "http://host2:3128");
assert_eq!(proxies[2], "http://u:p@host3:9999");
}
#[test]
fn parse_file_empty_errors() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("empty.txt");
std::fs::write(&path, "# only comments\n\n").unwrap();
let err = parse_proxy_file(path.to_str().unwrap());
assert!(err.is_err());
}
#[test]
fn parse_file_missing_errors() {
let err = parse_proxy_file("/nonexistent/proxies.txt");
assert!(err.is_err());
}
}

View file

@ -0,0 +1,172 @@
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
///
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
/// comment tree as structured JSON, which we convert to clean markdown.
use serde::Deserialize;
use tracing::debug;
use noxa_core::{Content, ExtractionResult, Metadata};
/// Check if a URL points to a Reddit post/comment page.
pub fn is_reddit_url(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
matches!(
host,
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
)
}
/// Build the `.json` URL from a Reddit page URL.
pub fn json_url(url: &str) -> String {
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
format!("{clean}.json")
}
/// Convert Reddit JSON API response into an ExtractionResult.
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
let listings: Vec<Listing> =
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
let mut markdown = String::new();
let mut title = None;
let mut author = None;
let mut subreddit = None;
// First listing = the post itself
if let Some(post_listing) = listings.first() {
for child in &post_listing.data.children {
if child.kind == "t3" {
let d = &child.data;
title = d.title.clone();
author = d.author.clone();
subreddit = d.subreddit_name_prefixed.clone();
if let Some(ref t) = title {
markdown.push_str(&format!("# {t}\n\n"));
}
if let (Some(a), Some(sr)) = (&author, &subreddit) {
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
}
if let Some(ref body) = d.selftext
&& !body.is_empty()
{
markdown.push_str(body);
markdown.push_str("\n\n");
}
if let Some(ref url_field) = d.url_overridden_by_dest
&& !url_field.is_empty()
{
markdown.push_str(&format!("[Link]({url_field})\n\n"));
}
markdown.push_str("---\n\n");
}
}
}
// Second listing = comment tree
if let Some(comment_listing) = listings.get(1) {
markdown.push_str("## Comments\n\n");
for child in &comment_listing.data.children {
render_comment(child, 0, &mut markdown);
}
}
let word_count = markdown.split_whitespace().count();
debug!(word_count, "reddit json extracted");
Ok(ExtractionResult {
metadata: Metadata {
title,
description: None,
author,
published_date: None,
language: Some("en".into()),
url: Some(url.to_string()),
site_name: subreddit,
image: None,
favicon: None,
word_count,
},
content: Content {
markdown,
plain_text: String::new(),
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
}
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
if thing.kind != "t1" {
return;
}
let d = &thing.data;
let indent = " ".repeat(depth);
let author = d.author.as_deref().unwrap_or("[deleted]");
let body = d.body.as_deref().unwrap_or("[removed]");
let score = d.score.unwrap_or(0);
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
for line in body.lines() {
out.push_str(&format!("{indent} {line}\n"));
}
out.push('\n');
// Recurse into replies
if let Some(Replies::Listing(listing)) = &d.replies {
for child in &listing.data.children {
render_comment(child, depth + 1, out);
}
}
}
// --- Reddit JSON types (minimal) ---
#[derive(Deserialize)]
struct Listing {
data: ListingData,
}
#[derive(Deserialize)]
struct ListingData {
children: Vec<Thing>,
}
#[derive(Deserialize)]
struct Thing {
kind: String,
data: ThingData,
}
#[derive(Deserialize)]
struct ThingData {
// Post fields (t3)
title: Option<String>,
selftext: Option<String>,
subreddit_name_prefixed: Option<String>,
url_overridden_by_dest: Option<String>,
// Comment fields (t1)
author: Option<String>,
body: Option<String>,
score: Option<i64>,
replies: Option<Replies>,
}
/// Reddit replies can be either a nested Listing or an empty string.
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
Listing(Listing),
#[allow(dead_code)]
Empty(String),
}

View file

@ -0,0 +1,601 @@
/// Sitemap parsing and URL discovery.
///
/// Discovers URLs from a site's sitemaps using a 3-step process:
/// 1. Parse robots.txt for `Sitemap:` directives
/// 2. Try common sitemap paths as fallback
/// 3. Recursively resolve sitemap index files
///
/// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
use std::collections::HashSet;
use quick_xml::Reader;
use quick_xml::events::Event;
use serde::Serialize;
use tracing::{debug, warn};
use crate::client::FetchClient;
use crate::error::FetchError;
/// Maximum depth when recursively fetching sitemap index files.
/// Prevents infinite loops from circular sitemap references.
const MAX_RECURSION_DEPTH: usize = 3;
/// Common sitemap paths to try when robots.txt doesn't list any.
const FALLBACK_SITEMAP_PATHS: &[&str] = &[
"/sitemap.xml",
"/sitemap_index.xml",
"/wp-sitemap.xml",
"/sitemap/sitemap-index.xml",
];
/// A single URL discovered from a sitemap.
#[derive(Debug, Clone, Serialize)]
pub struct SitemapEntry {
pub url: String,
pub last_modified: Option<String>,
pub priority: Option<f64>,
pub change_freq: Option<String>,
}
/// Discover all URLs from a site's sitemaps.
///
/// Discovery order:
/// 1. Fetch /robots.txt, parse `Sitemap:` directives
/// 2. Try common sitemap paths as fallback (skipping any already found)
/// 3. If sitemap index, recursively fetch child sitemaps
/// 4. Deduplicate by URL
///
/// Returns an empty vec (not an error) if no sitemaps are found.
pub async fn discover(
client: &FetchClient,
base_url: &str,
) -> Result<Vec<SitemapEntry>, FetchError> {
let base = base_url.trim_end_matches('/');
let mut sitemap_urls: Vec<String> = Vec::new();
// Step 1: Try robots.txt
let robots_url = format!("{base}/robots.txt");
debug!(url = %robots_url, "fetching robots.txt");
match client.fetch(&robots_url).await {
Ok(result) if result.status == 200 => {
let found = parse_robots_txt(&result.html);
debug!(count = found.len(), "sitemap URLs from robots.txt");
sitemap_urls.extend(found);
}
Ok(result) => {
debug!(status = result.status, "robots.txt not found");
}
Err(e) => {
debug!(error = %e, "failed to fetch robots.txt");
}
}
// Step 2: Try common sitemap paths (skipping any already discovered via robots.txt)
for path in FALLBACK_SITEMAP_PATHS {
let candidate = format!("{base}{path}");
if !sitemap_urls.iter().any(|u| u == &candidate) {
sitemap_urls.push(candidate);
}
}
// Step 3: Fetch and parse each sitemap, handling indexes recursively
let mut seen_urls: HashSet<String> = HashSet::new();
let mut entries: Vec<SitemapEntry> = Vec::new();
fetch_sitemaps(client, &sitemap_urls, &mut entries, &mut seen_urls, 0).await;
debug!(total = entries.len(), "sitemap discovery complete");
Ok(entries)
}
/// Recursively fetch and parse sitemap URLs, handling both urlsets and indexes.
async fn fetch_sitemaps(
client: &FetchClient,
urls: &[String],
entries: &mut Vec<SitemapEntry>,
seen_urls: &mut HashSet<String>,
depth: usize,
) {
if depth > MAX_RECURSION_DEPTH {
warn!(depth, "sitemap recursion limit reached, stopping");
return;
}
for sitemap_url in urls {
debug!(url = %sitemap_url, depth, "fetching sitemap");
let xml = match client.fetch(sitemap_url).await {
Ok(result) if result.status == 200 => result.html,
Ok(result) => {
debug!(url = %sitemap_url, status = result.status, "sitemap not found");
continue;
}
Err(e) => {
debug!(url = %sitemap_url, error = %e, "failed to fetch sitemap");
continue;
}
};
match detect_sitemap_type(&xml) {
SitemapType::UrlSet => {
let parsed = parse_urlset(&xml);
for entry in parsed {
if seen_urls.insert(entry.url.clone()) {
entries.push(entry);
}
}
}
SitemapType::Index => {
let child_urls = parse_sitemap_index(&xml);
debug!(count = child_urls.len(), "found child sitemaps in index");
// Box the recursive call to avoid large future sizes
Box::pin(fetch_sitemaps(
client,
&child_urls,
entries,
seen_urls,
depth + 1,
))
.await;
}
SitemapType::Unknown => {
debug!(url = %sitemap_url, "unrecognized sitemap format, skipping");
}
}
}
}
// ---------------------------------------------------------------------------
// Pure parsing functions (no I/O, fully testable)
// ---------------------------------------------------------------------------
/// Extract `Sitemap:` directive URLs from robots.txt content.
pub fn parse_robots_txt(text: &str) -> Vec<String> {
text.lines()
.filter_map(|line| {
let trimmed = line.trim();
// Case-insensitive match for "Sitemap:" prefix
if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") {
let url = trimmed[8..].trim();
if !url.is_empty() {
return Some(url.to_string());
}
}
None
})
.collect()
}
/// Parse a sitemap XML string. Handles both `<urlset>` and `<sitemapindex>`.
/// Returns entries from urlsets and recursion targets from indexes.
pub fn parse_sitemap_xml(xml: &str) -> Vec<SitemapEntry> {
match detect_sitemap_type(xml) {
SitemapType::UrlSet => parse_urlset(xml),
SitemapType::Index => {
// For the public parsing API, convert index <loc> entries into
// SitemapEntry with just the URL. The async `discover` function
// handles actual recursive fetching.
parse_sitemap_index(xml)
.into_iter()
.map(|url| SitemapEntry {
url,
last_modified: None,
priority: None,
change_freq: None,
})
.collect()
}
SitemapType::Unknown => Vec::new(),
}
}
#[derive(Debug, PartialEq)]
enum SitemapType {
UrlSet,
Index,
Unknown,
}
/// Peek at the first element to determine if this is a urlset or sitemapindex.
fn detect_sitemap_type(xml: &str) -> SitemapType {
let mut reader = Reader::from_str(xml);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let name = e.local_name();
return match name.as_ref() {
b"urlset" => SitemapType::UrlSet,
b"sitemapindex" => SitemapType::Index,
_ => continue, // skip processing instructions, comments
};
}
Ok(Event::Eof) => return SitemapType::Unknown,
Err(_) => return SitemapType::Unknown,
_ => continue,
}
}
}
/// Parse `<url>` entries from a `<urlset>` sitemap.
fn parse_urlset(xml: &str) -> Vec<SitemapEntry> {
let mut reader = Reader::from_str(xml);
let mut buf = Vec::new();
let mut entries = Vec::new();
// State for current <url> element being parsed
let mut in_url = false;
let mut current_tag: Option<UrlTag> = None;
let mut loc: Option<String> = None;
let mut lastmod: Option<String> = None;
let mut priority: Option<f64> = None;
let mut changefreq: Option<String> = None;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
let name = e.local_name();
match name.as_ref() {
b"url" => {
in_url = true;
loc = None;
lastmod = None;
priority = None;
changefreq = None;
}
b"loc" if in_url => current_tag = Some(UrlTag::Loc),
b"lastmod" if in_url => current_tag = Some(UrlTag::LastMod),
b"priority" if in_url => current_tag = Some(UrlTag::Priority),
b"changefreq" if in_url => current_tag = Some(UrlTag::ChangeFreq),
_ => current_tag = None,
}
}
Ok(Event::Text(ref e)) => {
if let Some(ref tag) = current_tag
&& let Ok(text) = e.unescape()
{
let text = text.trim().to_string();
if !text.is_empty() {
match tag {
UrlTag::Loc => loc = Some(text),
UrlTag::LastMod => lastmod = Some(text),
UrlTag::Priority => priority = text.parse().ok(),
UrlTag::ChangeFreq => changefreq = Some(text),
}
}
}
}
Ok(Event::End(ref e)) => {
let name = e.local_name();
if name.as_ref() == b"url" && in_url {
if let Some(url) = loc.take() {
entries.push(SitemapEntry {
url,
last_modified: lastmod.take(),
priority: priority.take(),
change_freq: changefreq.take(),
});
}
in_url = false;
}
current_tag = None;
}
Ok(Event::Eof) => break,
Err(e) => {
warn!(error = %e, "XML parse error in sitemap, returning partial results");
break;
}
_ => {}
}
buf.clear();
}
entries
}
#[derive(Debug)]
enum UrlTag {
Loc,
LastMod,
Priority,
ChangeFreq,
}
/// Parse `<sitemap>` entries from a `<sitemapindex>`, returning child sitemap URLs.
fn parse_sitemap_index(xml: &str) -> Vec<String> {
let mut reader = Reader::from_str(xml);
let mut buf = Vec::new();
let mut urls = Vec::new();
let mut in_sitemap = false;
let mut in_loc = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
let name = e.local_name();
match name.as_ref() {
b"sitemap" => in_sitemap = true,
b"loc" if in_sitemap => in_loc = true,
_ => {}
}
}
Ok(Event::Text(ref e)) => {
if in_loc && let Ok(text) = e.unescape() {
let text = text.trim().to_string();
if !text.is_empty() {
urls.push(text);
}
}
}
Ok(Event::End(ref e)) => {
let name = e.local_name();
match name.as_ref() {
b"sitemap" => {
in_sitemap = false;
in_loc = false;
}
b"loc" => in_loc = false,
_ => {}
}
}
Ok(Event::Eof) => break,
Err(e) => {
warn!(error = %e, "XML parse error in sitemap index, returning partial results");
break;
}
_ => {}
}
buf.clear();
}
urls
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_urlset() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/</loc>
<lastmod>2026-01-15</lastmod>
<changefreq>daily</changefreq>
<priority>1.0</priority>
</url>
<url>
<loc>https://example.com/about</loc>
<lastmod>2026-01-10</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://example.com/blog/post-1</loc>
</url>
</urlset>"#;
let entries = parse_urlset(xml);
assert_eq!(entries.len(), 3);
assert_eq!(entries[0].url, "https://example.com/");
assert_eq!(entries[0].last_modified.as_deref(), Some("2026-01-15"));
assert_eq!(entries[0].change_freq.as_deref(), Some("daily"));
assert_eq!(entries[0].priority, Some(1.0));
assert_eq!(entries[1].url, "https://example.com/about");
assert_eq!(entries[1].priority, Some(0.8));
assert_eq!(entries[2].url, "https://example.com/blog/post-1");
assert_eq!(entries[2].last_modified, None);
assert_eq!(entries[2].priority, None);
assert_eq!(entries[2].change_freq, None);
}
#[test]
fn test_parse_sitemap_index() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>https://example.com/sitemap-posts.xml</loc>
<lastmod>2026-03-01</lastmod>
</sitemap>
<sitemap>
<loc>https://example.com/sitemap-pages.xml</loc>
</sitemap>
</sitemapindex>"#;
let urls = parse_sitemap_index(xml);
assert_eq!(urls.len(), 2);
assert_eq!(urls[0], "https://example.com/sitemap-posts.xml");
assert_eq!(urls[1], "https://example.com/sitemap-pages.xml");
}
#[test]
fn test_parse_sitemap_xml_dispatches_urlset() {
let xml = r#"<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/page</loc></url>
</urlset>"#;
let entries = parse_sitemap_xml(xml);
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].url, "https://example.com/page");
}
#[test]
fn test_parse_sitemap_xml_dispatches_index() {
let xml = r#"<?xml version="1.0"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>https://example.com/sitemap-1.xml</loc></sitemap>
</sitemapindex>"#;
let entries = parse_sitemap_xml(xml);
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].url, "https://example.com/sitemap-1.xml");
// Index entries have no metadata when parsed through the public API
assert_eq!(entries[0].priority, None);
}
#[test]
fn test_parse_robots_txt() {
let robots = "User-agent: *\n\
Disallow: /admin/\n\
\n\
Sitemap: https://example.com/sitemap.xml\n\
sitemap: https://example.com/sitemap-news.xml\n\
SITEMAP: https://example.com/sitemap-images.xml\n\
\n\
User-agent: Googlebot\n\
Allow: /\n";
let urls = parse_robots_txt(robots);
assert_eq!(urls.len(), 3);
assert_eq!(urls[0], "https://example.com/sitemap.xml");
assert_eq!(urls[1], "https://example.com/sitemap-news.xml");
assert_eq!(urls[2], "https://example.com/sitemap-images.xml");
}
#[test]
fn test_parse_robots_txt_empty_value() {
// "Sitemap:" with no URL should be skipped
let robots = "Sitemap:\nSitemap: \nSitemap: https://example.com/s.xml\n";
let urls = parse_robots_txt(robots);
assert_eq!(urls.len(), 1);
assert_eq!(urls[0], "https://example.com/s.xml");
}
#[test]
fn test_deduplicate() {
// parse_sitemap_xml deduplicates via the discover() path, but
// we can verify that parsing the same URL twice produces entries
// that the HashSet in discover() would collapse.
let xml = r#"<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/page</loc></url>
<url><loc>https://example.com/page</loc></url>
<url><loc>https://example.com/other</loc></url>
</urlset>"#;
let entries = parse_urlset(xml);
assert_eq!(entries.len(), 3, "parser returns all entries");
// Simulate the dedup that discover() does
let mut seen = HashSet::new();
let deduped: Vec<_> = entries
.into_iter()
.filter(|e| seen.insert(e.url.clone()))
.collect();
assert_eq!(deduped.len(), 2, "dedup collapses duplicates");
}
#[test]
fn test_empty_sitemap() {
let xml = r#"<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</urlset>"#;
let entries = parse_urlset(xml);
assert!(entries.is_empty());
}
#[test]
fn test_malformed_xml() {
let xml = "this is not xml at all <><><";
let entries = parse_sitemap_xml(xml);
assert!(entries.is_empty(), "malformed XML returns empty vec");
}
#[test]
fn test_malformed_xml_partial() {
// Partial XML that starts valid but breaks mid-stream
let xml = r#"<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/good</loc></url>
<url><loc>broken
"#;
let entries = parse_sitemap_xml(xml);
// Should return at least the successfully parsed entry
assert!(entries.len() >= 1);
assert_eq!(entries[0].url, "https://example.com/good");
}
#[test]
fn test_missing_loc() {
let xml = r#"<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<lastmod>2026-01-01</lastmod>
<priority>0.5</priority>
</url>
<url>
<loc>https://example.com/valid</loc>
</url>
</urlset>"#;
let entries = parse_urlset(xml);
assert_eq!(entries.len(), 1, "entry without <loc> is skipped");
assert_eq!(entries[0].url, "https://example.com/valid");
}
#[test]
fn test_priority_parsing() {
let xml = r#"<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/high</loc>
<priority>1.0</priority>
</url>
<url>
<loc>https://example.com/mid</loc>
<priority>0.5</priority>
</url>
<url>
<loc>https://example.com/low</loc>
<priority>0.1</priority>
</url>
<url>
<loc>https://example.com/invalid</loc>
<priority>not-a-number</priority>
</url>
</urlset>"#;
let entries = parse_urlset(xml);
assert_eq!(entries.len(), 4);
assert_eq!(entries[0].priority, Some(1.0));
assert_eq!(entries[1].priority, Some(0.5));
assert_eq!(entries[2].priority, Some(0.1));
assert_eq!(entries[3].priority, None, "invalid priority parses as None");
}
#[test]
fn test_detect_sitemap_type() {
let urlset = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
assert_eq!(detect_sitemap_type(urlset), SitemapType::UrlSet);
let index = r#"<?xml version="1.0"?><sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></sitemapindex>"#;
assert_eq!(detect_sitemap_type(index), SitemapType::Index);
assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
}
#[test]
fn test_fallback_paths_constant() {
// Verify the constant has the expected paths
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
}
}

View file

@ -0,0 +1,372 @@
//! Browser TLS + HTTP/2 fingerprint profiles built on wreq (BoringSSL).
//!
//! Replaces the old noxa-http/noxa-tls patched rustls stack.
//! Each profile configures TLS options (cipher suites, curves, extensions,
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
//! stream dependency, priorities) to match real browser fingerprints.
use std::time::Duration;
use wreq::http2::{
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
};
use wreq::tls::{AlpsProtocol, CertificateCompressionAlgorithm, TlsOptions, TlsVersion};
use wreq::{Client, Emulation};
use crate::browser::BrowserVariant;
use crate::error::FetchError;
/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
/// Chrome signature algorithms.
const CHROME_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:rsa_pkcs1_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha512";
/// Chrome curves (post-quantum ML-KEM + X25519 + P-256 + P-384).
const CHROME_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384";
/// Firefox cipher list.
const FIREFOX_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
/// Firefox signature algorithms.
const FIREFOX_SIGALGS: &str = "ecdsa_secp256r1_sha256:ecdsa_secp384r1_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha256:rsa_pss_rsae_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha256:rsa_pkcs1_sha384:rsa_pkcs1_sha512:ecdsa_sha1:rsa_pkcs1_sha1";
/// Firefox curves.
const FIREFOX_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384:P-521";
/// Safari cipher list.
const SAFARI_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_CBC_SHA";
/// Safari signature algorithms.
const SAFARI_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha512:rsa_pkcs1_sha384:rsa_pkcs1_sha512";
/// Safari curves.
const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521";
// --- Chrome HTTP headers in correct wire order ---
const CHROME_HEADERS: &[(&str, &str)] = &[
(
"sec-ch-ua",
r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
),
("sec-ch-ua-mobile", "?0"),
("sec-ch-ua-platform", "\"Windows\""),
("upgrade-insecure-requests", "1"),
(
"user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
),
(
"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
),
("sec-fetch-site", "none"),
("sec-fetch-mode", "navigate"),
("sec-fetch-user", "?1"),
("sec-fetch-dest", "document"),
("accept-encoding", "gzip, deflate, br, zstd"),
("accept-language", "en-US,en;q=0.9"),
("priority", "u=0, i"),
];
const CHROME_MACOS_HEADERS: &[(&str, &str)] = &[
(
"sec-ch-ua",
r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
),
("sec-ch-ua-mobile", "?0"),
("sec-ch-ua-platform", "\"macOS\""),
("upgrade-insecure-requests", "1"),
(
"user-agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
),
(
"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
),
("sec-fetch-site", "none"),
("sec-fetch-mode", "navigate"),
("sec-fetch-user", "?1"),
("sec-fetch-dest", "document"),
("accept-encoding", "gzip, deflate, br, zstd"),
("accept-language", "en-US,en;q=0.9"),
("priority", "u=0, i"),
];
const FIREFOX_HEADERS: &[(&str, &str)] = &[
(
"user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
),
(
"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
),
("accept-language", "en-US,en;q=0.5"),
("accept-encoding", "gzip, deflate, br, zstd"),
("upgrade-insecure-requests", "1"),
("sec-fetch-dest", "document"),
("sec-fetch-mode", "navigate"),
("sec-fetch-site", "none"),
("sec-fetch-user", "?1"),
("priority", "u=0, i"),
];
const SAFARI_HEADERS: &[(&str, &str)] = &[
(
"user-agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Safari/605.1.15",
),
(
"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
),
("sec-fetch-site", "none"),
("accept-language", "en-US,en;q=0.9"),
("sec-fetch-mode", "navigate"),
("accept-encoding", "gzip, deflate, br"),
("sec-fetch-dest", "document"),
];
const EDGE_HEADERS: &[(&str, &str)] = &[
(
"sec-ch-ua",
r#""Microsoft Edge";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
),
("sec-ch-ua-mobile", "?0"),
("sec-ch-ua-platform", "\"Windows\""),
("upgrade-insecure-requests", "1"),
(
"user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36 Edg/145.0.0.0",
),
(
"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
),
("sec-fetch-site", "none"),
("sec-fetch-mode", "navigate"),
("sec-fetch-user", "?1"),
("sec-fetch-dest", "document"),
("accept-encoding", "gzip, deflate, br, zstd"),
("accept-language", "en-US,en;q=0.9"),
("priority", "u=0, i"),
];
fn chrome_tls() -> TlsOptions {
TlsOptions::builder()
.cipher_list(CHROME_CIPHERS)
.sigalgs_list(CHROME_SIGALGS)
.curves_list(CHROME_CURVES)
.min_tls_version(TlsVersion::TLS_1_2)
.max_tls_version(TlsVersion::TLS_1_3)
.grease_enabled(true)
.permute_extensions(true)
.enable_ech_grease(true)
.pre_shared_key(true)
.enable_ocsp_stapling(true)
.enable_signed_cert_timestamps(true)
.alps_protocols([AlpsProtocol::HTTP2])
.alps_use_new_codepoint(true)
.aes_hw_override(true)
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
.build()
}
fn firefox_tls() -> TlsOptions {
TlsOptions::builder()
.cipher_list(FIREFOX_CIPHERS)
.sigalgs_list(FIREFOX_SIGALGS)
.curves_list(FIREFOX_CURVES)
.min_tls_version(TlsVersion::TLS_1_2)
.max_tls_version(TlsVersion::TLS_1_3)
.grease_enabled(true)
.permute_extensions(false)
.enable_ech_grease(true)
.pre_shared_key(true)
.enable_ocsp_stapling(true)
.enable_signed_cert_timestamps(true)
.certificate_compression_algorithms(&[
CertificateCompressionAlgorithm::ZLIB,
CertificateCompressionAlgorithm::BROTLI,
])
.build()
}
fn safari_tls() -> TlsOptions {
TlsOptions::builder()
.cipher_list(SAFARI_CIPHERS)
.sigalgs_list(SAFARI_SIGALGS)
.curves_list(SAFARI_CURVES)
.min_tls_version(TlsVersion::TLS_1_2)
.max_tls_version(TlsVersion::TLS_1_3)
.grease_enabled(true)
.permute_extensions(false)
.enable_ech_grease(false)
.pre_shared_key(false)
.enable_ocsp_stapling(true)
.enable_signed_cert_timestamps(true)
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB])
.build()
}
fn chrome_h2() -> Http2Options {
Http2Options::builder()
.initial_window_size(6_291_456)
.initial_connection_window_size(15_728_640)
.max_header_list_size(262_144)
.header_table_size(65_536)
.max_concurrent_streams(1000u32)
.enable_push(false)
.settings_order(
SettingsOrder::builder()
.extend([
SettingId::HeaderTableSize,
SettingId::EnablePush,
SettingId::MaxConcurrentStreams,
SettingId::InitialWindowSize,
SettingId::MaxFrameSize,
SettingId::MaxHeaderListSize,
SettingId::EnableConnectProtocol,
SettingId::NoRfc7540Priorities,
])
.build(),
)
.headers_pseudo_order(
PseudoOrder::builder()
.extend([
PseudoId::Method,
PseudoId::Authority,
PseudoId::Scheme,
PseudoId::Path,
])
.build(),
)
.headers_stream_dependency(StreamDependency::new(StreamId::zero(), 219, true))
.build()
}
fn firefox_h2() -> Http2Options {
Http2Options::builder()
.initial_window_size(131_072)
.initial_connection_window_size(12_517_377)
.max_header_list_size(65_536)
.header_table_size(65_536)
.settings_order(
SettingsOrder::builder()
.extend([
SettingId::HeaderTableSize,
SettingId::InitialWindowSize,
SettingId::MaxFrameSize,
])
.build(),
)
.headers_pseudo_order(
PseudoOrder::builder()
.extend([
PseudoId::Method,
PseudoId::Path,
PseudoId::Authority,
PseudoId::Scheme,
])
.build(),
)
.build()
}
fn safari_h2() -> Http2Options {
Http2Options::builder()
.initial_window_size(2_097_152)
.initial_connection_window_size(10_420_225)
.max_header_list_size(0)
.header_table_size(4_096)
.enable_push(false)
.max_concurrent_streams(100u32)
.settings_order(
SettingsOrder::builder()
.extend([
SettingId::EnablePush,
SettingId::MaxConcurrentStreams,
SettingId::InitialWindowSize,
SettingId::MaxFrameSize,
])
.build(),
)
.headers_pseudo_order(
PseudoOrder::builder()
.extend([
PseudoId::Method,
PseudoId::Scheme,
PseudoId::Authority,
PseudoId::Path,
])
.build(),
)
.headers_stream_dependency(StreamDependency::new(StreamId::zero(), 255, false))
.build()
}
fn build_headers(pairs: &[(&str, &str)]) -> http::HeaderMap {
let mut map = http::HeaderMap::with_capacity(pairs.len());
for (name, value) in pairs {
if let (Ok(n), Ok(v)) = (
http::header::HeaderName::from_bytes(name.as_bytes()),
http::header::HeaderValue::from_str(value),
) {
map.insert(n, v);
}
}
map
}
/// Build a wreq Client for a specific browser variant.
pub fn build_client(
variant: BrowserVariant,
timeout: Duration,
extra_headers: &std::collections::HashMap<String, String>,
proxy: Option<&str>,
) -> Result<Client, FetchError> {
let (tls, h2, headers) = match variant {
BrowserVariant::Chrome => (chrome_tls(), chrome_h2(), CHROME_HEADERS),
BrowserVariant::ChromeMacos => (chrome_tls(), chrome_h2(), CHROME_MACOS_HEADERS),
BrowserVariant::Firefox => (firefox_tls(), firefox_h2(), FIREFOX_HEADERS),
BrowserVariant::Safari => (safari_tls(), safari_h2(), SAFARI_HEADERS),
BrowserVariant::Edge => (chrome_tls(), chrome_h2(), EDGE_HEADERS),
};
let mut header_map = build_headers(headers);
// Append extra headers after profile defaults
for (k, v) in extra_headers {
if let (Ok(n), Ok(val)) = (
http::header::HeaderName::from_bytes(k.as_bytes()),
http::header::HeaderValue::from_str(v),
) {
header_map.insert(n, val);
}
}
let emulation = Emulation::builder()
.tls_options(tls)
.http2_options(h2)
.headers(header_map)
.build();
let mut builder = Client::builder()
.emulation(emulation)
.redirect(wreq::redirect::Policy::limited(10))
.cookie_store(true)
.timeout(timeout);
if let Some(proxy_url) = proxy {
let proxy =
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
builder = builder.proxy(proxy);
}
builder
.build()
.map_err(|e| FetchError::Build(e.to_string()))
}