mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-13 17:02:36 +02:00
chore: rebrand webclaw to noxa
This commit is contained in:
parent
a4c351d5ae
commit
8674b60b4e
86 changed files with 781 additions and 2121 deletions
26
crates/noxa-fetch/Cargo.toml
Normal file
26
crates/noxa-fetch/Cargo.toml
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
[package]
|
||||
name = "noxa-fetch"
|
||||
description = "HTTP client with browser TLS fingerprint impersonation via wreq"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
noxa-core = { workspace = true }
|
||||
noxa-pdf = { path = "../noxa-pdf" }
|
||||
serde = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
|
||||
http = "1"
|
||||
bytes = "1"
|
||||
url = "2"
|
||||
rand = "0.8"
|
||||
quick-xml = { version = "0.37", features = ["serde"] }
|
||||
serde_json.workspace = true
|
||||
calamine = "0.34"
|
||||
zip = "2"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
51
crates/noxa-fetch/src/browser.rs
Normal file
51
crates/noxa-fetch/src/browser.rs
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
//! Browser fingerprint selection and rotation.
|
||||
//! Maps our BrowserProfile enum to noxa-http client builder methods.
|
||||
|
||||
/// Which browser identity to present at the TLS/HTTP layer.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub enum BrowserProfile {
|
||||
#[default]
|
||||
Chrome,
|
||||
Firefox,
|
||||
/// Randomly pick from all available profiles on each request.
|
||||
Random,
|
||||
}
|
||||
|
||||
/// A browser variant for building noxa-http clients.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum BrowserVariant {
|
||||
Chrome,
|
||||
ChromeMacos,
|
||||
Firefox,
|
||||
Safari,
|
||||
Edge,
|
||||
}
|
||||
|
||||
/// All Chrome variants we ship.
|
||||
pub fn chrome_variants() -> Vec<BrowserVariant> {
|
||||
vec![BrowserVariant::Chrome, BrowserVariant::ChromeMacos]
|
||||
}
|
||||
|
||||
/// All Firefox variants we ship.
|
||||
pub fn firefox_variants() -> Vec<BrowserVariant> {
|
||||
vec![BrowserVariant::Firefox]
|
||||
}
|
||||
|
||||
/// All variants for maximum diversity in Random mode.
|
||||
pub fn all_variants() -> Vec<BrowserVariant> {
|
||||
vec![
|
||||
BrowserVariant::Chrome,
|
||||
BrowserVariant::ChromeMacos,
|
||||
BrowserVariant::Firefox,
|
||||
BrowserVariant::Safari,
|
||||
BrowserVariant::Edge,
|
||||
]
|
||||
}
|
||||
|
||||
pub fn latest_chrome() -> BrowserVariant {
|
||||
BrowserVariant::Chrome
|
||||
}
|
||||
|
||||
pub fn latest_firefox() -> BrowserVariant {
|
||||
BrowserVariant::Firefox
|
||||
}
|
||||
836
crates/noxa-fetch/src/client.rs
Normal file
836
crates/noxa-fetch/src/client.rs
Normal file
|
|
@ -0,0 +1,836 @@
|
|||
/// HTTP client with browser TLS fingerprint impersonation.
|
||||
/// Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
/// Supports single and batch operations with proxy rotation.
|
||||
/// Automatically detects PDF responses and extracts text via noxa-pdf.
|
||||
///
|
||||
/// Two proxy modes:
|
||||
/// - **Static**: single proxy (or none) baked into pre-built clients at construction.
|
||||
/// - **Rotating**: pre-built pool of clients, each with a different proxy + fingerprint.
|
||||
/// Same-host URLs are routed to the same client for HTTP/2 connection reuse.
|
||||
use std::collections::HashMap;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{debug, instrument, warn};
|
||||
use noxa_pdf::PdfMode;
|
||||
|
||||
use crate::browser::{self, BrowserProfile, BrowserVariant};
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Configuration for building a [`FetchClient`].
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FetchConfig {
|
||||
pub browser: BrowserProfile,
|
||||
/// Single proxy URL. Used when `proxy_pool` is empty.
|
||||
pub proxy: Option<String>,
|
||||
/// Pool of proxy URLs to rotate through.
|
||||
/// When non-empty, each proxy gets a pre-built client with a
|
||||
/// random browser fingerprint. Same-host URLs reuse the same client
|
||||
/// for HTTP/2 connection multiplexing.
|
||||
pub proxy_pool: Vec<String>,
|
||||
pub timeout: Duration,
|
||||
pub follow_redirects: bool,
|
||||
pub max_redirects: u32,
|
||||
pub headers: HashMap<String, String>,
|
||||
pub pdf_mode: PdfMode,
|
||||
}
|
||||
|
||||
impl Default for FetchConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
browser: BrowserProfile::Chrome,
|
||||
proxy: None,
|
||||
proxy_pool: Vec::new(),
|
||||
timeout: Duration::from_secs(12),
|
||||
follow_redirects: true,
|
||||
max_redirects: 10,
|
||||
headers: HashMap::from([("Accept-Language".to_string(), "en-US,en;q=0.9".to_string())]),
|
||||
pdf_mode: PdfMode::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a successful fetch.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FetchResult {
|
||||
pub html: String,
|
||||
pub status: u16,
|
||||
/// Final URL after any redirects.
|
||||
pub url: String,
|
||||
pub headers: http::HeaderMap,
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
/// Result for a single URL in a batch fetch operation.
|
||||
#[derive(Debug)]
|
||||
pub struct BatchResult {
|
||||
pub url: String,
|
||||
pub result: Result<FetchResult, FetchError>,
|
||||
}
|
||||
|
||||
/// Result for a single URL in a batch fetch-and-extract operation.
|
||||
#[derive(Debug)]
|
||||
pub struct BatchExtractResult {
|
||||
pub url: String,
|
||||
pub result: Result<noxa_core::ExtractionResult, FetchError>,
|
||||
}
|
||||
|
||||
/// Buffered response that owns its body. Provides the same sync API
|
||||
/// that noxa-http::Response used to provide.
|
||||
struct Response {
|
||||
status: u16,
|
||||
url: String,
|
||||
headers: http::HeaderMap,
|
||||
body: bytes::Bytes,
|
||||
}
|
||||
|
||||
impl Response {
|
||||
/// Buffer a wreq response into an owned Response.
|
||||
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
|
||||
let status = resp.status().as_u16();
|
||||
let url = resp.uri().to_string();
|
||||
let headers = resp.headers().clone();
|
||||
let body = resp
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
Ok(Self {
|
||||
status,
|
||||
url,
|
||||
headers,
|
||||
body,
|
||||
})
|
||||
}
|
||||
|
||||
fn status(&self) -> u16 {
|
||||
self.status
|
||||
}
|
||||
fn url(&self) -> &str {
|
||||
&self.url
|
||||
}
|
||||
fn headers(&self) -> &http::HeaderMap {
|
||||
&self.headers
|
||||
}
|
||||
fn body(&self) -> &[u8] {
|
||||
&self.body
|
||||
}
|
||||
fn is_success(&self) -> bool {
|
||||
(200..300).contains(&self.status)
|
||||
}
|
||||
|
||||
fn text(&self) -> std::borrow::Cow<'_, str> {
|
||||
String::from_utf8_lossy(&self.body)
|
||||
}
|
||||
|
||||
fn into_text(self) -> String {
|
||||
String::from_utf8_lossy(&self.body).into_owned()
|
||||
}
|
||||
}
|
||||
|
||||
/// Internal representation of the client pool strategy.
|
||||
enum ClientPool {
|
||||
/// Pre-built clients with a fixed proxy (or no proxy).
|
||||
/// Fingerprint rotation still works via the pool when `random` is true.
|
||||
Static {
|
||||
clients: Vec<wreq::Client>,
|
||||
random: bool,
|
||||
},
|
||||
/// Pre-built pool of clients, each with a different proxy + fingerprint.
|
||||
/// Requests pick a client deterministically by host for HTTP/2 connection reuse.
|
||||
Rotating { clients: Vec<wreq::Client> },
|
||||
}
|
||||
|
||||
/// HTTP client with browser TLS + HTTP/2 fingerprinting via wreq.
|
||||
///
|
||||
/// Operates in two modes:
|
||||
/// - **Static pool**: pre-built clients, optionally with fingerprint rotation.
|
||||
/// Used when no `proxy_pool` is configured. Fast (no per-request construction).
|
||||
/// - **Rotating pool**: pre-built clients, one per proxy in the pool.
|
||||
/// Same-host URLs are routed to the same client for HTTP/2 multiplexing.
|
||||
pub struct FetchClient {
|
||||
pool: ClientPool,
|
||||
pdf_mode: PdfMode,
|
||||
}
|
||||
|
||||
impl FetchClient {
|
||||
/// Build a new client from config.
|
||||
pub fn new(config: FetchConfig) -> Result<Self, FetchError> {
|
||||
let variants = collect_variants(&config.browser);
|
||||
let pdf_mode = config.pdf_mode.clone();
|
||||
|
||||
let pool = if config.proxy_pool.is_empty() {
|
||||
let clients = variants
|
||||
.into_iter()
|
||||
.map(|v| {
|
||||
crate::tls::build_client(
|
||||
v,
|
||||
config.timeout,
|
||||
&config.headers,
|
||||
config.proxy.as_deref(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
let random = matches!(config.browser, BrowserProfile::Random);
|
||||
debug!(
|
||||
count = clients.len(),
|
||||
random, "fetch client ready (static pool)"
|
||||
);
|
||||
|
||||
ClientPool::Static { clients, random }
|
||||
} else {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
let clients = config
|
||||
.proxy_pool
|
||||
.iter()
|
||||
.map(|proxy| {
|
||||
let v = *variants.choose(&mut rng).unwrap();
|
||||
crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
debug!(
|
||||
clients = clients.len(),
|
||||
"fetch client ready (pre-built rotating pool)"
|
||||
);
|
||||
|
||||
ClientPool::Rotating { clients }
|
||||
};
|
||||
|
||||
Ok(Self { pool, pdf_mode })
|
||||
}
|
||||
|
||||
/// Fetch a URL and return the raw HTML + response metadata.
|
||||
///
|
||||
/// Automatically retries on transient failures (network errors, 5xx, 429)
|
||||
/// with exponential backoff: 0s, 1s (2 attempts total).
|
||||
#[instrument(skip(self), fields(url = %url))]
|
||||
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
let delays = [Duration::ZERO, Duration::from_secs(1)];
|
||||
let mut last_err = None;
|
||||
|
||||
for (attempt, delay) in delays.iter().enumerate() {
|
||||
if attempt > 0 {
|
||||
tokio::time::sleep(*delay).await;
|
||||
}
|
||||
|
||||
match self.fetch_once(url).await {
|
||||
Ok(result) => {
|
||||
if is_retryable_status(result.status) && attempt < delays.len() - 1 {
|
||||
warn!(
|
||||
url,
|
||||
status = result.status,
|
||||
attempt = attempt + 1,
|
||||
"retryable status, will retry"
|
||||
);
|
||||
last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
|
||||
continue;
|
||||
}
|
||||
if attempt > 0 {
|
||||
debug!(url, attempt = attempt + 1, "retry succeeded");
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => {
|
||||
if !is_retryable_error(&e) || attempt == delays.len() - 1 {
|
||||
return Err(e);
|
||||
}
|
||||
warn!(
|
||||
url,
|
||||
error = %e,
|
||||
attempt = attempt + 1,
|
||||
"transient error, will retry"
|
||||
);
|
||||
last_err = Some(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
|
||||
}
|
||||
|
||||
/// Single fetch attempt.
|
||||
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
||||
let resp = client.get(url).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
response_to_result(response, start)
|
||||
}
|
||||
|
||||
/// Fetch a URL then extract structured content.
|
||||
#[instrument(skip(self), fields(url = %url))]
|
||||
pub async fn fetch_and_extract(
|
||||
&self,
|
||||
url: &str,
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
self.fetch_and_extract_with_options(url, &noxa_core::ExtractionOptions::default())
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetch a URL then extract structured content with custom extraction options.
|
||||
#[instrument(skip(self, options), fields(url = %url))]
|
||||
pub async fn fetch_and_extract_with_options(
|
||||
&self,
|
||||
url: &str,
|
||||
options: &noxa_core::ExtractionOptions,
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let client = self.pick_client(url);
|
||||
let resp = client.get(&json_url).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
if response.is_success() {
|
||||
let bytes = response.body();
|
||||
match crate::reddit::parse_reddit_json(bytes, url) {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
let resp = client.get(url).send().await?;
|
||||
let mut response = Response::from_wreq(resp).await?;
|
||||
|
||||
// Cookie warmup: if we get a challenge page, visit the homepage first
|
||||
// to collect Akamai cookies (_abck, bm_sz, etc.), then retry.
|
||||
if is_challenge_response(&response)
|
||||
&& let Some(homepage) = extract_homepage(url)
|
||||
{
|
||||
debug!("challenge detected, warming cookies via {homepage}");
|
||||
let _ = client.get(&homepage).send().await;
|
||||
let resp = client.get(url).send().await?;
|
||||
response = Response::from_wreq(resp).await?;
|
||||
debug!("retried after cookie warmup: status={}", response.status());
|
||||
}
|
||||
|
||||
let status = response.status();
|
||||
let final_url = response.url().to_string();
|
||||
|
||||
let headers = response.headers().clone();
|
||||
|
||||
let is_pdf = is_pdf_content_type(&headers);
|
||||
|
||||
if is_pdf {
|
||||
debug!(status, "detected PDF response, using pdf extraction");
|
||||
|
||||
let bytes = response.body();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(
|
||||
status,
|
||||
bytes = bytes.len(),
|
||||
elapsed_ms = %elapsed.as_millis(),
|
||||
"PDF fetch complete"
|
||||
);
|
||||
|
||||
let pdf_result = noxa_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
|
||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||
} else if let Some(doc_type) =
|
||||
crate::document::is_document_content_type(&headers, &final_url)
|
||||
{
|
||||
debug!(status, doc_type = ?doc_type, "detected document response, extracting");
|
||||
|
||||
let bytes = response.body();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(
|
||||
status,
|
||||
bytes = bytes.len(),
|
||||
elapsed_ms = %elapsed.as_millis(),
|
||||
"document fetch complete"
|
||||
);
|
||||
|
||||
let mut result = crate::document::extract_document(bytes, doc_type)?;
|
||||
result.metadata.url = Some(final_url);
|
||||
Ok(result)
|
||||
} else {
|
||||
let html = response.into_text();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
||||
// LinkedIn: extract from embedded <code> JSON blobs
|
||||
if crate::linkedin::is_linkedin_post(&final_url) {
|
||||
if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
|
||||
debug!("linkedin extraction succeeded");
|
||||
return Ok(result);
|
||||
}
|
||||
debug!("linkedin extraction failed, falling back to standard");
|
||||
}
|
||||
|
||||
let extraction = noxa_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
|
||||
Ok(extraction)
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch multiple URLs concurrently with bounded parallelism.
|
||||
pub async fn fetch_batch(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
) -> Vec<BatchResult> {
|
||||
let semaphore = Arc::new(Semaphore::new(concurrency));
|
||||
let mut handles = Vec::with_capacity(urls.len());
|
||||
|
||||
for (idx, url) in urls.iter().enumerate() {
|
||||
let permit = Arc::clone(&semaphore);
|
||||
let client = Arc::clone(self);
|
||||
let url = url.to_string();
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||
let result = client.fetch(&url).await;
|
||||
(idx, BatchResult { url, result })
|
||||
}));
|
||||
}
|
||||
|
||||
collect_ordered(handles, urls.len()).await
|
||||
}
|
||||
|
||||
/// Fetch and extract multiple URLs concurrently with bounded parallelism.
|
||||
pub async fn fetch_and_extract_batch(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
) -> Vec<BatchExtractResult> {
|
||||
self.fetch_and_extract_batch_with_options(
|
||||
urls,
|
||||
concurrency,
|
||||
&noxa_core::ExtractionOptions::default(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetch and extract multiple URLs concurrently with custom extraction options.
|
||||
pub async fn fetch_and_extract_batch_with_options(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
options: &noxa_core::ExtractionOptions,
|
||||
) -> Vec<BatchExtractResult> {
|
||||
let semaphore = Arc::new(Semaphore::new(concurrency));
|
||||
let mut handles = Vec::with_capacity(urls.len());
|
||||
|
||||
for (idx, url) in urls.iter().enumerate() {
|
||||
let permit = Arc::clone(&semaphore);
|
||||
let client = Arc::clone(self);
|
||||
let url = url.to_string();
|
||||
let opts = options.clone();
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||
let result = client.fetch_and_extract_with_options(&url, &opts).await;
|
||||
(idx, BatchExtractResult { url, result })
|
||||
}));
|
||||
}
|
||||
|
||||
collect_ordered(handles, urls.len()).await
|
||||
}
|
||||
|
||||
/// Returns the number of proxies in the rotation pool, or 0 if static mode.
|
||||
pub fn proxy_pool_size(&self) -> usize {
|
||||
match &self.pool {
|
||||
ClientPool::Static { .. } => 0,
|
||||
ClientPool::Rotating { clients } => clients.len(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Pick a client from the pool for a given URL.
|
||||
fn pick_client(&self, url: &str) -> &wreq::Client {
|
||||
match &self.pool {
|
||||
ClientPool::Static { clients, random } => {
|
||||
if *random {
|
||||
let host = extract_host(url);
|
||||
pick_for_host(clients, &host)
|
||||
} else {
|
||||
&clients[0]
|
||||
}
|
||||
}
|
||||
ClientPool::Rotating { clients } => pick_random(clients),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect the browser variants to use based on the browser profile.
|
||||
fn collect_variants(profile: &BrowserProfile) -> Vec<BrowserVariant> {
|
||||
match profile {
|
||||
BrowserProfile::Random => browser::all_variants(),
|
||||
BrowserProfile::Chrome => vec![browser::latest_chrome()],
|
||||
BrowserProfile::Firefox => vec![browser::latest_firefox()],
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a buffered Response into a FetchResult.
|
||||
fn response_to_result(response: Response, start: Instant) -> Result<FetchResult, FetchError> {
|
||||
let status = response.status();
|
||||
let final_url = response.url().to_string();
|
||||
let headers = response.headers().clone();
|
||||
let html = response.into_text();
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
||||
Ok(FetchResult {
|
||||
html,
|
||||
status,
|
||||
url: final_url,
|
||||
headers,
|
||||
elapsed,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract the host from a URL, returning empty string on parse failure.
|
||||
fn extract_host(url: &str) -> String {
|
||||
url::Url::parse(url)
|
||||
.ok()
|
||||
.and_then(|u| u.host_str().map(String::from))
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Pick a client deterministically based on a host string.
|
||||
/// Same host always gets the same client, enabling HTTP/2 connection reuse.
|
||||
fn pick_for_host<'a>(clients: &'a [wreq::Client], host: &str) -> &'a wreq::Client {
|
||||
let mut hasher = std::collections::hash_map::DefaultHasher::new();
|
||||
host.hash(&mut hasher);
|
||||
let idx = (hasher.finish() as usize) % clients.len();
|
||||
&clients[idx]
|
||||
}
|
||||
|
||||
/// Pick a random client from the pool for per-request rotation.
|
||||
fn pick_random(clients: &[wreq::Client]) -> &wreq::Client {
|
||||
use rand::Rng;
|
||||
let idx = rand::thread_rng().gen_range(0..clients.len());
|
||||
&clients[idx]
|
||||
}
|
||||
|
||||
/// Status codes worth retrying: server errors + rate limiting.
|
||||
fn is_retryable_status(status: u16) -> bool {
|
||||
status == 429
|
||||
|| status == 502
|
||||
|| status == 503
|
||||
|| status == 504
|
||||
|| status == 520
|
||||
|| status == 521
|
||||
|| status == 522
|
||||
|| status == 523
|
||||
|| status == 524
|
||||
}
|
||||
|
||||
/// Errors worth retrying: network/connection failures (not client errors).
|
||||
fn is_retryable_error(err: &FetchError) -> bool {
|
||||
matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_))
|
||||
}
|
||||
|
||||
fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
|
||||
headers
|
||||
.get("content-type")
|
||||
.and_then(|ct| ct.to_str().ok())
|
||||
.map(|ct| {
|
||||
let mime = ct.split(';').next().unwrap_or("").trim();
|
||||
mime.eq_ignore_ascii_case("application/pdf")
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Detect if a response looks like a bot protection challenge page.
|
||||
fn is_challenge_response(response: &Response) -> bool {
|
||||
let len = response.body().len();
|
||||
if len > 15_000 || len == 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let text = response.text();
|
||||
let lower = text.to_lowercase();
|
||||
|
||||
if lower.contains("<title>challenge page</title>") {
|
||||
return true;
|
||||
}
|
||||
|
||||
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Extract the homepage URL (scheme + host) from a full URL.
|
||||
fn extract_homepage(url: &str) -> Option<String> {
|
||||
url::Url::parse(url)
|
||||
.ok()
|
||||
.map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or("")))
|
||||
}
|
||||
|
||||
/// Convert a noxa-pdf PdfResult into a noxa-core ExtractionResult.
|
||||
fn pdf_to_extraction_result(
|
||||
pdf: &noxa_pdf::PdfResult,
|
||||
url: &str,
|
||||
) -> noxa_core::ExtractionResult {
|
||||
let markdown = noxa_pdf::to_markdown(pdf);
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
|
||||
noxa_core::ExtractionResult {
|
||||
metadata: noxa_core::Metadata {
|
||||
title: pdf.metadata.title.clone(),
|
||||
description: pdf.metadata.subject.clone(),
|
||||
author: pdf.metadata.author.clone(),
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: Some(url.to_string()),
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: noxa_core::Content {
|
||||
markdown,
|
||||
plain_text: pdf.text.clone(),
|
||||
links: Vec::new(),
|
||||
images: Vec::new(),
|
||||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect spawned tasks and reorder results to match input order.
|
||||
async fn collect_ordered<T>(
|
||||
handles: Vec<tokio::task::JoinHandle<(usize, T)>>,
|
||||
len: usize,
|
||||
) -> Vec<T> {
|
||||
let mut slots: Vec<Option<T>> = (0..len).map(|_| None).collect();
|
||||
|
||||
for handle in handles {
|
||||
match handle.await {
|
||||
Ok((idx, result)) => {
|
||||
slots[idx] = Some(result);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(error = %e, "batch task panicked");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
slots.into_iter().flatten().collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_batch_result_struct() {
|
||||
let ok = BatchResult {
|
||||
url: "https://example.com".to_string(),
|
||||
result: Ok(FetchResult {
|
||||
html: "<html></html>".to_string(),
|
||||
status: 200,
|
||||
url: "https://example.com".to_string(),
|
||||
headers: http::HeaderMap::new(),
|
||||
elapsed: Duration::from_millis(42),
|
||||
}),
|
||||
};
|
||||
assert_eq!(ok.url, "https://example.com");
|
||||
assert!(ok.result.is_ok());
|
||||
assert_eq!(ok.result.unwrap().status, 200);
|
||||
|
||||
let err = BatchResult {
|
||||
url: "https://bad.example".to_string(),
|
||||
result: Err(FetchError::InvalidUrl("bad url".into())),
|
||||
};
|
||||
assert!(err.result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_extract_result_struct() {
|
||||
let err = BatchExtractResult {
|
||||
url: "https://example.com".to_string(),
|
||||
result: Err(FetchError::BodyDecode("timeout".into())),
|
||||
};
|
||||
assert_eq!(err.url, "https://example.com");
|
||||
assert!(err.result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_batch_preserves_order() {
|
||||
let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
|
||||
tokio::spawn(async { (2, "c".to_string()) }),
|
||||
tokio::spawn(async { (0, "a".to_string()) }),
|
||||
tokio::spawn(async { (1, "b".to_string()) }),
|
||||
];
|
||||
|
||||
let results = collect_ordered(handles, 3).await;
|
||||
assert_eq!(results, vec!["a", "b", "c"]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_collect_ordered_handles_gaps() {
|
||||
let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
|
||||
tokio::spawn(async { (0, "first".to_string()) }),
|
||||
tokio::spawn(async { (2, "third".to_string()) }),
|
||||
];
|
||||
|
||||
let results = collect_ordered(handles, 3).await;
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0], "first");
|
||||
assert_eq!(results[1], "third");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_pdf_content_type() {
|
||||
let mut headers = http::HeaderMap::new();
|
||||
headers.insert("content-type", "application/pdf".parse().unwrap());
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert(
|
||||
"content-type",
|
||||
"application/pdf; charset=utf-8".parse().unwrap(),
|
||||
);
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert("content-type", "Application/PDF".parse().unwrap());
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
headers.insert("content-type", "text/html".parse().unwrap());
|
||||
assert!(!is_pdf_content_type(&headers));
|
||||
|
||||
let empty = http::HeaderMap::new();
|
||||
assert!(!is_pdf_content_type(&empty));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_to_extraction_result() {
|
||||
let pdf = noxa_pdf::PdfResult {
|
||||
text: "Hello from PDF.".into(),
|
||||
page_count: 2,
|
||||
metadata: noxa_pdf::PdfMetadata {
|
||||
title: Some("My Doc".into()),
|
||||
author: Some("Author".into()),
|
||||
subject: Some("Testing".into()),
|
||||
creator: None,
|
||||
},
|
||||
};
|
||||
|
||||
let result = pdf_to_extraction_result(&pdf, "https://example.com/doc.pdf");
|
||||
|
||||
assert_eq!(result.metadata.title.as_deref(), Some("My Doc"));
|
||||
assert_eq!(result.metadata.author.as_deref(), Some("Author"));
|
||||
assert_eq!(result.metadata.description.as_deref(), Some("Testing"));
|
||||
assert_eq!(
|
||||
result.metadata.url.as_deref(),
|
||||
Some("https://example.com/doc.pdf")
|
||||
);
|
||||
assert!(result.content.markdown.contains("# My Doc"));
|
||||
assert!(result.content.markdown.contains("Hello from PDF."));
|
||||
assert_eq!(result.content.plain_text, "Hello from PDF.");
|
||||
assert!(result.content.links.is_empty());
|
||||
assert!(result.domain_data.is_none());
|
||||
assert!(result.metadata.word_count > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_static_pool_no_proxy() {
|
||||
let config = FetchConfig::default();
|
||||
let client = FetchClient::new(config).unwrap();
|
||||
assert_eq!(client.proxy_pool_size(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rotating_pool_prebuilds_clients() {
|
||||
let config = FetchConfig {
|
||||
proxy_pool: vec![
|
||||
"http://proxy1:8080".into(),
|
||||
"http://proxy2:8080".into(),
|
||||
"http://proxy3:8080".into(),
|
||||
],
|
||||
..Default::default()
|
||||
};
|
||||
let client = FetchClient::new(config).unwrap();
|
||||
assert_eq!(client.proxy_pool_size(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pick_for_host_deterministic() {
|
||||
let config = FetchConfig {
|
||||
browser: BrowserProfile::Random,
|
||||
..Default::default()
|
||||
};
|
||||
let client = FetchClient::new(config).unwrap();
|
||||
|
||||
let clients = match &client.pool {
|
||||
ClientPool::Static { clients, .. } => clients,
|
||||
ClientPool::Rotating { clients } => clients,
|
||||
};
|
||||
|
||||
let a1 = pick_for_host(clients, "example.com") as *const _;
|
||||
let a2 = pick_for_host(clients, "example.com") as *const _;
|
||||
let a3 = pick_for_host(clients, "example.com") as *const _;
|
||||
assert_eq!(a1, a2);
|
||||
assert_eq!(a2, a3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pick_for_host_distributes() {
|
||||
let config = FetchConfig {
|
||||
proxy_pool: (0..10).map(|i| format!("http://proxy{i}:8080")).collect(),
|
||||
..Default::default()
|
||||
};
|
||||
let client = FetchClient::new(config).unwrap();
|
||||
|
||||
let clients = match &client.pool {
|
||||
ClientPool::Static { clients, .. } | ClientPool::Rotating { clients } => clients,
|
||||
};
|
||||
|
||||
let hosts = [
|
||||
"example.com",
|
||||
"google.com",
|
||||
"github.com",
|
||||
"rust-lang.org",
|
||||
"crates.io",
|
||||
];
|
||||
|
||||
let indices: Vec<usize> = hosts
|
||||
.iter()
|
||||
.map(|h| {
|
||||
let ptr = pick_for_host(clients, h) as *const _;
|
||||
clients.iter().position(|c| std::ptr::eq(c, ptr)).unwrap()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let unique: std::collections::HashSet<_> = indices.iter().collect();
|
||||
assert!(
|
||||
unique.len() >= 2,
|
||||
"expected host distribution across clients, got indices: {indices:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_host() {
|
||||
assert_eq!(extract_host("https://example.com/path"), "example.com");
|
||||
assert_eq!(
|
||||
extract_host("https://sub.example.com:8080/foo"),
|
||||
"sub.example.com"
|
||||
);
|
||||
assert_eq!(extract_host("not-a-url"), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_config_has_empty_proxy_pool() {
|
||||
let config = FetchConfig::default();
|
||||
assert!(config.proxy_pool.is_empty());
|
||||
assert!(config.proxy.is_none());
|
||||
}
|
||||
}
|
||||
648
crates/noxa-fetch/src/crawler.rs
Normal file
648
crates/noxa-fetch/src/crawler.rs
Normal file
|
|
@ -0,0 +1,648 @@
|
|||
/// Recursive same-origin web crawler built on top of [`FetchClient`].
|
||||
///
|
||||
/// Starts from a seed URL, extracts content, discovers links, and follows
|
||||
/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
|
||||
/// for bounded concurrency and per-request delays for politeness.
|
||||
///
|
||||
/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
|
||||
/// site's sitemaps and seeds the BFS frontier before crawling.
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{debug, info, warn};
|
||||
use url::Url;
|
||||
|
||||
use crate::client::{FetchClient, FetchConfig};
|
||||
use crate::error::FetchError;
|
||||
use crate::sitemap;
|
||||
|
||||
/// Controls crawl scope, depth, concurrency, and politeness.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CrawlConfig {
|
||||
/// Fetch configuration (browser profile, proxy, timeout, etc.)
|
||||
pub fetch: FetchConfig,
|
||||
/// How deep to follow links. 1 = only immediate links from seed page.
|
||||
pub max_depth: usize,
|
||||
/// Hard cap on total pages fetched (including the seed).
|
||||
pub max_pages: usize,
|
||||
/// Max concurrent in-flight requests.
|
||||
pub concurrency: usize,
|
||||
/// Minimum delay before each request (politeness).
|
||||
pub delay: Duration,
|
||||
/// Only follow URLs whose path starts with this prefix (e.g. "/docs/").
|
||||
pub path_prefix: Option<String>,
|
||||
/// Seed BFS frontier from sitemap discovery before crawling.
|
||||
pub use_sitemap: bool,
|
||||
/// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
|
||||
/// E.g. `["/api/*", "/guides/*"]` — matched against the URL path.
|
||||
pub include_patterns: Vec<String>,
|
||||
/// Glob patterns for paths to exclude. Checked after include_patterns.
|
||||
/// E.g. `["/changelog/*", "/blog/*"]` — matching URLs are skipped.
|
||||
pub exclude_patterns: Vec<String>,
|
||||
/// Optional channel sender for streaming per-page results as they complete.
|
||||
/// When set, each `PageResult` is sent on this channel immediately after extraction.
|
||||
pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
|
||||
/// When set to `true`, the crawler breaks out of the main loop early.
|
||||
/// Callers (e.g. a Ctrl+C handler) can flip this to request graceful cancellation.
|
||||
pub cancel_flag: Option<Arc<AtomicBool>>,
|
||||
}
|
||||
|
||||
impl Default for CrawlConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
fetch: FetchConfig::default(),
|
||||
max_depth: 1,
|
||||
max_pages: 50,
|
||||
concurrency: 5,
|
||||
delay: Duration::from_millis(100),
|
||||
path_prefix: None,
|
||||
use_sitemap: false,
|
||||
include_patterns: Vec::new(),
|
||||
exclude_patterns: Vec::new(),
|
||||
progress_tx: None,
|
||||
cancel_flag: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregated results from a crawl run.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct CrawlResult {
|
||||
pub pages: Vec<PageResult>,
|
||||
pub total: usize,
|
||||
pub ok: usize,
|
||||
pub errors: usize,
|
||||
pub elapsed_secs: f64,
|
||||
/// URLs visited during this crawl (for resume state).
|
||||
#[serde(skip)]
|
||||
pub visited: HashSet<String>,
|
||||
/// Remaining frontier when crawl was cancelled (for resume state).
|
||||
#[serde(skip)]
|
||||
pub remaining_frontier: Vec<(String, usize)>,
|
||||
}
|
||||
|
||||
/// Outcome of extracting a single page during the crawl.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PageResult {
|
||||
pub url: String,
|
||||
pub depth: usize,
|
||||
pub extraction: Option<noxa_core::ExtractionResult>,
|
||||
pub error: Option<String>,
|
||||
#[serde(skip)]
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
/// Serializable crawl state for resume after Ctrl+C cancellation.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CrawlState {
|
||||
pub seed_url: String,
|
||||
pub visited: Vec<String>,
|
||||
pub frontier: Vec<(String, usize)>,
|
||||
pub completed_pages: usize,
|
||||
pub max_pages: usize,
|
||||
pub max_depth: usize,
|
||||
}
|
||||
|
||||
/// Recursive crawler that wraps a shared [`FetchClient`].
|
||||
pub struct Crawler {
|
||||
client: Arc<FetchClient>,
|
||||
config: CrawlConfig,
|
||||
seed_origin: String,
|
||||
}
|
||||
|
||||
impl Crawler {
|
||||
/// Build a new crawler from a seed URL and config.
|
||||
/// Constructs the underlying `FetchClient` from `config.fetch`.
|
||||
pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
|
||||
let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
|
||||
let seed_origin = origin_key(&seed);
|
||||
|
||||
let client = FetchClient::new(config.fetch.clone())?;
|
||||
|
||||
Ok(Self {
|
||||
client: Arc::new(client),
|
||||
config,
|
||||
seed_origin,
|
||||
})
|
||||
}
|
||||
|
||||
/// Save current crawl state to a JSON file for later resume.
|
||||
pub fn save_state(
|
||||
path: &Path,
|
||||
seed_url: &str,
|
||||
visited: &HashSet<String>,
|
||||
frontier: &[(String, usize)],
|
||||
completed_pages: usize,
|
||||
max_pages: usize,
|
||||
max_depth: usize,
|
||||
) -> Result<(), String> {
|
||||
let state = CrawlState {
|
||||
seed_url: seed_url.to_string(),
|
||||
visited: visited.iter().cloned().collect(),
|
||||
frontier: frontier.to_vec(),
|
||||
completed_pages,
|
||||
max_pages,
|
||||
max_depth,
|
||||
};
|
||||
let json =
|
||||
serde_json::to_string_pretty(&state).map_err(|e| format!("serialize state: {e}"))?;
|
||||
std::fs::write(path, json).map_err(|e| format!("write state to {}: {e}", path.display()))
|
||||
}
|
||||
|
||||
/// Load crawl state from a JSON file. Returns `None` if file doesn't exist.
|
||||
pub fn load_state(path: &Path) -> Option<CrawlState> {
|
||||
let content = std::fs::read_to_string(path).ok()?;
|
||||
serde_json::from_str(&content).ok()
|
||||
}
|
||||
|
||||
/// Returns true if the cancel flag has been set.
|
||||
fn is_cancelled(&self) -> bool {
|
||||
self.config
|
||||
.cancel_flag
|
||||
.as_ref()
|
||||
.is_some_and(|f| f.load(Ordering::Relaxed))
|
||||
}
|
||||
|
||||
/// Crawl starting from `start_url`, returning results for every page visited.
|
||||
///
|
||||
/// Uses breadth-first traversal: all pages at depth N are fetched (concurrently,
|
||||
/// bounded by `config.concurrency`) before moving to depth N+1.
|
||||
///
|
||||
/// When `config.use_sitemap` is true, sitemap URLs are discovered first and
|
||||
/// added to the initial frontier at depth 0 alongside the seed URL.
|
||||
///
|
||||
/// If `resume_state` is provided, the crawl resumes from the saved state
|
||||
/// (pre-populated visited set and frontier) instead of starting fresh.
|
||||
pub async fn crawl(&self, start_url: &str, resume_state: Option<CrawlState>) -> CrawlResult {
|
||||
let start = Instant::now();
|
||||
|
||||
let seed = match Url::parse(start_url) {
|
||||
Ok(u) => u,
|
||||
Err(_) => {
|
||||
return CrawlResult {
|
||||
pages: vec![PageResult {
|
||||
url: start_url.to_string(),
|
||||
depth: 0,
|
||||
extraction: None,
|
||||
error: Some(format!("invalid URL: {start_url}")),
|
||||
elapsed: Duration::ZERO,
|
||||
}],
|
||||
total: 1,
|
||||
ok: 0,
|
||||
errors: 1,
|
||||
elapsed_secs: 0.0,
|
||||
visited: HashSet::new(),
|
||||
remaining_frontier: Vec::new(),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
let semaphore = Arc::new(Semaphore::new(self.config.concurrency));
|
||||
let mut visited: HashSet<String>;
|
||||
let mut pages: Vec<PageResult> = Vec::new();
|
||||
let mut frontier: Vec<(String, usize)>;
|
||||
|
||||
// Resume from saved state or start fresh
|
||||
if let Some(state) = resume_state {
|
||||
visited = state.visited.into_iter().collect();
|
||||
frontier = state.frontier;
|
||||
info!(
|
||||
visited = visited.len(),
|
||||
frontier = frontier.len(),
|
||||
"resuming crawl from saved state"
|
||||
);
|
||||
} else {
|
||||
visited = HashSet::new();
|
||||
frontier = vec![(normalize(&seed), 0)];
|
||||
|
||||
// Seed frontier from sitemap if enabled
|
||||
if self.config.use_sitemap {
|
||||
let base_url = format!("{}://{}", seed.scheme(), seed.host_str().unwrap_or(""));
|
||||
match sitemap::discover(&self.client, &base_url).await {
|
||||
Ok(entries) => {
|
||||
let before = frontier.len();
|
||||
for entry in entries {
|
||||
if self.qualify_link(&entry.url, &visited).is_some() {
|
||||
let parsed = match Url::parse(&entry.url) {
|
||||
Ok(u) => u,
|
||||
Err(_) => continue,
|
||||
};
|
||||
let norm = normalize(&parsed);
|
||||
frontier.push((norm, 0));
|
||||
}
|
||||
}
|
||||
let added = frontier.len() - before;
|
||||
info!(
|
||||
sitemap_urls = added,
|
||||
"seeded frontier from sitemap discovery"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(error = %e, "sitemap discovery failed, continuing with seed URL only");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while !frontier.is_empty() && pages.len() < self.config.max_pages {
|
||||
// Check cancel flag before processing each batch
|
||||
if self.is_cancelled() {
|
||||
info!("crawl cancelled by user");
|
||||
break;
|
||||
}
|
||||
|
||||
// Dedup this level's frontier against the visited set and page cap
|
||||
let batch: Vec<(String, usize)> = frontier
|
||||
.drain(..)
|
||||
.filter(|(url, _)| visited.insert(url.clone()))
|
||||
.take(self.config.max_pages.saturating_sub(pages.len()))
|
||||
.collect();
|
||||
|
||||
if batch.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Spawn one task per URL, bounded by semaphore
|
||||
let mut handles = Vec::with_capacity(batch.len());
|
||||
|
||||
for (url, depth) in &batch {
|
||||
let permit = Arc::clone(&semaphore);
|
||||
let client = Arc::clone(&self.client);
|
||||
let url = url.clone();
|
||||
let depth = *depth;
|
||||
let delay = self.config.delay;
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
// Acquire permit — blocks if concurrency limit reached
|
||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||
tokio::time::sleep(delay).await;
|
||||
|
||||
let page_start = Instant::now();
|
||||
let result = client.fetch_and_extract(&url).await;
|
||||
let elapsed = page_start.elapsed();
|
||||
|
||||
match result {
|
||||
Ok(extraction) => {
|
||||
debug!(
|
||||
url = %url, depth,
|
||||
elapsed_ms = %elapsed.as_millis(),
|
||||
"page extracted"
|
||||
);
|
||||
PageResult {
|
||||
url,
|
||||
depth,
|
||||
extraction: Some(extraction),
|
||||
error: None,
|
||||
elapsed,
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(url = %url, depth, error = %e, "page failed");
|
||||
PageResult {
|
||||
url,
|
||||
depth,
|
||||
extraction: None,
|
||||
error: Some(e.to_string()),
|
||||
elapsed,
|
||||
}
|
||||
}
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
// Collect results and harvest links for the next depth level
|
||||
let mut next_frontier: Vec<(String, usize)> = Vec::new();
|
||||
|
||||
for handle in handles {
|
||||
let page = match handle.await {
|
||||
Ok(page) => page,
|
||||
Err(e) => {
|
||||
warn!(error = %e, "crawl task panicked");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let depth = page.depth;
|
||||
|
||||
if depth < self.config.max_depth
|
||||
&& let Some(ref extraction) = page.extraction
|
||||
{
|
||||
for link in &extraction.content.links {
|
||||
if let Some(candidate) = self.qualify_link(&link.href, &visited) {
|
||||
next_frontier.push((candidate, depth + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Stream progress if a channel is configured
|
||||
if let Some(tx) = &self.config.progress_tx {
|
||||
let _ = tx.send(page.clone());
|
||||
}
|
||||
|
||||
pages.push(page);
|
||||
|
||||
if pages.len() >= self.config.max_pages {
|
||||
break;
|
||||
}
|
||||
|
||||
// Check cancel flag between page results
|
||||
if self.is_cancelled() {
|
||||
info!("crawl cancelled by user (mid-batch)");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
frontier = next_frontier;
|
||||
}
|
||||
|
||||
let total_elapsed = start.elapsed();
|
||||
let ok_count = pages.iter().filter(|p| p.extraction.is_some()).count();
|
||||
let err_count = pages.len() - ok_count;
|
||||
info!(
|
||||
total = pages.len(),
|
||||
ok = ok_count,
|
||||
errors = err_count,
|
||||
elapsed_ms = %total_elapsed.as_millis(),
|
||||
"crawl complete"
|
||||
);
|
||||
|
||||
CrawlResult {
|
||||
total: pages.len(),
|
||||
ok: ok_count,
|
||||
errors: err_count,
|
||||
elapsed_secs: total_elapsed.as_secs_f64(),
|
||||
remaining_frontier: frontier,
|
||||
visited,
|
||||
pages,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a discovered link should be added to the frontier.
|
||||
/// Returns `Some(normalized_url)` if it passes all filters, `None` otherwise.
|
||||
fn qualify_link(&self, href: &str, visited: &HashSet<String>) -> Option<String> {
|
||||
let parsed = Url::parse(href).ok()?;
|
||||
|
||||
// Only http(s) schemes
|
||||
match parsed.scheme() {
|
||||
"http" | "https" => {}
|
||||
_ => return None,
|
||||
}
|
||||
|
||||
// Same-origin check (scheme + host + port)
|
||||
if origin_key(&parsed) != self.seed_origin {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Path prefix filter
|
||||
if let Some(ref prefix) = self.config.path_prefix
|
||||
&& !parsed.path().starts_with(prefix.as_str())
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Include patterns: if any are set, path must match at least one
|
||||
let path = parsed.path();
|
||||
if !self.config.include_patterns.is_empty()
|
||||
&& !self
|
||||
.config
|
||||
.include_patterns
|
||||
.iter()
|
||||
.any(|pat| glob_match(pat, path))
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Exclude patterns: if path matches any, skip
|
||||
if self
|
||||
.config
|
||||
.exclude_patterns
|
||||
.iter()
|
||||
.any(|pat| glob_match(pat, path))
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip common non-page file extensions
|
||||
const SKIP_EXTENSIONS: &[&str] = &[
|
||||
".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".css", ".js",
|
||||
".zip", ".tar", ".gz", ".xml", ".rss", ".mp3", ".mp4", ".avi", ".mov", ".woff",
|
||||
".woff2", ".ttf", ".eot",
|
||||
];
|
||||
if SKIP_EXTENSIONS.iter().any(|ext| path.ends_with(ext)) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let normalized = normalize(&parsed);
|
||||
|
||||
if visited.contains(&normalized) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(normalized)
|
||||
}
|
||||
}
|
||||
|
||||
/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
|
||||
fn origin_key(url: &Url) -> String {
|
||||
let port_suffix = match url.port() {
|
||||
Some(p) => format!(":{p}"),
|
||||
None => String::new(),
|
||||
};
|
||||
let host = url.host_str().unwrap_or("");
|
||||
let host = host.strip_prefix("www.").unwrap_or(host);
|
||||
format!("{}://{}{}", url.scheme(), host, port_suffix)
|
||||
}
|
||||
|
||||
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
|
||||
/// lowercase scheme + host. Preserves query params and path case.
|
||||
fn normalize(url: &Url) -> String {
|
||||
let scheme = url.scheme();
|
||||
let host = url.host_str().unwrap_or("").to_ascii_lowercase();
|
||||
let port_suffix = match url.port() {
|
||||
Some(p) => format!(":{p}"),
|
||||
None => String::new(),
|
||||
};
|
||||
|
||||
let mut path = url.path().to_string();
|
||||
if path.len() > 1 && path.ends_with('/') {
|
||||
path.pop();
|
||||
}
|
||||
|
||||
let query = match url.query() {
|
||||
Some(q) => format!("?{q}"),
|
||||
None => String::new(),
|
||||
};
|
||||
|
||||
// Fragment intentionally omitted
|
||||
format!("{scheme}://{host}{port_suffix}{path}{query}")
|
||||
}
|
||||
|
||||
/// Simple glob matching for URL paths. Supports:
|
||||
/// - `*` matches any characters within a single path segment (no `/`)
|
||||
/// - `**` matches any characters including `/` (any number of segments)
|
||||
/// - Literal characters match exactly
|
||||
///
|
||||
/// Examples:
|
||||
/// - `/api/*` matches `/api/users` but not `/api/users/123`
|
||||
/// - `/api/**` matches `/api/users`, `/api/users/123`, `/api/a/b/c`
|
||||
/// - `/docs/*/intro` matches `/docs/v2/intro`
|
||||
fn glob_match(pattern: &str, path: &str) -> bool {
|
||||
glob_match_inner(pattern.as_bytes(), path.as_bytes())
|
||||
}
|
||||
|
||||
fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
|
||||
let mut pi = 0;
|
||||
let mut ti = 0;
|
||||
let mut star_pi = usize::MAX;
|
||||
let mut star_ti = 0;
|
||||
|
||||
while ti < text.len() {
|
||||
if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
|
||||
// `**` — match everything including slashes
|
||||
// Skip all consecutive `*`
|
||||
while pi < pat.len() && pat[pi] == b'*' {
|
||||
pi += 1;
|
||||
}
|
||||
// Skip trailing `/` after `**`
|
||||
if pi < pat.len() && pat[pi] == b'/' {
|
||||
pi += 1;
|
||||
}
|
||||
if pi >= pat.len() {
|
||||
return true; // `**` at end matches everything
|
||||
}
|
||||
// Try matching the rest of pattern against every suffix of text
|
||||
for start in ti..=text.len() {
|
||||
if glob_match_inner(&pat[pi..], &text[start..]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} else if pi < pat.len() && pat[pi] == b'*' {
|
||||
// `*` — match any chars except `/`
|
||||
star_pi = pi;
|
||||
star_ti = ti;
|
||||
pi += 1;
|
||||
} else if pi < pat.len() && (pat[pi] == text[ti] || pat[pi] == b'?') {
|
||||
pi += 1;
|
||||
ti += 1;
|
||||
} else if star_pi != usize::MAX {
|
||||
// Backtrack: `*` absorbs one more char (but not `/`)
|
||||
if text[star_ti] == b'/' {
|
||||
return false;
|
||||
}
|
||||
star_ti += 1;
|
||||
ti = star_ti;
|
||||
pi = star_pi + 1;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Consume trailing `*` or `**` in pattern
|
||||
while pi < pat.len() && pat[pi] == b'*' {
|
||||
pi += 1;
|
||||
}
|
||||
|
||||
pi >= pat.len()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_fragment() {
|
||||
let url = Url::parse("https://example.com/page#section").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/page");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_trailing_slash() {
|
||||
let url = Url::parse("https://example.com/docs/").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/docs");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_keeps_root_slash() {
|
||||
let url = Url::parse("https://example.com/").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_preserves_query() {
|
||||
let url = Url::parse("https://example.com/search?q=rust&page=2").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/search?q=rust&page=2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_lowercases_host() {
|
||||
let url = Url::parse("https://Example.COM/Path").unwrap();
|
||||
assert_eq!(normalize(&url), "https://example.com/Path");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn origin_includes_explicit_port() {
|
||||
let url = Url::parse("https://example.com:8443/foo").unwrap();
|
||||
assert_eq!(origin_key(&url), "https://example.com:8443");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn origin_omits_default_port() {
|
||||
let url = Url::parse("https://example.com/foo").unwrap();
|
||||
assert_eq!(origin_key(&url), "https://example.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_schemes_are_different_origins() {
|
||||
let http = Url::parse("http://example.com/").unwrap();
|
||||
let https = Url::parse("https://example.com/").unwrap();
|
||||
assert_ne!(origin_key(&http), origin_key(&https));
|
||||
}
|
||||
|
||||
// -- glob_match tests --
|
||||
|
||||
#[test]
|
||||
fn glob_star_matches_single_segment() {
|
||||
assert!(glob_match("/api/*", "/api/users"));
|
||||
assert!(glob_match("/api/*", "/api/products"));
|
||||
assert!(!glob_match("/api/*", "/api/users/123"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_doublestar_matches_multiple_segments() {
|
||||
assert!(glob_match("/api/**", "/api/users"));
|
||||
assert!(glob_match("/api/**", "/api/users/123"));
|
||||
assert!(glob_match("/api/**", "/api/a/b/c/d"));
|
||||
assert!(!glob_match("/api/**", "/docs/intro"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_exact_match() {
|
||||
assert!(glob_match("/about", "/about"));
|
||||
assert!(!glob_match("/about", "/about/team"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_middle_wildcard() {
|
||||
assert!(glob_match("/docs/*/intro", "/docs/v2/intro"));
|
||||
assert!(!glob_match("/docs/*/intro", "/docs/v2/v3/intro"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_no_pattern_matches_nothing() {
|
||||
// Empty pattern only matches empty string
|
||||
assert!(glob_match("", ""));
|
||||
assert!(!glob_match("", "/foo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn glob_trailing_star() {
|
||||
assert!(glob_match("/blog*", "/blog"));
|
||||
assert!(glob_match("/blog*", "/blog-post"));
|
||||
assert!(!glob_match("/blog*", "/blog/post")); // * doesn't cross /
|
||||
}
|
||||
}
|
||||
745
crates/noxa-fetch/src/document.rs
Normal file
745
crates/noxa-fetch/src/document.rs
Normal file
|
|
@ -0,0 +1,745 @@
|
|||
/// Document extraction for DOCX, XLSX, XLS, and CSV files.
|
||||
/// Auto-detects document type from Content-Type headers or URL extension,
|
||||
/// then extracts text content as markdown — same pattern as PDF extraction.
|
||||
use std::io::{Cursor, Read};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
use crate::error::FetchError;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DocType {
|
||||
Docx,
|
||||
Xlsx,
|
||||
Xls,
|
||||
Csv,
|
||||
}
|
||||
|
||||
impl DocType {
|
||||
fn label(self) -> &'static str {
|
||||
match self {
|
||||
DocType::Docx => "DOCX",
|
||||
DocType::Xlsx => "XLSX",
|
||||
DocType::Xls => "XLS",
|
||||
DocType::Csv => "CSV",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DocType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.label())
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect document type from response headers or URL extension.
|
||||
/// Returns `None` for non-document responses (HTML, PDF, etc.).
|
||||
pub fn is_document_content_type(headers: &http::HeaderMap, url: &str) -> Option<DocType> {
|
||||
// Check Content-Type header first
|
||||
if let Some(ct) = headers.get("content-type").and_then(|v| v.to_str().ok()) {
|
||||
let mime = ct.split(';').next().unwrap_or("").trim();
|
||||
|
||||
if mime.eq_ignore_ascii_case(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
) {
|
||||
return Some(DocType::Docx);
|
||||
}
|
||||
if mime.eq_ignore_ascii_case(
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
) {
|
||||
return Some(DocType::Xlsx);
|
||||
}
|
||||
if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
|
||||
return Some(DocType::Xls);
|
||||
}
|
||||
if mime.eq_ignore_ascii_case("text/csv") {
|
||||
return Some(DocType::Csv);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to URL extension
|
||||
let path = url.split('?').next().unwrap_or(url);
|
||||
let lower = path.to_ascii_lowercase();
|
||||
|
||||
if lower.ends_with(".docx") {
|
||||
return Some(DocType::Docx);
|
||||
}
|
||||
if lower.ends_with(".xlsx") {
|
||||
return Some(DocType::Xlsx);
|
||||
}
|
||||
if lower.ends_with(".xls") {
|
||||
return Some(DocType::Xls);
|
||||
}
|
||||
if lower.ends_with(".csv") {
|
||||
return Some(DocType::Csv);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Extract text content from document bytes, returning an ExtractionResult.
|
||||
pub fn extract_document(
|
||||
bytes: &[u8],
|
||||
doc_type: DocType,
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
debug!(
|
||||
doc_type = doc_type.label(),
|
||||
bytes = bytes.len(),
|
||||
"extracting document"
|
||||
);
|
||||
|
||||
let markdown = match doc_type {
|
||||
DocType::Docx => extract_docx(bytes)?,
|
||||
DocType::Xlsx => extract_xlsx(bytes)?,
|
||||
DocType::Xls => extract_xls(bytes)?,
|
||||
DocType::Csv => extract_csv(bytes)?,
|
||||
};
|
||||
|
||||
let plain_text = strip_markdown_formatting(&markdown);
|
||||
let word_count = plain_text.split_whitespace().count();
|
||||
|
||||
Ok(noxa_core::ExtractionResult {
|
||||
metadata: noxa_core::Metadata {
|
||||
title: None,
|
||||
description: None,
|
||||
author: None,
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: None,
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: noxa_core::Content {
|
||||
markdown,
|
||||
plain_text,
|
||||
links: Vec::new(),
|
||||
images: Vec::new(),
|
||||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract text from a DOCX file (ZIP of XML).
|
||||
/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
|
||||
fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
|
||||
let cursor = Cursor::new(bytes);
|
||||
let mut archive =
|
||||
zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
|
||||
|
||||
let xml = {
|
||||
let mut file = archive
|
||||
.by_name("word/document.xml")
|
||||
.map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
|
||||
let mut buf = String::new();
|
||||
file.read_to_string(&mut buf)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
|
||||
buf
|
||||
};
|
||||
|
||||
parse_docx_xml(&xml)
|
||||
}
|
||||
|
||||
/// Parse DOCX XML (word/document.xml) into markdown.
|
||||
///
|
||||
/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
|
||||
/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
|
||||
fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
|
||||
use quick_xml::Reader;
|
||||
use quick_xml::events::Event;
|
||||
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut paragraphs: Vec<String> = Vec::new();
|
||||
|
||||
// State tracking for the current paragraph
|
||||
let mut in_paragraph = false;
|
||||
let mut in_run = false; // inside <w:r> (run)
|
||||
let mut in_text = false; // inside <w:t>
|
||||
let mut current_text = String::new();
|
||||
let mut heading_level: Option<u8> = None; // None = normal paragraph
|
||||
let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
|
||||
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
|
||||
let name_bytes = e.name().as_ref().to_vec();
|
||||
let local = local_name(&name_bytes);
|
||||
match local {
|
||||
b"p" if is_w_namespace(&name_bytes) => {
|
||||
in_paragraph = true;
|
||||
current_text.clear();
|
||||
heading_level = None;
|
||||
}
|
||||
b"pPr" if in_paragraph => in_ppr = true,
|
||||
b"pStyle" if in_ppr => {
|
||||
heading_level = extract_heading_level(e);
|
||||
}
|
||||
b"r" if in_paragraph => in_run = true,
|
||||
b"t" if in_run => in_text = true,
|
||||
b"br" if in_paragraph => {
|
||||
current_text.push('\n');
|
||||
}
|
||||
b"tab" if in_paragraph => {
|
||||
current_text.push('\t');
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
let name_bytes = e.name().as_ref().to_vec();
|
||||
let local = local_name(&name_bytes);
|
||||
match local {
|
||||
b"p" if in_paragraph => {
|
||||
let text = current_text.trim().to_string();
|
||||
if !text.is_empty() {
|
||||
let formatted = match heading_level {
|
||||
Some(1) => format!("# {text}"),
|
||||
Some(2) => format!("## {text}"),
|
||||
Some(3) => format!("### {text}"),
|
||||
Some(4) => format!("#### {text}"),
|
||||
Some(5) => format!("##### {text}"),
|
||||
Some(6) => format!("###### {text}"),
|
||||
_ => text,
|
||||
};
|
||||
paragraphs.push(formatted);
|
||||
}
|
||||
in_paragraph = false;
|
||||
}
|
||||
b"pPr" => in_ppr = false,
|
||||
b"r" => {
|
||||
in_run = false;
|
||||
in_text = false;
|
||||
}
|
||||
b"t" => in_text = false,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(ref e)) if in_text => {
|
||||
if let Ok(text) = e.unescape() {
|
||||
current_text.push_str(&text);
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(paragraphs.join("\n\n"))
|
||||
}
|
||||
|
||||
/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
|
||||
/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
|
||||
fn is_w_namespace(name: &[u8]) -> bool {
|
||||
// quick-xml gives us the full name bytes. Accept both "w:p" and "p".
|
||||
name == b"w:p" || name == b"p"
|
||||
}
|
||||
|
||||
/// Extract the local name from a possibly namespaced XML tag.
|
||||
/// `w:p` -> `p`, `p` -> `p`
|
||||
fn local_name(name: &[u8]) -> &[u8] {
|
||||
match name.iter().position(|&b| b == b':') {
|
||||
Some(pos) => &name[pos + 1..],
|
||||
None => name,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
|
||||
fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
|
||||
for attr in e.attributes().flatten() {
|
||||
let local = local_name(attr.key.as_ref());
|
||||
if local == b"val" {
|
||||
let val = String::from_utf8_lossy(&attr.value);
|
||||
let lower = val.to_ascii_lowercase();
|
||||
|
||||
// Match "heading1", "heading2", etc. and "title" -> h1
|
||||
if lower == "title" {
|
||||
return Some(1);
|
||||
}
|
||||
if let Some(rest) = lower.strip_prefix("heading")
|
||||
&& let Ok(n) = rest.parse::<u8>()
|
||||
{
|
||||
return Some(n.min(6));
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Extract spreadsheet content using calamine (XLSX format).
|
||||
fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
|
||||
extract_spreadsheet(bytes, "XLSX")
|
||||
}
|
||||
|
||||
/// Extract spreadsheet content using calamine (XLS format).
|
||||
fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
|
||||
extract_spreadsheet(bytes, "XLS")
|
||||
}
|
||||
|
||||
/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
|
||||
/// Reads all sheets and formats each as a markdown table.
|
||||
fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
|
||||
use calamine::Reader;
|
||||
|
||||
let cursor = Cursor::new(bytes);
|
||||
let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
|
||||
.map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
|
||||
|
||||
let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
|
||||
let mut sections: Vec<String> = Vec::new();
|
||||
|
||||
for name in &sheet_names {
|
||||
let range = workbook
|
||||
.worksheet_range(name)
|
||||
.map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
|
||||
|
||||
let rows: Vec<Vec<String>> = range
|
||||
.rows()
|
||||
.map(|row| row.iter().map(cell_to_string).collect())
|
||||
.collect();
|
||||
|
||||
if rows.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut section = format!("## Sheet: {name}\n\n");
|
||||
section.push_str(&rows_to_markdown_table(&rows));
|
||||
sections.push(section);
|
||||
}
|
||||
|
||||
if sections.is_empty() {
|
||||
return Ok("(empty spreadsheet)".to_string());
|
||||
}
|
||||
|
||||
Ok(sections.join("\n\n"))
|
||||
}
|
||||
|
||||
/// Convert a calamine cell value to a display string.
|
||||
fn cell_to_string(cell: &calamine::Data) -> String {
|
||||
use calamine::Data;
|
||||
match cell {
|
||||
Data::Empty => String::new(),
|
||||
Data::String(s) => s.clone(),
|
||||
Data::Int(n) => n.to_string(),
|
||||
Data::Float(f) => format_float(*f),
|
||||
Data::Bool(b) => b.to_string(),
|
||||
Data::Error(e) => format!("#{e:?}"),
|
||||
Data::DateTime(dt) => format!("{dt}"),
|
||||
Data::DateTimeIso(s) => s.clone(),
|
||||
Data::DurationIso(s) => s.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format a float, dropping trailing `.0` for clean integer display.
|
||||
fn format_float(f: f64) -> String {
|
||||
if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
|
||||
format!("{}", f as i64)
|
||||
} else {
|
||||
format!("{f}")
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract CSV text and convert to markdown table.
|
||||
fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
|
||||
let text = String::from_utf8_lossy(bytes);
|
||||
let rows = parse_csv_rows(&text);
|
||||
|
||||
if rows.is_empty() {
|
||||
return Ok("(empty CSV)".to_string());
|
||||
}
|
||||
|
||||
Ok(rows_to_markdown_table(&rows))
|
||||
}
|
||||
|
||||
/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
|
||||
fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
|
||||
let mut rows: Vec<Vec<String>> = Vec::new();
|
||||
let mut current_row: Vec<String> = Vec::new();
|
||||
let mut current_field = String::new();
|
||||
let mut in_quotes = false;
|
||||
let mut chars = text.chars().peekable();
|
||||
|
||||
while let Some(ch) = chars.next() {
|
||||
if in_quotes {
|
||||
if ch == '"' {
|
||||
// Escaped quote ("") or end of quoted field
|
||||
if chars.peek() == Some(&'"') {
|
||||
chars.next();
|
||||
current_field.push('"');
|
||||
} else {
|
||||
in_quotes = false;
|
||||
}
|
||||
} else {
|
||||
current_field.push(ch);
|
||||
}
|
||||
} else {
|
||||
match ch {
|
||||
'"' => in_quotes = true,
|
||||
',' => {
|
||||
current_row.push(current_field.trim().to_string());
|
||||
current_field = String::new();
|
||||
}
|
||||
'\n' => {
|
||||
current_row.push(current_field.trim().to_string());
|
||||
current_field = String::new();
|
||||
if !current_row.iter().all(|f| f.is_empty()) {
|
||||
rows.push(current_row);
|
||||
}
|
||||
current_row = Vec::new();
|
||||
}
|
||||
'\r' => {
|
||||
// Skip carriage returns (handled with \n)
|
||||
}
|
||||
_ => current_field.push(ch),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Flush last field/row
|
||||
if !current_field.is_empty() || !current_row.is_empty() {
|
||||
current_row.push(current_field.trim().to_string());
|
||||
if !current_row.iter().all(|f| f.is_empty()) {
|
||||
rows.push(current_row);
|
||||
}
|
||||
}
|
||||
|
||||
rows
|
||||
}
|
||||
|
||||
/// Convert rows (first row = header) into a markdown table.
|
||||
fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
|
||||
if rows.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
// Find the max column count across all rows
|
||||
let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
|
||||
if col_count == 0 {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let mut lines: Vec<String> = Vec::new();
|
||||
|
||||
// Header row
|
||||
let header = &rows[0];
|
||||
let header_cells: Vec<&str> = (0..col_count)
|
||||
.map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
|
||||
.collect();
|
||||
lines.push(format!("| {} |", header_cells.join(" | ")));
|
||||
|
||||
// Separator row
|
||||
let sep: Vec<&str> = vec!["---"; col_count];
|
||||
lines.push(format!("| {} |", sep.join(" | ")));
|
||||
|
||||
// Data rows
|
||||
for row in &rows[1..] {
|
||||
let cells: Vec<&str> = (0..col_count)
|
||||
.map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
|
||||
.collect();
|
||||
lines.push(format!("| {} |", cells.join(" | ")));
|
||||
}
|
||||
|
||||
lines.join("\n")
|
||||
}
|
||||
|
||||
/// Strip markdown formatting to get plain text.
|
||||
fn strip_markdown_formatting(markdown: &str) -> String {
|
||||
let mut plain = String::with_capacity(markdown.len());
|
||||
for line in markdown.lines() {
|
||||
let trimmed = line.trim_start_matches('#').trim();
|
||||
if trimmed.starts_with("| ---") || trimmed == "|---|" {
|
||||
continue; // Skip separator rows
|
||||
}
|
||||
if let Some(stripped) = trimmed.strip_prefix('|')
|
||||
&& let Some(stripped) = stripped.strip_suffix('|')
|
||||
{
|
||||
// Table row: join cells with spaces
|
||||
let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
|
||||
plain.push_str(&cells.join(" "));
|
||||
plain.push('\n');
|
||||
continue;
|
||||
}
|
||||
plain.push_str(trimmed);
|
||||
plain.push('\n');
|
||||
}
|
||||
plain.trim().to_string()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use http::HeaderMap;
|
||||
|
||||
fn headers_with(name: &str, value: &str) -> HeaderMap {
|
||||
let mut h = HeaderMap::new();
|
||||
h.insert(
|
||||
name.parse::<http::header::HeaderName>().unwrap(),
|
||||
value.parse().unwrap(),
|
||||
);
|
||||
h
|
||||
}
|
||||
|
||||
// --- Content-type detection ---
|
||||
|
||||
#[test]
|
||||
fn test_detect_docx_content_type() {
|
||||
let headers = headers_with(
|
||||
"content-type",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Docx)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_xlsx_content_type() {
|
||||
let headers = headers_with(
|
||||
"content-type",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Xlsx)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_xls_content_type() {
|
||||
let headers = headers_with("content-type", "application/vnd.ms-excel");
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Xls)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_csv_content_type() {
|
||||
let headers = headers_with("content-type", "text/csv");
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Csv)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_csv_content_type_with_charset() {
|
||||
let headers = headers_with("content-type", "text/csv; charset=utf-8");
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Csv)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_by_url_extension() {
|
||||
let empty = HeaderMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/report.docx"),
|
||||
Some(DocType::Docx)
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/data.xlsx"),
|
||||
Some(DocType::Xlsx)
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/old.xls"),
|
||||
Some(DocType::Xls)
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/data.csv"),
|
||||
Some(DocType::Csv)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_url_extension_with_query() {
|
||||
let empty = HeaderMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
|
||||
Some(DocType::Docx)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_url_extension_case_insensitive() {
|
||||
let empty = HeaderMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
|
||||
Some(DocType::Xlsx)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_none_for_html() {
|
||||
let headers = headers_with("content-type", "text/html");
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/page"),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_content_type_takes_precedence_over_url() {
|
||||
let headers = headers_with("content-type", "text/csv");
|
||||
// URL says .xlsx but Content-Type says CSV — header wins
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/data.xlsx"),
|
||||
Some(DocType::Csv)
|
||||
);
|
||||
}
|
||||
|
||||
// --- CSV parsing ---
|
||||
|
||||
#[test]
|
||||
fn test_csv_simple() {
|
||||
let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
|
||||
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||
assert!(result.contains("| Name | Age | City |"));
|
||||
assert!(result.contains("| --- | --- | --- |"));
|
||||
assert!(result.contains("| Alice | 30 | NYC |"));
|
||||
assert!(result.contains("| Bob | 25 | LA |"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_csv_quoted_fields() {
|
||||
let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
|
||||
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||
assert!(result.contains("Has a, comma"));
|
||||
assert!(result.contains("Said \"hello\""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_csv_empty() {
|
||||
let result = extract_csv(b"").unwrap();
|
||||
assert_eq!(result, "(empty CSV)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_csv_windows_line_endings() {
|
||||
let csv = "A,B\r\n1,2\r\n3,4\r\n";
|
||||
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||
assert!(result.contains("| A | B |"));
|
||||
assert!(result.contains("| 1 | 2 |"));
|
||||
}
|
||||
|
||||
// --- DOCX XML parsing ---
|
||||
|
||||
#[test]
|
||||
fn test_docx_xml_simple_paragraphs() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
|
||||
</w:body>
|
||||
</w:document>"#;
|
||||
let result = parse_docx_xml(xml).unwrap();
|
||||
assert_eq!(result, "Hello world\n\nSecond paragraph");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docx_xml_headings() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
|
||||
<w:r><w:t>Title</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p><w:r><w:t>Body text</w:t></w:r></w:p>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
|
||||
<w:r><w:t>Subtitle</w:t></w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>"#;
|
||||
let result = parse_docx_xml(xml).unwrap();
|
||||
assert!(result.contains("# Title"));
|
||||
assert!(result.contains("Body text"));
|
||||
assert!(result.contains("## Subtitle"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docx_xml_multiple_runs() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:r><w:t>Hello </w:t></w:r>
|
||||
<w:r><w:t>world</w:t></w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>"#;
|
||||
let result = parse_docx_xml(xml).unwrap();
|
||||
assert_eq!(result, "Hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docx_xml_empty_paragraphs_skipped() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p></w:p>
|
||||
<w:p><w:r><w:t>Content</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t> </w:t></w:r></w:p>
|
||||
</w:body>
|
||||
</w:document>"#;
|
||||
let result = parse_docx_xml(xml).unwrap();
|
||||
assert_eq!(result, "Content");
|
||||
}
|
||||
|
||||
// --- Markdown table ---
|
||||
|
||||
#[test]
|
||||
fn test_rows_to_markdown_table() {
|
||||
let rows = vec![
|
||||
vec!["A".to_string(), "B".to_string()],
|
||||
vec!["1".to_string(), "2".to_string()],
|
||||
vec!["3".to_string(), "4".to_string()],
|
||||
];
|
||||
let table = rows_to_markdown_table(&rows);
|
||||
assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rows_to_markdown_table_ragged() {
|
||||
let rows = vec![
|
||||
vec!["A".to_string(), "B".to_string(), "C".to_string()],
|
||||
vec!["1".to_string()], // fewer columns
|
||||
];
|
||||
let table = rows_to_markdown_table(&rows);
|
||||
assert!(table.contains("| 1 | | |"));
|
||||
}
|
||||
|
||||
// --- Extract result ---
|
||||
|
||||
#[test]
|
||||
fn test_extract_csv_result() {
|
||||
let csv = "Name,Score\nAlice,100\n";
|
||||
let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
|
||||
assert!(result.content.markdown.contains("| Name | Score |"));
|
||||
assert!(result.metadata.word_count > 0);
|
||||
assert!(result.content.links.is_empty());
|
||||
assert!(result.domain_data.is_none());
|
||||
}
|
||||
|
||||
// --- Strip markdown ---
|
||||
|
||||
#[test]
|
||||
fn test_strip_markdown() {
|
||||
let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
|
||||
let plain = strip_markdown_formatting(md);
|
||||
assert!(plain.contains("Title"));
|
||||
assert!(plain.contains("Some text"));
|
||||
assert!(plain.contains("A B"));
|
||||
assert!(!plain.contains("---"));
|
||||
}
|
||||
}
|
||||
24
crates/noxa-fetch/src/error.rs
Normal file
24
crates/noxa-fetch/src/error.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
/// Fetch-layer errors. Wraps HTTP/network failures into a single type
|
||||
/// that callers can match on without leaking transport details.
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum FetchError {
|
||||
#[error("request failed: {0}")]
|
||||
Request(#[from] wreq::Error),
|
||||
|
||||
#[error("invalid url: {0}")]
|
||||
InvalidUrl(String),
|
||||
|
||||
#[error("response body decode failed: {0}")]
|
||||
BodyDecode(String),
|
||||
|
||||
#[error("extraction failed: {0}")]
|
||||
Extraction(#[from] noxa_core::ExtractError),
|
||||
|
||||
#[error("PDF extraction failed: {0}")]
|
||||
Pdf(#[from] noxa_pdf::PdfError),
|
||||
|
||||
#[error("client build failed: {0}")]
|
||||
Build(String),
|
||||
}
|
||||
22
crates/noxa-fetch/src/lib.rs
Normal file
22
crates/noxa-fetch/src/lib.rs
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
//! noxa-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
//! Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
//! Automatically detects PDF responses and delegates to noxa-pdf.
|
||||
pub mod browser;
|
||||
pub mod client;
|
||||
pub mod crawler;
|
||||
pub mod document;
|
||||
pub mod error;
|
||||
pub mod linkedin;
|
||||
pub mod proxy;
|
||||
pub mod reddit;
|
||||
pub mod sitemap;
|
||||
pub mod tls;
|
||||
|
||||
pub use browser::BrowserProfile;
|
||||
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
|
||||
pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult};
|
||||
pub use error::FetchError;
|
||||
pub use http::HeaderMap;
|
||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||
pub use sitemap::SitemapEntry;
|
||||
pub use noxa_pdf::PdfMode;
|
||||
279
crates/noxa-fetch/src/linkedin.rs
Normal file
279
crates/noxa-fetch/src/linkedin.rs
Normal file
|
|
@ -0,0 +1,279 @@
|
|||
/// LinkedIn post extraction from authenticated HTML.
|
||||
///
|
||||
/// LinkedIn's SPA stores all data in `<code>` tags as HTML-escaped JSON.
|
||||
/// The `included` array contains typed entities: Update (post), Comment,
|
||||
/// Profile, etc. We parse these to reconstruct post + comments as markdown.
|
||||
use serde_json::Value;
|
||||
use tracing::debug;
|
||||
use noxa_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL is a LinkedIn post/activity.
|
||||
pub fn is_linkedin_post(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
(host == "www.linkedin.com" || host == "linkedin.com")
|
||||
&& (url.contains("/feed/update/") || url.contains("/posts/"))
|
||||
}
|
||||
|
||||
/// Extract `<code>` block contents from HTML using simple string scanning.
|
||||
/// LinkedIn wraps JSON data in `<code>` tags with HTML-escaped content.
|
||||
fn extract_code_blocks(html: &str) -> Vec<String> {
|
||||
let mut blocks = Vec::new();
|
||||
let mut search_from = 0;
|
||||
while let Some(start) = html[search_from..].find("<code") {
|
||||
let abs_start = search_from + start;
|
||||
// Find end of opening tag
|
||||
let Some(tag_end) = html[abs_start..].find('>') else {
|
||||
break;
|
||||
};
|
||||
let content_start = abs_start + tag_end + 1;
|
||||
let Some(end) = html[content_start..].find("</code>") else {
|
||||
break;
|
||||
};
|
||||
let content = &html[content_start..content_start + end];
|
||||
if content.len() > 1000 {
|
||||
blocks.push(html_unescape(content));
|
||||
}
|
||||
search_from = content_start + end + 7;
|
||||
}
|
||||
blocks
|
||||
}
|
||||
|
||||
/// Extract post + comments from LinkedIn's SSR HTML (requires auth cookies).
|
||||
pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult> {
|
||||
let code_blocks = extract_code_blocks(html);
|
||||
|
||||
// Find the largest <code> block with "included" — that's the main data payload
|
||||
let mut best_included: Option<Vec<Value>> = None;
|
||||
for raw in &code_blocks {
|
||||
if let Ok(obj) = serde_json::from_str::<Value>(raw)
|
||||
&& let Some(arr) = obj.get("included").and_then(|v| v.as_array())
|
||||
{
|
||||
let current_len = best_included.as_ref().map(|a| a.len()).unwrap_or(0);
|
||||
if arr.len() > current_len {
|
||||
best_included = Some(arr.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let included = best_included?;
|
||||
debug!(entities = included.len(), "linkedin: found included array");
|
||||
|
||||
// Collect profiles (entityUrn → "First Last")
|
||||
let mut profiles = std::collections::HashMap::new();
|
||||
for item in &included {
|
||||
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if t.contains("Profile") {
|
||||
let urn = item.get("entityUrn").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let first = item.get("firstName").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let last = item.get("lastName").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let headline = item.get("headline").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !first.is_empty() {
|
||||
profiles.insert(
|
||||
urn.to_string(),
|
||||
(
|
||||
format!("{first} {last}").trim().to_string(),
|
||||
headline.to_string(),
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find the main post (Update type)
|
||||
let mut markdown = String::new();
|
||||
let mut post_author = String::new();
|
||||
let mut post_headline = String::new();
|
||||
|
||||
for item in &included {
|
||||
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !t.contains("Update") {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get author from actor profile
|
||||
if let Some(actor) = item.get("actor") {
|
||||
// actor can have a nested profile reference or inline data
|
||||
let author_urn = actor
|
||||
.get("*author")
|
||||
.or(actor.get("author"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
if let Some((name, headline)) = profiles.get(author_urn) {
|
||||
post_author = name.clone();
|
||||
post_headline = headline.clone();
|
||||
}
|
||||
// Or inline name
|
||||
if post_author.is_empty()
|
||||
&& let Some(name) = actor.get("name").and_then(|v| v.as_object())
|
||||
{
|
||||
let text = name.get("text").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !text.is_empty() {
|
||||
post_author = text.to_string();
|
||||
}
|
||||
}
|
||||
if post_headline.is_empty()
|
||||
&& let Some(desc) = actor.get("description").and_then(|v| v.as_object())
|
||||
{
|
||||
let text = desc.get("text").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !text.is_empty() {
|
||||
post_headline = text.to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get post body from commentary
|
||||
if let Some(commentary) = item.get("commentary")
|
||||
&& let Some(text) = commentary
|
||||
.get("text")
|
||||
.and_then(|v| v.as_object())
|
||||
.and_then(|o| o.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
{
|
||||
if !post_author.is_empty() {
|
||||
markdown.push_str(&format!("# {post_author}\n\n"));
|
||||
}
|
||||
if !post_headline.is_empty() {
|
||||
markdown.push_str(&format!("*{post_headline}*\n\n"));
|
||||
}
|
||||
markdown.push_str("---\n\n");
|
||||
// Unescape literal \n from JSON
|
||||
markdown.push_str(&text.replace("\\n", "\n"));
|
||||
markdown.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
if markdown.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Collect comments — LinkedIn stores comment text in `commentary.text`
|
||||
// and commenter name in `commenter.name.text`
|
||||
let mut comments: Vec<(String, String)> = Vec::new();
|
||||
for item in &included {
|
||||
let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if !t.contains("Comment") {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get comment text from commentary.text
|
||||
let text = item
|
||||
.get("commentary")
|
||||
.and_then(|c| c.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
if text.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get commenter name from commenter.title.text
|
||||
let name = item
|
||||
.get("commenter")
|
||||
.and_then(|c| c.get("title"))
|
||||
.and_then(|n| n.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("Someone");
|
||||
|
||||
comments.push((name.to_string(), text.to_string()));
|
||||
}
|
||||
|
||||
if !comments.is_empty() {
|
||||
markdown.push_str("---\n\n## Comments\n\n");
|
||||
for (name, text) in &comments {
|
||||
markdown.push_str(&format!("- **{name}**: {text}\n\n"));
|
||||
}
|
||||
}
|
||||
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
debug!(
|
||||
word_count,
|
||||
comments = comments.len(),
|
||||
"linkedin extraction done"
|
||||
);
|
||||
|
||||
Some(ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title: if post_author.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(format!("{post_author}'s LinkedIn Post"))
|
||||
},
|
||||
description: None,
|
||||
author: if post_author.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(post_author)
|
||||
},
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: Some(url.to_string()),
|
||||
site_name: Some("LinkedIn".into()),
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
/// Unescape HTML entities (named + numeric decimal).
|
||||
fn html_unescape(s: &str) -> String {
|
||||
let mut out = String::with_capacity(s.len());
|
||||
let mut chars = s.chars().peekable();
|
||||
while let Some(c) = chars.next() {
|
||||
if c != '&' {
|
||||
out.push(c);
|
||||
continue;
|
||||
}
|
||||
// Collect until ';'
|
||||
let mut entity = String::new();
|
||||
for c2 in chars.by_ref() {
|
||||
if c2 == ';' {
|
||||
break;
|
||||
}
|
||||
entity.push(c2);
|
||||
if entity.len() > 10 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
match entity.as_str() {
|
||||
"quot" => out.push('"'),
|
||||
"amp" => out.push('&'),
|
||||
"lt" => out.push('<'),
|
||||
"gt" => out.push('>'),
|
||||
"apos" => out.push('\''),
|
||||
s if s.starts_with('#') => {
|
||||
let num = &s[1..];
|
||||
if let Ok(n) = num.parse::<u32>()
|
||||
&& let Some(ch) = char::from_u32(n)
|
||||
{
|
||||
out.push(ch);
|
||||
continue;
|
||||
}
|
||||
out.push('&');
|
||||
out.push_str(&entity);
|
||||
out.push(';');
|
||||
}
|
||||
_ => {
|
||||
out.push('&');
|
||||
out.push_str(&entity);
|
||||
out.push(';');
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
122
crates/noxa-fetch/src/proxy.rs
Normal file
122
crates/noxa-fetch/src/proxy.rs
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
/// Proxy file parsing utilities.
|
||||
///
|
||||
/// Format: `host:port:user:pass` (one per line).
|
||||
/// Lines starting with `#` and blank lines are skipped.
|
||||
/// Also accepts `host:port` (no auth).
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Parse a single proxy line into an HTTP proxy URL.
|
||||
///
|
||||
/// Accepts two formats:
|
||||
/// - `host:port:user:pass` -> `http://user:pass@host:port`
|
||||
/// - `host:port` -> `http://host:port`
|
||||
pub fn parse_proxy_line(line: &str) -> Option<String> {
|
||||
let parts: Vec<&str> = line.trim().splitn(4, ':').collect();
|
||||
match parts.len() {
|
||||
4 => Some(format!(
|
||||
"http://{}:{}@{}:{}",
|
||||
parts[2], parts[3], parts[0], parts[1]
|
||||
)),
|
||||
2 => Some(format!("http://{}:{}", parts[0], parts[1])),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Load proxies from a file, returning parsed HTTP proxy URLs.
|
||||
///
|
||||
/// Skips blank lines and `#` comments. Returns an error if the file
|
||||
/// can't be read or contains no valid entries.
|
||||
pub fn parse_proxy_file(path: &str) -> Result<Vec<String>, FetchError> {
|
||||
let content = std::fs::read_to_string(path)
|
||||
.map_err(|e| FetchError::Build(format!("failed to read proxy file: {e}")))?;
|
||||
|
||||
let proxies: Vec<String> = content
|
||||
.lines()
|
||||
.filter_map(|line| {
|
||||
let trimmed = line.trim();
|
||||
if trimmed.is_empty() || trimmed.starts_with('#') {
|
||||
None
|
||||
} else {
|
||||
parse_proxy_line(trimmed)
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if proxies.is_empty() {
|
||||
return Err(FetchError::Build(
|
||||
"proxy file is empty or has no valid entries".into(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(proxies)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn parse_host_port_user_pass() {
|
||||
let result = parse_proxy_line("proxy.example.com:8080:alice:s3cret");
|
||||
assert_eq!(
|
||||
result.as_deref(),
|
||||
Some("http://alice:s3cret@proxy.example.com:8080")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_host_port_only() {
|
||||
let result = parse_proxy_line("10.0.0.1:3128");
|
||||
assert_eq!(result.as_deref(), Some("http://10.0.0.1:3128"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_trims_whitespace() {
|
||||
let result = parse_proxy_line(" host:9999:user:pass ");
|
||||
assert_eq!(result.as_deref(), Some("http://user:pass@host:9999"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_invalid_returns_none() {
|
||||
assert!(parse_proxy_line("just-a-hostname").is_none());
|
||||
assert!(parse_proxy_line("a:b:c").is_none()); // 3 parts is invalid
|
||||
assert!(parse_proxy_line("").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_happy_path() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("proxies.txt");
|
||||
let mut f = std::fs::File::create(&path).unwrap();
|
||||
writeln!(f, "# residential pool").unwrap();
|
||||
writeln!(f, "host1:8080:user1:pass1").unwrap();
|
||||
writeln!(f).unwrap(); // blank line
|
||||
writeln!(f, "host2:3128").unwrap();
|
||||
writeln!(f, "# datacenter").unwrap();
|
||||
writeln!(f, "host3:9999:u:p").unwrap();
|
||||
drop(f);
|
||||
|
||||
let proxies = parse_proxy_file(path.to_str().unwrap()).unwrap();
|
||||
assert_eq!(proxies.len(), 3);
|
||||
assert_eq!(proxies[0], "http://user1:pass1@host1:8080");
|
||||
assert_eq!(proxies[1], "http://host2:3128");
|
||||
assert_eq!(proxies[2], "http://u:p@host3:9999");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_empty_errors() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("empty.txt");
|
||||
std::fs::write(&path, "# only comments\n\n").unwrap();
|
||||
|
||||
let err = parse_proxy_file(path.to_str().unwrap());
|
||||
assert!(err.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_missing_errors() {
|
||||
let err = parse_proxy_file("/nonexistent/proxies.txt");
|
||||
assert!(err.is_err());
|
||||
}
|
||||
}
|
||||
172
crates/noxa-fetch/src/reddit.rs
Normal file
172
crates/noxa-fetch/src/reddit.rs
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
|
||||
///
|
||||
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
|
||||
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
|
||||
/// comment tree as structured JSON, which we convert to clean markdown.
|
||||
use serde::Deserialize;
|
||||
use tracing::debug;
|
||||
use noxa_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL points to a Reddit post/comment page.
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
matches!(
|
||||
host,
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
)
|
||||
}
|
||||
|
||||
/// Build the `.json` URL from a Reddit page URL.
|
||||
pub fn json_url(url: &str) -> String {
|
||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
format!("{clean}.json")
|
||||
}
|
||||
|
||||
/// Convert Reddit JSON API response into an ExtractionResult.
|
||||
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
|
||||
let listings: Vec<Listing> =
|
||||
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
|
||||
|
||||
let mut markdown = String::new();
|
||||
let mut title = None;
|
||||
let mut author = None;
|
||||
let mut subreddit = None;
|
||||
|
||||
// First listing = the post itself
|
||||
if let Some(post_listing) = listings.first() {
|
||||
for child in &post_listing.data.children {
|
||||
if child.kind == "t3" {
|
||||
let d = &child.data;
|
||||
title = d.title.clone();
|
||||
author = d.author.clone();
|
||||
subreddit = d.subreddit_name_prefixed.clone();
|
||||
|
||||
if let Some(ref t) = title {
|
||||
markdown.push_str(&format!("# {t}\n\n"));
|
||||
}
|
||||
if let (Some(a), Some(sr)) = (&author, &subreddit) {
|
||||
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
|
||||
}
|
||||
if let Some(ref body) = d.selftext
|
||||
&& !body.is_empty()
|
||||
{
|
||||
markdown.push_str(body);
|
||||
markdown.push_str("\n\n");
|
||||
}
|
||||
if let Some(ref url_field) = d.url_overridden_by_dest
|
||||
&& !url_field.is_empty()
|
||||
{
|
||||
markdown.push_str(&format!("[Link]({url_field})\n\n"));
|
||||
}
|
||||
markdown.push_str("---\n\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second listing = comment tree
|
||||
if let Some(comment_listing) = listings.get(1) {
|
||||
markdown.push_str("## Comments\n\n");
|
||||
for child in &comment_listing.data.children {
|
||||
render_comment(child, 0, &mut markdown);
|
||||
}
|
||||
}
|
||||
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
debug!(word_count, "reddit json extracted");
|
||||
|
||||
Ok(ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title,
|
||||
description: None,
|
||||
author,
|
||||
published_date: None,
|
||||
language: Some("en".into()),
|
||||
url: Some(url.to_string()),
|
||||
site_name: subreddit,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
|
||||
if thing.kind != "t1" {
|
||||
return;
|
||||
}
|
||||
let d = &thing.data;
|
||||
let indent = " ".repeat(depth);
|
||||
let author = d.author.as_deref().unwrap_or("[deleted]");
|
||||
let body = d.body.as_deref().unwrap_or("[removed]");
|
||||
let score = d.score.unwrap_or(0);
|
||||
|
||||
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
|
||||
for line in body.lines() {
|
||||
out.push_str(&format!("{indent} {line}\n"));
|
||||
}
|
||||
out.push('\n');
|
||||
|
||||
// Recurse into replies
|
||||
if let Some(Replies::Listing(listing)) = &d.replies {
|
||||
for child in &listing.data.children {
|
||||
render_comment(child, depth + 1, out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Reddit JSON types (minimal) ---
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Listing {
|
||||
data: ListingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListingData {
|
||||
children: Vec<Thing>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Thing {
|
||||
kind: String,
|
||||
data: ThingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ThingData {
|
||||
// Post fields (t3)
|
||||
title: Option<String>,
|
||||
selftext: Option<String>,
|
||||
subreddit_name_prefixed: Option<String>,
|
||||
url_overridden_by_dest: Option<String>,
|
||||
// Comment fields (t1)
|
||||
author: Option<String>,
|
||||
body: Option<String>,
|
||||
score: Option<i64>,
|
||||
replies: Option<Replies>,
|
||||
}
|
||||
|
||||
/// Reddit replies can be either a nested Listing or an empty string.
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Replies {
|
||||
Listing(Listing),
|
||||
#[allow(dead_code)]
|
||||
Empty(String),
|
||||
}
|
||||
601
crates/noxa-fetch/src/sitemap.rs
Normal file
601
crates/noxa-fetch/src/sitemap.rs
Normal file
|
|
@ -0,0 +1,601 @@
|
|||
/// Sitemap parsing and URL discovery.
|
||||
///
|
||||
/// Discovers URLs from a site's sitemaps using a 3-step process:
|
||||
/// 1. Parse robots.txt for `Sitemap:` directives
|
||||
/// 2. Try common sitemap paths as fallback
|
||||
/// 3. Recursively resolve sitemap index files
|
||||
///
|
||||
/// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
|
||||
use std::collections::HashSet;
|
||||
|
||||
use quick_xml::Reader;
|
||||
use quick_xml::events::Event;
|
||||
use serde::Serialize;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Maximum depth when recursively fetching sitemap index files.
|
||||
/// Prevents infinite loops from circular sitemap references.
|
||||
const MAX_RECURSION_DEPTH: usize = 3;
|
||||
|
||||
/// Common sitemap paths to try when robots.txt doesn't list any.
|
||||
const FALLBACK_SITEMAP_PATHS: &[&str] = &[
|
||||
"/sitemap.xml",
|
||||
"/sitemap_index.xml",
|
||||
"/wp-sitemap.xml",
|
||||
"/sitemap/sitemap-index.xml",
|
||||
];
|
||||
|
||||
/// A single URL discovered from a sitemap.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct SitemapEntry {
|
||||
pub url: String,
|
||||
pub last_modified: Option<String>,
|
||||
pub priority: Option<f64>,
|
||||
pub change_freq: Option<String>,
|
||||
}
|
||||
|
||||
/// Discover all URLs from a site's sitemaps.
|
||||
///
|
||||
/// Discovery order:
|
||||
/// 1. Fetch /robots.txt, parse `Sitemap:` directives
|
||||
/// 2. Try common sitemap paths as fallback (skipping any already found)
|
||||
/// 3. If sitemap index, recursively fetch child sitemaps
|
||||
/// 4. Deduplicate by URL
|
||||
///
|
||||
/// Returns an empty vec (not an error) if no sitemaps are found.
|
||||
pub async fn discover(
|
||||
client: &FetchClient,
|
||||
base_url: &str,
|
||||
) -> Result<Vec<SitemapEntry>, FetchError> {
|
||||
let base = base_url.trim_end_matches('/');
|
||||
let mut sitemap_urls: Vec<String> = Vec::new();
|
||||
|
||||
// Step 1: Try robots.txt
|
||||
let robots_url = format!("{base}/robots.txt");
|
||||
debug!(url = %robots_url, "fetching robots.txt");
|
||||
|
||||
match client.fetch(&robots_url).await {
|
||||
Ok(result) if result.status == 200 => {
|
||||
let found = parse_robots_txt(&result.html);
|
||||
debug!(count = found.len(), "sitemap URLs from robots.txt");
|
||||
sitemap_urls.extend(found);
|
||||
}
|
||||
Ok(result) => {
|
||||
debug!(status = result.status, "robots.txt not found");
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(error = %e, "failed to fetch robots.txt");
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Try common sitemap paths (skipping any already discovered via robots.txt)
|
||||
for path in FALLBACK_SITEMAP_PATHS {
|
||||
let candidate = format!("{base}{path}");
|
||||
if !sitemap_urls.iter().any(|u| u == &candidate) {
|
||||
sitemap_urls.push(candidate);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Fetch and parse each sitemap, handling indexes recursively
|
||||
let mut seen_urls: HashSet<String> = HashSet::new();
|
||||
let mut entries: Vec<SitemapEntry> = Vec::new();
|
||||
|
||||
fetch_sitemaps(client, &sitemap_urls, &mut entries, &mut seen_urls, 0).await;
|
||||
|
||||
debug!(total = entries.len(), "sitemap discovery complete");
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
/// Recursively fetch and parse sitemap URLs, handling both urlsets and indexes.
|
||||
async fn fetch_sitemaps(
|
||||
client: &FetchClient,
|
||||
urls: &[String],
|
||||
entries: &mut Vec<SitemapEntry>,
|
||||
seen_urls: &mut HashSet<String>,
|
||||
depth: usize,
|
||||
) {
|
||||
if depth > MAX_RECURSION_DEPTH {
|
||||
warn!(depth, "sitemap recursion limit reached, stopping");
|
||||
return;
|
||||
}
|
||||
|
||||
for sitemap_url in urls {
|
||||
debug!(url = %sitemap_url, depth, "fetching sitemap");
|
||||
|
||||
let xml = match client.fetch(sitemap_url).await {
|
||||
Ok(result) if result.status == 200 => result.html,
|
||||
Ok(result) => {
|
||||
debug!(url = %sitemap_url, status = result.status, "sitemap not found");
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(url = %sitemap_url, error = %e, "failed to fetch sitemap");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match detect_sitemap_type(&xml) {
|
||||
SitemapType::UrlSet => {
|
||||
let parsed = parse_urlset(&xml);
|
||||
for entry in parsed {
|
||||
if seen_urls.insert(entry.url.clone()) {
|
||||
entries.push(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
SitemapType::Index => {
|
||||
let child_urls = parse_sitemap_index(&xml);
|
||||
debug!(count = child_urls.len(), "found child sitemaps in index");
|
||||
|
||||
// Box the recursive call to avoid large future sizes
|
||||
Box::pin(fetch_sitemaps(
|
||||
client,
|
||||
&child_urls,
|
||||
entries,
|
||||
seen_urls,
|
||||
depth + 1,
|
||||
))
|
||||
.await;
|
||||
}
|
||||
SitemapType::Unknown => {
|
||||
debug!(url = %sitemap_url, "unrecognized sitemap format, skipping");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pure parsing functions (no I/O, fully testable)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Extract `Sitemap:` directive URLs from robots.txt content.
|
||||
pub fn parse_robots_txt(text: &str) -> Vec<String> {
|
||||
text.lines()
|
||||
.filter_map(|line| {
|
||||
let trimmed = line.trim();
|
||||
// Case-insensitive match for "Sitemap:" prefix
|
||||
if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") {
|
||||
let url = trimmed[8..].trim();
|
||||
if !url.is_empty() {
|
||||
return Some(url.to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse a sitemap XML string. Handles both `<urlset>` and `<sitemapindex>`.
|
||||
/// Returns entries from urlsets and recursion targets from indexes.
|
||||
pub fn parse_sitemap_xml(xml: &str) -> Vec<SitemapEntry> {
|
||||
match detect_sitemap_type(xml) {
|
||||
SitemapType::UrlSet => parse_urlset(xml),
|
||||
SitemapType::Index => {
|
||||
// For the public parsing API, convert index <loc> entries into
|
||||
// SitemapEntry with just the URL. The async `discover` function
|
||||
// handles actual recursive fetching.
|
||||
parse_sitemap_index(xml)
|
||||
.into_iter()
|
||||
.map(|url| SitemapEntry {
|
||||
url,
|
||||
last_modified: None,
|
||||
priority: None,
|
||||
change_freq: None,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
SitemapType::Unknown => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum SitemapType {
|
||||
UrlSet,
|
||||
Index,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
/// Peek at the first element to determine if this is a urlset or sitemapindex.
|
||||
fn detect_sitemap_type(xml: &str) -> SitemapType {
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
|
||||
let name = e.local_name();
|
||||
return match name.as_ref() {
|
||||
b"urlset" => SitemapType::UrlSet,
|
||||
b"sitemapindex" => SitemapType::Index,
|
||||
_ => continue, // skip processing instructions, comments
|
||||
};
|
||||
}
|
||||
Ok(Event::Eof) => return SitemapType::Unknown,
|
||||
Err(_) => return SitemapType::Unknown,
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse `<url>` entries from a `<urlset>` sitemap.
|
||||
fn parse_urlset(xml: &str) -> Vec<SitemapEntry> {
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut buf = Vec::new();
|
||||
let mut entries = Vec::new();
|
||||
|
||||
// State for current <url> element being parsed
|
||||
let mut in_url = false;
|
||||
let mut current_tag: Option<UrlTag> = None;
|
||||
let mut loc: Option<String> = None;
|
||||
let mut lastmod: Option<String> = None;
|
||||
let mut priority: Option<f64> = None;
|
||||
let mut changefreq: Option<String> = None;
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) => {
|
||||
let name = e.local_name();
|
||||
match name.as_ref() {
|
||||
b"url" => {
|
||||
in_url = true;
|
||||
loc = None;
|
||||
lastmod = None;
|
||||
priority = None;
|
||||
changefreq = None;
|
||||
}
|
||||
b"loc" if in_url => current_tag = Some(UrlTag::Loc),
|
||||
b"lastmod" if in_url => current_tag = Some(UrlTag::LastMod),
|
||||
b"priority" if in_url => current_tag = Some(UrlTag::Priority),
|
||||
b"changefreq" if in_url => current_tag = Some(UrlTag::ChangeFreq),
|
||||
_ => current_tag = None,
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(ref e)) => {
|
||||
if let Some(ref tag) = current_tag
|
||||
&& let Ok(text) = e.unescape()
|
||||
{
|
||||
let text = text.trim().to_string();
|
||||
if !text.is_empty() {
|
||||
match tag {
|
||||
UrlTag::Loc => loc = Some(text),
|
||||
UrlTag::LastMod => lastmod = Some(text),
|
||||
UrlTag::Priority => priority = text.parse().ok(),
|
||||
UrlTag::ChangeFreq => changefreq = Some(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
let name = e.local_name();
|
||||
if name.as_ref() == b"url" && in_url {
|
||||
if let Some(url) = loc.take() {
|
||||
entries.push(SitemapEntry {
|
||||
url,
|
||||
last_modified: lastmod.take(),
|
||||
priority: priority.take(),
|
||||
change_freq: changefreq.take(),
|
||||
});
|
||||
}
|
||||
in_url = false;
|
||||
}
|
||||
current_tag = None;
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
warn!(error = %e, "XML parse error in sitemap, returning partial results");
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum UrlTag {
|
||||
Loc,
|
||||
LastMod,
|
||||
Priority,
|
||||
ChangeFreq,
|
||||
}
|
||||
|
||||
/// Parse `<sitemap>` entries from a `<sitemapindex>`, returning child sitemap URLs.
|
||||
fn parse_sitemap_index(xml: &str) -> Vec<String> {
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut buf = Vec::new();
|
||||
let mut urls = Vec::new();
|
||||
|
||||
let mut in_sitemap = false;
|
||||
let mut in_loc = false;
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) => {
|
||||
let name = e.local_name();
|
||||
match name.as_ref() {
|
||||
b"sitemap" => in_sitemap = true,
|
||||
b"loc" if in_sitemap => in_loc = true,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(ref e)) => {
|
||||
if in_loc && let Ok(text) = e.unescape() {
|
||||
let text = text.trim().to_string();
|
||||
if !text.is_empty() {
|
||||
urls.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
let name = e.local_name();
|
||||
match name.as_ref() {
|
||||
b"sitemap" => {
|
||||
in_sitemap = false;
|
||||
in_loc = false;
|
||||
}
|
||||
b"loc" => in_loc = false,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
warn!(error = %e, "XML parse error in sitemap index, returning partial results");
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
urls
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_urlset() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>https://example.com/</loc>
|
||||
<lastmod>2026-01-15</lastmod>
|
||||
<changefreq>daily</changefreq>
|
||||
<priority>1.0</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/about</loc>
|
||||
<lastmod>2026-01-10</lastmod>
|
||||
<changefreq>monthly</changefreq>
|
||||
<priority>0.8</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/blog/post-1</loc>
|
||||
</url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert_eq!(entries.len(), 3);
|
||||
|
||||
assert_eq!(entries[0].url, "https://example.com/");
|
||||
assert_eq!(entries[0].last_modified.as_deref(), Some("2026-01-15"));
|
||||
assert_eq!(entries[0].change_freq.as_deref(), Some("daily"));
|
||||
assert_eq!(entries[0].priority, Some(1.0));
|
||||
|
||||
assert_eq!(entries[1].url, "https://example.com/about");
|
||||
assert_eq!(entries[1].priority, Some(0.8));
|
||||
|
||||
assert_eq!(entries[2].url, "https://example.com/blog/post-1");
|
||||
assert_eq!(entries[2].last_modified, None);
|
||||
assert_eq!(entries[2].priority, None);
|
||||
assert_eq!(entries[2].change_freq, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_sitemap_index() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap>
|
||||
<loc>https://example.com/sitemap-posts.xml</loc>
|
||||
<lastmod>2026-03-01</lastmod>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://example.com/sitemap-pages.xml</loc>
|
||||
</sitemap>
|
||||
</sitemapindex>"#;
|
||||
|
||||
let urls = parse_sitemap_index(xml);
|
||||
assert_eq!(urls.len(), 2);
|
||||
assert_eq!(urls[0], "https://example.com/sitemap-posts.xml");
|
||||
assert_eq!(urls[1], "https://example.com/sitemap-pages.xml");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_sitemap_xml_dispatches_urlset() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/page</loc></url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
assert_eq!(entries.len(), 1);
|
||||
assert_eq!(entries[0].url, "https://example.com/page");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_sitemap_xml_dispatches_index() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap><loc>https://example.com/sitemap-1.xml</loc></sitemap>
|
||||
</sitemapindex>"#;
|
||||
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
assert_eq!(entries.len(), 1);
|
||||
assert_eq!(entries[0].url, "https://example.com/sitemap-1.xml");
|
||||
// Index entries have no metadata when parsed through the public API
|
||||
assert_eq!(entries[0].priority, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_robots_txt() {
|
||||
let robots = "User-agent: *\n\
|
||||
Disallow: /admin/\n\
|
||||
\n\
|
||||
Sitemap: https://example.com/sitemap.xml\n\
|
||||
sitemap: https://example.com/sitemap-news.xml\n\
|
||||
SITEMAP: https://example.com/sitemap-images.xml\n\
|
||||
\n\
|
||||
User-agent: Googlebot\n\
|
||||
Allow: /\n";
|
||||
|
||||
let urls = parse_robots_txt(robots);
|
||||
assert_eq!(urls.len(), 3);
|
||||
assert_eq!(urls[0], "https://example.com/sitemap.xml");
|
||||
assert_eq!(urls[1], "https://example.com/sitemap-news.xml");
|
||||
assert_eq!(urls[2], "https://example.com/sitemap-images.xml");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_robots_txt_empty_value() {
|
||||
// "Sitemap:" with no URL should be skipped
|
||||
let robots = "Sitemap:\nSitemap: \nSitemap: https://example.com/s.xml\n";
|
||||
let urls = parse_robots_txt(robots);
|
||||
assert_eq!(urls.len(), 1);
|
||||
assert_eq!(urls[0], "https://example.com/s.xml");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deduplicate() {
|
||||
// parse_sitemap_xml deduplicates via the discover() path, but
|
||||
// we can verify that parsing the same URL twice produces entries
|
||||
// that the HashSet in discover() would collapse.
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/page</loc></url>
|
||||
<url><loc>https://example.com/page</loc></url>
|
||||
<url><loc>https://example.com/other</loc></url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert_eq!(entries.len(), 3, "parser returns all entries");
|
||||
|
||||
// Simulate the dedup that discover() does
|
||||
let mut seen = HashSet::new();
|
||||
let deduped: Vec<_> = entries
|
||||
.into_iter()
|
||||
.filter(|e| seen.insert(e.url.clone()))
|
||||
.collect();
|
||||
assert_eq!(deduped.len(), 2, "dedup collapses duplicates");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_sitemap() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert!(entries.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_xml() {
|
||||
let xml = "this is not xml at all <><><";
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
assert!(entries.is_empty(), "malformed XML returns empty vec");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_xml_partial() {
|
||||
// Partial XML that starts valid but breaks mid-stream
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/good</loc></url>
|
||||
<url><loc>broken
|
||||
"#;
|
||||
let entries = parse_sitemap_xml(xml);
|
||||
// Should return at least the successfully parsed entry
|
||||
assert!(entries.len() >= 1);
|
||||
assert_eq!(entries[0].url, "https://example.com/good");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_missing_loc() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<lastmod>2026-01-01</lastmod>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/valid</loc>
|
||||
</url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert_eq!(entries.len(), 1, "entry without <loc> is skipped");
|
||||
assert_eq!(entries[0].url, "https://example.com/valid");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_parsing() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>https://example.com/high</loc>
|
||||
<priority>1.0</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/mid</loc>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/low</loc>
|
||||
<priority>0.1</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/invalid</loc>
|
||||
<priority>not-a-number</priority>
|
||||
</url>
|
||||
</urlset>"#;
|
||||
|
||||
let entries = parse_urlset(xml);
|
||||
assert_eq!(entries.len(), 4);
|
||||
|
||||
assert_eq!(entries[0].priority, Some(1.0));
|
||||
assert_eq!(entries[1].priority, Some(0.5));
|
||||
assert_eq!(entries[2].priority, Some(0.1));
|
||||
assert_eq!(entries[3].priority, None, "invalid priority parses as None");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_sitemap_type() {
|
||||
let urlset = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
|
||||
assert_eq!(detect_sitemap_type(urlset), SitemapType::UrlSet);
|
||||
|
||||
let index = r#"<?xml version="1.0"?><sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></sitemapindex>"#;
|
||||
assert_eq!(detect_sitemap_type(index), SitemapType::Index);
|
||||
|
||||
assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
|
||||
assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fallback_paths_constant() {
|
||||
// Verify the constant has the expected paths
|
||||
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml"));
|
||||
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
|
||||
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
|
||||
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
|
||||
}
|
||||
}
|
||||
372
crates/noxa-fetch/src/tls.rs
Normal file
372
crates/noxa-fetch/src/tls.rs
Normal file
|
|
@ -0,0 +1,372 @@
|
|||
//! Browser TLS + HTTP/2 fingerprint profiles built on wreq (BoringSSL).
|
||||
//!
|
||||
//! Replaces the old noxa-http/noxa-tls patched rustls stack.
|
||||
//! Each profile configures TLS options (cipher suites, curves, extensions,
|
||||
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
|
||||
//! stream dependency, priorities) to match real browser fingerprints.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use wreq::http2::{
|
||||
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
|
||||
};
|
||||
use wreq::tls::{AlpsProtocol, CertificateCompressionAlgorithm, TlsOptions, TlsVersion};
|
||||
use wreq::{Client, Emulation};
|
||||
|
||||
use crate::browser::BrowserVariant;
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
|
||||
const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
|
||||
|
||||
/// Chrome signature algorithms.
|
||||
const CHROME_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:rsa_pkcs1_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha512";
|
||||
|
||||
/// Chrome curves (post-quantum ML-KEM + X25519 + P-256 + P-384).
|
||||
const CHROME_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384";
|
||||
|
||||
/// Firefox cipher list.
|
||||
const FIREFOX_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
|
||||
|
||||
/// Firefox signature algorithms.
|
||||
const FIREFOX_SIGALGS: &str = "ecdsa_secp256r1_sha256:ecdsa_secp384r1_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha256:rsa_pss_rsae_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha256:rsa_pkcs1_sha384:rsa_pkcs1_sha512:ecdsa_sha1:rsa_pkcs1_sha1";
|
||||
|
||||
/// Firefox curves.
|
||||
const FIREFOX_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384:P-521";
|
||||
|
||||
/// Safari cipher list.
|
||||
const SAFARI_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_CBC_SHA";
|
||||
|
||||
/// Safari signature algorithms.
|
||||
const SAFARI_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha512:rsa_pkcs1_sha384:rsa_pkcs1_sha512";
|
||||
|
||||
/// Safari curves.
|
||||
const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521";
|
||||
|
||||
// --- Chrome HTTP headers in correct wire order ---
|
||||
|
||||
const CHROME_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"sec-ch-ua",
|
||||
r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
|
||||
),
|
||||
("sec-ch-ua-mobile", "?0"),
|
||||
("sec-ch-ua-platform", "\"Windows\""),
|
||||
("upgrade-insecure-requests", "1"),
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
),
|
||||
("sec-fetch-site", "none"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("sec-fetch-user", "?1"),
|
||||
("sec-fetch-dest", "document"),
|
||||
("accept-encoding", "gzip, deflate, br, zstd"),
|
||||
("accept-language", "en-US,en;q=0.9"),
|
||||
("priority", "u=0, i"),
|
||||
];
|
||||
|
||||
const CHROME_MACOS_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"sec-ch-ua",
|
||||
r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
|
||||
),
|
||||
("sec-ch-ua-mobile", "?0"),
|
||||
("sec-ch-ua-platform", "\"macOS\""),
|
||||
("upgrade-insecure-requests", "1"),
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
),
|
||||
("sec-fetch-site", "none"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("sec-fetch-user", "?1"),
|
||||
("sec-fetch-dest", "document"),
|
||||
("accept-encoding", "gzip, deflate, br, zstd"),
|
||||
("accept-language", "en-US,en;q=0.9"),
|
||||
("priority", "u=0, i"),
|
||||
];
|
||||
|
||||
const FIREFOX_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
),
|
||||
("accept-language", "en-US,en;q=0.5"),
|
||||
("accept-encoding", "gzip, deflate, br, zstd"),
|
||||
("upgrade-insecure-requests", "1"),
|
||||
("sec-fetch-dest", "document"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("sec-fetch-site", "none"),
|
||||
("sec-fetch-user", "?1"),
|
||||
("priority", "u=0, i"),
|
||||
];
|
||||
|
||||
const SAFARI_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Safari/605.1.15",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
),
|
||||
("sec-fetch-site", "none"),
|
||||
("accept-language", "en-US,en;q=0.9"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("accept-encoding", "gzip, deflate, br"),
|
||||
("sec-fetch-dest", "document"),
|
||||
];
|
||||
|
||||
const EDGE_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"sec-ch-ua",
|
||||
r#""Microsoft Edge";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
|
||||
),
|
||||
("sec-ch-ua-mobile", "?0"),
|
||||
("sec-ch-ua-platform", "\"Windows\""),
|
||||
("upgrade-insecure-requests", "1"),
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36 Edg/145.0.0.0",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
),
|
||||
("sec-fetch-site", "none"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("sec-fetch-user", "?1"),
|
||||
("sec-fetch-dest", "document"),
|
||||
("accept-encoding", "gzip, deflate, br, zstd"),
|
||||
("accept-language", "en-US,en;q=0.9"),
|
||||
("priority", "u=0, i"),
|
||||
];
|
||||
|
||||
fn chrome_tls() -> TlsOptions {
|
||||
TlsOptions::builder()
|
||||
.cipher_list(CHROME_CIPHERS)
|
||||
.sigalgs_list(CHROME_SIGALGS)
|
||||
.curves_list(CHROME_CURVES)
|
||||
.min_tls_version(TlsVersion::TLS_1_2)
|
||||
.max_tls_version(TlsVersion::TLS_1_3)
|
||||
.grease_enabled(true)
|
||||
.permute_extensions(true)
|
||||
.enable_ech_grease(true)
|
||||
.pre_shared_key(true)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.alps_protocols([AlpsProtocol::HTTP2])
|
||||
.alps_use_new_codepoint(true)
|
||||
.aes_hw_override(true)
|
||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
|
||||
.build()
|
||||
}
|
||||
|
||||
fn firefox_tls() -> TlsOptions {
|
||||
TlsOptions::builder()
|
||||
.cipher_list(FIREFOX_CIPHERS)
|
||||
.sigalgs_list(FIREFOX_SIGALGS)
|
||||
.curves_list(FIREFOX_CURVES)
|
||||
.min_tls_version(TlsVersion::TLS_1_2)
|
||||
.max_tls_version(TlsVersion::TLS_1_3)
|
||||
.grease_enabled(true)
|
||||
.permute_extensions(false)
|
||||
.enable_ech_grease(true)
|
||||
.pre_shared_key(true)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.certificate_compression_algorithms(&[
|
||||
CertificateCompressionAlgorithm::ZLIB,
|
||||
CertificateCompressionAlgorithm::BROTLI,
|
||||
])
|
||||
.build()
|
||||
}
|
||||
|
||||
fn safari_tls() -> TlsOptions {
|
||||
TlsOptions::builder()
|
||||
.cipher_list(SAFARI_CIPHERS)
|
||||
.sigalgs_list(SAFARI_SIGALGS)
|
||||
.curves_list(SAFARI_CURVES)
|
||||
.min_tls_version(TlsVersion::TLS_1_2)
|
||||
.max_tls_version(TlsVersion::TLS_1_3)
|
||||
.grease_enabled(true)
|
||||
.permute_extensions(false)
|
||||
.enable_ech_grease(false)
|
||||
.pre_shared_key(false)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB])
|
||||
.build()
|
||||
}
|
||||
|
||||
fn chrome_h2() -> Http2Options {
|
||||
Http2Options::builder()
|
||||
.initial_window_size(6_291_456)
|
||||
.initial_connection_window_size(15_728_640)
|
||||
.max_header_list_size(262_144)
|
||||
.header_table_size(65_536)
|
||||
.max_concurrent_streams(1000u32)
|
||||
.enable_push(false)
|
||||
.settings_order(
|
||||
SettingsOrder::builder()
|
||||
.extend([
|
||||
SettingId::HeaderTableSize,
|
||||
SettingId::EnablePush,
|
||||
SettingId::MaxConcurrentStreams,
|
||||
SettingId::InitialWindowSize,
|
||||
SettingId::MaxFrameSize,
|
||||
SettingId::MaxHeaderListSize,
|
||||
SettingId::EnableConnectProtocol,
|
||||
SettingId::NoRfc7540Priorities,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_pseudo_order(
|
||||
PseudoOrder::builder()
|
||||
.extend([
|
||||
PseudoId::Method,
|
||||
PseudoId::Authority,
|
||||
PseudoId::Scheme,
|
||||
PseudoId::Path,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_stream_dependency(StreamDependency::new(StreamId::zero(), 219, true))
|
||||
.build()
|
||||
}
|
||||
|
||||
fn firefox_h2() -> Http2Options {
|
||||
Http2Options::builder()
|
||||
.initial_window_size(131_072)
|
||||
.initial_connection_window_size(12_517_377)
|
||||
.max_header_list_size(65_536)
|
||||
.header_table_size(65_536)
|
||||
.settings_order(
|
||||
SettingsOrder::builder()
|
||||
.extend([
|
||||
SettingId::HeaderTableSize,
|
||||
SettingId::InitialWindowSize,
|
||||
SettingId::MaxFrameSize,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_pseudo_order(
|
||||
PseudoOrder::builder()
|
||||
.extend([
|
||||
PseudoId::Method,
|
||||
PseudoId::Path,
|
||||
PseudoId::Authority,
|
||||
PseudoId::Scheme,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.build()
|
||||
}
|
||||
|
||||
fn safari_h2() -> Http2Options {
|
||||
Http2Options::builder()
|
||||
.initial_window_size(2_097_152)
|
||||
.initial_connection_window_size(10_420_225)
|
||||
.max_header_list_size(0)
|
||||
.header_table_size(4_096)
|
||||
.enable_push(false)
|
||||
.max_concurrent_streams(100u32)
|
||||
.settings_order(
|
||||
SettingsOrder::builder()
|
||||
.extend([
|
||||
SettingId::EnablePush,
|
||||
SettingId::MaxConcurrentStreams,
|
||||
SettingId::InitialWindowSize,
|
||||
SettingId::MaxFrameSize,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_pseudo_order(
|
||||
PseudoOrder::builder()
|
||||
.extend([
|
||||
PseudoId::Method,
|
||||
PseudoId::Scheme,
|
||||
PseudoId::Authority,
|
||||
PseudoId::Path,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_stream_dependency(StreamDependency::new(StreamId::zero(), 255, false))
|
||||
.build()
|
||||
}
|
||||
|
||||
fn build_headers(pairs: &[(&str, &str)]) -> http::HeaderMap {
|
||||
let mut map = http::HeaderMap::with_capacity(pairs.len());
|
||||
for (name, value) in pairs {
|
||||
if let (Ok(n), Ok(v)) = (
|
||||
http::header::HeaderName::from_bytes(name.as_bytes()),
|
||||
http::header::HeaderValue::from_str(value),
|
||||
) {
|
||||
map.insert(n, v);
|
||||
}
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
/// Build a wreq Client for a specific browser variant.
|
||||
pub fn build_client(
|
||||
variant: BrowserVariant,
|
||||
timeout: Duration,
|
||||
extra_headers: &std::collections::HashMap<String, String>,
|
||||
proxy: Option<&str>,
|
||||
) -> Result<Client, FetchError> {
|
||||
let (tls, h2, headers) = match variant {
|
||||
BrowserVariant::Chrome => (chrome_tls(), chrome_h2(), CHROME_HEADERS),
|
||||
BrowserVariant::ChromeMacos => (chrome_tls(), chrome_h2(), CHROME_MACOS_HEADERS),
|
||||
BrowserVariant::Firefox => (firefox_tls(), firefox_h2(), FIREFOX_HEADERS),
|
||||
BrowserVariant::Safari => (safari_tls(), safari_h2(), SAFARI_HEADERS),
|
||||
BrowserVariant::Edge => (chrome_tls(), chrome_h2(), EDGE_HEADERS),
|
||||
};
|
||||
|
||||
let mut header_map = build_headers(headers);
|
||||
|
||||
// Append extra headers after profile defaults
|
||||
for (k, v) in extra_headers {
|
||||
if let (Ok(n), Ok(val)) = (
|
||||
http::header::HeaderName::from_bytes(k.as_bytes()),
|
||||
http::header::HeaderValue::from_str(v),
|
||||
) {
|
||||
header_map.insert(n, val);
|
||||
}
|
||||
}
|
||||
|
||||
let emulation = Emulation::builder()
|
||||
.tls_options(tls)
|
||||
.http2_options(h2)
|
||||
.headers(header_map)
|
||||
.build();
|
||||
|
||||
let mut builder = Client::builder()
|
||||
.emulation(emulation)
|
||||
.redirect(wreq::redirect::Policy::limited(10))
|
||||
.cookie_store(true)
|
||||
.timeout(timeout);
|
||||
|
||||
if let Some(proxy_url) = proxy {
|
||||
let proxy =
|
||||
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
|
||||
builder = builder.proxy(proxy);
|
||||
}
|
||||
|
||||
builder
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(e.to_string()))
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue