mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-13 17:02:36 +02:00
chore: rebrand webclaw to noxa
This commit is contained in:
parent
a4c351d5ae
commit
8674b60b4e
86 changed files with 781 additions and 2121 deletions
|
|
@ -1,19 +1,19 @@
|
|||
[package]
|
||||
name = "webclaw-cli"
|
||||
name = "noxa-cli"
|
||||
description = "CLI for extracting web content into LLM-optimized formats"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "webclaw"
|
||||
name = "noxa"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
webclaw-core = { workspace = true }
|
||||
webclaw-fetch = { workspace = true }
|
||||
webclaw-llm = { workspace = true }
|
||||
webclaw-pdf = { workspace = true }
|
||||
noxa-core = { workspace = true }
|
||||
noxa-fetch = { workspace = true }
|
||||
noxa-llm = { workspace = true }
|
||||
noxa-pdf = { workspace = true }
|
||||
dotenvy = { workspace = true }
|
||||
rand = "0.8"
|
||||
serde_json = { workspace = true }
|
||||
|
|
@ -1,16 +1,16 @@
|
|||
/// Cloud API client for automatic fallback when local extraction fails.
|
||||
///
|
||||
/// When WEBCLAW_API_KEY is set (or --api-key is passed), the CLI can fall back
|
||||
/// to api.webclaw.io for bot-protected or JS-rendered sites. With --cloud flag,
|
||||
/// When NOXA_API_KEY is set (or --api-key is passed), the CLI can fall back
|
||||
/// to api.noxa.io for bot-protected or JS-rendered sites. With --cloud flag,
|
||||
/// all requests go through the cloud API directly.
|
||||
///
|
||||
/// NOTE: The canonical, full-featured cloud module lives in webclaw-mcp/src/cloud.rs
|
||||
/// NOTE: The canonical, full-featured cloud module lives in noxa-mcp/src/cloud.rs
|
||||
/// (smart_fetch, bot detection, JS rendering checks). This is the minimal subset
|
||||
/// needed by the CLI. Kept separate to avoid pulling in rmcp via webclaw-mcp.
|
||||
/// and adding webclaw-mcp as a dependency would pull in rmcp.
|
||||
/// needed by the CLI. Kept separate to avoid pulling in rmcp via noxa-mcp.
|
||||
/// and adding noxa-mcp as a dependency would pull in rmcp.
|
||||
use serde_json::{Value, json};
|
||||
|
||||
const API_BASE: &str = "https://api.webclaw.io/v1";
|
||||
const API_BASE: &str = "https://api.noxa.io/v1";
|
||||
|
||||
pub struct CloudClient {
|
||||
api_key: String,
|
||||
|
|
@ -18,11 +18,11 @@ pub struct CloudClient {
|
|||
}
|
||||
|
||||
impl CloudClient {
|
||||
/// Create from explicit key or WEBCLAW_API_KEY env var.
|
||||
/// Create from explicit key or NOXA_API_KEY env var.
|
||||
pub fn new(explicit_key: Option<&str>) -> Option<Self> {
|
||||
let key = explicit_key
|
||||
.map(String::from)
|
||||
.or_else(|| std::env::var("WEBCLAW_API_KEY").ok())
|
||||
.or_else(|| std::env::var("NOXA_API_KEY").ok())
|
||||
.filter(|k| !k.is_empty())?;
|
||||
|
||||
Some(Self {
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
#![allow(dead_code)]
|
||||
/// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command.
|
||||
/// CLI entry point -- wires noxa-core and noxa-fetch into a single command.
|
||||
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
|
||||
mod cloud;
|
||||
|
||||
|
|
@ -11,16 +11,16 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
|||
|
||||
use clap::{Parser, ValueEnum};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use webclaw_core::{
|
||||
use noxa_core::{
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
|
||||
to_llm_text,
|
||||
};
|
||||
use webclaw_fetch::{
|
||||
use noxa_fetch::{
|
||||
BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
|
||||
FetchConfig, FetchResult, PageResult, SitemapEntry,
|
||||
};
|
||||
use webclaw_llm::LlmProvider;
|
||||
use webclaw_pdf::PdfMode;
|
||||
use noxa_llm::LlmProvider;
|
||||
use noxa_pdf::PdfMode;
|
||||
|
||||
/// Known anti-bot challenge page titles (case-insensitive prefix match).
|
||||
const ANTIBOT_TITLES: &[&str] = &[
|
||||
|
|
@ -73,19 +73,19 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
|
|||
EmptyReason::Antibot => eprintln!(
|
||||
"\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\
|
||||
This site requires CAPTCHA solving or browser rendering.\n\
|
||||
Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
|
||||
Use the noxa Cloud API for automatic bypass: https://noxa.io/pricing"
|
||||
),
|
||||
EmptyReason::JsRequired => eprintln!(
|
||||
"\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
|
||||
This site requires JavaScript rendering (SPA).\n\
|
||||
Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing"
|
||||
Use the noxa Cloud API for JS rendering: https://noxa.io/pricing"
|
||||
),
|
||||
EmptyReason::None => {}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "webclaw", about = "Extract web content for LLMs", version)]
|
||||
#[command(name = "noxa", about = "Extract web content for LLMs", version)]
|
||||
struct Cli {
|
||||
/// URLs to fetch (multiple allowed)
|
||||
#[arg()]
|
||||
|
|
@ -104,11 +104,11 @@ struct Cli {
|
|||
browser: Browser,
|
||||
|
||||
/// Proxy URL (http://user:pass@host:port or socks5://host:port)
|
||||
#[arg(short, long, env = "WEBCLAW_PROXY")]
|
||||
#[arg(short, long, env = "NOXA_PROXY")]
|
||||
proxy: Option<String>,
|
||||
|
||||
/// File with proxies (host:port:user:pass, one per line). Rotates per request.
|
||||
#[arg(long, env = "WEBCLAW_PROXY_FILE")]
|
||||
#[arg(long, env = "NOXA_PROXY_FILE")]
|
||||
proxy_file: Option<String>,
|
||||
|
||||
/// Request timeout in seconds
|
||||
|
|
@ -177,7 +177,7 @@ struct Cli {
|
|||
|
||||
/// Webhook URL: POST a JSON payload when an operation completes.
|
||||
/// Works with crawl, batch, watch (on change), and single URL modes.
|
||||
#[arg(long, env = "WEBCLAW_WEBHOOK_URL")]
|
||||
#[arg(long, env = "NOXA_WEBHOOK_URL")]
|
||||
webhook: Option<String>,
|
||||
|
||||
/// Extract brand identity (colors, fonts, logo)
|
||||
|
|
@ -248,20 +248,20 @@ struct Cli {
|
|||
summarize: Option<usize>,
|
||||
|
||||
/// Force a specific LLM provider (ollama, openai, anthropic)
|
||||
#[arg(long, env = "WEBCLAW_LLM_PROVIDER")]
|
||||
#[arg(long, env = "NOXA_LLM_PROVIDER")]
|
||||
llm_provider: Option<String>,
|
||||
|
||||
/// Override the LLM model name
|
||||
#[arg(long, env = "WEBCLAW_LLM_MODEL")]
|
||||
#[arg(long, env = "NOXA_LLM_MODEL")]
|
||||
llm_model: Option<String>,
|
||||
|
||||
/// Override the LLM base URL (Ollama or OpenAI-compatible)
|
||||
#[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
|
||||
#[arg(long, env = "NOXA_LLM_BASE_URL")]
|
||||
llm_base_url: Option<String>,
|
||||
|
||||
// -- Cloud API options --
|
||||
/// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites
|
||||
#[arg(long, env = "WEBCLAW_API_KEY")]
|
||||
/// Noxa Cloud API key for automatic fallback on bot-protected or JS-rendered sites
|
||||
#[arg(long, env = "NOXA_API_KEY")]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// Force all requests through the cloud API (skip local extraction)
|
||||
|
|
@ -330,9 +330,9 @@ impl From<Browser> for BrowserProfile {
|
|||
|
||||
fn init_logging(verbose: bool) {
|
||||
let filter = if verbose {
|
||||
EnvFilter::new("webclaw=debug")
|
||||
EnvFilter::new("noxa=debug")
|
||||
} else {
|
||||
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
EnvFilter::try_from_env("NOXA_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
};
|
||||
|
||||
tracing_subscriber::fmt().with_env_filter(filter).init();
|
||||
|
|
@ -347,7 +347,7 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
|||
let (proxy, proxy_pool) = if cli.proxy.is_some() {
|
||||
(cli.proxy.clone(), Vec::new())
|
||||
} else if let Some(ref path) = cli.proxy_file {
|
||||
match webclaw_fetch::parse_proxy_file(path) {
|
||||
match noxa_fetch::parse_proxy_file(path) {
|
||||
Ok(pool) => (None, pool),
|
||||
Err(e) => {
|
||||
eprintln!("warning: {e}");
|
||||
|
|
@ -356,7 +356,7 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
|||
}
|
||||
} else if std::path::Path::new("proxies.txt").exists() {
|
||||
// Auto-load proxies.txt from working directory if present
|
||||
match webclaw_fetch::parse_proxy_file("proxies.txt") {
|
||||
match noxa_fetch::parse_proxy_file("proxies.txt") {
|
||||
Ok(pool) if !pool.is_empty() => {
|
||||
eprintln!("loaded {} proxies from proxies.txt", pool.len());
|
||||
(None, pool)
|
||||
|
|
@ -652,7 +652,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
// --cloud: skip local, go straight to cloud API
|
||||
if cli.cloud {
|
||||
let c =
|
||||
cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?;
|
||||
cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?;
|
||||
let options = build_extraction_options(cli);
|
||||
let format_str = match cli.format {
|
||||
OutputFormat::Markdown => "markdown",
|
||||
|
|
@ -1349,7 +1349,7 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
|
|||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
|
||||
let entries = webclaw_fetch::sitemap::discover(&client, url)
|
||||
let entries = noxa_fetch::sitemap::discover(&client, url)
|
||||
.await
|
||||
.map_err(|e| format!("sitemap discovery failed: {e}"))?;
|
||||
|
||||
|
|
@ -1469,7 +1469,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
|||
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
|
||||
serde_json::json!({
|
||||
"embeds": [{
|
||||
"title": format!("webclaw: {event}"),
|
||||
"title": format!("noxa: {event}"),
|
||||
"description": format!("```json\n{details}\n```"),
|
||||
"color": 5814783
|
||||
}]
|
||||
|
|
@ -1482,7 +1482,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
|||
.unwrap_or("notification");
|
||||
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
|
||||
serde_json::json!({
|
||||
"text": format!("*webclaw: {event}*\n```{details}```")
|
||||
"text": format!("*noxa: {event}*\n```{details}```")
|
||||
})
|
||||
.to_string()
|
||||
} else {
|
||||
|
|
@ -1575,7 +1575,7 @@ async fn run_watch_single(
|
|||
}
|
||||
};
|
||||
|
||||
let diff = webclaw_core::diff::diff(&previous, ¤t);
|
||||
let diff = noxa_core::diff::diff(&previous, ¤t);
|
||||
|
||||
if diff.status == ChangeStatus::Same {
|
||||
eprintln!("[watch] No changes ({})", timestamp());
|
||||
|
|
@ -1687,7 +1687,7 @@ async fn run_watch_multi(
|
|||
match r.result {
|
||||
Ok(current) => {
|
||||
if let Some(previous) = snapshots.get(&r.url) {
|
||||
let diff = webclaw_core::diff::diff(previous, ¤t);
|
||||
let diff = noxa_core::diff::diff(previous, ¤t);
|
||||
if diff.status == ChangeStatus::Same {
|
||||
same_count += 1;
|
||||
} else {
|
||||
|
|
@ -1790,7 +1790,7 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
|
|||
// Extract current version (handles PDF detection for URLs)
|
||||
let new_result = fetch_and_extract(cli).await?.into_extraction()?;
|
||||
|
||||
let diff = webclaw_core::diff::diff(&old, &new_result);
|
||||
let diff = noxa_core::diff::diff(&old, &new_result);
|
||||
print_diff_output(&diff, &cli.format);
|
||||
|
||||
Ok(())
|
||||
|
|
@ -1799,7 +1799,7 @@ async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
|
|||
async fn run_brand(cli: &Cli) -> Result<(), String> {
|
||||
let result = fetch_html(cli).await?;
|
||||
let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
|
||||
let brand = webclaw_core::brand::extract_brand(
|
||||
let brand = noxa_core::brand::extract_brand(
|
||||
&enriched,
|
||||
Some(result.url.as_str()).filter(|s| !s.is_empty()),
|
||||
);
|
||||
|
|
@ -1815,7 +1815,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
if let Some(ref name) = cli.llm_provider {
|
||||
match name.as_str() {
|
||||
"ollama" => {
|
||||
let provider = webclaw_llm::providers::ollama::OllamaProvider::new(
|
||||
let provider = noxa_llm::providers::ollama::OllamaProvider::new(
|
||||
cli.llm_base_url.clone(),
|
||||
cli.llm_model.clone(),
|
||||
);
|
||||
|
|
@ -1825,7 +1825,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
Ok(Box::new(provider))
|
||||
}
|
||||
"openai" => {
|
||||
let provider = webclaw_llm::providers::openai::OpenAiProvider::new(
|
||||
let provider = noxa_llm::providers::openai::OpenAiProvider::new(
|
||||
None,
|
||||
cli.llm_base_url.clone(),
|
||||
cli.llm_model.clone(),
|
||||
|
|
@ -1834,7 +1834,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
Ok(Box::new(provider))
|
||||
}
|
||||
"anthropic" => {
|
||||
let provider = webclaw_llm::providers::anthropic::AnthropicProvider::new(
|
||||
let provider = noxa_llm::providers::anthropic::AnthropicProvider::new(
|
||||
None,
|
||||
cli.llm_model.clone(),
|
||||
)
|
||||
|
|
@ -1846,7 +1846,7 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
|||
)),
|
||||
}
|
||||
} else {
|
||||
let chain = webclaw_llm::ProviderChain::default().await;
|
||||
let chain = noxa_llm::ProviderChain::default().await;
|
||||
if chain.is_empty() {
|
||||
return Err(
|
||||
"no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
|
||||
|
|
@ -1876,7 +1876,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
let schema: serde_json::Value =
|
||||
serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?;
|
||||
|
||||
let extracted = webclaw_llm::extract::extract_json(
|
||||
let extracted = noxa_llm::extract::extract_json(
|
||||
&result.content.plain_text,
|
||||
&schema,
|
||||
provider.as_ref(),
|
||||
|
|
@ -1890,7 +1890,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
serde_json::to_string_pretty(&extracted).expect("serialization failed")
|
||||
);
|
||||
} else if let Some(ref prompt) = cli.extract_prompt {
|
||||
let extracted = webclaw_llm::extract::extract_with_prompt(
|
||||
let extracted = noxa_llm::extract::extract_with_prompt(
|
||||
&result.content.plain_text,
|
||||
prompt,
|
||||
provider.as_ref(),
|
||||
|
|
@ -1904,7 +1904,7 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
|||
serde_json::to_string_pretty(&extracted).expect("serialization failed")
|
||||
);
|
||||
} else if let Some(sentences) = cli.summarize {
|
||||
let summary = webclaw_llm::summarize::summarize(
|
||||
let summary = noxa_llm::summarize::summarize(
|
||||
&result.content.plain_text,
|
||||
Some(sentences),
|
||||
provider.as_ref(),
|
||||
|
|
@ -1975,15 +1975,15 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
|||
|
||||
// Run the appropriate LLM operation
|
||||
let llm_result = if let Some(ref schema) = schema {
|
||||
webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model)
|
||||
noxa_llm::extract::extract_json(text, schema, provider.as_ref(), model)
|
||||
.await
|
||||
.map(LlmOutput::Json)
|
||||
} else if let Some(ref prompt) = cli.extract_prompt {
|
||||
webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
|
||||
noxa_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
|
||||
.await
|
||||
.map(LlmOutput::Json)
|
||||
} else if let Some(sentences) = cli.summarize {
|
||||
webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
|
||||
noxa_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
|
||||
.await
|
||||
.map(LlmOutput::Text)
|
||||
} else {
|
||||
|
|
@ -2080,7 +2080,7 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
let api_key = cli
|
||||
.api_key
|
||||
.as_deref()
|
||||
.ok_or("--research requires WEBCLAW_API_KEY (set via env or --api-key)")?;
|
||||
.ok_or("--research requires NOXA_API_KEY (set via env or --api-key)")?;
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(600))
|
||||
|
|
@ -2099,7 +2099,7 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
|
||||
// Start job
|
||||
let resp = client
|
||||
.post("https://api.webclaw.io/v1/research")
|
||||
.post("https://api.noxa.io/v1/research")
|
||||
.header("Authorization", format!("Bearer {api_key}"))
|
||||
.json(&body)
|
||||
.send()
|
||||
|
|
@ -2122,7 +2122,7 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
|
||||
|
||||
let status_resp = client
|
||||
.get(format!("https://api.webclaw.io/v1/research/{job_id}"))
|
||||
.get(format!("https://api.noxa.io/v1/research/{job_id}"))
|
||||
.header("Authorization", format!("Bearer {api_key}"))
|
||||
.send()
|
||||
.await
|
||||
|
|
@ -2448,7 +2448,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn write_to_file_creates_dirs() {
|
||||
let dir = std::env::temp_dir().join("webclaw_test_output_dir");
|
||||
let dir = std::env::temp_dir().join("noxa_test_output_dir");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
write_to_file(&dir, "nested/deep/file.md", "hello").unwrap();
|
||||
let content = std::fs::read_to_string(dir.join("nested/deep/file.md")).unwrap();
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
[package]
|
||||
name = "webclaw-core"
|
||||
name = "noxa-core"
|
||||
description = "Pure HTML content extraction engine for LLMs"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
pub mod brand;
|
||||
pub(crate) mod data_island;
|
||||
/// webclaw-core: Pure HTML content extraction engine for LLMs.
|
||||
/// noxa-core: Pure HTML content extraction engine for LLMs.
|
||||
///
|
||||
/// Takes raw HTML + optional URL, returns structured content
|
||||
/// (metadata, markdown, plain text, links, images, code blocks).
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
[package]
|
||||
name = "webclaw-fetch"
|
||||
name = "noxa-fetch"
|
||||
description = "HTTP client with browser TLS fingerprint impersonation via wreq"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
webclaw-core = { workspace = true }
|
||||
webclaw-pdf = { path = "../webclaw-pdf" }
|
||||
noxa-core = { workspace = true }
|
||||
noxa-pdf = { path = "../noxa-pdf" }
|
||||
serde = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
//! Browser fingerprint selection and rotation.
|
||||
//! Maps our BrowserProfile enum to webclaw-http client builder methods.
|
||||
//! Maps our BrowserProfile enum to noxa-http client builder methods.
|
||||
|
||||
/// Which browser identity to present at the TLS/HTTP layer.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
|
|
@ -11,7 +11,7 @@ pub enum BrowserProfile {
|
|||
Random,
|
||||
}
|
||||
|
||||
/// A browser variant for building webclaw-http clients.
|
||||
/// A browser variant for building noxa-http clients.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum BrowserVariant {
|
||||
Chrome,
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/// HTTP client with browser TLS fingerprint impersonation.
|
||||
/// Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
/// Supports single and batch operations with proxy rotation.
|
||||
/// Automatically detects PDF responses and extracts text via webclaw-pdf.
|
||||
/// Automatically detects PDF responses and extracts text via noxa-pdf.
|
||||
///
|
||||
/// Two proxy modes:
|
||||
/// - **Static**: single proxy (or none) baked into pre-built clients at construction.
|
||||
|
|
@ -15,7 +15,7 @@ use std::time::{Duration, Instant};
|
|||
use rand::seq::SliceRandom;
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{debug, instrument, warn};
|
||||
use webclaw_pdf::PdfMode;
|
||||
use noxa_pdf::PdfMode;
|
||||
|
||||
use crate::browser::{self, BrowserProfile, BrowserVariant};
|
||||
use crate::error::FetchError;
|
||||
|
|
@ -75,11 +75,11 @@ pub struct BatchResult {
|
|||
#[derive(Debug)]
|
||||
pub struct BatchExtractResult {
|
||||
pub url: String,
|
||||
pub result: Result<webclaw_core::ExtractionResult, FetchError>,
|
||||
pub result: Result<noxa_core::ExtractionResult, FetchError>,
|
||||
}
|
||||
|
||||
/// Buffered response that owns its body. Provides the same sync API
|
||||
/// that webclaw-http::Response used to provide.
|
||||
/// that noxa-http::Response used to provide.
|
||||
struct Response {
|
||||
status: u16,
|
||||
url: String,
|
||||
|
|
@ -268,8 +268,8 @@ impl FetchClient {
|
|||
pub async fn fetch_and_extract(
|
||||
&self,
|
||||
url: &str,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
self.fetch_and_extract_with_options(url, &webclaw_core::ExtractionOptions::default())
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
self.fetch_and_extract_with_options(url, &noxa_core::ExtractionOptions::default())
|
||||
.await
|
||||
}
|
||||
|
||||
|
|
@ -278,8 +278,8 @@ impl FetchClient {
|
|||
pub async fn fetch_and_extract_with_options(
|
||||
&self,
|
||||
url: &str,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
options: &noxa_core::ExtractionOptions,
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
|
|
@ -334,7 +334,7 @@ impl FetchClient {
|
|||
"PDF fetch complete"
|
||||
);
|
||||
|
||||
let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
|
||||
let pdf_result = noxa_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
|
||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||
} else if let Some(doc_type) =
|
||||
crate::document::is_document_content_type(&headers, &final_url)
|
||||
|
|
@ -369,7 +369,7 @@ impl FetchClient {
|
|||
debug!("linkedin extraction failed, falling back to standard");
|
||||
}
|
||||
|
||||
let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
let extraction = noxa_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
|
||||
Ok(extraction)
|
||||
}
|
||||
|
|
@ -408,7 +408,7 @@ impl FetchClient {
|
|||
self.fetch_and_extract_batch_with_options(
|
||||
urls,
|
||||
concurrency,
|
||||
&webclaw_core::ExtractionOptions::default(),
|
||||
&noxa_core::ExtractionOptions::default(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
|
@ -418,7 +418,7 @@ impl FetchClient {
|
|||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
concurrency: usize,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
options: &noxa_core::ExtractionOptions,
|
||||
) -> Vec<BatchExtractResult> {
|
||||
let semaphore = Arc::new(Semaphore::new(concurrency));
|
||||
let mut handles = Vec::with_capacity(urls.len());
|
||||
|
|
@ -572,16 +572,16 @@ fn extract_homepage(url: &str) -> Option<String> {
|
|||
.map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or("")))
|
||||
}
|
||||
|
||||
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
||||
/// Convert a noxa-pdf PdfResult into a noxa-core ExtractionResult.
|
||||
fn pdf_to_extraction_result(
|
||||
pdf: &webclaw_pdf::PdfResult,
|
||||
pdf: &noxa_pdf::PdfResult,
|
||||
url: &str,
|
||||
) -> webclaw_core::ExtractionResult {
|
||||
let markdown = webclaw_pdf::to_markdown(pdf);
|
||||
) -> noxa_core::ExtractionResult {
|
||||
let markdown = noxa_pdf::to_markdown(pdf);
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
|
||||
webclaw_core::ExtractionResult {
|
||||
metadata: webclaw_core::Metadata {
|
||||
noxa_core::ExtractionResult {
|
||||
metadata: noxa_core::Metadata {
|
||||
title: pdf.metadata.title.clone(),
|
||||
description: pdf.metadata.subject.clone(),
|
||||
author: pdf.metadata.author.clone(),
|
||||
|
|
@ -593,7 +593,7 @@ fn pdf_to_extraction_result(
|
|||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
content: noxa_core::Content {
|
||||
markdown,
|
||||
plain_text: pdf.text.clone(),
|
||||
links: Vec::new(),
|
||||
|
|
@ -713,10 +713,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_pdf_to_extraction_result() {
|
||||
let pdf = webclaw_pdf::PdfResult {
|
||||
let pdf = noxa_pdf::PdfResult {
|
||||
text: "Hello from PDF.".into(),
|
||||
page_count: 2,
|
||||
metadata: webclaw_pdf::PdfMetadata {
|
||||
metadata: noxa_pdf::PdfMetadata {
|
||||
title: Some("My Doc".into()),
|
||||
author: Some("Author".into()),
|
||||
subject: Some("Testing".into()),
|
||||
|
|
@ -91,7 +91,7 @@ pub struct CrawlResult {
|
|||
pub struct PageResult {
|
||||
pub url: String,
|
||||
pub depth: usize,
|
||||
pub extraction: Option<webclaw_core::ExtractionResult>,
|
||||
pub extraction: Option<noxa_core::ExtractionResult>,
|
||||
pub error: Option<String>,
|
||||
#[serde(skip)]
|
||||
pub elapsed: Duration,
|
||||
|
|
@ -81,7 +81,7 @@ pub fn is_document_content_type(headers: &http::HeaderMap, url: &str) -> Option<
|
|||
pub fn extract_document(
|
||||
bytes: &[u8],
|
||||
doc_type: DocType,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
) -> Result<noxa_core::ExtractionResult, FetchError> {
|
||||
debug!(
|
||||
doc_type = doc_type.label(),
|
||||
bytes = bytes.len(),
|
||||
|
|
@ -98,8 +98,8 @@ pub fn extract_document(
|
|||
let plain_text = strip_markdown_formatting(&markdown);
|
||||
let word_count = plain_text.split_whitespace().count();
|
||||
|
||||
Ok(webclaw_core::ExtractionResult {
|
||||
metadata: webclaw_core::Metadata {
|
||||
Ok(noxa_core::ExtractionResult {
|
||||
metadata: noxa_core::Metadata {
|
||||
title: None,
|
||||
description: None,
|
||||
author: None,
|
||||
|
|
@ -111,7 +111,7 @@ pub fn extract_document(
|
|||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
content: noxa_core::Content {
|
||||
markdown,
|
||||
plain_text,
|
||||
links: Vec::new(),
|
||||
|
|
@ -14,10 +14,10 @@ pub enum FetchError {
|
|||
BodyDecode(String),
|
||||
|
||||
#[error("extraction failed: {0}")]
|
||||
Extraction(#[from] webclaw_core::ExtractError),
|
||||
Extraction(#[from] noxa_core::ExtractError),
|
||||
|
||||
#[error("PDF extraction failed: {0}")]
|
||||
Pdf(#[from] webclaw_pdf::PdfError),
|
||||
Pdf(#[from] noxa_pdf::PdfError),
|
||||
|
||||
#[error("client build failed: {0}")]
|
||||
Build(String),
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
//! webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
//! noxa-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
//! Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
//! Automatically detects PDF responses and delegates to webclaw-pdf.
|
||||
//! Automatically detects PDF responses and delegates to noxa-pdf.
|
||||
pub mod browser;
|
||||
pub mod client;
|
||||
pub mod crawler;
|
||||
|
|
@ -19,4 +19,4 @@ pub use error::FetchError;
|
|||
pub use http::HeaderMap;
|
||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||
pub use sitemap::SitemapEntry;
|
||||
pub use webclaw_pdf::PdfMode;
|
||||
pub use noxa_pdf::PdfMode;
|
||||
|
|
@ -5,7 +5,7 @@
|
|||
/// Profile, etc. We parse these to reconstruct post + comments as markdown.
|
||||
use serde_json::Value;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
use noxa_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL is a LinkedIn post/activity.
|
||||
pub fn is_linkedin_post(url: &str) -> bool {
|
||||
|
|
@ -5,7 +5,7 @@
|
|||
/// comment tree as structured JSON, which we convert to clean markdown.
|
||||
use serde::Deserialize;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
use noxa_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL points to a Reddit post/comment page.
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
//! Browser TLS + HTTP/2 fingerprint profiles built on wreq (BoringSSL).
|
||||
//!
|
||||
//! Replaces the old webclaw-http/webclaw-tls patched rustls stack.
|
||||
//! Replaces the old noxa-http/noxa-tls patched rustls stack.
|
||||
//! Each profile configures TLS options (cipher suites, curves, extensions,
|
||||
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
|
||||
//! stream dependency, priorities) to match real browser fingerprints.
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "webclaw-llm"
|
||||
description = "LLM integration for webclaw — local-first hybrid architecture (Ollama -> OpenAI -> Anthropic)"
|
||||
name = "noxa-llm"
|
||||
description = "LLM integration for noxa — local-first hybrid architecture (Ollama -> OpenAI -> Anthropic)"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
/// webclaw-llm: LLM integration with local-first hybrid architecture.
|
||||
/// noxa-llm: LLM integration with local-first hybrid architecture.
|
||||
///
|
||||
/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
|
||||
/// Provides schema-based extraction, prompt extraction, and summarization
|
||||
/// on top of webclaw-core's content pipeline.
|
||||
/// on top of noxa-core's content pipeline.
|
||||
pub mod chain;
|
||||
pub mod clean;
|
||||
pub mod error;
|
||||
|
|
@ -151,7 +151,7 @@ mod tests {
|
|||
|
||||
// Env var fallback tests mutate process-global state and race with parallel tests.
|
||||
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
|
||||
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
|
||||
// cargo test -p noxa-llm env_var -- --ignored --test-threads=1
|
||||
#[test]
|
||||
#[ignore = "mutates process env; run with --test-threads=1"]
|
||||
fn env_var_key_fallback() {
|
||||
|
|
@ -29,7 +29,7 @@ mod tests {
|
|||
#[test]
|
||||
fn none_override_with_no_env_returns_none() {
|
||||
assert_eq!(
|
||||
load_api_key(None, "WEBCLAW_TEST_NONEXISTENT_KEY_12345"),
|
||||
load_api_key(None, "NOXA_TEST_NONEXISTENT_KEY_12345"),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
|
@ -140,7 +140,7 @@ mod tests {
|
|||
|
||||
// Env var fallback is a trivial `env::var().ok()` -- not worth the flakiness
|
||||
// of manipulating process-global state. Run in isolation if needed:
|
||||
// cargo test -p webclaw-llm env_var_fallback -- --ignored --test-threads=1
|
||||
// cargo test -p noxa-llm env_var_fallback -- --ignored --test-threads=1
|
||||
#[test]
|
||||
#[ignore = "mutates process env; run with --test-threads=1"]
|
||||
fn env_var_fallback() {
|
||||
|
|
@ -162,7 +162,7 @@ mod tests {
|
|||
|
||||
// Env var fallback tests mutate process-global state and race with parallel tests.
|
||||
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
|
||||
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
|
||||
// cargo test -p noxa-llm env_var -- --ignored --test-threads=1
|
||||
#[test]
|
||||
#[ignore = "mutates process env; run with --test-threads=1"]
|
||||
fn env_var_key_fallback() {
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
/// Shared test utilities for webclaw-llm.
|
||||
/// Shared test utilities for noxa-llm.
|
||||
///
|
||||
/// Provides a configurable mock LLM provider for unit tests across
|
||||
/// extract, chain, and other modules that need a fake LLM backend.
|
||||
|
|
@ -1,19 +1,19 @@
|
|||
[package]
|
||||
name = "webclaw-mcp"
|
||||
description = "MCP server for webclaw web extraction toolkit"
|
||||
name = "noxa-mcp"
|
||||
description = "MCP server for noxa web extraction toolkit"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "webclaw-mcp"
|
||||
name = "noxa-mcp"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
webclaw-core = { workspace = true }
|
||||
webclaw-fetch = { workspace = true }
|
||||
webclaw-llm = { workspace = true }
|
||||
webclaw-pdf = { workspace = true }
|
||||
noxa-core = { workspace = true }
|
||||
noxa-fetch = { workspace = true }
|
||||
noxa-llm = { workspace = true }
|
||||
noxa-pdf = { workspace = true }
|
||||
rmcp = { version = "1.2", features = ["server", "macros", "transport-io", "schemars"] }
|
||||
schemars = "1.0"
|
||||
dotenvy = { workspace = true }
|
||||
|
|
@ -1,25 +1,26 @@
|
|||
/// Cloud API fallback for protected sites.
|
||||
///
|
||||
/// When local fetch returns a challenge page, this module retries
|
||||
/// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set.
|
||||
/// via api.noxa.io. Requires NOXA_API_KEY to be set.
|
||||
use std::time::Duration;
|
||||
|
||||
use serde_json::{Value, json};
|
||||
use tracing::info;
|
||||
|
||||
const API_BASE: &str = "https://api.webclaw.io/v1";
|
||||
|
||||
/// Lightweight client for the webclaw cloud API.
|
||||
const API_BASE: &str = "https://api.noxa.io/v1";
|
||||
|
||||
/// Lightweight client for the noxa cloud API.
|
||||
pub struct CloudClient {
|
||||
api_key: String,
|
||||
http: reqwest::Client,
|
||||
}
|
||||
|
||||
impl CloudClient {
|
||||
/// Create a new cloud client from WEBCLAW_API_KEY env var.
|
||||
/// Create a new cloud client from NOXA_API_KEY env var.
|
||||
/// Returns None if the key is not set.
|
||||
pub fn from_env() -> Option<Self> {
|
||||
let key = std::env::var("WEBCLAW_API_KEY").ok()?;
|
||||
let key = std::env::var("NOXA_API_KEY").ok()?;
|
||||
if key.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
|
@ -114,7 +115,7 @@ fn truncate_error(text: &str) -> &str {
|
|||
|
||||
/// Check if fetched HTML looks like a bot protection challenge page.
|
||||
/// Detects common bot protection challenge pages.
|
||||
pub fn is_bot_protected(html: &str, headers: &webclaw_fetch::HeaderMap) -> bool {
|
||||
pub fn is_bot_protected(html: &str, headers: &noxa_fetch::HeaderMap) -> bool {
|
||||
let html_lower = html.to_lowercase();
|
||||
|
||||
// Cloudflare challenge page
|
||||
|
|
@ -199,7 +200,7 @@ pub fn needs_js_rendering(word_count: usize, html: &str) -> bool {
|
|||
/// Result of a smart fetch: either local extraction or cloud API response.
|
||||
pub enum SmartFetchResult {
|
||||
/// Successfully extracted locally.
|
||||
Local(Box<webclaw_core::ExtractionResult>),
|
||||
Local(Box<noxa_core::ExtractionResult>),
|
||||
/// Fell back to cloud API. Contains the API response JSON.
|
||||
Cloud(Value),
|
||||
}
|
||||
|
|
@ -210,7 +211,7 @@ pub enum SmartFetchResult {
|
|||
/// If no API key is configured and local fetch is blocked, returns an error
|
||||
/// with a helpful message.
|
||||
pub async fn smart_fetch(
|
||||
client: &webclaw_fetch::FetchClient,
|
||||
client: &noxa_fetch::FetchClient,
|
||||
cloud: Option<&CloudClient>,
|
||||
url: &str,
|
||||
include_selectors: &[String],
|
||||
|
|
@ -239,7 +240,7 @@ pub async fn smart_fetch(
|
|||
}
|
||||
|
||||
// Step 3: Extract locally
|
||||
let options = webclaw_core::ExtractionOptions {
|
||||
let options = noxa_core::ExtractionOptions {
|
||||
include_selectors: include_selectors.to_vec(),
|
||||
exclude_selectors: exclude_selectors.to_vec(),
|
||||
only_main_content,
|
||||
|
|
@ -247,7 +248,7 @@ pub async fn smart_fetch(
|
|||
};
|
||||
|
||||
let extraction =
|
||||
webclaw_core::extract_with_options(&fetch_result.html, Some(&fetch_result.url), &options)
|
||||
noxa_core::extract_with_options(&fetch_result.html, Some(&fetch_result.url), &options)
|
||||
.map_err(|e| format!("Extraction failed: {e}"))?;
|
||||
|
||||
// Step 4: Check for JS-rendered pages (low content from large HTML)
|
||||
|
|
@ -295,8 +296,8 @@ async fn cloud_fallback(
|
|||
Ok(SmartFetchResult::Cloud(resp))
|
||||
}
|
||||
None => Err(format!(
|
||||
"Bot protection detected on {url}. Set WEBCLAW_API_KEY for automatic cloud bypass. \
|
||||
Get a key at https://webclaw.io"
|
||||
"Bot protection detected on {url}. Set NOXA_API_KEY for automatic cloud bypass. \
|
||||
Get a key at https://noxa.io"
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
/// webclaw-mcp: MCP (Model Context Protocol) server for webclaw.
|
||||
/// noxa-mcp: MCP (Model Context Protocol) server for noxa.
|
||||
/// Exposes web extraction tools over stdio transport for AI agents
|
||||
/// like Claude Desktop, Claude Code, and other MCP clients.
|
||||
mod cloud;
|
||||
|
|
@ -8,7 +8,7 @@ mod tools;
|
|||
use rmcp::ServiceExt;
|
||||
use rmcp::transport::stdio;
|
||||
|
||||
use server::WebclawMcp;
|
||||
use server::NoxaMcp;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
|
@ -21,7 +21,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
.with_ansi(false)
|
||||
.init();
|
||||
|
||||
let service = WebclawMcp::new().await.serve(stdio()).await?;
|
||||
let service = NoxaMcp::new().await.serve(stdio()).await?;
|
||||
|
||||
service.waiting().await?;
|
||||
Ok(())
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
/// MCP server implementation for webclaw.
|
||||
/// MCP server implementation for noxa.
|
||||
/// Exposes web extraction capabilities as tools for AI agents.
|
||||
///
|
||||
/// Uses a local-first architecture: fetches pages directly, then falls back
|
||||
/// to the webclaw cloud API (api.webclaw.io) when bot protection or
|
||||
/// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
|
||||
/// to the noxa cloud API (api.noxa.io) when bot protection or
|
||||
/// JS rendering is detected. Set NOXA_API_KEY for automatic fallback.
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
|
|
@ -18,19 +18,19 @@ use url::Url;
|
|||
use crate::cloud::{self, CloudClient, SmartFetchResult};
|
||||
use crate::tools::*;
|
||||
|
||||
pub struct WebclawMcp {
|
||||
pub struct NoxaMcp {
|
||||
tool_router: ToolRouter<Self>,
|
||||
fetch_client: Arc<webclaw_fetch::FetchClient>,
|
||||
llm_chain: Option<webclaw_llm::ProviderChain>,
|
||||
fetch_client: Arc<noxa_fetch::FetchClient>,
|
||||
llm_chain: Option<noxa_llm::ProviderChain>,
|
||||
cloud: Option<CloudClient>,
|
||||
}
|
||||
|
||||
/// Parse a browser string into a BrowserProfile.
|
||||
fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
|
||||
fn parse_browser(browser: Option<&str>) -> noxa_fetch::BrowserProfile {
|
||||
match browser {
|
||||
Some("firefox") => webclaw_fetch::BrowserProfile::Firefox,
|
||||
Some("random") => webclaw_fetch::BrowserProfile::Random,
|
||||
_ => webclaw_fetch::BrowserProfile::Chrome,
|
||||
Some("firefox") => noxa_fetch::BrowserProfile::Firefox,
|
||||
Some("random") => noxa_fetch::BrowserProfile::Random,
|
||||
_ => noxa_fetch::BrowserProfile::Chrome,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -58,28 +58,28 @@ const LOCAL_FETCH_TIMEOUT: Duration = Duration::from_secs(30);
|
|||
const RESEARCH_MAX_POLLS: u32 = 200;
|
||||
|
||||
#[tool_router]
|
||||
impl WebclawMcp {
|
||||
impl NoxaMcp {
|
||||
pub async fn new() -> Self {
|
||||
let mut config = webclaw_fetch::FetchConfig::default();
|
||||
let mut config = noxa_fetch::FetchConfig::default();
|
||||
|
||||
// Load proxy config from env vars or local file
|
||||
if let Ok(proxy) = std::env::var("WEBCLAW_PROXY") {
|
||||
info!("using single proxy from WEBCLAW_PROXY");
|
||||
if let Ok(proxy) = std::env::var("NOXA_PROXY") {
|
||||
info!("using single proxy from NOXA_PROXY");
|
||||
config.proxy = Some(proxy);
|
||||
}
|
||||
|
||||
let proxy_file = std::env::var("WEBCLAW_PROXY_FILE")
|
||||
let proxy_file = std::env::var("NOXA_PROXY_FILE")
|
||||
.ok()
|
||||
.unwrap_or_else(|| "proxies.txt".to_string());
|
||||
if std::path::Path::new(&proxy_file).exists()
|
||||
&& let Ok(pool) = webclaw_fetch::parse_proxy_file(&proxy_file)
|
||||
&& let Ok(pool) = noxa_fetch::parse_proxy_file(&proxy_file)
|
||||
&& !pool.is_empty()
|
||||
{
|
||||
info!(count = pool.len(), file = %proxy_file, "loaded proxy pool");
|
||||
config.proxy_pool = pool;
|
||||
}
|
||||
|
||||
let fetch_client = match webclaw_fetch::FetchClient::new(config) {
|
||||
let fetch_client = match noxa_fetch::FetchClient::new(config) {
|
||||
Ok(client) => client,
|
||||
Err(e) => {
|
||||
error!("failed to build FetchClient: {e}");
|
||||
|
|
@ -87,7 +87,7 @@ impl WebclawMcp {
|
|||
}
|
||||
};
|
||||
|
||||
let chain = webclaw_llm::ProviderChain::default().await;
|
||||
let chain = noxa_llm::ProviderChain::default().await;
|
||||
let llm_chain = if chain.is_empty() {
|
||||
warn!("no LLM providers available -- extract/summarize tools will fail");
|
||||
None
|
||||
|
|
@ -98,11 +98,11 @@ impl WebclawMcp {
|
|||
|
||||
let cloud = CloudClient::from_env();
|
||||
if cloud.is_some() {
|
||||
info!("cloud API fallback enabled (WEBCLAW_API_KEY set)");
|
||||
info!("cloud API fallback enabled (NOXA_API_KEY set)");
|
||||
} else {
|
||||
warn!(
|
||||
"WEBCLAW_API_KEY not set -- bot-protected sites will return challenge pages. \
|
||||
Get a key at https://webclaw.io"
|
||||
"NOXA_API_KEY not set -- bot-protected sites will return challenge pages. \
|
||||
Get a key at https://noxa.io"
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -129,7 +129,7 @@ impl WebclawMcp {
|
|||
}
|
||||
|
||||
/// Scrape a single URL and extract its content as markdown, LLM-optimized text, plain text, or full JSON.
|
||||
/// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
|
||||
/// Automatically falls back to the noxa cloud API when bot protection or JS rendering is detected.
|
||||
#[tool]
|
||||
async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
|
|
@ -147,21 +147,21 @@ impl WebclawMcp {
|
|||
.map(|c| c.join("; "));
|
||||
|
||||
// Use a custom client if non-default browser or cookies are provided
|
||||
let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome);
|
||||
let is_default_browser = matches!(browser, noxa_fetch::BrowserProfile::Chrome);
|
||||
let needs_custom = !is_default_browser || cookie_header.is_some();
|
||||
let custom_client;
|
||||
let client: &webclaw_fetch::FetchClient = if needs_custom {
|
||||
let client: &noxa_fetch::FetchClient = if needs_custom {
|
||||
let mut headers = std::collections::HashMap::new();
|
||||
headers.insert("Accept-Language".to_string(), "en-US,en;q=0.9".to_string());
|
||||
if let Some(ref cookies) = cookie_header {
|
||||
headers.insert("Cookie".to_string(), cookies.clone());
|
||||
}
|
||||
let config = webclaw_fetch::FetchConfig {
|
||||
let config = noxa_fetch::FetchConfig {
|
||||
browser,
|
||||
headers,
|
||||
..Default::default()
|
||||
};
|
||||
custom_client = webclaw_fetch::FetchClient::new(config)
|
||||
custom_client = noxa_fetch::FetchClient::new(config)
|
||||
.map_err(|e| format!("Failed to build client: {e}"))?;
|
||||
&custom_client
|
||||
} else {
|
||||
|
|
@ -183,7 +183,7 @@ impl WebclawMcp {
|
|||
match result {
|
||||
SmartFetchResult::Local(extraction) => {
|
||||
let output = match format {
|
||||
"llm" => webclaw_core::to_llm_text(&extraction, Some(¶ms.url)),
|
||||
"llm" => noxa_core::to_llm_text(&extraction, Some(¶ms.url)),
|
||||
"text" => extraction.content.plain_text,
|
||||
"json" => serde_json::to_string_pretty(&extraction).unwrap_or_default(),
|
||||
_ => extraction.content.markdown,
|
||||
|
|
@ -221,7 +221,7 @@ impl WebclawMcp {
|
|||
|
||||
let format = params.format.as_deref().unwrap_or("markdown");
|
||||
|
||||
let config = webclaw_fetch::CrawlConfig {
|
||||
let config = noxa_fetch::CrawlConfig {
|
||||
max_depth: params.depth.unwrap_or(2) as usize,
|
||||
max_pages: params.max_pages.unwrap_or(50),
|
||||
concurrency: params.concurrency.unwrap_or(5),
|
||||
|
|
@ -229,7 +229,7 @@ impl WebclawMcp {
|
|||
..Default::default()
|
||||
};
|
||||
|
||||
let crawler = webclaw_fetch::Crawler::new(¶ms.url, config)
|
||||
let crawler = noxa_fetch::Crawler::new(¶ms.url, config)
|
||||
.map_err(|e| format!("Crawler init failed: {e}"))?;
|
||||
|
||||
let result = crawler.crawl(¶ms.url, None).await;
|
||||
|
|
@ -243,7 +243,7 @@ impl WebclawMcp {
|
|||
output.push_str(&format!("--- {} (depth {}) ---\n", page.url, page.depth));
|
||||
if let Some(ref extraction) = page.extraction {
|
||||
let content = match format {
|
||||
"llm" => webclaw_core::to_llm_text(extraction, Some(&page.url)),
|
||||
"llm" => noxa_core::to_llm_text(extraction, Some(&page.url)),
|
||||
"text" => extraction.content.plain_text.clone(),
|
||||
_ => extraction.content.markdown.clone(),
|
||||
};
|
||||
|
|
@ -261,7 +261,7 @@ impl WebclawMcp {
|
|||
#[tool]
|
||||
async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, ¶ms.url)
|
||||
let entries = noxa_fetch::sitemap::discover(&self.fetch_client, ¶ms.url)
|
||||
.await
|
||||
.map_err(|e| format!("Sitemap discovery failed: {e}"))?;
|
||||
|
||||
|
|
@ -302,7 +302,7 @@ impl WebclawMcp {
|
|||
match &r.result {
|
||||
Ok(extraction) => {
|
||||
let content = match format {
|
||||
"llm" => webclaw_core::to_llm_text(extraction, Some(&r.url)),
|
||||
"llm" => noxa_core::to_llm_text(extraction, Some(&r.url)),
|
||||
"text" => extraction.content.plain_text.clone(),
|
||||
_ => extraction.content.markdown.clone(),
|
||||
};
|
||||
|
|
@ -319,7 +319,7 @@ impl WebclawMcp {
|
|||
}
|
||||
|
||||
/// Extract structured data from a web page using an LLM. Provide either a JSON schema or a natural language prompt.
|
||||
/// Falls back to the webclaw cloud API when no local LLM is available or bot protection is detected.
|
||||
/// Falls back to the noxa cloud API when no local LLM is available or bot protection is detected.
|
||||
#[tool]
|
||||
async fn extract(
|
||||
&self,
|
||||
|
|
@ -334,7 +334,7 @@ impl WebclawMcp {
|
|||
// No local LLM — fall back to cloud API directly
|
||||
if self.llm_chain.is_none() {
|
||||
let cloud = self.cloud.as_ref().ok_or(
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or WEBCLAW_API_KEY for cloud fallback.",
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
|
||||
)?;
|
||||
let mut body = json!({"url": params.url});
|
||||
if let Some(ref schema) = params.schema {
|
||||
|
|
@ -351,7 +351,7 @@ impl WebclawMcp {
|
|||
|
||||
let llm_content = match self.smart_fetch_llm(¶ms.url).await? {
|
||||
SmartFetchResult::Local(extraction) => {
|
||||
webclaw_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||
noxa_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||
}
|
||||
SmartFetchResult::Cloud(resp) => resp
|
||||
.get("llm")
|
||||
|
|
@ -362,12 +362,12 @@ impl WebclawMcp {
|
|||
};
|
||||
|
||||
let data = if let Some(ref schema) = params.schema {
|
||||
webclaw_llm::extract::extract_json(&llm_content, schema, chain, None)
|
||||
noxa_llm::extract::extract_json(&llm_content, schema, chain, None)
|
||||
.await
|
||||
.map_err(|e| format!("LLM extraction failed: {e}"))?
|
||||
} else {
|
||||
let prompt = params.prompt.as_deref().unwrap();
|
||||
webclaw_llm::extract::extract_with_prompt(&llm_content, prompt, chain, None)
|
||||
noxa_llm::extract::extract_with_prompt(&llm_content, prompt, chain, None)
|
||||
.await
|
||||
.map_err(|e| format!("LLM extraction failed: {e}"))?
|
||||
};
|
||||
|
|
@ -376,7 +376,7 @@ impl WebclawMcp {
|
|||
}
|
||||
|
||||
/// Summarize the content of a web page using an LLM.
|
||||
/// Falls back to the webclaw cloud API when no local LLM is available or bot protection is detected.
|
||||
/// Falls back to the noxa cloud API when no local LLM is available or bot protection is detected.
|
||||
#[tool]
|
||||
async fn summarize(
|
||||
&self,
|
||||
|
|
@ -387,7 +387,7 @@ impl WebclawMcp {
|
|||
// No local LLM — fall back to cloud API directly
|
||||
if self.llm_chain.is_none() {
|
||||
let cloud = self.cloud.as_ref().ok_or(
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or WEBCLAW_API_KEY for cloud fallback.",
|
||||
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
|
||||
)?;
|
||||
let mut body = json!({"url": params.url});
|
||||
if let Some(sentences) = params.max_sentences {
|
||||
|
|
@ -405,7 +405,7 @@ impl WebclawMcp {
|
|||
|
||||
let llm_content = match self.smart_fetch_llm(¶ms.url).await? {
|
||||
SmartFetchResult::Local(extraction) => {
|
||||
webclaw_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||
noxa_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||
}
|
||||
SmartFetchResult::Cloud(resp) => resp
|
||||
.get("llm")
|
||||
|
|
@ -415,17 +415,17 @@ impl WebclawMcp {
|
|||
.to_string(),
|
||||
};
|
||||
|
||||
webclaw_llm::summarize::summarize(&llm_content, params.max_sentences, chain, None)
|
||||
noxa_llm::summarize::summarize(&llm_content, params.max_sentences, chain, None)
|
||||
.await
|
||||
.map_err(|e| format!("Summarization failed: {e}"))
|
||||
}
|
||||
|
||||
/// Compare the current content of a URL against a previous extraction snapshot, showing what changed.
|
||||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
||||
/// Automatically falls back to the noxa cloud API when bot protection is detected.
|
||||
#[tool]
|
||||
async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
let previous: webclaw_core::ExtractionResult =
|
||||
let previous: noxa_core::ExtractionResult =
|
||||
serde_json::from_str(¶ms.previous_snapshot)
|
||||
.map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
|
||||
|
||||
|
|
@ -442,7 +442,7 @@ impl WebclawMcp {
|
|||
|
||||
match result {
|
||||
SmartFetchResult::Local(current) => {
|
||||
let content_diff = webclaw_core::diff::diff(&previous, ¤t);
|
||||
let content_diff = noxa_core::diff::diff(&previous, ¤t);
|
||||
Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
|
||||
}
|
||||
SmartFetchResult::Cloud(resp) => {
|
||||
|
|
@ -457,8 +457,8 @@ impl WebclawMcp {
|
|||
);
|
||||
}
|
||||
|
||||
let current = webclaw_core::ExtractionResult {
|
||||
content: webclaw_core::Content {
|
||||
let current = noxa_core::ExtractionResult {
|
||||
content: noxa_core::Content {
|
||||
markdown: markdown.to_string(),
|
||||
plain_text: markdown.to_string(),
|
||||
links: Vec::new(),
|
||||
|
|
@ -466,7 +466,7 @@ impl WebclawMcp {
|
|||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
metadata: webclaw_core::Metadata {
|
||||
metadata: noxa_core::Metadata {
|
||||
title: None,
|
||||
description: None,
|
||||
author: None,
|
||||
|
|
@ -482,14 +482,14 @@ impl WebclawMcp {
|
|||
structured_data: Vec::new(),
|
||||
};
|
||||
|
||||
let content_diff = webclaw_core::diff::diff(&previous, ¤t);
|
||||
let content_diff = noxa_core::diff::diff(&previous, ¤t);
|
||||
Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract brand identity (colors, fonts, logo, favicon) from a website's HTML and CSS.
|
||||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
||||
/// Automatically falls back to the noxa cloud API when bot protection is detected.
|
||||
#[tool]
|
||||
async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
|
|
@ -508,21 +508,21 @@ impl WebclawMcp {
|
|||
return Ok(serde_json::to_string_pretty(&resp).unwrap_or_default());
|
||||
} else {
|
||||
return Err(format!(
|
||||
"Bot protection detected on {}. Set WEBCLAW_API_KEY for automatic cloud bypass. \
|
||||
Get a key at https://webclaw.io",
|
||||
"Bot protection detected on {}. Set NOXA_API_KEY for automatic cloud bypass. \
|
||||
Get a key at https://noxa.io",
|
||||
params.url
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let identity =
|
||||
webclaw_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url));
|
||||
noxa_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url));
|
||||
|
||||
Ok(serde_json::to_string_pretty(&identity).unwrap_or_default())
|
||||
}
|
||||
|
||||
/// Run a deep research investigation on a topic or question. Requires WEBCLAW_API_KEY.
|
||||
/// Saves full result to ~/.webclaw/research/ and returns the file path + key findings.
|
||||
/// Run a deep research investigation on a topic or question. Requires NOXA_API_KEY.
|
||||
/// Saves full result to ~/.noxa/research/ and returns the file path + key findings.
|
||||
/// Checks cache first — same query returns the cached result without spending credits.
|
||||
#[tool]
|
||||
async fn research(
|
||||
|
|
@ -532,7 +532,7 @@ impl WebclawMcp {
|
|||
let cloud = self
|
||||
.cloud
|
||||
.as_ref()
|
||||
.ok_or("Research requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?;
|
||||
.ok_or("Research requires NOXA_API_KEY. Get a key at https://noxa.io")?;
|
||||
|
||||
let research_dir = research_dir();
|
||||
let slug = slugify(¶ms.query);
|
||||
|
|
@ -622,17 +622,17 @@ impl WebclawMcp {
|
|||
|
||||
Err(format!(
|
||||
"Research job {job_id} timed out after ~10 minutes of polling. \
|
||||
Check status manually via the webclaw API: GET /v1/research/{job_id}"
|
||||
Check status manually via the noxa API: GET /v1/research/{job_id}"
|
||||
))
|
||||
}
|
||||
|
||||
/// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY.
|
||||
/// Search the web for a query and return structured results. Requires NOXA_API_KEY.
|
||||
#[tool]
|
||||
async fn search(&self, Parameters(params): Parameters<SearchParams>) -> Result<String, String> {
|
||||
let cloud = self
|
||||
.cloud
|
||||
.as_ref()
|
||||
.ok_or("Search requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?;
|
||||
.ok_or("Search requires NOXA_API_KEY. Get a key at https://noxa.io")?;
|
||||
|
||||
let mut body = json!({ "query": params.query });
|
||||
if let Some(num) = params.num_results {
|
||||
|
|
@ -670,12 +670,12 @@ impl WebclawMcp {
|
|||
}
|
||||
|
||||
#[tool_handler]
|
||||
impl ServerHandler for WebclawMcp {
|
||||
impl ServerHandler for NoxaMcp {
|
||||
fn get_info(&self) -> ServerInfo {
|
||||
ServerInfo::new(ServerCapabilities::builder().enable_tools().build())
|
||||
.with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
|
||||
.with_server_info(Implementation::new("noxa-mcp", env!("CARGO_PKG_VERSION")))
|
||||
.with_instructions(String::from(
|
||||
"Webclaw MCP server -- web content extraction for AI agents. \
|
||||
"Noxa MCP server -- web content extraction for AI agents. \
|
||||
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",
|
||||
))
|
||||
}
|
||||
|
|
@ -688,7 +688,7 @@ impl ServerHandler for WebclawMcp {
|
|||
fn research_dir() -> std::path::PathBuf {
|
||||
let dir = dirs::home_dir()
|
||||
.unwrap_or_else(|| std::path::PathBuf::from("."))
|
||||
.join(".webclaw")
|
||||
.join(".noxa")
|
||||
.join("research");
|
||||
std::fs::create_dir_all(&dir).ok();
|
||||
dir
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "webclaw-pdf"
|
||||
description = "PDF text extraction for webclaw"
|
||||
name = "noxa-pdf"
|
||||
description = "PDF text extraction for noxa"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
/// PDF text extraction for webclaw.
|
||||
/// PDF text extraction for noxa.
|
||||
///
|
||||
/// Uses pdf-extract (backed by lopdf) to pull text from PDF bytes.
|
||||
/// No OCR -- text-based PDFs only. Scanned PDFs return EmptyPdf in Auto mode.
|
||||
Loading…
Add table
Add a link
Reference in a new issue