chore: rebrand webclaw to noxa

This commit is contained in:
Jacob Magar 2026-04-11 00:10:38 -04:00
parent a4c351d5ae
commit 8674b60b4e
86 changed files with 781 additions and 2121 deletions

View file

@ -0,0 +1,11 @@
[package]
name = "noxa-pdf"
description = "PDF text extraction for noxa"
version.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
pdf-extract = "0.7"
thiserror = { workspace = true }
tracing = { workspace = true }

View file

@ -0,0 +1,14 @@
/// PDF extraction errors. Kept simple -- no OCR, no complex recovery.
use thiserror::Error;
#[derive(Debug, Error)]
pub enum PdfError {
#[error("PDF extraction failed: {0}")]
ExtractionFailed(String),
#[error("invalid PDF: {0}")]
InvalidPdf(String),
#[error("empty PDF: no text content found")]
EmptyPdf,
}

292
crates/noxa-pdf/src/lib.rs Normal file
View file

@ -0,0 +1,292 @@
/// PDF text extraction for noxa.
///
/// Uses pdf-extract (backed by lopdf) to pull text from PDF bytes.
/// No OCR -- text-based PDFs only. Scanned PDFs return EmptyPdf in Auto mode.
pub mod error;
pub use error::PdfError;
// pdf-extract re-exports all of lopdf via `pub use lopdf::*`
use pdf_extract::{Dictionary, Document, Object};
use tracing::debug;
/// Controls how strictly we treat empty/sparse PDFs.
#[derive(Debug, Clone, Default)]
pub enum PdfMode {
/// Try text extraction; error if no text found (catches scanned PDFs early).
#[default]
Auto,
/// Return whatever text is found, even if empty. Caller decides what to do.
Fast,
}
/// Successful PDF extraction output.
#[derive(Debug, Clone)]
pub struct PdfResult {
pub text: String,
pub page_count: usize,
pub metadata: PdfMetadata,
}
/// PDF document metadata from the info dictionary.
#[derive(Debug, Clone, Default)]
pub struct PdfMetadata {
pub title: Option<String>,
pub author: Option<String>,
pub subject: Option<String>,
pub creator: Option<String>,
}
const MAX_PDF_SIZE: usize = 50 * 1024 * 1024; // 50MB
/// Extract text content from raw PDF bytes.
///
/// Uses pdf-extract for text extraction and lopdf (transitive dep) for
/// metadata and page count. In `Auto` mode, returns `PdfError::EmptyPdf`
/// if no text is found (likely a scanned/image-only PDF).
pub fn extract_pdf(bytes: &[u8], mode: PdfMode) -> Result<PdfResult, PdfError> {
if bytes.len() > MAX_PDF_SIZE {
return Err(PdfError::InvalidPdf(format!(
"PDF too large ({} bytes, max {})",
bytes.len(),
MAX_PDF_SIZE
)));
}
if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
return Err(PdfError::InvalidPdf("missing PDF header".into()));
}
let doc = Document::load_mem(bytes).map_err(|e| PdfError::InvalidPdf(e.to_string()))?;
let page_count = doc.get_pages().len();
let metadata = read_metadata(&doc);
debug!(pages = page_count, "PDF document loaded");
// Extract text via pdf-extract (higher-level API over lopdf)
let text = pdf_extract::extract_text_from_mem(bytes)
.map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
let text = normalize_text(&text);
if text.is_empty() {
if matches!(mode, PdfMode::Auto) {
return Err(PdfError::EmptyPdf);
}
debug!("PDF text extraction returned empty (Fast mode, returning as-is)");
}
debug!(chars = text.len(), "PDF text extracted");
Ok(PdfResult {
text,
page_count,
metadata,
})
}
/// Format a PdfResult as markdown for downstream consumers.
///
/// Adds title as a heading if available, followed by the extracted text body.
pub fn to_markdown(result: &PdfResult) -> String {
let mut out = String::new();
if let Some(ref title) = result.metadata.title
&& !title.is_empty()
{
out.push_str("# ");
out.push_str(title);
out.push_str("\n\n");
}
out.push_str(&result.text);
out
}
/// Read metadata from the PDF info dictionary.
/// Gracefully returns defaults for any missing or unreadable fields.
fn read_metadata(doc: &Document) -> PdfMetadata {
let info = match doc.trailer.get(b"Info") {
Ok(obj) => match doc.dereference(obj) {
Ok((_, Object::Dictionary(dict))) => Some(dict),
_ => None,
},
Err(_) => None,
};
let Some(info) = info else {
return PdfMetadata::default();
};
PdfMetadata {
title: info_string(info, b"Title"),
author: info_string(info, b"Author"),
subject: info_string(info, b"Subject"),
creator: info_string(info, b"Creator"),
}
}
/// Extract a string value from a PDF info dictionary entry.
/// Handles both String and Name object types.
fn info_string(dict: &Dictionary, key: &[u8]) -> Option<String> {
let obj = dict.get(key).ok()?;
let raw = match obj {
Object::String(bytes, _) => bytes.clone(),
Object::Name(bytes) => bytes.clone(),
_ => return None,
};
// PDF strings can be UTF-16BE (BOM: FE FF) or PDFDocEncoding (~Latin-1)
let text = if raw.len() >= 2 && raw[0] == 0xFE && raw[1] == 0xFF {
// UTF-16BE: skip BOM, decode pairs
let pairs: Vec<u16> = raw[2..]
.chunks_exact(2)
.map(|c| u16::from_be_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&pairs)
} else {
// PDFDocEncoding -- first 128 chars match ASCII, rest is Latin-1-ish.
// Good enough: lossy UTF-8 covers the common case.
String::from_utf8_lossy(&raw).into_owned()
};
let trimmed = text.trim().to_string();
if trimmed.is_empty() {
None
} else {
Some(trimmed)
}
}
/// Collapse excessive whitespace from pdf-extract output.
/// PDF text extraction often produces irregular spacing and blank lines.
fn normalize_text(raw: &str) -> String {
let mut lines: Vec<&str> = Vec::new();
let mut prev_blank = false;
for line in raw.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if !prev_blank && !lines.is_empty() {
lines.push("");
prev_blank = true;
}
} else {
lines.push(trimmed);
prev_blank = false;
}
}
// Strip trailing blank lines
while lines.last() == Some(&"") {
lines.pop();
}
lines.join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pdf_metadata_default() {
let meta = PdfMetadata::default();
assert!(meta.title.is_none());
assert!(meta.author.is_none());
assert!(meta.subject.is_none());
assert!(meta.creator.is_none());
}
#[test]
fn test_to_markdown_with_title() {
let result = PdfResult {
text: "Hello world.\n\nSecond paragraph.".into(),
page_count: 1,
metadata: PdfMetadata {
title: Some("Test Document".into()),
..Default::default()
},
};
let md = to_markdown(&result);
assert!(md.starts_with("# Test Document\n\n"));
assert!(md.contains("Hello world."));
assert!(md.contains("Second paragraph."));
}
#[test]
fn test_to_markdown_without_title() {
let result = PdfResult {
text: "Just text.".into(),
page_count: 1,
metadata: PdfMetadata::default(),
};
let md = to_markdown(&result);
assert_eq!(md, "Just text.");
}
#[test]
fn test_to_markdown_empty_title_skipped() {
let result = PdfResult {
text: "Body.".into(),
page_count: 1,
metadata: PdfMetadata {
title: Some("".into()),
..Default::default()
},
};
let md = to_markdown(&result);
assert!(!md.starts_with('#'));
assert_eq!(md, "Body.");
}
#[test]
fn test_empty_bytes_returns_error() {
let result = extract_pdf(&[], PdfMode::Auto);
assert!(matches!(result, Err(PdfError::InvalidPdf(_))));
}
#[test]
fn test_garbage_bytes_returns_error() {
let result = extract_pdf(b"not a pdf at all", PdfMode::Auto);
assert!(matches!(result, Err(PdfError::InvalidPdf(_))));
}
#[test]
fn test_truncated_pdf_header_returns_error() {
// Has the PDF magic but nothing else -- lopdf will reject it
let result = extract_pdf(b"%PDF-1.4\n", PdfMode::Auto);
assert!(result.is_err());
}
#[test]
fn test_oversized_pdf_rejected() {
let big = vec![0u8; MAX_PDF_SIZE + 1];
let result = extract_pdf(&big, PdfMode::Auto);
assert!(matches!(result, Err(PdfError::InvalidPdf(msg)) if msg.contains("too large")));
}
#[test]
fn test_normalize_text_collapses_blanks() {
let input = "Line one.\n\n\n\nLine two.\n\n\n";
let output = normalize_text(input);
assert_eq!(output, "Line one.\n\nLine two.");
}
#[test]
fn test_normalize_text_trims_lines() {
let input = " hello \n world ";
let output = normalize_text(input);
assert_eq!(output, "hello\nworld");
}
#[test]
fn test_normalize_text_empty() {
assert_eq!(normalize_text(""), "");
assert_eq!(normalize_text(" \n \n "), "");
}
}