chore: rebrand webclaw to noxa

2026-05-12 08:22:37 +02:00 · 2026-04-11 00:10:38 -04:00 · 2026-04-11 00:10:38 -04:00 · 8674b60b4e
commit 8674b60b4e
parent a4c351d5ae
86 changed files with 781 additions and 2121 deletions
--- a/crates/noxa-pdf/Cargo.toml
+++ b/crates/noxa-pdf/Cargo.toml
@ -0,0 +1,11 @@
+[package]
+name = "noxa-pdf"
+description = "PDF text extraction for noxa"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+pdf-extract = "0.7"
+thiserror = { workspace = true }
+tracing = { workspace = true }
--- a/crates/noxa-pdf/src/error.rs
+++ b/crates/noxa-pdf/src/error.rs
@ -0,0 +1,14 @@
+/// PDF extraction errors. Kept simple -- no OCR, no complex recovery.
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum PdfError {
+    #[error("PDF extraction failed: {0}")]
+    ExtractionFailed(String),
+
+    #[error("invalid PDF: {0}")]
+    InvalidPdf(String),
+
+    #[error("empty PDF: no text content found")]
+    EmptyPdf,
+}
--- a/crates/noxa-pdf/src/lib.rs
+++ b/crates/noxa-pdf/src/lib.rs
@ -0,0 +1,292 @@
+/// PDF text extraction for noxa.
+///
+/// Uses pdf-extract (backed by lopdf) to pull text from PDF bytes.
+/// No OCR -- text-based PDFs only. Scanned PDFs return EmptyPdf in Auto mode.
+pub mod error;
+
+pub use error::PdfError;
+
+// pdf-extract re-exports all of lopdf via `pub use lopdf::*`
+use pdf_extract::{Dictionary, Document, Object};
+use tracing::debug;
+
+/// Controls how strictly we treat empty/sparse PDFs.
+#[derive(Debug, Clone, Default)]
+pub enum PdfMode {
+    /// Try text extraction; error if no text found (catches scanned PDFs early).
+    #[default]
+    Auto,
+    /// Return whatever text is found, even if empty. Caller decides what to do.
+    Fast,
+}
+
+/// Successful PDF extraction output.
+#[derive(Debug, Clone)]
+pub struct PdfResult {
+    pub text: String,
+    pub page_count: usize,
+    pub metadata: PdfMetadata,
+}
+
+/// PDF document metadata from the info dictionary.
+#[derive(Debug, Clone, Default)]
+pub struct PdfMetadata {
+    pub title: Option<String>,
+    pub author: Option<String>,
+    pub subject: Option<String>,
+    pub creator: Option<String>,
+}
+
+const MAX_PDF_SIZE: usize = 50 * 1024 * 1024; // 50MB
+
+/// Extract text content from raw PDF bytes.
+///
+/// Uses pdf-extract for text extraction and lopdf (transitive dep) for
+/// metadata and page count. In `Auto` mode, returns `PdfError::EmptyPdf`
+/// if no text is found (likely a scanned/image-only PDF).
+pub fn extract_pdf(bytes: &[u8], mode: PdfMode) -> Result<PdfResult, PdfError> {
+    if bytes.len() > MAX_PDF_SIZE {
+        return Err(PdfError::InvalidPdf(format!(
+            "PDF too large ({} bytes, max {})",
+            bytes.len(),
+            MAX_PDF_SIZE
+        )));
+    }
+
+    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
+        return Err(PdfError::InvalidPdf("missing PDF header".into()));
+    }
+
+    let doc = Document::load_mem(bytes).map_err(|e| PdfError::InvalidPdf(e.to_string()))?;
+
+    let page_count = doc.get_pages().len();
+    let metadata = read_metadata(&doc);
+
+    debug!(pages = page_count, "PDF document loaded");
+
+    // Extract text via pdf-extract (higher-level API over lopdf)
+    let text = pdf_extract::extract_text_from_mem(bytes)
+        .map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
+
+    let text = normalize_text(&text);
+
+    if text.is_empty() {
+        if matches!(mode, PdfMode::Auto) {
+            return Err(PdfError::EmptyPdf);
+        }
+        debug!("PDF text extraction returned empty (Fast mode, returning as-is)");
+    }
+
+    debug!(chars = text.len(), "PDF text extracted");
+
+    Ok(PdfResult {
+        text,
+        page_count,
+        metadata,
+    })
+}
+
+/// Format a PdfResult as markdown for downstream consumers.
+///
+/// Adds title as a heading if available, followed by the extracted text body.
+pub fn to_markdown(result: &PdfResult) -> String {
+    let mut out = String::new();
+
+    if let Some(ref title) = result.metadata.title
+        && !title.is_empty()
+    {
+        out.push_str("# ");
+        out.push_str(title);
+        out.push_str("\n\n");
+    }
+
+    out.push_str(&result.text);
+    out
+}
+
+/// Read metadata from the PDF info dictionary.
+/// Gracefully returns defaults for any missing or unreadable fields.
+fn read_metadata(doc: &Document) -> PdfMetadata {
+    let info = match doc.trailer.get(b"Info") {
+        Ok(obj) => match doc.dereference(obj) {
+            Ok((_, Object::Dictionary(dict))) => Some(dict),
+            _ => None,
+        },
+        Err(_) => None,
+    };
+
+    let Some(info) = info else {
+        return PdfMetadata::default();
+    };
+
+    PdfMetadata {
+        title: info_string(info, b"Title"),
+        author: info_string(info, b"Author"),
+        subject: info_string(info, b"Subject"),
+        creator: info_string(info, b"Creator"),
+    }
+}
+
+/// Extract a string value from a PDF info dictionary entry.
+/// Handles both String and Name object types.
+fn info_string(dict: &Dictionary, key: &[u8]) -> Option<String> {
+    let obj = dict.get(key).ok()?;
+    let raw = match obj {
+        Object::String(bytes, _) => bytes.clone(),
+        Object::Name(bytes) => bytes.clone(),
+        _ => return None,
+    };
+
+    // PDF strings can be UTF-16BE (BOM: FE FF) or PDFDocEncoding (~Latin-1)
+    let text = if raw.len() >= 2 && raw[0] == 0xFE && raw[1] == 0xFF {
+        // UTF-16BE: skip BOM, decode pairs
+        let pairs: Vec<u16> = raw[2..]
+            .chunks_exact(2)
+            .map(|c| u16::from_be_bytes([c[0], c[1]]))
+            .collect();
+        String::from_utf16_lossy(&pairs)
+    } else {
+        // PDFDocEncoding -- first 128 chars match ASCII, rest is Latin-1-ish.
+        // Good enough: lossy UTF-8 covers the common case.
+        String::from_utf8_lossy(&raw).into_owned()
+    };
+
+    let trimmed = text.trim().to_string();
+    if trimmed.is_empty() {
+        None
+    } else {
+        Some(trimmed)
+    }
+}
+
+/// Collapse excessive whitespace from pdf-extract output.
+/// PDF text extraction often produces irregular spacing and blank lines.
+fn normalize_text(raw: &str) -> String {
+    let mut lines: Vec<&str> = Vec::new();
+    let mut prev_blank = false;
+
+    for line in raw.lines() {
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            if !prev_blank && !lines.is_empty() {
+                lines.push("");
+                prev_blank = true;
+            }
+        } else {
+            lines.push(trimmed);
+            prev_blank = false;
+        }
+    }
+
+    // Strip trailing blank lines
+    while lines.last() == Some(&"") {
+        lines.pop();
+    }
+
+    lines.join("\n")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pdf_metadata_default() {
+        let meta = PdfMetadata::default();
+        assert!(meta.title.is_none());
+        assert!(meta.author.is_none());
+        assert!(meta.subject.is_none());
+        assert!(meta.creator.is_none());
+    }
+
+    #[test]
+    fn test_to_markdown_with_title() {
+        let result = PdfResult {
+            text: "Hello world.\n\nSecond paragraph.".into(),
+            page_count: 1,
+            metadata: PdfMetadata {
+                title: Some("Test Document".into()),
+                ..Default::default()
+            },
+        };
+
+        let md = to_markdown(&result);
+        assert!(md.starts_with("# Test Document\n\n"));
+        assert!(md.contains("Hello world."));
+        assert!(md.contains("Second paragraph."));
+    }
+
+    #[test]
+    fn test_to_markdown_without_title() {
+        let result = PdfResult {
+            text: "Just text.".into(),
+            page_count: 1,
+            metadata: PdfMetadata::default(),
+        };
+
+        let md = to_markdown(&result);
+        assert_eq!(md, "Just text.");
+    }
+
+    #[test]
+    fn test_to_markdown_empty_title_skipped() {
+        let result = PdfResult {
+            text: "Body.".into(),
+            page_count: 1,
+            metadata: PdfMetadata {
+                title: Some("".into()),
+                ..Default::default()
+            },
+        };
+
+        let md = to_markdown(&result);
+        assert!(!md.starts_with('#'));
+        assert_eq!(md, "Body.");
+    }
+
+    #[test]
+    fn test_empty_bytes_returns_error() {
+        let result = extract_pdf(&[], PdfMode::Auto);
+        assert!(matches!(result, Err(PdfError::InvalidPdf(_))));
+    }
+
+    #[test]
+    fn test_garbage_bytes_returns_error() {
+        let result = extract_pdf(b"not a pdf at all", PdfMode::Auto);
+        assert!(matches!(result, Err(PdfError::InvalidPdf(_))));
+    }
+
+    #[test]
+    fn test_truncated_pdf_header_returns_error() {
+        // Has the PDF magic but nothing else -- lopdf will reject it
+        let result = extract_pdf(b"%PDF-1.4\n", PdfMode::Auto);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_oversized_pdf_rejected() {
+        let big = vec![0u8; MAX_PDF_SIZE + 1];
+        let result = extract_pdf(&big, PdfMode::Auto);
+        assert!(matches!(result, Err(PdfError::InvalidPdf(msg)) if msg.contains("too large")));
+    }
+
+    #[test]
+    fn test_normalize_text_collapses_blanks() {
+        let input = "Line one.\n\n\n\nLine two.\n\n\n";
+        let output = normalize_text(input);
+        assert_eq!(output, "Line one.\n\nLine two.");
+    }
+
+    #[test]
+    fn test_normalize_text_trims_lines() {
+        let input = "  hello  \n  world  ";
+        let output = normalize_text(input);
+        assert_eq!(output, "hello\nworld");
+    }
+
+    #[test]
+    fn test_normalize_text_empty() {
+        assert_eq!(normalize_text(""), "");
+        assert_eq!(normalize_text("  \n  \n  "), "");
+    }
+}