Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
2026-05-11 07:52:37 +02:00 · 2026-03-23 18:31:11 +01:00 · 2026-03-23 18:31:11 +01:00 · c99ec684fa
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions
--- a/crates/webclaw-core/src/llm/metadata.rs
+++ b/crates/webclaw-core/src/llm/metadata.rs
@ -0,0 +1,47 @@
+/// Metadata header building for LLM-optimized output.
+///
+/// Produces `> ` prefixed lines with URL, title, author, etc.
+/// Omits empty/zero fields to minimize token waste.
+use crate::types::ExtractionResult;
+
+pub(crate) fn build_metadata_header(
+    out: &mut String,
+    result: &ExtractionResult,
+    url: Option<&str>,
+) {
+    let meta = &result.metadata;
+
+    // URL: prefer explicit arg, fall back to metadata
+    let effective_url = url.or(meta.url.as_deref());
+    if let Some(u) = effective_url {
+        out.push_str(&format!("> URL: {u}\n"));
+    }
+    if let Some(t) = &meta.title
+        && !t.is_empty()
+    {
+        out.push_str(&format!("> Title: {t}\n"));
+    }
+    if let Some(d) = &meta.description
+        && !d.is_empty()
+    {
+        out.push_str(&format!("> Description: {d}\n"));
+    }
+    if let Some(a) = &meta.author
+        && !a.is_empty()
+    {
+        out.push_str(&format!("> Author: {a}\n"));
+    }
+    if let Some(d) = &meta.published_date
+        && !d.is_empty()
+    {
+        out.push_str(&format!("> Published: {d}\n"));
+    }
+    if let Some(l) = &meta.language
+        && !l.is_empty()
+    {
+        out.push_str(&format!("> Language: {l}\n"));
+    }
+    if meta.word_count > 0 {
+        out.push_str(&format!("> Word count: {}\n", meta.word_count));
+    }
+}