Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io
This commit is contained in:
Valerio 2026-03-23 18:31:11 +01:00
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions

View file

@ -0,0 +1,47 @@
/// Metadata header building for LLM-optimized output.
///
/// Produces `> ` prefixed lines with URL, title, author, etc.
/// Omits empty/zero fields to minimize token waste.
use crate::types::ExtractionResult;
pub(crate) fn build_metadata_header(
out: &mut String,
result: &ExtractionResult,
url: Option<&str>,
) {
let meta = &result.metadata;
// URL: prefer explicit arg, fall back to metadata
let effective_url = url.or(meta.url.as_deref());
if let Some(u) = effective_url {
out.push_str(&format!("> URL: {u}\n"));
}
if let Some(t) = &meta.title
&& !t.is_empty()
{
out.push_str(&format!("> Title: {t}\n"));
}
if let Some(d) = &meta.description
&& !d.is_empty()
{
out.push_str(&format!("> Description: {d}\n"));
}
if let Some(a) = &meta.author
&& !a.is_empty()
{
out.push_str(&format!("> Author: {a}\n"));
}
if let Some(d) = &meta.published_date
&& !d.is_empty()
{
out.push_str(&format!("> Published: {d}\n"));
}
if let Some(l) = &meta.language
&& !l.is_empty()
{
out.push_str(&format!("> Language: {l}\n"));
}
if meta.word_count > 0 {
out.push_str(&format!("> Word count: {}\n", meta.word_count));
}
}