mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-11 07:52:37 +02:00
Initial release: webclaw v0.1.0 — web content extraction for LLMs
CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
This commit is contained in:
commit
c99ec684fa
79 changed files with 24074 additions and 0 deletions
47
crates/webclaw-core/src/llm/metadata.rs
Normal file
47
crates/webclaw-core/src/llm/metadata.rs
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
/// Metadata header building for LLM-optimized output.
|
||||
///
|
||||
/// Produces `> ` prefixed lines with URL, title, author, etc.
|
||||
/// Omits empty/zero fields to minimize token waste.
|
||||
use crate::types::ExtractionResult;
|
||||
|
||||
pub(crate) fn build_metadata_header(
|
||||
out: &mut String,
|
||||
result: &ExtractionResult,
|
||||
url: Option<&str>,
|
||||
) {
|
||||
let meta = &result.metadata;
|
||||
|
||||
// URL: prefer explicit arg, fall back to metadata
|
||||
let effective_url = url.or(meta.url.as_deref());
|
||||
if let Some(u) = effective_url {
|
||||
out.push_str(&format!("> URL: {u}\n"));
|
||||
}
|
||||
if let Some(t) = &meta.title
|
||||
&& !t.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Title: {t}\n"));
|
||||
}
|
||||
if let Some(d) = &meta.description
|
||||
&& !d.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Description: {d}\n"));
|
||||
}
|
||||
if let Some(a) = &meta.author
|
||||
&& !a.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Author: {a}\n"));
|
||||
}
|
||||
if let Some(d) = &meta.published_date
|
||||
&& !d.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Published: {d}\n"));
|
||||
}
|
||||
if let Some(l) = &meta.language
|
||||
&& !l.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Language: {l}\n"));
|
||||
}
|
||||
if meta.word_count > 0 {
|
||||
out.push_str(&format!("> Word count: {}\n", meta.word_count));
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue