Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
2026-05-15 18:25:24 +02:00 · 2026-03-23 18:31:11 +01:00 · 2026-03-23 18:31:11 +01:00 · c99ec684fa
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions
--- a/crates/webclaw-fetch/src/linkedin.rs
+++ b/crates/webclaw-fetch/src/linkedin.rs
@ -0,0 +1,279 @@
+/// LinkedIn post extraction from authenticated HTML.
+///
+/// LinkedIn's SPA stores all data in `<code>` tags as HTML-escaped JSON.
+/// The `included` array contains typed entities: Update (post), Comment,
+/// Profile, etc. We parse these to reconstruct post + comments as markdown.
+use serde_json::Value;
+use tracing::debug;
+use webclaw_core::{Content, ExtractionResult, Metadata};
+
+/// Check if a URL is a LinkedIn post/activity.
+pub fn is_linkedin_post(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    (host == "www.linkedin.com" || host == "linkedin.com")
+        && (url.contains("/feed/update/") || url.contains("/posts/"))
+}
+
+/// Extract `<code>` block contents from HTML using simple string scanning.
+/// LinkedIn wraps JSON data in `<code>` tags with HTML-escaped content.
+fn extract_code_blocks(html: &str) -> Vec<String> {
+    let mut blocks = Vec::new();
+    let mut search_from = 0;
+    while let Some(start) = html[search_from..].find("<code") {
+        let abs_start = search_from + start;
+        // Find end of opening tag
+        let Some(tag_end) = html[abs_start..].find('>') else {
+            break;
+        };
+        let content_start = abs_start + tag_end + 1;
+        let Some(end) = html[content_start..].find("</code>") else {
+            break;
+        };
+        let content = &html[content_start..content_start + end];
+        if content.len() > 1000 {
+            blocks.push(html_unescape(content));
+        }
+        search_from = content_start + end + 7;
+    }
+    blocks
+}
+
+/// Extract post + comments from LinkedIn's SSR HTML (requires auth cookies).
+pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult> {
+    let code_blocks = extract_code_blocks(html);
+
+    // Find the largest <code> block with "included" — that's the main data payload
+    let mut best_included: Option<Vec<Value>> = None;
+    for raw in &code_blocks {
+        if let Ok(obj) = serde_json::from_str::<Value>(raw)
+            && let Some(arr) = obj.get("included").and_then(|v| v.as_array())
+        {
+            let current_len = best_included.as_ref().map(|a| a.len()).unwrap_or(0);
+            if arr.len() > current_len {
+                best_included = Some(arr.clone());
+            }
+        }
+    }
+
+    let included = best_included?;
+    debug!(entities = included.len(), "linkedin: found included array");
+
+    // Collect profiles (entityUrn → "First Last")
+    let mut profiles = std::collections::HashMap::new();
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if t.contains("Profile") {
+            let urn = item.get("entityUrn").and_then(|v| v.as_str()).unwrap_or("");
+            let first = item.get("firstName").and_then(|v| v.as_str()).unwrap_or("");
+            let last = item.get("lastName").and_then(|v| v.as_str()).unwrap_or("");
+            let headline = item.get("headline").and_then(|v| v.as_str()).unwrap_or("");
+            if !first.is_empty() {
+                profiles.insert(
+                    urn.to_string(),
+                    (
+                        format!("{first} {last}").trim().to_string(),
+                        headline.to_string(),
+                    ),
+                );
+            }
+        }
+    }
+
+    // Find the main post (Update type)
+    let mut markdown = String::new();
+    let mut post_author = String::new();
+    let mut post_headline = String::new();
+
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if !t.contains("Update") {
+            continue;
+        }
+
+        // Get author from actor profile
+        if let Some(actor) = item.get("actor") {
+            // actor can have a nested profile reference or inline data
+            let author_urn = actor
+                .get("*author")
+                .or(actor.get("author"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("");
+            if let Some((name, headline)) = profiles.get(author_urn) {
+                post_author = name.clone();
+                post_headline = headline.clone();
+            }
+            // Or inline name
+            if post_author.is_empty()
+                && let Some(name) = actor.get("name").and_then(|v| v.as_object())
+            {
+                let text = name.get("text").and_then(|v| v.as_str()).unwrap_or("");
+                if !text.is_empty() {
+                    post_author = text.to_string();
+                }
+            }
+            if post_headline.is_empty()
+                && let Some(desc) = actor.get("description").and_then(|v| v.as_object())
+            {
+                let text = desc.get("text").and_then(|v| v.as_str()).unwrap_or("");
+                if !text.is_empty() {
+                    post_headline = text.to_string();
+                }
+            }
+        }
+
+        // Get post body from commentary
+        if let Some(commentary) = item.get("commentary")
+            && let Some(text) = commentary
+                .get("text")
+                .and_then(|v| v.as_object())
+                .and_then(|o| o.get("text"))
+                .and_then(|v| v.as_str())
+        {
+            if !post_author.is_empty() {
+                markdown.push_str(&format!("# {post_author}\n\n"));
+            }
+            if !post_headline.is_empty() {
+                markdown.push_str(&format!("*{post_headline}*\n\n"));
+            }
+            markdown.push_str("---\n\n");
+            // Unescape literal \n from JSON
+            markdown.push_str(&text.replace("\\n", "\n"));
+            markdown.push_str("\n\n");
+        }
+    }
+
+    if markdown.is_empty() {
+        return None;
+    }
+
+    // Collect comments — LinkedIn stores comment text in `commentary.text`
+    // and commenter name in `commenter.name.text`
+    let mut comments: Vec<(String, String)> = Vec::new();
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if !t.contains("Comment") {
+            continue;
+        }
+
+        // Get comment text from commentary.text
+        let text = item
+            .get("commentary")
+            .and_then(|c| c.get("text"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("");
+        if text.is_empty() {
+            continue;
+        }
+
+        // Get commenter name from commenter.title.text
+        let name = item
+            .get("commenter")
+            .and_then(|c| c.get("title"))
+            .and_then(|n| n.get("text"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("Someone");
+
+        comments.push((name.to_string(), text.to_string()));
+    }
+
+    if !comments.is_empty() {
+        markdown.push_str("---\n\n## Comments\n\n");
+        for (name, text) in &comments {
+            markdown.push_str(&format!("- **{name}**: {text}\n\n"));
+        }
+    }
+
+    let word_count = markdown.split_whitespace().count();
+    debug!(
+        word_count,
+        comments = comments.len(),
+        "linkedin extraction done"
+    );
+
+    Some(ExtractionResult {
+        metadata: Metadata {
+            title: if post_author.is_empty() {
+                None
+            } else {
+                Some(format!("{post_author}'s LinkedIn Post"))
+            },
+            description: None,
+            author: if post_author.is_empty() {
+                None
+            } else {
+                Some(post_author)
+            },
+            published_date: None,
+            language: None,
+            url: Some(url.to_string()),
+            site_name: Some("LinkedIn".into()),
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: Content {
+            markdown,
+            plain_text: String::new(),
+            links: vec![],
+            images: vec![],
+            code_blocks: vec![],
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+/// Unescape HTML entities (named + numeric decimal).
+fn html_unescape(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    let mut chars = s.chars().peekable();
+    while let Some(c) = chars.next() {
+        if c != '&' {
+            out.push(c);
+            continue;
+        }
+        // Collect until ';'
+        let mut entity = String::new();
+        for c2 in chars.by_ref() {
+            if c2 == ';' {
+                break;
+            }
+            entity.push(c2);
+            if entity.len() > 10 {
+                break;
+            }
+        }
+        match entity.as_str() {
+            "quot" => out.push('"'),
+            "amp" => out.push('&'),
+            "lt" => out.push('<'),
+            "gt" => out.push('>'),
+            "apos" => out.push('\''),
+            s if s.starts_with('#') => {
+                let num = &s[1..];
+                if let Ok(n) = num.parse::<u32>()
+                    && let Some(ch) = char::from_u32(n)
+                {
+                    out.push(ch);
+                    continue;
+                }
+                out.push('&');
+                out.push_str(&entity);
+                out.push(';');
+            }
+            _ => {
+                out.push('&');
+                out.push_str(&entity);
+                out.push(';');
+            }
+        }
+    }
+    out
+}