/// LinkedIn post extraction from authenticated HTML. /// /// LinkedIn's SPA stores all data in `` tags as HTML-escaped JSON. /// The `included` array contains typed entities: Update (post), Comment, /// Profile, etc. We parse these to reconstruct post + comments as markdown. use serde_json::Value; use tracing::debug; use webclaw_core::{Content, ExtractionResult, Metadata}; /// Check if a URL is a LinkedIn post/activity. pub fn is_linkedin_post(url: &str) -> bool { let host = url .split("://") .nth(1) .unwrap_or(url) .split('/') .next() .unwrap_or(""); (host == "www.linkedin.com" || host == "linkedin.com") && (url.contains("/feed/update/") || url.contains("/posts/")) } /// Extract `` block contents from HTML using simple string scanning. /// LinkedIn wraps JSON data in `` tags with HTML-escaped content. fn extract_code_blocks(html: &str) -> Vec { let mut blocks = Vec::new(); let mut search_from = 0; while let Some(start) = html[search_from..].find("') else { break; }; let content_start = abs_start + tag_end + 1; let Some(end) = html[content_start..].find("") else { break; }; let content = &html[content_start..content_start + end]; if content.len() > 1000 { blocks.push(html_unescape(content)); } search_from = content_start + end + 7; } blocks } /// Extract post + comments from LinkedIn's SSR HTML (requires auth cookies). pub fn extract_linkedin_post(html: &str, url: &str) -> Option { let code_blocks = extract_code_blocks(html); // Find the largest block with "included" — that's the main data payload let mut best_included: Option> = None; for raw in &code_blocks { if let Ok(obj) = serde_json::from_str::(raw) && let Some(arr) = obj.get("included").and_then(|v| v.as_array()) { let current_len = best_included.as_ref().map(|a| a.len()).unwrap_or(0); if arr.len() > current_len { best_included = Some(arr.clone()); } } } let included = best_included?; debug!(entities = included.len(), "linkedin: found included array"); // Collect profiles (entityUrn → "First Last") let mut profiles = std::collections::HashMap::new(); for item in &included { let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or(""); if t.contains("Profile") { let urn = item.get("entityUrn").and_then(|v| v.as_str()).unwrap_or(""); let first = item.get("firstName").and_then(|v| v.as_str()).unwrap_or(""); let last = item.get("lastName").and_then(|v| v.as_str()).unwrap_or(""); let headline = item.get("headline").and_then(|v| v.as_str()).unwrap_or(""); if !first.is_empty() { profiles.insert( urn.to_string(), ( format!("{first} {last}").trim().to_string(), headline.to_string(), ), ); } } } // Find the main post (Update type) let mut markdown = String::new(); let mut post_author = String::new(); let mut post_headline = String::new(); for item in &included { let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or(""); if !t.contains("Update") { continue; } // Get author from actor profile if let Some(actor) = item.get("actor") { // actor can have a nested profile reference or inline data let author_urn = actor .get("*author") .or(actor.get("author")) .and_then(|v| v.as_str()) .unwrap_or(""); if let Some((name, headline)) = profiles.get(author_urn) { post_author = name.clone(); post_headline = headline.clone(); } // Or inline name if post_author.is_empty() && let Some(name) = actor.get("name").and_then(|v| v.as_object()) { let text = name.get("text").and_then(|v| v.as_str()).unwrap_or(""); if !text.is_empty() { post_author = text.to_string(); } } if post_headline.is_empty() && let Some(desc) = actor.get("description").and_then(|v| v.as_object()) { let text = desc.get("text").and_then(|v| v.as_str()).unwrap_or(""); if !text.is_empty() { post_headline = text.to_string(); } } } // Get post body from commentary if let Some(commentary) = item.get("commentary") && let Some(text) = commentary .get("text") .and_then(|v| v.as_object()) .and_then(|o| o.get("text")) .and_then(|v| v.as_str()) { if !post_author.is_empty() { markdown.push_str(&format!("# {post_author}\n\n")); } if !post_headline.is_empty() { markdown.push_str(&format!("*{post_headline}*\n\n")); } markdown.push_str("---\n\n"); // Unescape literal \n from JSON markdown.push_str(&text.replace("\\n", "\n")); markdown.push_str("\n\n"); } } if markdown.is_empty() { return None; } // Collect comments — LinkedIn stores comment text in `commentary.text` // and commenter name in `commenter.name.text` let mut comments: Vec<(String, String)> = Vec::new(); for item in &included { let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or(""); if !t.contains("Comment") { continue; } // Get comment text from commentary.text let text = item .get("commentary") .and_then(|c| c.get("text")) .and_then(|v| v.as_str()) .unwrap_or(""); if text.is_empty() { continue; } // Get commenter name from commenter.title.text let name = item .get("commenter") .and_then(|c| c.get("title")) .and_then(|n| n.get("text")) .and_then(|v| v.as_str()) .unwrap_or("Someone"); comments.push((name.to_string(), text.to_string())); } if !comments.is_empty() { markdown.push_str("---\n\n## Comments\n\n"); for (name, text) in &comments { markdown.push_str(&format!("- **{name}**: {text}\n\n")); } } let word_count = markdown.split_whitespace().count(); debug!( word_count, comments = comments.len(), "linkedin extraction done" ); Some(ExtractionResult { metadata: Metadata { title: if post_author.is_empty() { None } else { Some(format!("{post_author}'s LinkedIn Post")) }, description: None, author: if post_author.is_empty() { None } else { Some(post_author) }, published_date: None, language: None, url: Some(url.to_string()), site_name: Some("LinkedIn".into()), image: None, favicon: None, word_count, }, content: Content { markdown, plain_text: String::new(), links: vec![], images: vec![], code_blocks: vec![], raw_html: None, }, domain_data: None, structured_data: vec![], }) } /// Unescape HTML entities (named + numeric decimal). fn html_unescape(s: &str) -> String { let mut out = String::with_capacity(s.len()); let mut chars = s.chars().peekable(); while let Some(c) = chars.next() { if c != '&' { out.push(c); continue; } // Collect until ';' let mut entity = String::new(); for c2 in chars.by_ref() { if c2 == ';' { break; } entity.push(c2); if entity.len() > 10 { break; } } match entity.as_str() { "quot" => out.push('"'), "amp" => out.push('&'), "lt" => out.push('<'), "gt" => out.push('>'), "apos" => out.push('\''), s if s.starts_with('#') => { let num = &s[1..]; if let Ok(n) = num.parse::() && let Some(ch) = char::from_u32(n) { out.push(ch); continue; } out.push('&'); out.push_str(&entity); out.push(';'); } _ => { out.push('&'); out.push_str(&entity); out.push(';'); } } } out }