//! LinkedIn post structured extractor. //! //! Uses the public embed endpoint `/embed/feed/update/{urn}` which //! LinkedIn provides for sites that want to render a post inline. No //! auth required, returns SSR HTML with the full post body, OG tags, //! image, and a link back to the original post. //! //! Accepts both URN forms (`urn:li:share:N` and `urn:li:activity:N`) //! and pretty post URLs (`/posts/{user}_{slug}-{id}-{suffix}`) by //! pulling the trailing numeric id and converting to an activity URN. use regex::Regex; use serde_json::{Value, json}; use std::sync::OnceLock; use super::ExtractorInfo; use crate::client::FetchClient; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { name: "linkedin_post", label: "LinkedIn post", description: "Returns post body, author name, image, and original URL via LinkedIn's public embed endpoint.", url_patterns: &[ "https://www.linkedin.com/feed/update/urn:li:share:{id}", "https://www.linkedin.com/feed/update/urn:li:activity:{id}", "https://www.linkedin.com/posts/{user}_{slug}-{id}-{suffix}", ], }; pub fn matches(url: &str) -> bool { let host = host_of(url); if !matches!(host, "www.linkedin.com" | "linkedin.com") { return false; } url.contains("/feed/update/urn:li:") || url.contains("/posts/") } pub async fn extract(client: &FetchClient, url: &str) -> Result { let urn = extract_urn(url).ok_or_else(|| { FetchError::Build(format!( "linkedin_post: cannot extract URN from '{url}' (expected /feed/update/urn:li:... or /posts/{{slug}}-{{id}})" )) })?; let embed_url = format!("https://www.linkedin.com/embed/feed/update/{urn}"); let resp = client.fetch(&embed_url).await?; if resp.status != 200 { return Err(FetchError::Build(format!( "linkedin embed returned status {} for {urn}", resp.status ))); } let html = &resp.html; let og = parse_og_tags(html); let body = parse_post_body(html); let author = parse_author(html); let canonical_url = og.get("url").cloned().unwrap_or_else(|| embed_url.clone()); Ok(json!({ "url": url, "embed_url": embed_url, "urn": urn, "canonical_url": canonical_url, "data_completeness": "embed", "title": og.get("title").cloned(), "body": body, "author_name": author, "image_url": og.get("image").cloned(), "site_name": og.get("site_name").cloned().unwrap_or_else(|| "LinkedIn".into()), })) } // --------------------------------------------------------------------------- // URN extraction // --------------------------------------------------------------------------- /// Pull a `urn:li:share:N` or `urn:li:activity:N` from any LinkedIn URL. /// `/posts/{slug}-{id}-{suffix}` URLs encode the activity id as the second- /// to-last `-` separated chunk. Both forms map to a URN we can hit the /// embed endpoint with. fn extract_urn(url: &str) -> Option { if let Some(idx) = url.find("urn:li:") { let tail = &url[idx..]; let end = tail.find(['/', '?', '#']).unwrap_or(tail.len()); let urn = &tail[..end]; // Validate shape: urn:li:{type}:{digits} let mut parts = urn.split(':'); if parts.next() == Some("urn") && parts.next() == Some("li") && parts.next().is_some() && parts .next() .filter(|p| p.chars().all(|c| c.is_ascii_digit())) .is_some() { return Some(urn.to_string()); } } // /posts/{user}_{slug}-{19-digit-id}-{4-char-hash}/ — id is the second- // to-last segment after the last `-`. if url.contains("/posts/") { static RE: OnceLock = OnceLock::new(); let re = RE.get_or_init(|| Regex::new(r"/posts/[^/]*?-(\d{15,})-[A-Za-z0-9]{2,}/?").unwrap()); if let Some(c) = re.captures(url) && let Some(id) = c.get(1) { return Some(format!("urn:li:activity:{}", id.as_str())); } } None } // --------------------------------------------------------------------------- // HTML scraping // --------------------------------------------------------------------------- /// Pull `og:foo` → value pairs out of ``. /// Returns lowercased keys with leading `og:` stripped. fn parse_og_tags(html: &str) -> std::collections::HashMap { static RE: OnceLock = OnceLock::new(); let re = RE.get_or_init(|| { Regex::new(r#"(?i)]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap() }); let mut out = std::collections::HashMap::new(); for c in re.captures_iter(html) { let k = c .get(1) .map(|m| m.as_str().to_lowercase()) .unwrap_or_default(); let v = c .get(2) .map(|m| html_decode(m.as_str())) .unwrap_or_default(); out.entry(k).or_insert(v); } out } /// Extract the post body text from the embed page. LinkedIn renders it /// inside `

{text}

` /// where the inner content can include nested `` tags for links. fn parse_post_body(html: &str) -> Option { static RE: OnceLock = OnceLock::new(); let re = RE.get_or_init(|| { Regex::new( r#"(?s)]+class="[^"]*attributed-text-segment-list__content[^"]*"[^>]*>(.*?)

"#, ) .unwrap() }); let inner = re.captures(html).and_then(|c| c.get(1))?.as_str(); Some(strip_tags(inner).trim().to_string()) } /// Author name lives in the `` like: /// "55 founding members are in… | Orc Dev" /// The chunk after the final `|` is the author display name. Falls back /// to the og:title minus the post body if there's no title. fn parse_author(html: &str) -> Option<String> { static RE_TITLE: OnceLock<Regex> = OnceLock::new(); let re = RE_TITLE.get_or_init(|| Regex::new(r"<title>([^<]+)").unwrap()); let title = re.captures(html).and_then(|c| c.get(1))?.as_str(); title .rsplit_once('|') .map(|(_, name)| html_decode(name.trim())) } /// Replace the small set of HTML entities LinkedIn (and Instagram, etc.) /// stuff into OG content attributes. fn html_decode(s: &str) -> String { s.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace(""", "\"") .replace("'", "'") .replace("@", "@") .replace("•", "•") .replace("…", "…") } /// Crude HTML tag stripper for the post body. Preserves text inside /// nested anchors so URLs don't disappear, and collapses runs of /// whitespace introduced by line wrapping. fn strip_tags(html: &str) -> String { static RE: OnceLock = OnceLock::new(); let re = RE.get_or_init(|| Regex::new(r"<[^>]+>").unwrap()); let no_tags = re.replace_all(html, "").to_string(); html_decode(&no_tags) } fn host_of(url: &str) -> &str { url.split("://") .nth(1) .unwrap_or(url) .split('/') .next() .unwrap_or("") } #[cfg(test)] mod tests { use super::*; #[test] fn matches_li_post_urls() { assert!(matches( "https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/" )); assert!(matches( "https://www.linkedin.com/feed/update/urn:li:activity:7452618583290892288" )); assert!(matches( "https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c" )); assert!(!matches("https://www.linkedin.com/in/foo")); assert!(!matches("https://www.linkedin.com/")); assert!(!matches("https://example.com/feed/update/urn:li:share:1")); } #[test] fn extract_urn_from_share_url() { assert_eq!( extract_urn("https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"), Some("urn:li:share:7452618582213144577".into()) ); } #[test] fn extract_urn_from_pretty_post_url() { assert_eq!( extract_urn( "https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c/" ), Some("urn:li:activity:7452618583290892288".into()) ); } #[test] fn parse_og_tags_basic() { let html = r#" "#; let og = parse_og_tags(html); assert_eq!( og.get("image").map(String::as_str), Some("https://x.com/a.png") ); assert_eq!( og.get("url").map(String::as_str), Some("https://example.com/x") ); } #[test] fn parse_post_body_strips_anchor_tags() { let html = r#"

Hello link world

"#; assert_eq!(parse_post_body(html).as_deref(), Some("Hello link world")); } #[test] fn html_decode_handles_common_entities() { assert_eq!(html_decode("AT&T @jane"), "AT&T @jane"); } }