mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-07 22:15:12 +02:00
267 lines
9.3 KiB
Rust
267 lines
9.3 KiB
Rust
|
|
//! LinkedIn post structured extractor.
|
||
|
|
//!
|
||
|
|
//! Uses the public embed endpoint `/embed/feed/update/{urn}` which
|
||
|
|
//! LinkedIn provides for sites that want to render a post inline. No
|
||
|
|
//! auth required, returns SSR HTML with the full post body, OG tags,
|
||
|
|
//! image, and a link back to the original post.
|
||
|
|
//!
|
||
|
|
//! Accepts both URN forms (`urn:li:share:N` and `urn:li:activity:N`)
|
||
|
|
//! and pretty post URLs (`/posts/{user}_{slug}-{id}-{suffix}`) by
|
||
|
|
//! pulling the trailing numeric id and converting to an activity URN.
|
||
|
|
|
||
|
|
use regex::Regex;
|
||
|
|
use serde_json::{Value, json};
|
||
|
|
use std::sync::OnceLock;
|
||
|
|
|
||
|
|
use super::ExtractorInfo;
|
||
|
|
use crate::client::FetchClient;
|
||
|
|
use crate::error::FetchError;
|
||
|
|
|
||
|
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||
|
|
name: "linkedin_post",
|
||
|
|
label: "LinkedIn post",
|
||
|
|
description: "Returns post body, author name, image, and original URL via LinkedIn's public embed endpoint.",
|
||
|
|
url_patterns: &[
|
||
|
|
"https://www.linkedin.com/feed/update/urn:li:share:{id}",
|
||
|
|
"https://www.linkedin.com/feed/update/urn:li:activity:{id}",
|
||
|
|
"https://www.linkedin.com/posts/{user}_{slug}-{id}-{suffix}",
|
||
|
|
],
|
||
|
|
};
|
||
|
|
|
||
|
|
pub fn matches(url: &str) -> bool {
|
||
|
|
let host = host_of(url);
|
||
|
|
if !matches!(host, "www.linkedin.com" | "linkedin.com") {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
url.contains("/feed/update/urn:li:") || url.contains("/posts/")
|
||
|
|
}
|
||
|
|
|
||
|
|
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||
|
|
let urn = extract_urn(url).ok_or_else(|| {
|
||
|
|
FetchError::Build(format!(
|
||
|
|
"linkedin_post: cannot extract URN from '{url}' (expected /feed/update/urn:li:... or /posts/{{slug}}-{{id}})"
|
||
|
|
))
|
||
|
|
})?;
|
||
|
|
|
||
|
|
let embed_url = format!("https://www.linkedin.com/embed/feed/update/{urn}");
|
||
|
|
let resp = client.fetch(&embed_url).await?;
|
||
|
|
if resp.status != 200 {
|
||
|
|
return Err(FetchError::Build(format!(
|
||
|
|
"linkedin embed returned status {} for {urn}",
|
||
|
|
resp.status
|
||
|
|
)));
|
||
|
|
}
|
||
|
|
|
||
|
|
let html = &resp.html;
|
||
|
|
let og = parse_og_tags(html);
|
||
|
|
let body = parse_post_body(html);
|
||
|
|
let author = parse_author(html);
|
||
|
|
let canonical_url = og.get("url").cloned().unwrap_or_else(|| embed_url.clone());
|
||
|
|
|
||
|
|
Ok(json!({
|
||
|
|
"url": url,
|
||
|
|
"embed_url": embed_url,
|
||
|
|
"urn": urn,
|
||
|
|
"canonical_url": canonical_url,
|
||
|
|
"data_completeness": "embed",
|
||
|
|
"title": og.get("title").cloned(),
|
||
|
|
"body": body,
|
||
|
|
"author_name": author,
|
||
|
|
"image_url": og.get("image").cloned(),
|
||
|
|
"site_name": og.get("site_name").cloned().unwrap_or_else(|| "LinkedIn".into()),
|
||
|
|
}))
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// URN extraction
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
/// Pull a `urn:li:share:N` or `urn:li:activity:N` from any LinkedIn URL.
|
||
|
|
/// `/posts/{slug}-{id}-{suffix}` URLs encode the activity id as the second-
|
||
|
|
/// to-last `-` separated chunk. Both forms map to a URN we can hit the
|
||
|
|
/// embed endpoint with.
|
||
|
|
fn extract_urn(url: &str) -> Option<String> {
|
||
|
|
if let Some(idx) = url.find("urn:li:") {
|
||
|
|
let tail = &url[idx..];
|
||
|
|
let end = tail.find(['/', '?', '#']).unwrap_or(tail.len());
|
||
|
|
let urn = &tail[..end];
|
||
|
|
// Validate shape: urn:li:{type}:{digits}
|
||
|
|
let mut parts = urn.split(':');
|
||
|
|
if parts.next() == Some("urn")
|
||
|
|
&& parts.next() == Some("li")
|
||
|
|
&& parts.next().is_some()
|
||
|
|
&& parts
|
||
|
|
.next()
|
||
|
|
.filter(|p| p.chars().all(|c| c.is_ascii_digit()))
|
||
|
|
.is_some()
|
||
|
|
{
|
||
|
|
return Some(urn.to_string());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// /posts/{user}_{slug}-{19-digit-id}-{4-char-hash}/ — id is the second-
|
||
|
|
// to-last segment after the last `-`.
|
||
|
|
if url.contains("/posts/") {
|
||
|
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||
|
|
let re =
|
||
|
|
RE.get_or_init(|| Regex::new(r"/posts/[^/]*?-(\d{15,})-[A-Za-z0-9]{2,}/?").unwrap());
|
||
|
|
if let Some(c) = re.captures(url)
|
||
|
|
&& let Some(id) = c.get(1)
|
||
|
|
{
|
||
|
|
return Some(format!("urn:li:activity:{}", id.as_str()));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
None
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// HTML scraping
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
/// Pull `og:foo` → value pairs out of `<meta property="og:..." content="...">`.
|
||
|
|
/// Returns lowercased keys with leading `og:` stripped.
|
||
|
|
fn parse_og_tags(html: &str) -> std::collections::HashMap<String, String> {
|
||
|
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||
|
|
let re = RE.get_or_init(|| {
|
||
|
|
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||
|
|
});
|
||
|
|
let mut out = std::collections::HashMap::new();
|
||
|
|
for c in re.captures_iter(html) {
|
||
|
|
let k = c
|
||
|
|
.get(1)
|
||
|
|
.map(|m| m.as_str().to_lowercase())
|
||
|
|
.unwrap_or_default();
|
||
|
|
let v = c
|
||
|
|
.get(2)
|
||
|
|
.map(|m| html_decode(m.as_str()))
|
||
|
|
.unwrap_or_default();
|
||
|
|
out.entry(k).or_insert(v);
|
||
|
|
}
|
||
|
|
out
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Extract the post body text from the embed page. LinkedIn renders it
|
||
|
|
/// inside `<p class="attributed-text-segment-list__content ...">{text}</p>`
|
||
|
|
/// where the inner content can include nested `<a>` tags for links.
|
||
|
|
fn parse_post_body(html: &str) -> Option<String> {
|
||
|
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||
|
|
let re = RE.get_or_init(|| {
|
||
|
|
Regex::new(
|
||
|
|
r#"(?s)<p[^>]+class="[^"]*attributed-text-segment-list__content[^"]*"[^>]*>(.*?)</p>"#,
|
||
|
|
)
|
||
|
|
.unwrap()
|
||
|
|
});
|
||
|
|
let inner = re.captures(html).and_then(|c| c.get(1))?.as_str();
|
||
|
|
Some(strip_tags(inner).trim().to_string())
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Author name lives in the `<title>` like:
|
||
|
|
/// "55 founding members are in… | Orc Dev"
|
||
|
|
/// The chunk after the final `|` is the author display name. Falls back
|
||
|
|
/// to the og:title minus the post body if there's no title.
|
||
|
|
fn parse_author(html: &str) -> Option<String> {
|
||
|
|
static RE_TITLE: OnceLock<Regex> = OnceLock::new();
|
||
|
|
let re = RE_TITLE.get_or_init(|| Regex::new(r"<title>([^<]+)</title>").unwrap());
|
||
|
|
let title = re.captures(html).and_then(|c| c.get(1))?.as_str();
|
||
|
|
title
|
||
|
|
.rsplit_once('|')
|
||
|
|
.map(|(_, name)| html_decode(name.trim()))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Replace the small set of HTML entities LinkedIn (and Instagram, etc.)
|
||
|
|
/// stuff into OG content attributes.
|
||
|
|
fn html_decode(s: &str) -> String {
|
||
|
|
s.replace("&", "&")
|
||
|
|
.replace("<", "<")
|
||
|
|
.replace(">", ">")
|
||
|
|
.replace(""", "\"")
|
||
|
|
.replace("'", "'")
|
||
|
|
.replace("@", "@")
|
||
|
|
.replace("•", "•")
|
||
|
|
.replace("…", "…")
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Crude HTML tag stripper for the post body. Preserves text inside
|
||
|
|
/// nested anchors so URLs don't disappear, and collapses runs of
|
||
|
|
/// whitespace introduced by line wrapping.
|
||
|
|
fn strip_tags(html: &str) -> String {
|
||
|
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||
|
|
let re = RE.get_or_init(|| Regex::new(r"<[^>]+>").unwrap());
|
||
|
|
let no_tags = re.replace_all(html, "").to_string();
|
||
|
|
html_decode(&no_tags)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn host_of(url: &str) -> &str {
|
||
|
|
url.split("://")
|
||
|
|
.nth(1)
|
||
|
|
.unwrap_or(url)
|
||
|
|
.split('/')
|
||
|
|
.next()
|
||
|
|
.unwrap_or("")
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(test)]
|
||
|
|
mod tests {
|
||
|
|
use super::*;
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn matches_li_post_urls() {
|
||
|
|
assert!(matches(
|
||
|
|
"https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"
|
||
|
|
));
|
||
|
|
assert!(matches(
|
||
|
|
"https://www.linkedin.com/feed/update/urn:li:activity:7452618583290892288"
|
||
|
|
));
|
||
|
|
assert!(matches(
|
||
|
|
"https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c"
|
||
|
|
));
|
||
|
|
assert!(!matches("https://www.linkedin.com/in/foo"));
|
||
|
|
assert!(!matches("https://www.linkedin.com/"));
|
||
|
|
assert!(!matches("https://example.com/feed/update/urn:li:share:1"));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn extract_urn_from_share_url() {
|
||
|
|
assert_eq!(
|
||
|
|
extract_urn("https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"),
|
||
|
|
Some("urn:li:share:7452618582213144577".into())
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn extract_urn_from_pretty_post_url() {
|
||
|
|
assert_eq!(
|
||
|
|
extract_urn(
|
||
|
|
"https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c/"
|
||
|
|
),
|
||
|
|
Some("urn:li:activity:7452618583290892288".into())
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn parse_og_tags_basic() {
|
||
|
|
let html = r#"<meta property="og:image" content="https://x.com/a.png">
|
||
|
|
<meta property="og:url" content="https://example.com/x">"#;
|
||
|
|
let og = parse_og_tags(html);
|
||
|
|
assert_eq!(
|
||
|
|
og.get("image").map(String::as_str),
|
||
|
|
Some("https://x.com/a.png")
|
||
|
|
);
|
||
|
|
assert_eq!(
|
||
|
|
og.get("url").map(String::as_str),
|
||
|
|
Some("https://example.com/x")
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn parse_post_body_strips_anchor_tags() {
|
||
|
|
let html = r#"<p class="attributed-text-segment-list__content text-color-text" dir="ltr">Hello <a href="x">link</a> world</p>"#;
|
||
|
|
assert_eq!(parse_post_body(html).as_deref(), Some("Hello link world"));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn html_decode_handles_common_entities() {
|
||
|
|
assert_eq!(html_decode("AT&T @jane"), "AT&T @jane");
|
||
|
|
}
|
||
|
|
}
|