webclaw/crates/webclaw-fetch/src/extractors/linkedin_post.rs
Valerio 3bb0a4bca0 feat(extractors): add LinkedIn + Instagram with profile-to-posts fan-out
3 social-network extractors that work entirely without auth, using
public embed/preview endpoints + Instagram's own SEO-facing API:

- linkedin_post:      /embed/feed/update/{urn} returns full body,
                      author, image, OG tags. Accepts both the urn:li:share
                      and urn:li:activity URN forms plus the pretty
                      /posts/{slug}-{id}-{suffix} URLs.

- instagram_post:     /p/{shortcode}/embed/captioned/ returns the full
                      caption, username, thumbnail. Same endpoint serves
                      reels and IGTV, kind correctly classified.

- instagram_profile:  /api/v1/users/web_profile_info/?username=X with the
                      x-ig-app-id header (Instagram's public web-app id,
                      sent by their own JS bundle). Returns the full
                      profile + the 12 most recent posts with shortcodes,
                      kinds, like/comment counts, thumbnails, and caption
                      previews. Falls back to OG-tag scraping of the
                      public HTML if the API ever 401/403s.

The IG profile output is shaped so callers can fan out cleanly:
  for p in profile.recent_posts:
      scrape('instagram_post', p.url)
giving you 'whole profile + every recent post' in one loop. End-to-end
tested against ticketswave: 1 profile call + 12 post calls in ~3.5s.
Pagination beyond 12 posts requires authenticated cookies and is left
for the cloud where we can stash a session.

Infrastructure change: added FetchClient::fetch_with_headers so
extractors can satisfy site-specific request headers (here x-ig-app-id;
later github_pr will use this for Authorization, etc.) without polluting
the global FetchConfig.headers map. Same retry semantics as fetch().

Catalog now exposes 17 extractors via /v1/extractors. Total unit tests
across the module: 47 passing. Clippy clean. Fmt clean.

Live test on the maintainer's example URLs:
- LinkedIn post (urn:li:share:7452618582213144577): 'Orc Dev' / full body
  / shipper.club link / CDN image extracted in 250ms.
- Instagram post (DT-RICMjeK5): 835-char Slovak caption, ticketswave
  username, thumbnail. 200ms.
- Instagram profile (ticketswave): 18,473 followers (exact, not
  rounded), is_verified=True, is_business=True, biography with emojis,
  12 recent posts with shortcodes + kinds + likes. 400ms.

Out of scope for this wave (require infra we don't have):
- linkedin_profile: returns 999 to all bot UAs, needs OAuth
- facebook_post / facebook_page: content is JS-loaded, needs cloud Chrome
- facebook_profile (personal): not publicly accessible by design
2026-04-22 14:39:49 +02:00

266 lines
9.3 KiB
Rust

//! LinkedIn post structured extractor.
//!
//! Uses the public embed endpoint `/embed/feed/update/{urn}` which
//! LinkedIn provides for sites that want to render a post inline. No
//! auth required, returns SSR HTML with the full post body, OG tags,
//! image, and a link back to the original post.
//!
//! Accepts both URN forms (`urn:li:share:N` and `urn:li:activity:N`)
//! and pretty post URLs (`/posts/{user}_{slug}-{id}-{suffix}`) by
//! pulling the trailing numeric id and converting to an activity URN.
use regex::Regex;
use serde_json::{Value, json};
use std::sync::OnceLock;
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "linkedin_post",
label: "LinkedIn post",
description: "Returns post body, author name, image, and original URL via LinkedIn's public embed endpoint.",
url_patterns: &[
"https://www.linkedin.com/feed/update/urn:li:share:{id}",
"https://www.linkedin.com/feed/update/urn:li:activity:{id}",
"https://www.linkedin.com/posts/{user}_{slug}-{id}-{suffix}",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if !matches!(host, "www.linkedin.com" | "linkedin.com") {
return false;
}
url.contains("/feed/update/urn:li:") || url.contains("/posts/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let urn = extract_urn(url).ok_or_else(|| {
FetchError::Build(format!(
"linkedin_post: cannot extract URN from '{url}' (expected /feed/update/urn:li:... or /posts/{{slug}}-{{id}})"
))
})?;
let embed_url = format!("https://www.linkedin.com/embed/feed/update/{urn}");
let resp = client.fetch(&embed_url).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"linkedin embed returned status {} for {urn}",
resp.status
)));
}
let html = &resp.html;
let og = parse_og_tags(html);
let body = parse_post_body(html);
let author = parse_author(html);
let canonical_url = og.get("url").cloned().unwrap_or_else(|| embed_url.clone());
Ok(json!({
"url": url,
"embed_url": embed_url,
"urn": urn,
"canonical_url": canonical_url,
"data_completeness": "embed",
"title": og.get("title").cloned(),
"body": body,
"author_name": author,
"image_url": og.get("image").cloned(),
"site_name": og.get("site_name").cloned().unwrap_or_else(|| "LinkedIn".into()),
}))
}
// ---------------------------------------------------------------------------
// URN extraction
// ---------------------------------------------------------------------------
/// Pull a `urn:li:share:N` or `urn:li:activity:N` from any LinkedIn URL.
/// `/posts/{slug}-{id}-{suffix}` URLs encode the activity id as the second-
/// to-last `-` separated chunk. Both forms map to a URN we can hit the
/// embed endpoint with.
fn extract_urn(url: &str) -> Option<String> {
if let Some(idx) = url.find("urn:li:") {
let tail = &url[idx..];
let end = tail.find(['/', '?', '#']).unwrap_or(tail.len());
let urn = &tail[..end];
// Validate shape: urn:li:{type}:{digits}
let mut parts = urn.split(':');
if parts.next() == Some("urn")
&& parts.next() == Some("li")
&& parts.next().is_some()
&& parts
.next()
.filter(|p| p.chars().all(|c| c.is_ascii_digit()))
.is_some()
{
return Some(urn.to_string());
}
}
// /posts/{user}_{slug}-{19-digit-id}-{4-char-hash}/ — id is the second-
// to-last segment after the last `-`.
if url.contains("/posts/") {
static RE: OnceLock<Regex> = OnceLock::new();
let re =
RE.get_or_init(|| Regex::new(r"/posts/[^/]*?-(\d{15,})-[A-Za-z0-9]{2,}/?").unwrap());
if let Some(c) = re.captures(url)
&& let Some(id) = c.get(1)
{
return Some(format!("urn:li:activity:{}", id.as_str()));
}
}
None
}
// ---------------------------------------------------------------------------
// HTML scraping
// ---------------------------------------------------------------------------
/// Pull `og:foo` → value pairs out of `<meta property="og:..." content="...">`.
/// Returns lowercased keys with leading `og:` stripped.
fn parse_og_tags(html: &str) -> std::collections::HashMap<String, String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
let mut out = std::collections::HashMap::new();
for c in re.captures_iter(html) {
let k = c
.get(1)
.map(|m| m.as_str().to_lowercase())
.unwrap_or_default();
let v = c
.get(2)
.map(|m| html_decode(m.as_str()))
.unwrap_or_default();
out.entry(k).or_insert(v);
}
out
}
/// Extract the post body text from the embed page. LinkedIn renders it
/// inside `<p class="attributed-text-segment-list__content ...">{text}</p>`
/// where the inner content can include nested `<a>` tags for links.
fn parse_post_body(html: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(
r#"(?s)<p[^>]+class="[^"]*attributed-text-segment-list__content[^"]*"[^>]*>(.*?)</p>"#,
)
.unwrap()
});
let inner = re.captures(html).and_then(|c| c.get(1))?.as_str();
Some(strip_tags(inner).trim().to_string())
}
/// Author name lives in the `<title>` like:
/// "55 founding members are in… | Orc Dev"
/// The chunk after the final `|` is the author display name. Falls back
/// to the og:title minus the post body if there's no title.
fn parse_author(html: &str) -> Option<String> {
static RE_TITLE: OnceLock<Regex> = OnceLock::new();
let re = RE_TITLE.get_or_init(|| Regex::new(r"<title>([^<]+)</title>").unwrap());
let title = re.captures(html).and_then(|c| c.get(1))?.as_str();
title
.rsplit_once('|')
.map(|(_, name)| html_decode(name.trim()))
}
/// Replace the small set of HTML entities LinkedIn (and Instagram, etc.)
/// stuff into OG content attributes.
fn html_decode(s: &str) -> String {
s.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&#064;", "@")
.replace("&#x2022;", "")
.replace("&hellip;", "")
}
/// Crude HTML tag stripper for the post body. Preserves text inside
/// nested anchors so URLs don't disappear, and collapses runs of
/// whitespace introduced by line wrapping.
fn strip_tags(html: &str) -> String {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"<[^>]+>").unwrap());
let no_tags = re.replace_all(html, "").to_string();
html_decode(&no_tags)
}
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_li_post_urls() {
assert!(matches(
"https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"
));
assert!(matches(
"https://www.linkedin.com/feed/update/urn:li:activity:7452618583290892288"
));
assert!(matches(
"https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c"
));
assert!(!matches("https://www.linkedin.com/in/foo"));
assert!(!matches("https://www.linkedin.com/"));
assert!(!matches("https://example.com/feed/update/urn:li:share:1"));
}
#[test]
fn extract_urn_from_share_url() {
assert_eq!(
extract_urn("https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"),
Some("urn:li:share:7452618582213144577".into())
);
}
#[test]
fn extract_urn_from_pretty_post_url() {
assert_eq!(
extract_urn(
"https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c/"
),
Some("urn:li:activity:7452618583290892288".into())
);
}
#[test]
fn parse_og_tags_basic() {
let html = r#"<meta property="og:image" content="https://x.com/a.png">
<meta property="og:url" content="https://example.com/x">"#;
let og = parse_og_tags(html);
assert_eq!(
og.get("image").map(String::as_str),
Some("https://x.com/a.png")
);
assert_eq!(
og.get("url").map(String::as_str),
Some("https://example.com/x")
);
}
#[test]
fn parse_post_body_strips_anchor_tags() {
let html = r#"<p class="attributed-text-segment-list__content text-color-text" dir="ltr">Hello <a href="x">link</a> world</p>"#;
assert_eq!(parse_post_body(html).as_deref(), Some("Hello link world"));
}
#[test]
fn html_decode_handles_common_entities() {
assert_eq!(html_decode("AT&amp;T &#064;jane"), "AT&T @jane");
}
}