feat(extractors): add LinkedIn + Instagram with profile-to-posts fan-out

3 social-network extractors that work entirely without auth, using public embed/preview endpoints + Instagram's own SEO-facing API: - linkedin_post: /embed/feed/update/{urn} returns full body, author, image, OG tags. Accepts both the urn:li:share and urn:li:activity URN forms plus the pretty /posts/{slug}-{id}-{suffix} URLs. - instagram_post: /p/{shortcode}/embed/captioned/ returns the full caption, username, thumbnail. Same endpoint serves reels and IGTV, kind correctly classified. - instagram_profile: /api/v1/users/web_profile_info/?username=X with the x-ig-app-id header (Instagram's public web-app id, sent by their own JS bundle). Returns the full profile + the 12 most recent posts with shortcodes, kinds, like/comment counts, thumbnails, and caption previews. Falls back to OG-tag scraping of the public HTML if the API ever 401/403s. The IG profile output is shaped so callers can fan out cleanly: for p in profile.recent_posts: scrape('instagram_post', p.url) giving you 'whole profile + every recent post' in one loop. End-to-end tested against ticketswave: 1 profile call + 12 post calls in ~3.5s. Pagination beyond 12 posts requires authenticated cookies and is left for the cloud where we can stash a session. Infrastructure change: added FetchClient::fetch_with_headers so extractors can satisfy site-specific request headers (here x-ig-app-id; later github_pr will use this for Authorization, etc.) without polluting the global FetchConfig.headers map. Same retry semantics as fetch(). Catalog now exposes 17 extractors via /v1/extractors. Total unit tests across the module: 47 passing. Clippy clean. Fmt clean. Live test on the maintainer's example URLs: - LinkedIn post (urn:li:share:7452618582213144577): 'Orc Dev' / full body / shipper.club link / CDN image extracted in 250ms. - Instagram post (DT-RICMjeK5): 835-char Slovak caption, ticketswave username, thumbnail. 200ms. - Instagram profile (ticketswave): 18,473 followers (exact, not rounded), is_verified=True, is_business=True, biography with emojis, 12 recent posts with shortcodes + kinds + likes. 400ms. Out of scope for this wave (require infra we don't have): - linkedin_profile: returns 999 to all bot UAs, needs OAuth - facebook_post / facebook_page: content is JS-loaded, needs cloud Chrome - facebook_profile (personal): not publicly accessible by design
2026-06-06 22:05:13 +02:00 · 2026-04-22 14:39:49 +02:00 · 2026-04-22 14:39:49 +02:00 · 3bb0a4bca0
commit 3bb0a4bca0
parent b041f3cddd
7 changed files with 1085 additions and 1 deletions
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -279,14 +279,85 @@ impl FetchClient {

    /// Single fetch attempt.
    async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
+        self.fetch_once_with_headers(url, &[]).await
+    }
+
+    /// Single fetch attempt with optional per-request headers appended
+    /// after the profile defaults. Used by extractors that need to
+    /// satisfy site-specific headers (e.g. `x-ig-app-id` for Instagram's
+    /// internal API).
+    async fn fetch_once_with_headers(
+        &self,
+        url: &str,
+        extra: &[(&str, &str)],
+    ) -> Result<FetchResult, FetchError> {
        let start = Instant::now();
        let client = self.pick_client(url);

-        let resp = client.get(url).send().await?;
+        let mut req = client.get(url);
+        for (k, v) in extra {
+            req = req.header(*k, *v);
+        }
+        let resp = req.send().await?;
        let response = Response::from_wreq(resp).await?;
        response_to_result(response, start)
    }

+    /// Fetch a URL with extra per-request headers appended after the
+    /// browser-profile defaults. Same retry semantics as `fetch`.
+    ///
+    /// Use this when an upstream API requires a header the global
+    /// `FetchConfig.headers` shouldn't carry to other hosts (Instagram's
+    /// `x-ig-app-id`, GitHub's `Authorization` once we wire `GITHUB_TOKEN`,
+    /// Reddit's compliant UA when we add OAuth, etc.).
+    #[instrument(skip(self, extra), fields(url = %url, extra_count = extra.len()))]
+    pub async fn fetch_with_headers(
+        &self,
+        url: &str,
+        extra: &[(&str, &str)],
+    ) -> Result<FetchResult, FetchError> {
+        let delays = [Duration::ZERO, Duration::from_secs(1)];
+        let mut last_err = None;
+
+        for (attempt, delay) in delays.iter().enumerate() {
+            if attempt > 0 {
+                tokio::time::sleep(*delay).await;
+            }
+            match self.fetch_once_with_headers(url, extra).await {
+                Ok(result) => {
+                    if is_retryable_status(result.status) && attempt < delays.len() - 1 {
+                        warn!(
+                            url,
+                            status = result.status,
+                            attempt = attempt + 1,
+                            "retryable status, will retry"
+                        );
+                        last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
+                        continue;
+                    }
+                    if attempt > 0 {
+                        debug!(url, attempt = attempt + 1, "retry succeeded");
+                    }
+                    return Ok(result);
+                }
+                Err(e) => {
+                    if !is_retryable_error(&e) || attempt == delays.len() - 1 {
+                        return Err(e);
+                    }
+                    warn!(
+                        url,
+                        error = %e,
+                        attempt = attempt + 1,
+                        "transient error, will retry"
+                    );
+                    last_err = Some(e);
+                }
+            }
+        }
+
+        Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
+    }
+
    /// Fetch a URL then extract structured content.
    #[instrument(skip(self), fields(url = %url))]
    pub async fn fetch_and_extract(
--- a/crates/webclaw-fetch/src/extractors/instagram_post.rs
+++ b/crates/webclaw-fetch/src/extractors/instagram_post.rs
@ -0,0 +1,235 @@
+//! Instagram post structured extractor.
+//!
+//! Uses Instagram's public embed endpoint
+//! `/p/{shortcode}/embed/captioned/` which returns SSR HTML with the
+//! full caption, author username, and thumbnail. No auth required.
+//! The same endpoint serves reels and IGTV under `/reel/{code}` and
+//! `/tv/{code}` URLs (we accept all three).
+
+use regex::Regex;
+use serde_json::{Value, json};
+use std::sync::OnceLock;
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "instagram_post",
+    label: "Instagram post",
+    description: "Returns full caption, author username, thumbnail, and post type (post / reel / tv) via Instagram's public embed.",
+    url_patterns: &[
+        "https://www.instagram.com/p/{shortcode}/",
+        "https://www.instagram.com/reel/{shortcode}/",
+        "https://www.instagram.com/tv/{shortcode}/",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if !matches!(host, "www.instagram.com" | "instagram.com") {
+        return false;
+    }
+    parse_shortcode(url).is_some()
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (kind, shortcode) = parse_shortcode(url).ok_or_else(|| {
+        FetchError::Build(format!(
+            "instagram_post: cannot parse shortcode from '{url}'"
+        ))
+    })?;
+
+    // Instagram serves the same embed HTML for posts/reels/tv under /p/.
+    let embed_url = format!("https://www.instagram.com/p/{shortcode}/embed/captioned/");
+    let resp = client.fetch(&embed_url).await?;
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "instagram embed returned status {} for {shortcode}",
+            resp.status
+        )));
+    }
+
+    let html = &resp.html;
+    let username = parse_username(html);
+    let caption = parse_caption(html);
+    let thumbnail = parse_thumbnail(html);
+
+    Ok(json!({
+        "url":               url,
+        "embed_url":         embed_url,
+        "shortcode":         shortcode,
+        "kind":              kind,
+        "data_completeness": "embed",
+        "author_username":   username,
+        "caption":           caption,
+        "thumbnail_url":     thumbnail,
+        "canonical_url":     format!("https://www.instagram.com/{}/{shortcode}/", path_segment_for(kind)),
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// URL parsing
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Returns `(kind, shortcode)` where kind ∈ {`post`, `reel`, `tv`}.
+fn parse_shortcode(url: &str) -> Option<(&'static str, String)> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    let mut segs = stripped.split('/').filter(|s| !s.is_empty());
+    let first = segs.next()?;
+    let kind = match first {
+        "p" => "post",
+        "reel" | "reels" => "reel",
+        "tv" => "tv",
+        _ => return None,
+    };
+    let shortcode = segs.next()?;
+    if shortcode.is_empty() {
+        return None;
+    }
+    Some((kind, shortcode.to_string()))
+}
+
+fn path_segment_for(kind: &str) -> &'static str {
+    match kind {
+        "reel" => "reel",
+        "tv" => "tv",
+        _ => "p",
+    }
+}
+
+// ---------------------------------------------------------------------------
+// HTML scraping
+// ---------------------------------------------------------------------------
+
+/// Username appears as the anchor text inside `<a class="CaptionUsername">`.
+fn parse_username(html: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| Regex::new(r#"(?s)class="CaptionUsername"[^>]*>([^<]+)<"#).unwrap());
+    re.captures(html)
+        .and_then(|c| c.get(1))
+        .map(|m| html_decode(m.as_str().trim()))
+}
+
+/// Caption sits inside `<div class="Caption">` after the username anchor.
+/// We grab the whole Caption block and strip out the username link, time
+/// node, and any trailing "Photo by" / "View ... on Instagram" boilerplate.
+fn parse_caption(html: &str) -> Option<String> {
+    static RE_OUTER: OnceLock<Regex> = OnceLock::new();
+    let outer = RE_OUTER
+        .get_or_init(|| Regex::new(r#"(?s)<div\s+class="Caption"[^>]*>(.*?)</div>"#).unwrap());
+    let block = outer.captures(html)?.get(1)?.as_str();
+
+    // Strip everything wrapped in <a class="CaptionUsername">...</a>.
+    static RE_USER: OnceLock<Regex> = OnceLock::new();
+    let user_re = RE_USER
+        .get_or_init(|| Regex::new(r#"(?s)<a[^>]*class="CaptionUsername"[^>]*>.*?</a>"#).unwrap());
+    let stripped = user_re.replace_all(block, "");
+
+    // Then strip anything remaining tagged.
+    static RE_TAGS: OnceLock<Regex> = OnceLock::new();
+    let tag_re = RE_TAGS.get_or_init(|| Regex::new(r"<[^>]+>").unwrap());
+    let text = tag_re.replace_all(&stripped, " ");
+
+    let cleaned = collapse_whitespace(&html_decode(text.trim()));
+    if cleaned.is_empty() {
+        None
+    } else {
+        Some(cleaned)
+    }
+}
+
+/// Thumbnail is the `<img class="EmbeddedMediaImage">` inside the embed
+/// (or the og:image as fallback).
+fn parse_thumbnail(html: &str) -> Option<String> {
+    static RE_IMG: OnceLock<Regex> = OnceLock::new();
+    let img_re = RE_IMG.get_or_init(|| {
+        Regex::new(r#"(?s)<img[^>]+class="[^"]*EmbeddedMediaImage[^"]*"[^>]+src="([^"]+)""#)
+            .unwrap()
+    });
+    if let Some(m) = img_re.captures(html).and_then(|c| c.get(1)) {
+        return Some(html_decode(m.as_str()));
+    }
+    static RE_OG: OnceLock<Regex> = OnceLock::new();
+    let og_re = RE_OG.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:image"[^>]+content="([^"]+)""#).unwrap()
+    });
+    og_re
+        .captures(html)
+        .and_then(|c| c.get(1))
+        .map(|m| html_decode(m.as_str()))
+}
+
+fn html_decode(s: &str) -> String {
+    s.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&#39;", "'")
+        .replace("&#064;", "@")
+        .replace("&#x2022;", "•")
+        .replace("&hellip;", "…")
+}
+
+fn collapse_whitespace(s: &str) -> String {
+    s.split_whitespace().collect::<Vec<_>>().join(" ")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_post_reel_tv_urls() {
+        assert!(matches("https://www.instagram.com/p/DT-RICMjeK5/"));
+        assert!(matches(
+            "https://www.instagram.com/p/DT-RICMjeK5/?img_index=1"
+        ));
+        assert!(matches("https://www.instagram.com/reel/abc123/"));
+        assert!(matches("https://www.instagram.com/tv/abc123/"));
+        assert!(!matches("https://www.instagram.com/ticketswave"));
+        assert!(!matches("https://www.instagram.com/"));
+        assert!(!matches("https://example.com/p/abc/"));
+    }
+
+    #[test]
+    fn parse_shortcode_reads_each_kind() {
+        assert_eq!(
+            parse_shortcode("https://www.instagram.com/p/DT-RICMjeK5/?img_index=1"),
+            Some(("post", "DT-RICMjeK5".into()))
+        );
+        assert_eq!(
+            parse_shortcode("https://www.instagram.com/reel/abc123/"),
+            Some(("reel", "abc123".into()))
+        );
+        assert_eq!(
+            parse_shortcode("https://www.instagram.com/tv/abc123"),
+            Some(("tv", "abc123".into()))
+        );
+    }
+
+    #[test]
+    fn parse_username_pulls_anchor_text() {
+        let html = r#"<a class="CaptionUsername" href="...">ticketswave</a>"#;
+        assert_eq!(parse_username(html).as_deref(), Some("ticketswave"));
+    }
+
+    #[test]
+    fn parse_caption_strips_username_anchor() {
+        let html = r#"<div class="Caption"><a class="CaptionUsername" href="...">ticketswave</a> Some caption text here</div>"#;
+        assert_eq!(
+            parse_caption(html).as_deref(),
+            Some("Some caption text here")
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/instagram_profile.rs
+++ b/crates/webclaw-fetch/src/extractors/instagram_profile.rs
@ -0,0 +1,465 @@
+//! Instagram profile structured extractor.
+//!
+//! Hits Instagram's internal `web_profile_info` endpoint at
+//! `instagram.com/api/v1/users/web_profile_info/?username=X`. The
+//! `x-ig-app-id` header is Instagram's own public web-app id (not a
+//! secret) — the same value Instagram's own JavaScript bundle sends.
+//!
+//! Returns the full profile (bio, exact follower count, verified /
+//! business flags, profile picture) plus the **12 most recent posts**
+//! with shortcodes, like counts, types, thumbnails, and caption
+//! previews. Callers can fan out to `/v1/scrape/instagram_post` per
+//! shortcode to get the full caption + media.
+//!
+//! Pagination beyond 12 requires authenticated cookies + a CSRF token;
+//! we accept that as the practical ceiling for the unauth path. The
+//! cloud (with stored sessions) can paginate later as a follow-up.
+//!
+//! Falls back to OG-tag scraping of the public profile page if the API
+//! returns 401/403 — Instagram has tightened this endpoint multiple
+//! times, so we keep the second path warm.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "instagram_profile",
+    label: "Instagram profile",
+    description: "Returns full profile metadata + the 12 most recent posts (shortcode, url, type, likes, thumbnail).",
+    url_patterns: &["https://www.instagram.com/{username}/"],
+};
+
+/// Instagram's own public web-app identifier. Sent by their JS bundle
+/// on every API call, accepted by the unauth endpoint, not a secret.
+const IG_APP_ID: &str = "936619743392459";
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if !matches!(host, "www.instagram.com" | "instagram.com") {
+        return false;
+    }
+    let path = url
+        .split("://")
+        .nth(1)
+        .and_then(|s| s.split_once('/'))
+        .map(|(_, p)| p)
+        .unwrap_or("");
+    let stripped = path
+        .split(['?', '#'])
+        .next()
+        .unwrap_or("")
+        .trim_end_matches('/');
+    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
+    segs.len() == 1 && !RESERVED.contains(&segs[0])
+}
+
+const RESERVED: &[&str] = &[
+    "p",
+    "reel",
+    "reels",
+    "tv",
+    "explore",
+    "stories",
+    "directory",
+    "accounts",
+    "about",
+    "developer",
+    "press",
+    "api",
+    "ads",
+    "blog",
+    "fragments",
+    "terms",
+    "privacy",
+    "session",
+    "login",
+    "signup",
+];
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let username = parse_username(url).ok_or_else(|| {
+        FetchError::Build(format!(
+            "instagram_profile: cannot parse username from '{url}'"
+        ))
+    })?;
+
+    let api_url =
+        format!("https://www.instagram.com/api/v1/users/web_profile_info/?username={username}");
+    let extra_headers: &[(&str, &str)] = &[
+        ("x-ig-app-id", IG_APP_ID),
+        ("accept", "*/*"),
+        ("sec-fetch-site", "same-origin"),
+        ("x-requested-with", "XMLHttpRequest"),
+    ];
+    let resp = client.fetch_with_headers(&api_url, extra_headers).await?;
+
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "instagram_profile: '{username}' not found"
+        )));
+    }
+    // Auth wall fallback: Instagram occasionally tightens this endpoint
+    // and starts returning 401/403/302 to a login page. When that
+    // happens we still want to give the caller something useful — the
+    // OG tags from the public HTML page (no posts list, but bio etc).
+    if !(200..300).contains(&resp.status) {
+        return og_fallback(client, &username, url, resp.status).await;
+    }
+
+    let body: ApiResponse = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("instagram_profile parse: {e}")))?;
+    let user = body.data.user;
+
+    let recent_posts: Vec<Value> = user
+        .edge_owner_to_timeline_media
+        .as_ref()
+        .map(|m| m.edges.iter().map(|e| post_summary(&e.node)).collect())
+        .unwrap_or_default();
+
+    Ok(json!({
+        "url":               url,
+        "canonical_url":     format!("https://www.instagram.com/{username}/"),
+        "username":          user.username.unwrap_or(username),
+        "data_completeness": "api",
+        "user_id":           user.id,
+        "full_name":         user.full_name,
+        "biography":         user.biography,
+        "biography_links":   user.bio_links,
+        "external_url":      user.external_url,
+        "category":          user.category_name,
+        "follower_count":    user.edge_followed_by.map(|c| c.count),
+        "following_count":   user.edge_follow.map(|c| c.count),
+        "post_count":        user.edge_owner_to_timeline_media.as_ref().map(|m| m.count),
+        "is_verified":       user.is_verified,
+        "is_private":        user.is_private,
+        "is_business":       user.is_business_account,
+        "is_professional":   user.is_professional_account,
+        "profile_pic_url":   user.profile_pic_url_hd.or(user.profile_pic_url),
+        "recent_posts":      recent_posts,
+    }))
+}
+
+/// Build the per-post summary the caller fans out from. Includes a
+/// constructed `url` so the loop is `for p in recent_posts: scrape('instagram_post', p.url)`.
+fn post_summary(n: &MediaNode) -> Value {
+    let kind = classify(n);
+    let url = match kind {
+        "reel" => format!(
+            "https://www.instagram.com/reel/{}/",
+            n.shortcode.as_deref().unwrap_or("")
+        ),
+        _ => format!(
+            "https://www.instagram.com/p/{}/",
+            n.shortcode.as_deref().unwrap_or("")
+        ),
+    };
+    let caption = n
+        .edge_media_to_caption
+        .as_ref()
+        .and_then(|c| c.edges.first())
+        .and_then(|e| e.node.text.clone());
+    json!({
+        "shortcode":     n.shortcode,
+        "url":           url,
+        "kind":          kind,
+        "is_video":      n.is_video.unwrap_or(false),
+        "video_views":   n.video_view_count,
+        "thumbnail_url": n.thumbnail_src.clone().or_else(|| n.display_url.clone()),
+        "display_url":   n.display_url,
+        "like_count":    n.edge_media_preview_like.as_ref().map(|c| c.count),
+        "comment_count": n.edge_media_to_comment.as_ref().map(|c| c.count),
+        "taken_at":      n.taken_at_timestamp,
+        "caption":       caption,
+        "alt_text":      n.accessibility_caption,
+        "dimensions":    n.dimensions.as_ref().map(|d| json!({"width": d.width, "height": d.height})),
+        "product_type":  n.product_type,
+    })
+}
+
+/// Best-effort post-type classification. `clips` is reels; `feed` is
+/// the regular grid. Sidecar = multi-photo carousel.
+fn classify(n: &MediaNode) -> &'static str {
+    if n.product_type.as_deref() == Some("clips") {
+        return "reel";
+    }
+    match n.typename.as_deref() {
+        Some("GraphSidecar") => "carousel",
+        Some("GraphVideo") => "video",
+        Some("GraphImage") => "photo",
+        _ => "post",
+    }
+}
+
+/// Fallback when the API path is blocked: hit the public profile HTML,
+/// pull whatever OG tags we can. Returns less data and explicitly
+/// flags `data_completeness: "og_only"` so callers know.
+async fn og_fallback(
+    client: &FetchClient,
+    username: &str,
+    original_url: &str,
+    api_status: u16,
+) -> Result<Value, FetchError> {
+    let canonical = format!("https://www.instagram.com/{username}/");
+    let resp = client.fetch(&canonical).await?;
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "instagram_profile: api status {api_status}, html status {} for {username}",
+            resp.status
+        )));
+    }
+    let og = parse_og_tags(&resp.html);
+    let (followers, following, posts) =
+        parse_counts_from_og_description(og.get("description").map(String::as_str));
+
+    Ok(json!({
+        "url":               original_url,
+        "canonical_url":     canonical,
+        "username":          username,
+        "data_completeness": "og_only",
+        "fallback_reason":   format!("api returned {api_status}"),
+        "full_name":         parse_full_name(&og.get("title").cloned().unwrap_or_default()),
+        "follower_count":    followers,
+        "following_count":   following,
+        "post_count":        posts,
+        "profile_pic_url":   og.get("image").cloned(),
+        "biography":         null_value(),
+        "is_verified":       null_value(),
+        "is_business":       null_value(),
+        "recent_posts":      Vec::<Value>::new(),
+    }))
+}
+
+fn null_value() -> Value {
+    Value::Null
+}
+
+// ---------------------------------------------------------------------------
+// URL parsing
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+fn parse_username(url: &str) -> Option<String> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    stripped
+        .split('/')
+        .find(|s| !s.is_empty())
+        .map(|s| s.to_string())
+}
+
+// ---------------------------------------------------------------------------
+// OG-fallback helpers (kept self-contained — same shape as the previous
+// version we shipped, retained as the safety net)
+// ---------------------------------------------------------------------------
+
+fn parse_og_tags(html: &str) -> std::collections::HashMap<String, String> {
+    use regex::Regex;
+    use std::sync::OnceLock;
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    let mut out = std::collections::HashMap::new();
+    for c in re.captures_iter(html) {
+        let k = c
+            .get(1)
+            .map(|m| m.as_str().to_lowercase())
+            .unwrap_or_default();
+        let v = c
+            .get(2)
+            .map(|m| html_decode(m.as_str()))
+            .unwrap_or_default();
+        out.entry(k).or_insert(v);
+    }
+    out
+}
+
+fn parse_full_name(og_title: &str) -> Option<String> {
+    if og_title.is_empty() {
+        return None;
+    }
+    let decoded = html_decode(og_title);
+    let trimmed = decoded.split('(').next().unwrap_or(&decoded).trim();
+    if trimmed.is_empty() {
+        None
+    } else {
+        Some(trimmed.to_string())
+    }
+}
+
+fn parse_counts_from_og_description(desc: Option<&str>) -> (Option<i64>, Option<i64>, Option<i64>) {
+    let Some(text) = desc else {
+        return (None, None, None);
+    };
+    let decoded = html_decode(text);
+    use regex::Regex;
+    use std::sync::OnceLock;
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r"(?i)([\d.,]+[KMB]?)\s*Followers,\s*([\d.,]+[KMB]?)\s*Following,\s*([\d.,]+[KMB]?)\s*Posts").unwrap()
+    });
+    if let Some(c) = re.captures(&decoded) {
+        return (
+            c.get(1).and_then(|m| parse_compact_number(m.as_str())),
+            c.get(2).and_then(|m| parse_compact_number(m.as_str())),
+            c.get(3).and_then(|m| parse_compact_number(m.as_str())),
+        );
+    }
+    (None, None, None)
+}
+
+fn parse_compact_number(s: &str) -> Option<i64> {
+    let s = s.trim();
+    let (num_str, mul) = match s.chars().last() {
+        Some('K') => (&s[..s.len() - 1], 1_000i64),
+        Some('M') => (&s[..s.len() - 1], 1_000_000i64),
+        Some('B') => (&s[..s.len() - 1], 1_000_000_000i64),
+        _ => (s, 1i64),
+    };
+    let cleaned: String = num_str.chars().filter(|c| *c != ',').collect();
+    cleaned.parse::<f64>().ok().map(|f| (f * mul as f64) as i64)
+}
+
+fn html_decode(s: &str) -> String {
+    s.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&#39;", "'")
+        .replace("&#064;", "@")
+        .replace("&#x2022;", "•")
+        .replace("&hellip;", "…")
+}
+
+// ---------------------------------------------------------------------------
+// Instagram web_profile_info API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct ApiResponse {
+    data: ApiData,
+}
+
+#[derive(Deserialize)]
+struct ApiData {
+    user: User,
+}
+
+#[derive(Deserialize)]
+struct User {
+    id: Option<String>,
+    username: Option<String>,
+    full_name: Option<String>,
+    biography: Option<String>,
+    bio_links: Option<Vec<serde_json::Value>>,
+    external_url: Option<String>,
+    category_name: Option<String>,
+    profile_pic_url: Option<String>,
+    profile_pic_url_hd: Option<String>,
+    is_verified: Option<bool>,
+    is_private: Option<bool>,
+    is_business_account: Option<bool>,
+    is_professional_account: Option<bool>,
+    edge_followed_by: Option<EdgeCount>,
+    edge_follow: Option<EdgeCount>,
+    edge_owner_to_timeline_media: Option<MediaEdges>,
+}
+
+#[derive(Deserialize)]
+struct EdgeCount {
+    count: i64,
+}
+
+#[derive(Deserialize)]
+struct MediaEdges {
+    count: i64,
+    edges: Vec<MediaEdge>,
+}
+
+#[derive(Deserialize)]
+struct MediaEdge {
+    node: MediaNode,
+}
+
+#[derive(Deserialize)]
+struct MediaNode {
+    #[serde(rename = "__typename")]
+    typename: Option<String>,
+    shortcode: Option<String>,
+    is_video: Option<bool>,
+    video_view_count: Option<i64>,
+    display_url: Option<String>,
+    thumbnail_src: Option<String>,
+    accessibility_caption: Option<String>,
+    taken_at_timestamp: Option<i64>,
+    product_type: Option<String>,
+    dimensions: Option<Dimensions>,
+    edge_media_preview_like: Option<EdgeCount>,
+    edge_media_to_comment: Option<EdgeCount>,
+    edge_media_to_caption: Option<CaptionEdges>,
+}
+
+#[derive(Deserialize)]
+struct Dimensions {
+    width: i64,
+    height: i64,
+}
+
+#[derive(Deserialize)]
+struct CaptionEdges {
+    edges: Vec<CaptionEdge>,
+}
+
+#[derive(Deserialize)]
+struct CaptionEdge {
+    node: CaptionNode,
+}
+
+#[derive(Deserialize)]
+struct CaptionNode {
+    text: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_profile_urls() {
+        assert!(matches("https://www.instagram.com/ticketswave"));
+        assert!(matches("https://www.instagram.com/ticketswave/"));
+        assert!(matches("https://instagram.com/0xmassi/?hl=en"));
+        assert!(!matches("https://www.instagram.com/p/DT-RICMjeK5/"));
+        assert!(!matches("https://www.instagram.com/explore"));
+        assert!(!matches("https://www.instagram.com/"));
+        assert!(!matches("https://example.com/foo"));
+    }
+
+    #[test]
+    fn parse_full_name_strips_handle() {
+        assert_eq!(
+            parse_full_name("Ticket Wave (&#064;ticketswave) &#x2022; Instagram photos and videos"),
+            Some("Ticket Wave".into())
+        );
+    }
+
+    #[test]
+    fn compact_number_handles_kmb() {
+        assert_eq!(parse_compact_number("18K"), Some(18_000));
+        assert_eq!(parse_compact_number("1.5M"), Some(1_500_000));
+        assert_eq!(parse_compact_number("1,234"), Some(1_234));
+        assert_eq!(parse_compact_number("641"), Some(641));
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/linkedin_post.rs
+++ b/crates/webclaw-fetch/src/extractors/linkedin_post.rs
@ -0,0 +1,266 @@
+//! LinkedIn post structured extractor.
+//!
+//! Uses the public embed endpoint `/embed/feed/update/{urn}` which
+//! LinkedIn provides for sites that want to render a post inline. No
+//! auth required, returns SSR HTML with the full post body, OG tags,
+//! image, and a link back to the original post.
+//!
+//! Accepts both URN forms (`urn:li:share:N` and `urn:li:activity:N`)
+//! and pretty post URLs (`/posts/{user}_{slug}-{id}-{suffix}`) by
+//! pulling the trailing numeric id and converting to an activity URN.
+
+use regex::Regex;
+use serde_json::{Value, json};
+use std::sync::OnceLock;
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "linkedin_post",
+    label: "LinkedIn post",
+    description: "Returns post body, author name, image, and original URL via LinkedIn's public embed endpoint.",
+    url_patterns: &[
+        "https://www.linkedin.com/feed/update/urn:li:share:{id}",
+        "https://www.linkedin.com/feed/update/urn:li:activity:{id}",
+        "https://www.linkedin.com/posts/{user}_{slug}-{id}-{suffix}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if !matches!(host, "www.linkedin.com" | "linkedin.com") {
+        return false;
+    }
+    url.contains("/feed/update/urn:li:") || url.contains("/posts/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let urn = extract_urn(url).ok_or_else(|| {
+        FetchError::Build(format!(
+            "linkedin_post: cannot extract URN from '{url}' (expected /feed/update/urn:li:... or /posts/{{slug}}-{{id}})"
+        ))
+    })?;
+
+    let embed_url = format!("https://www.linkedin.com/embed/feed/update/{urn}");
+    let resp = client.fetch(&embed_url).await?;
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "linkedin embed returned status {} for {urn}",
+            resp.status
+        )));
+    }
+
+    let html = &resp.html;
+    let og = parse_og_tags(html);
+    let body = parse_post_body(html);
+    let author = parse_author(html);
+    let canonical_url = og.get("url").cloned().unwrap_or_else(|| embed_url.clone());
+
+    Ok(json!({
+        "url":               url,
+        "embed_url":         embed_url,
+        "urn":               urn,
+        "canonical_url":     canonical_url,
+        "data_completeness": "embed",
+        "title":             og.get("title").cloned(),
+        "body":              body,
+        "author_name":       author,
+        "image_url":         og.get("image").cloned(),
+        "site_name":         og.get("site_name").cloned().unwrap_or_else(|| "LinkedIn".into()),
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// URN extraction
+// ---------------------------------------------------------------------------
+
+/// Pull a `urn:li:share:N` or `urn:li:activity:N` from any LinkedIn URL.
+/// `/posts/{slug}-{id}-{suffix}` URLs encode the activity id as the second-
+/// to-last `-` separated chunk. Both forms map to a URN we can hit the
+/// embed endpoint with.
+fn extract_urn(url: &str) -> Option<String> {
+    if let Some(idx) = url.find("urn:li:") {
+        let tail = &url[idx..];
+        let end = tail.find(['/', '?', '#']).unwrap_or(tail.len());
+        let urn = &tail[..end];
+        // Validate shape: urn:li:{type}:{digits}
+        let mut parts = urn.split(':');
+        if parts.next() == Some("urn")
+            && parts.next() == Some("li")
+            && parts.next().is_some()
+            && parts
+                .next()
+                .filter(|p| p.chars().all(|c| c.is_ascii_digit()))
+                .is_some()
+        {
+            return Some(urn.to_string());
+        }
+    }
+
+    // /posts/{user}_{slug}-{19-digit-id}-{4-char-hash}/ — id is the second-
+    // to-last segment after the last `-`.
+    if url.contains("/posts/") {
+        static RE: OnceLock<Regex> = OnceLock::new();
+        let re =
+            RE.get_or_init(|| Regex::new(r"/posts/[^/]*?-(\d{15,})-[A-Za-z0-9]{2,}/?").unwrap());
+        if let Some(c) = re.captures(url)
+            && let Some(id) = c.get(1)
+        {
+            return Some(format!("urn:li:activity:{}", id.as_str()));
+        }
+    }
+    None
+}
+
+// ---------------------------------------------------------------------------
+// HTML scraping
+// ---------------------------------------------------------------------------
+
+/// Pull `og:foo` → value pairs out of `<meta property="og:..." content="...">`.
+/// Returns lowercased keys with leading `og:` stripped.
+fn parse_og_tags(html: &str) -> std::collections::HashMap<String, String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    let mut out = std::collections::HashMap::new();
+    for c in re.captures_iter(html) {
+        let k = c
+            .get(1)
+            .map(|m| m.as_str().to_lowercase())
+            .unwrap_or_default();
+        let v = c
+            .get(2)
+            .map(|m| html_decode(m.as_str()))
+            .unwrap_or_default();
+        out.entry(k).or_insert(v);
+    }
+    out
+}
+
+/// Extract the post body text from the embed page. LinkedIn renders it
+/// inside `<p class="attributed-text-segment-list__content ...">{text}</p>`
+/// where the inner content can include nested `<a>` tags for links.
+fn parse_post_body(html: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(
+            r#"(?s)<p[^>]+class="[^"]*attributed-text-segment-list__content[^"]*"[^>]*>(.*?)</p>"#,
+        )
+        .unwrap()
+    });
+    let inner = re.captures(html).and_then(|c| c.get(1))?.as_str();
+    Some(strip_tags(inner).trim().to_string())
+}
+
+/// Author name lives in the `<title>` like:
+///   "55 founding members are in… | Orc Dev"
+/// The chunk after the final `|` is the author display name. Falls back
+/// to the og:title minus the post body if there's no title.
+fn parse_author(html: &str) -> Option<String> {
+    static RE_TITLE: OnceLock<Regex> = OnceLock::new();
+    let re = RE_TITLE.get_or_init(|| Regex::new(r"<title>([^<]+)</title>").unwrap());
+    let title = re.captures(html).and_then(|c| c.get(1))?.as_str();
+    title
+        .rsplit_once('|')
+        .map(|(_, name)| html_decode(name.trim()))
+}
+
+/// Replace the small set of HTML entities LinkedIn (and Instagram, etc.)
+/// stuff into OG content attributes.
+fn html_decode(s: &str) -> String {
+    s.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&#39;", "'")
+        .replace("&#064;", "@")
+        .replace("&#x2022;", "•")
+        .replace("&hellip;", "…")
+}
+
+/// Crude HTML tag stripper for the post body. Preserves text inside
+/// nested anchors so URLs don't disappear, and collapses runs of
+/// whitespace introduced by line wrapping.
+fn strip_tags(html: &str) -> String {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| Regex::new(r"<[^>]+>").unwrap());
+    let no_tags = re.replace_all(html, "").to_string();
+    html_decode(&no_tags)
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_li_post_urls() {
+        assert!(matches(
+            "https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"
+        ));
+        assert!(matches(
+            "https://www.linkedin.com/feed/update/urn:li:activity:7452618583290892288"
+        ));
+        assert!(matches(
+            "https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c"
+        ));
+        assert!(!matches("https://www.linkedin.com/in/foo"));
+        assert!(!matches("https://www.linkedin.com/"));
+        assert!(!matches("https://example.com/feed/update/urn:li:share:1"));
+    }
+
+    #[test]
+    fn extract_urn_from_share_url() {
+        assert_eq!(
+            extract_urn("https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"),
+            Some("urn:li:share:7452618582213144577".into())
+        );
+    }
+
+    #[test]
+    fn extract_urn_from_pretty_post_url() {
+        assert_eq!(
+            extract_urn(
+                "https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c/"
+            ),
+            Some("urn:li:activity:7452618583290892288".into())
+        );
+    }
+
+    #[test]
+    fn parse_og_tags_basic() {
+        let html = r#"<meta property="og:image" content="https://x.com/a.png">
+<meta property="og:url" content="https://example.com/x">"#;
+        let og = parse_og_tags(html);
+        assert_eq!(
+            og.get("image").map(String::as_str),
+            Some("https://x.com/a.png")
+        );
+        assert_eq!(
+            og.get("url").map(String::as_str),
+            Some("https://example.com/x")
+        );
+    }
+
+    #[test]
+    fn parse_post_body_strips_anchor_tags() {
+        let html = r#"<p class="attributed-text-segment-list__content text-color-text" dir="ltr">Hello <a href="x">link</a> world</p>"#;
+        assert_eq!(parse_post_body(html).as_deref(), Some("Hello link world"));
+    }
+
+    #[test]
+    fn html_decode_handles_common_entities() {
+        assert_eq!(html_decode("AT&amp;T &#064;jane"), "AT&T @jane");
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/mod.rs
+++ b/crates/webclaw-fetch/src/extractors/mod.rs
@ -24,6 +24,9 @@ pub mod github_repo;
 pub mod hackernews;
 pub mod huggingface_dataset;
 pub mod huggingface_model;
+pub mod instagram_post;
+pub mod instagram_profile;
+pub mod linkedin_post;
 pub mod npm;
 pub mod pypi;
 pub mod reddit;
@ -67,6 +70,9 @@ pub fn list() -> Vec<ExtractorInfo> {
        docker_hub::INFO,
        dev_to::INFO,
        stackoverflow::INFO,
+        linkedin_post::INFO,
+        instagram_post::INFO,
+        instagram_profile::INFO,
    ]
 }

@ -171,6 +177,27 @@ pub async fn dispatch_by_url(
                .map(|v| (stackoverflow::INFO.name, v)),
        );
    }
+    if linkedin_post::matches(url) {
+        return Some(
+            linkedin_post::extract(client, url)
+                .await
+                .map(|v| (linkedin_post::INFO.name, v)),
+        );
+    }
+    if instagram_post::matches(url) {
+        return Some(
+            instagram_post::extract(client, url)
+                .await
+                .map(|v| (instagram_post::INFO.name, v)),
+        );
+    }
+    if instagram_profile::matches(url) {
+        return Some(
+            instagram_profile::extract(client, url)
+                .await
+                .map(|v| (instagram_profile::INFO.name, v)),
+        );
+    }
    None
 }

@ -259,6 +286,24 @@ pub async fn dispatch_by_name(
            })
            .await
        }
+        n if n == linkedin_post::INFO.name => {
+            run_or_mismatch(linkedin_post::matches(url), n, url, || {
+                linkedin_post::extract(client, url)
+            })
+            .await
+        }
+        n if n == instagram_post::INFO.name => {
+            run_or_mismatch(instagram_post::matches(url), n, url, || {
+                instagram_post::extract(client, url)
+            })
+            .await
+        }
+        n if n == instagram_profile::INFO.name => {
+            run_or_mismatch(instagram_profile::matches(url), n, url, || {
+                instagram_profile::extract(client, url)
+            })
+            .await
+        }
        _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
    }
 }