mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
feat(extractors): add LinkedIn + Instagram with profile-to-posts fan-out
3 social-network extractors that work entirely without auth, using
public embed/preview endpoints + Instagram's own SEO-facing API:
- linkedin_post: /embed/feed/update/{urn} returns full body,
author, image, OG tags. Accepts both the urn:li:share
and urn:li:activity URN forms plus the pretty
/posts/{slug}-{id}-{suffix} URLs.
- instagram_post: /p/{shortcode}/embed/captioned/ returns the full
caption, username, thumbnail. Same endpoint serves
reels and IGTV, kind correctly classified.
- instagram_profile: /api/v1/users/web_profile_info/?username=X with the
x-ig-app-id header (Instagram's public web-app id,
sent by their own JS bundle). Returns the full
profile + the 12 most recent posts with shortcodes,
kinds, like/comment counts, thumbnails, and caption
previews. Falls back to OG-tag scraping of the
public HTML if the API ever 401/403s.
The IG profile output is shaped so callers can fan out cleanly:
for p in profile.recent_posts:
scrape('instagram_post', p.url)
giving you 'whole profile + every recent post' in one loop. End-to-end
tested against ticketswave: 1 profile call + 12 post calls in ~3.5s.
Pagination beyond 12 posts requires authenticated cookies and is left
for the cloud where we can stash a session.
Infrastructure change: added FetchClient::fetch_with_headers so
extractors can satisfy site-specific request headers (here x-ig-app-id;
later github_pr will use this for Authorization, etc.) without polluting
the global FetchConfig.headers map. Same retry semantics as fetch().
Catalog now exposes 17 extractors via /v1/extractors. Total unit tests
across the module: 47 passing. Clippy clean. Fmt clean.
Live test on the maintainer's example URLs:
- LinkedIn post (urn:li:share:7452618582213144577): 'Orc Dev' / full body
/ shipper.club link / CDN image extracted in 250ms.
- Instagram post (DT-RICMjeK5): 835-char Slovak caption, ticketswave
username, thumbnail. 200ms.
- Instagram profile (ticketswave): 18,473 followers (exact, not
rounded), is_verified=True, is_business=True, biography with emojis,
12 recent posts with shortcodes + kinds + likes. 400ms.
Out of scope for this wave (require infra we don't have):
- linkedin_profile: returns 999 to all bot UAs, needs OAuth
- facebook_post / facebook_page: content is JS-loaded, needs cloud Chrome
- facebook_profile (personal): not publicly accessible by design
This commit is contained in:
parent
b041f3cddd
commit
3bb0a4bca0
7 changed files with 1085 additions and 1 deletions
|
|
@ -279,14 +279,85 @@ impl FetchClient {
|
|||
|
||||
/// Single fetch attempt.
|
||||
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
self.fetch_once_with_headers(url, &[]).await
|
||||
}
|
||||
|
||||
/// Single fetch attempt with optional per-request headers appended
|
||||
/// after the profile defaults. Used by extractors that need to
|
||||
/// satisfy site-specific headers (e.g. `x-ig-app-id` for Instagram's
|
||||
/// internal API).
|
||||
async fn fetch_once_with_headers(
|
||||
&self,
|
||||
url: &str,
|
||||
extra: &[(&str, &str)],
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
||||
let resp = client.get(url).send().await?;
|
||||
let mut req = client.get(url);
|
||||
for (k, v) in extra {
|
||||
req = req.header(*k, *v);
|
||||
}
|
||||
let resp = req.send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
response_to_result(response, start)
|
||||
}
|
||||
|
||||
/// Fetch a URL with extra per-request headers appended after the
|
||||
/// browser-profile defaults. Same retry semantics as `fetch`.
|
||||
///
|
||||
/// Use this when an upstream API requires a header the global
|
||||
/// `FetchConfig.headers` shouldn't carry to other hosts (Instagram's
|
||||
/// `x-ig-app-id`, GitHub's `Authorization` once we wire `GITHUB_TOKEN`,
|
||||
/// Reddit's compliant UA when we add OAuth, etc.).
|
||||
#[instrument(skip(self, extra), fields(url = %url, extra_count = extra.len()))]
|
||||
pub async fn fetch_with_headers(
|
||||
&self,
|
||||
url: &str,
|
||||
extra: &[(&str, &str)],
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let delays = [Duration::ZERO, Duration::from_secs(1)];
|
||||
let mut last_err = None;
|
||||
|
||||
for (attempt, delay) in delays.iter().enumerate() {
|
||||
if attempt > 0 {
|
||||
tokio::time::sleep(*delay).await;
|
||||
}
|
||||
match self.fetch_once_with_headers(url, extra).await {
|
||||
Ok(result) => {
|
||||
if is_retryable_status(result.status) && attempt < delays.len() - 1 {
|
||||
warn!(
|
||||
url,
|
||||
status = result.status,
|
||||
attempt = attempt + 1,
|
||||
"retryable status, will retry"
|
||||
);
|
||||
last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
|
||||
continue;
|
||||
}
|
||||
if attempt > 0 {
|
||||
debug!(url, attempt = attempt + 1, "retry succeeded");
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => {
|
||||
if !is_retryable_error(&e) || attempt == delays.len() - 1 {
|
||||
return Err(e);
|
||||
}
|
||||
warn!(
|
||||
url,
|
||||
error = %e,
|
||||
attempt = attempt + 1,
|
||||
"transient error, will retry"
|
||||
);
|
||||
last_err = Some(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
|
||||
}
|
||||
|
||||
/// Fetch a URL then extract structured content.
|
||||
#[instrument(skip(self), fields(url = %url))]
|
||||
pub async fn fetch_and_extract(
|
||||
|
|
|
|||
235
crates/webclaw-fetch/src/extractors/instagram_post.rs
Normal file
235
crates/webclaw-fetch/src/extractors/instagram_post.rs
Normal file
|
|
@ -0,0 +1,235 @@
|
|||
//! Instagram post structured extractor.
|
||||
//!
|
||||
//! Uses Instagram's public embed endpoint
|
||||
//! `/p/{shortcode}/embed/captioned/` which returns SSR HTML with the
|
||||
//! full caption, author username, and thumbnail. No auth required.
|
||||
//! The same endpoint serves reels and IGTV under `/reel/{code}` and
|
||||
//! `/tv/{code}` URLs (we accept all three).
|
||||
|
||||
use regex::Regex;
|
||||
use serde_json::{Value, json};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "instagram_post",
|
||||
label: "Instagram post",
|
||||
description: "Returns full caption, author username, thumbnail, and post type (post / reel / tv) via Instagram's public embed.",
|
||||
url_patterns: &[
|
||||
"https://www.instagram.com/p/{shortcode}/",
|
||||
"https://www.instagram.com/reel/{shortcode}/",
|
||||
"https://www.instagram.com/tv/{shortcode}/",
|
||||
],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if !matches!(host, "www.instagram.com" | "instagram.com") {
|
||||
return false;
|
||||
}
|
||||
parse_shortcode(url).is_some()
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let (kind, shortcode) = parse_shortcode(url).ok_or_else(|| {
|
||||
FetchError::Build(format!(
|
||||
"instagram_post: cannot parse shortcode from '{url}'"
|
||||
))
|
||||
})?;
|
||||
|
||||
// Instagram serves the same embed HTML for posts/reels/tv under /p/.
|
||||
let embed_url = format!("https://www.instagram.com/p/{shortcode}/embed/captioned/");
|
||||
let resp = client.fetch(&embed_url).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"instagram embed returned status {} for {shortcode}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let html = &resp.html;
|
||||
let username = parse_username(html);
|
||||
let caption = parse_caption(html);
|
||||
let thumbnail = parse_thumbnail(html);
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"embed_url": embed_url,
|
||||
"shortcode": shortcode,
|
||||
"kind": kind,
|
||||
"data_completeness": "embed",
|
||||
"author_username": username,
|
||||
"caption": caption,
|
||||
"thumbnail_url": thumbnail,
|
||||
"canonical_url": format!("https://www.instagram.com/{}/{shortcode}/", path_segment_for(kind)),
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// URL parsing
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Returns `(kind, shortcode)` where kind ∈ {`post`, `reel`, `tv`}.
|
||||
fn parse_shortcode(url: &str) -> Option<(&'static str, String)> {
|
||||
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
let mut segs = stripped.split('/').filter(|s| !s.is_empty());
|
||||
let first = segs.next()?;
|
||||
let kind = match first {
|
||||
"p" => "post",
|
||||
"reel" | "reels" => "reel",
|
||||
"tv" => "tv",
|
||||
_ => return None,
|
||||
};
|
||||
let shortcode = segs.next()?;
|
||||
if shortcode.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some((kind, shortcode.to_string()))
|
||||
}
|
||||
|
||||
fn path_segment_for(kind: &str) -> &'static str {
|
||||
match kind {
|
||||
"reel" => "reel",
|
||||
"tv" => "tv",
|
||||
_ => "p",
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTML scraping
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Username appears as the anchor text inside `<a class="CaptionUsername">`.
|
||||
fn parse_username(html: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| Regex::new(r#"(?s)class="CaptionUsername"[^>]*>([^<]+)<"#).unwrap());
|
||||
re.captures(html)
|
||||
.and_then(|c| c.get(1))
|
||||
.map(|m| html_decode(m.as_str().trim()))
|
||||
}
|
||||
|
||||
/// Caption sits inside `<div class="Caption">` after the username anchor.
|
||||
/// We grab the whole Caption block and strip out the username link, time
|
||||
/// node, and any trailing "Photo by" / "View ... on Instagram" boilerplate.
|
||||
fn parse_caption(html: &str) -> Option<String> {
|
||||
static RE_OUTER: OnceLock<Regex> = OnceLock::new();
|
||||
let outer = RE_OUTER
|
||||
.get_or_init(|| Regex::new(r#"(?s)<div\s+class="Caption"[^>]*>(.*?)</div>"#).unwrap());
|
||||
let block = outer.captures(html)?.get(1)?.as_str();
|
||||
|
||||
// Strip everything wrapped in <a class="CaptionUsername">...</a>.
|
||||
static RE_USER: OnceLock<Regex> = OnceLock::new();
|
||||
let user_re = RE_USER
|
||||
.get_or_init(|| Regex::new(r#"(?s)<a[^>]*class="CaptionUsername"[^>]*>.*?</a>"#).unwrap());
|
||||
let stripped = user_re.replace_all(block, "");
|
||||
|
||||
// Then strip anything remaining tagged.
|
||||
static RE_TAGS: OnceLock<Regex> = OnceLock::new();
|
||||
let tag_re = RE_TAGS.get_or_init(|| Regex::new(r"<[^>]+>").unwrap());
|
||||
let text = tag_re.replace_all(&stripped, " ");
|
||||
|
||||
let cleaned = collapse_whitespace(&html_decode(text.trim()));
|
||||
if cleaned.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(cleaned)
|
||||
}
|
||||
}
|
||||
|
||||
/// Thumbnail is the `<img class="EmbeddedMediaImage">` inside the embed
|
||||
/// (or the og:image as fallback).
|
||||
fn parse_thumbnail(html: &str) -> Option<String> {
|
||||
static RE_IMG: OnceLock<Regex> = OnceLock::new();
|
||||
let img_re = RE_IMG.get_or_init(|| {
|
||||
Regex::new(r#"(?s)<img[^>]+class="[^"]*EmbeddedMediaImage[^"]*"[^>]+src="([^"]+)""#)
|
||||
.unwrap()
|
||||
});
|
||||
if let Some(m) = img_re.captures(html).and_then(|c| c.get(1)) {
|
||||
return Some(html_decode(m.as_str()));
|
||||
}
|
||||
static RE_OG: OnceLock<Regex> = OnceLock::new();
|
||||
let og_re = RE_OG.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:image"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
og_re
|
||||
.captures(html)
|
||||
.and_then(|c| c.get(1))
|
||||
.map(|m| html_decode(m.as_str()))
|
||||
}
|
||||
|
||||
fn html_decode(s: &str) -> String {
|
||||
s.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace("@", "@")
|
||||
.replace("•", "•")
|
||||
.replace("…", "…")
|
||||
}
|
||||
|
||||
fn collapse_whitespace(s: &str) -> String {
|
||||
s.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_post_reel_tv_urls() {
|
||||
assert!(matches("https://www.instagram.com/p/DT-RICMjeK5/"));
|
||||
assert!(matches(
|
||||
"https://www.instagram.com/p/DT-RICMjeK5/?img_index=1"
|
||||
));
|
||||
assert!(matches("https://www.instagram.com/reel/abc123/"));
|
||||
assert!(matches("https://www.instagram.com/tv/abc123/"));
|
||||
assert!(!matches("https://www.instagram.com/ticketswave"));
|
||||
assert!(!matches("https://www.instagram.com/"));
|
||||
assert!(!matches("https://example.com/p/abc/"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_shortcode_reads_each_kind() {
|
||||
assert_eq!(
|
||||
parse_shortcode("https://www.instagram.com/p/DT-RICMjeK5/?img_index=1"),
|
||||
Some(("post", "DT-RICMjeK5".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_shortcode("https://www.instagram.com/reel/abc123/"),
|
||||
Some(("reel", "abc123".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
parse_shortcode("https://www.instagram.com/tv/abc123"),
|
||||
Some(("tv", "abc123".into()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_username_pulls_anchor_text() {
|
||||
let html = r#"<a class="CaptionUsername" href="...">ticketswave</a>"#;
|
||||
assert_eq!(parse_username(html).as_deref(), Some("ticketswave"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_caption_strips_username_anchor() {
|
||||
let html = r#"<div class="Caption"><a class="CaptionUsername" href="...">ticketswave</a> Some caption text here</div>"#;
|
||||
assert_eq!(
|
||||
parse_caption(html).as_deref(),
|
||||
Some("Some caption text here")
|
||||
);
|
||||
}
|
||||
}
|
||||
465
crates/webclaw-fetch/src/extractors/instagram_profile.rs
Normal file
465
crates/webclaw-fetch/src/extractors/instagram_profile.rs
Normal file
|
|
@ -0,0 +1,465 @@
|
|||
//! Instagram profile structured extractor.
|
||||
//!
|
||||
//! Hits Instagram's internal `web_profile_info` endpoint at
|
||||
//! `instagram.com/api/v1/users/web_profile_info/?username=X`. The
|
||||
//! `x-ig-app-id` header is Instagram's own public web-app id (not a
|
||||
//! secret) — the same value Instagram's own JavaScript bundle sends.
|
||||
//!
|
||||
//! Returns the full profile (bio, exact follower count, verified /
|
||||
//! business flags, profile picture) plus the **12 most recent posts**
|
||||
//! with shortcodes, like counts, types, thumbnails, and caption
|
||||
//! previews. Callers can fan out to `/v1/scrape/instagram_post` per
|
||||
//! shortcode to get the full caption + media.
|
||||
//!
|
||||
//! Pagination beyond 12 requires authenticated cookies + a CSRF token;
|
||||
//! we accept that as the practical ceiling for the unauth path. The
|
||||
//! cloud (with stored sessions) can paginate later as a follow-up.
|
||||
//!
|
||||
//! Falls back to OG-tag scraping of the public profile page if the API
|
||||
//! returns 401/403 — Instagram has tightened this endpoint multiple
|
||||
//! times, so we keep the second path warm.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "instagram_profile",
|
||||
label: "Instagram profile",
|
||||
description: "Returns full profile metadata + the 12 most recent posts (shortcode, url, type, likes, thumbnail).",
|
||||
url_patterns: &["https://www.instagram.com/{username}/"],
|
||||
};
|
||||
|
||||
/// Instagram's own public web-app identifier. Sent by their JS bundle
|
||||
/// on every API call, accepted by the unauth endpoint, not a secret.
|
||||
const IG_APP_ID: &str = "936619743392459";
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if !matches!(host, "www.instagram.com" | "instagram.com") {
|
||||
return false;
|
||||
}
|
||||
let path = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.and_then(|s| s.split_once('/'))
|
||||
.map(|(_, p)| p)
|
||||
.unwrap_or("");
|
||||
let stripped = path
|
||||
.split(['?', '#'])
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
.trim_end_matches('/');
|
||||
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
||||
segs.len() == 1 && !RESERVED.contains(&segs[0])
|
||||
}
|
||||
|
||||
const RESERVED: &[&str] = &[
|
||||
"p",
|
||||
"reel",
|
||||
"reels",
|
||||
"tv",
|
||||
"explore",
|
||||
"stories",
|
||||
"directory",
|
||||
"accounts",
|
||||
"about",
|
||||
"developer",
|
||||
"press",
|
||||
"api",
|
||||
"ads",
|
||||
"blog",
|
||||
"fragments",
|
||||
"terms",
|
||||
"privacy",
|
||||
"session",
|
||||
"login",
|
||||
"signup",
|
||||
];
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let username = parse_username(url).ok_or_else(|| {
|
||||
FetchError::Build(format!(
|
||||
"instagram_profile: cannot parse username from '{url}'"
|
||||
))
|
||||
})?;
|
||||
|
||||
let api_url =
|
||||
format!("https://www.instagram.com/api/v1/users/web_profile_info/?username={username}");
|
||||
let extra_headers: &[(&str, &str)] = &[
|
||||
("x-ig-app-id", IG_APP_ID),
|
||||
("accept", "*/*"),
|
||||
("sec-fetch-site", "same-origin"),
|
||||
("x-requested-with", "XMLHttpRequest"),
|
||||
];
|
||||
let resp = client.fetch_with_headers(&api_url, extra_headers).await?;
|
||||
|
||||
if resp.status == 404 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"instagram_profile: '{username}' not found"
|
||||
)));
|
||||
}
|
||||
// Auth wall fallback: Instagram occasionally tightens this endpoint
|
||||
// and starts returning 401/403/302 to a login page. When that
|
||||
// happens we still want to give the caller something useful — the
|
||||
// OG tags from the public HTML page (no posts list, but bio etc).
|
||||
if !(200..300).contains(&resp.status) {
|
||||
return og_fallback(client, &username, url, resp.status).await;
|
||||
}
|
||||
|
||||
let body: ApiResponse = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("instagram_profile parse: {e}")))?;
|
||||
let user = body.data.user;
|
||||
|
||||
let recent_posts: Vec<Value> = user
|
||||
.edge_owner_to_timeline_media
|
||||
.as_ref()
|
||||
.map(|m| m.edges.iter().map(|e| post_summary(&e.node)).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"canonical_url": format!("https://www.instagram.com/{username}/"),
|
||||
"username": user.username.unwrap_or(username),
|
||||
"data_completeness": "api",
|
||||
"user_id": user.id,
|
||||
"full_name": user.full_name,
|
||||
"biography": user.biography,
|
||||
"biography_links": user.bio_links,
|
||||
"external_url": user.external_url,
|
||||
"category": user.category_name,
|
||||
"follower_count": user.edge_followed_by.map(|c| c.count),
|
||||
"following_count": user.edge_follow.map(|c| c.count),
|
||||
"post_count": user.edge_owner_to_timeline_media.as_ref().map(|m| m.count),
|
||||
"is_verified": user.is_verified,
|
||||
"is_private": user.is_private,
|
||||
"is_business": user.is_business_account,
|
||||
"is_professional": user.is_professional_account,
|
||||
"profile_pic_url": user.profile_pic_url_hd.or(user.profile_pic_url),
|
||||
"recent_posts": recent_posts,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Build the per-post summary the caller fans out from. Includes a
|
||||
/// constructed `url` so the loop is `for p in recent_posts: scrape('instagram_post', p.url)`.
|
||||
fn post_summary(n: &MediaNode) -> Value {
|
||||
let kind = classify(n);
|
||||
let url = match kind {
|
||||
"reel" => format!(
|
||||
"https://www.instagram.com/reel/{}/",
|
||||
n.shortcode.as_deref().unwrap_or("")
|
||||
),
|
||||
_ => format!(
|
||||
"https://www.instagram.com/p/{}/",
|
||||
n.shortcode.as_deref().unwrap_or("")
|
||||
),
|
||||
};
|
||||
let caption = n
|
||||
.edge_media_to_caption
|
||||
.as_ref()
|
||||
.and_then(|c| c.edges.first())
|
||||
.and_then(|e| e.node.text.clone());
|
||||
json!({
|
||||
"shortcode": n.shortcode,
|
||||
"url": url,
|
||||
"kind": kind,
|
||||
"is_video": n.is_video.unwrap_or(false),
|
||||
"video_views": n.video_view_count,
|
||||
"thumbnail_url": n.thumbnail_src.clone().or_else(|| n.display_url.clone()),
|
||||
"display_url": n.display_url,
|
||||
"like_count": n.edge_media_preview_like.as_ref().map(|c| c.count),
|
||||
"comment_count": n.edge_media_to_comment.as_ref().map(|c| c.count),
|
||||
"taken_at": n.taken_at_timestamp,
|
||||
"caption": caption,
|
||||
"alt_text": n.accessibility_caption,
|
||||
"dimensions": n.dimensions.as_ref().map(|d| json!({"width": d.width, "height": d.height})),
|
||||
"product_type": n.product_type,
|
||||
})
|
||||
}
|
||||
|
||||
/// Best-effort post-type classification. `clips` is reels; `feed` is
|
||||
/// the regular grid. Sidecar = multi-photo carousel.
|
||||
fn classify(n: &MediaNode) -> &'static str {
|
||||
if n.product_type.as_deref() == Some("clips") {
|
||||
return "reel";
|
||||
}
|
||||
match n.typename.as_deref() {
|
||||
Some("GraphSidecar") => "carousel",
|
||||
Some("GraphVideo") => "video",
|
||||
Some("GraphImage") => "photo",
|
||||
_ => "post",
|
||||
}
|
||||
}
|
||||
|
||||
/// Fallback when the API path is blocked: hit the public profile HTML,
|
||||
/// pull whatever OG tags we can. Returns less data and explicitly
|
||||
/// flags `data_completeness: "og_only"` so callers know.
|
||||
async fn og_fallback(
|
||||
client: &FetchClient,
|
||||
username: &str,
|
||||
original_url: &str,
|
||||
api_status: u16,
|
||||
) -> Result<Value, FetchError> {
|
||||
let canonical = format!("https://www.instagram.com/{username}/");
|
||||
let resp = client.fetch(&canonical).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"instagram_profile: api status {api_status}, html status {} for {username}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
let og = parse_og_tags(&resp.html);
|
||||
let (followers, following, posts) =
|
||||
parse_counts_from_og_description(og.get("description").map(String::as_str));
|
||||
|
||||
Ok(json!({
|
||||
"url": original_url,
|
||||
"canonical_url": canonical,
|
||||
"username": username,
|
||||
"data_completeness": "og_only",
|
||||
"fallback_reason": format!("api returned {api_status}"),
|
||||
"full_name": parse_full_name(&og.get("title").cloned().unwrap_or_default()),
|
||||
"follower_count": followers,
|
||||
"following_count": following,
|
||||
"post_count": posts,
|
||||
"profile_pic_url": og.get("image").cloned(),
|
||||
"biography": null_value(),
|
||||
"is_verified": null_value(),
|
||||
"is_business": null_value(),
|
||||
"recent_posts": Vec::<Value>::new(),
|
||||
}))
|
||||
}
|
||||
|
||||
fn null_value() -> Value {
|
||||
Value::Null
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// URL parsing
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
fn parse_username(url: &str) -> Option<String> {
|
||||
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||
stripped
|
||||
.split('/')
|
||||
.find(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// OG-fallback helpers (kept self-contained — same shape as the previous
|
||||
// version we shipped, retained as the safety net)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn parse_og_tags(html: &str) -> std::collections::HashMap<String, String> {
|
||||
use regex::Regex;
|
||||
use std::sync::OnceLock;
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
let mut out = std::collections::HashMap::new();
|
||||
for c in re.captures_iter(html) {
|
||||
let k = c
|
||||
.get(1)
|
||||
.map(|m| m.as_str().to_lowercase())
|
||||
.unwrap_or_default();
|
||||
let v = c
|
||||
.get(2)
|
||||
.map(|m| html_decode(m.as_str()))
|
||||
.unwrap_or_default();
|
||||
out.entry(k).or_insert(v);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn parse_full_name(og_title: &str) -> Option<String> {
|
||||
if og_title.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let decoded = html_decode(og_title);
|
||||
let trimmed = decoded.split('(').next().unwrap_or(&decoded).trim();
|
||||
if trimmed.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(trimmed.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_counts_from_og_description(desc: Option<&str>) -> (Option<i64>, Option<i64>, Option<i64>) {
|
||||
let Some(text) = desc else {
|
||||
return (None, None, None);
|
||||
};
|
||||
let decoded = html_decode(text);
|
||||
use regex::Regex;
|
||||
use std::sync::OnceLock;
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r"(?i)([\d.,]+[KMB]?)\s*Followers,\s*([\d.,]+[KMB]?)\s*Following,\s*([\d.,]+[KMB]?)\s*Posts").unwrap()
|
||||
});
|
||||
if let Some(c) = re.captures(&decoded) {
|
||||
return (
|
||||
c.get(1).and_then(|m| parse_compact_number(m.as_str())),
|
||||
c.get(2).and_then(|m| parse_compact_number(m.as_str())),
|
||||
c.get(3).and_then(|m| parse_compact_number(m.as_str())),
|
||||
);
|
||||
}
|
||||
(None, None, None)
|
||||
}
|
||||
|
||||
fn parse_compact_number(s: &str) -> Option<i64> {
|
||||
let s = s.trim();
|
||||
let (num_str, mul) = match s.chars().last() {
|
||||
Some('K') => (&s[..s.len() - 1], 1_000i64),
|
||||
Some('M') => (&s[..s.len() - 1], 1_000_000i64),
|
||||
Some('B') => (&s[..s.len() - 1], 1_000_000_000i64),
|
||||
_ => (s, 1i64),
|
||||
};
|
||||
let cleaned: String = num_str.chars().filter(|c| *c != ',').collect();
|
||||
cleaned.parse::<f64>().ok().map(|f| (f * mul as f64) as i64)
|
||||
}
|
||||
|
||||
fn html_decode(s: &str) -> String {
|
||||
s.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace("@", "@")
|
||||
.replace("•", "•")
|
||||
.replace("…", "…")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Instagram web_profile_info API types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ApiResponse {
|
||||
data: ApiData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ApiData {
|
||||
user: User,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct User {
|
||||
id: Option<String>,
|
||||
username: Option<String>,
|
||||
full_name: Option<String>,
|
||||
biography: Option<String>,
|
||||
bio_links: Option<Vec<serde_json::Value>>,
|
||||
external_url: Option<String>,
|
||||
category_name: Option<String>,
|
||||
profile_pic_url: Option<String>,
|
||||
profile_pic_url_hd: Option<String>,
|
||||
is_verified: Option<bool>,
|
||||
is_private: Option<bool>,
|
||||
is_business_account: Option<bool>,
|
||||
is_professional_account: Option<bool>,
|
||||
edge_followed_by: Option<EdgeCount>,
|
||||
edge_follow: Option<EdgeCount>,
|
||||
edge_owner_to_timeline_media: Option<MediaEdges>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct EdgeCount {
|
||||
count: i64,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct MediaEdges {
|
||||
count: i64,
|
||||
edges: Vec<MediaEdge>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct MediaEdge {
|
||||
node: MediaNode,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct MediaNode {
|
||||
#[serde(rename = "__typename")]
|
||||
typename: Option<String>,
|
||||
shortcode: Option<String>,
|
||||
is_video: Option<bool>,
|
||||
video_view_count: Option<i64>,
|
||||
display_url: Option<String>,
|
||||
thumbnail_src: Option<String>,
|
||||
accessibility_caption: Option<String>,
|
||||
taken_at_timestamp: Option<i64>,
|
||||
product_type: Option<String>,
|
||||
dimensions: Option<Dimensions>,
|
||||
edge_media_preview_like: Option<EdgeCount>,
|
||||
edge_media_to_comment: Option<EdgeCount>,
|
||||
edge_media_to_caption: Option<CaptionEdges>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Dimensions {
|
||||
width: i64,
|
||||
height: i64,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct CaptionEdges {
|
||||
edges: Vec<CaptionEdge>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct CaptionEdge {
|
||||
node: CaptionNode,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct CaptionNode {
|
||||
text: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_profile_urls() {
|
||||
assert!(matches("https://www.instagram.com/ticketswave"));
|
||||
assert!(matches("https://www.instagram.com/ticketswave/"));
|
||||
assert!(matches("https://instagram.com/0xmassi/?hl=en"));
|
||||
assert!(!matches("https://www.instagram.com/p/DT-RICMjeK5/"));
|
||||
assert!(!matches("https://www.instagram.com/explore"));
|
||||
assert!(!matches("https://www.instagram.com/"));
|
||||
assert!(!matches("https://example.com/foo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_full_name_strips_handle() {
|
||||
assert_eq!(
|
||||
parse_full_name("Ticket Wave (@ticketswave) • Instagram photos and videos"),
|
||||
Some("Ticket Wave".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compact_number_handles_kmb() {
|
||||
assert_eq!(parse_compact_number("18K"), Some(18_000));
|
||||
assert_eq!(parse_compact_number("1.5M"), Some(1_500_000));
|
||||
assert_eq!(parse_compact_number("1,234"), Some(1_234));
|
||||
assert_eq!(parse_compact_number("641"), Some(641));
|
||||
}
|
||||
}
|
||||
266
crates/webclaw-fetch/src/extractors/linkedin_post.rs
Normal file
266
crates/webclaw-fetch/src/extractors/linkedin_post.rs
Normal file
|
|
@ -0,0 +1,266 @@
|
|||
//! LinkedIn post structured extractor.
|
||||
//!
|
||||
//! Uses the public embed endpoint `/embed/feed/update/{urn}` which
|
||||
//! LinkedIn provides for sites that want to render a post inline. No
|
||||
//! auth required, returns SSR HTML with the full post body, OG tags,
|
||||
//! image, and a link back to the original post.
|
||||
//!
|
||||
//! Accepts both URN forms (`urn:li:share:N` and `urn:li:activity:N`)
|
||||
//! and pretty post URLs (`/posts/{user}_{slug}-{id}-{suffix}`) by
|
||||
//! pulling the trailing numeric id and converting to an activity URN.
|
||||
|
||||
use regex::Regex;
|
||||
use serde_json::{Value, json};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::client::FetchClient;
|
||||
use crate::error::FetchError;
|
||||
|
||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||
name: "linkedin_post",
|
||||
label: "LinkedIn post",
|
||||
description: "Returns post body, author name, image, and original URL via LinkedIn's public embed endpoint.",
|
||||
url_patterns: &[
|
||||
"https://www.linkedin.com/feed/update/urn:li:share:{id}",
|
||||
"https://www.linkedin.com/feed/update/urn:li:activity:{id}",
|
||||
"https://www.linkedin.com/posts/{user}_{slug}-{id}-{suffix}",
|
||||
],
|
||||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if !matches!(host, "www.linkedin.com" | "linkedin.com") {
|
||||
return false;
|
||||
}
|
||||
url.contains("/feed/update/urn:li:") || url.contains("/posts/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||
let urn = extract_urn(url).ok_or_else(|| {
|
||||
FetchError::Build(format!(
|
||||
"linkedin_post: cannot extract URN from '{url}' (expected /feed/update/urn:li:... or /posts/{{slug}}-{{id}})"
|
||||
))
|
||||
})?;
|
||||
|
||||
let embed_url = format!("https://www.linkedin.com/embed/feed/update/{urn}");
|
||||
let resp = client.fetch(&embed_url).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"linkedin embed returned status {} for {urn}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let html = &resp.html;
|
||||
let og = parse_og_tags(html);
|
||||
let body = parse_post_body(html);
|
||||
let author = parse_author(html);
|
||||
let canonical_url = og.get("url").cloned().unwrap_or_else(|| embed_url.clone());
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"embed_url": embed_url,
|
||||
"urn": urn,
|
||||
"canonical_url": canonical_url,
|
||||
"data_completeness": "embed",
|
||||
"title": og.get("title").cloned(),
|
||||
"body": body,
|
||||
"author_name": author,
|
||||
"image_url": og.get("image").cloned(),
|
||||
"site_name": og.get("site_name").cloned().unwrap_or_else(|| "LinkedIn".into()),
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// URN extraction
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Pull a `urn:li:share:N` or `urn:li:activity:N` from any LinkedIn URL.
|
||||
/// `/posts/{slug}-{id}-{suffix}` URLs encode the activity id as the second-
|
||||
/// to-last `-` separated chunk. Both forms map to a URN we can hit the
|
||||
/// embed endpoint with.
|
||||
fn extract_urn(url: &str) -> Option<String> {
|
||||
if let Some(idx) = url.find("urn:li:") {
|
||||
let tail = &url[idx..];
|
||||
let end = tail.find(['/', '?', '#']).unwrap_or(tail.len());
|
||||
let urn = &tail[..end];
|
||||
// Validate shape: urn:li:{type}:{digits}
|
||||
let mut parts = urn.split(':');
|
||||
if parts.next() == Some("urn")
|
||||
&& parts.next() == Some("li")
|
||||
&& parts.next().is_some()
|
||||
&& parts
|
||||
.next()
|
||||
.filter(|p| p.chars().all(|c| c.is_ascii_digit()))
|
||||
.is_some()
|
||||
{
|
||||
return Some(urn.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// /posts/{user}_{slug}-{19-digit-id}-{4-char-hash}/ — id is the second-
|
||||
// to-last segment after the last `-`.
|
||||
if url.contains("/posts/") {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re =
|
||||
RE.get_or_init(|| Regex::new(r"/posts/[^/]*?-(\d{15,})-[A-Za-z0-9]{2,}/?").unwrap());
|
||||
if let Some(c) = re.captures(url)
|
||||
&& let Some(id) = c.get(1)
|
||||
{
|
||||
return Some(format!("urn:li:activity:{}", id.as_str()));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTML scraping
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Pull `og:foo` → value pairs out of `<meta property="og:..." content="...">`.
|
||||
/// Returns lowercased keys with leading `og:` stripped.
|
||||
fn parse_og_tags(html: &str) -> std::collections::HashMap<String, String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
let mut out = std::collections::HashMap::new();
|
||||
for c in re.captures_iter(html) {
|
||||
let k = c
|
||||
.get(1)
|
||||
.map(|m| m.as_str().to_lowercase())
|
||||
.unwrap_or_default();
|
||||
let v = c
|
||||
.get(2)
|
||||
.map(|m| html_decode(m.as_str()))
|
||||
.unwrap_or_default();
|
||||
out.entry(k).or_insert(v);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Extract the post body text from the embed page. LinkedIn renders it
|
||||
/// inside `<p class="attributed-text-segment-list__content ...">{text}</p>`
|
||||
/// where the inner content can include nested `<a>` tags for links.
|
||||
fn parse_post_body(html: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(
|
||||
r#"(?s)<p[^>]+class="[^"]*attributed-text-segment-list__content[^"]*"[^>]*>(.*?)</p>"#,
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
let inner = re.captures(html).and_then(|c| c.get(1))?.as_str();
|
||||
Some(strip_tags(inner).trim().to_string())
|
||||
}
|
||||
|
||||
/// Author name lives in the `<title>` like:
|
||||
/// "55 founding members are in… | Orc Dev"
|
||||
/// The chunk after the final `|` is the author display name. Falls back
|
||||
/// to the og:title minus the post body if there's no title.
|
||||
fn parse_author(html: &str) -> Option<String> {
|
||||
static RE_TITLE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE_TITLE.get_or_init(|| Regex::new(r"<title>([^<]+)</title>").unwrap());
|
||||
let title = re.captures(html).and_then(|c| c.get(1))?.as_str();
|
||||
title
|
||||
.rsplit_once('|')
|
||||
.map(|(_, name)| html_decode(name.trim()))
|
||||
}
|
||||
|
||||
/// Replace the small set of HTML entities LinkedIn (and Instagram, etc.)
|
||||
/// stuff into OG content attributes.
|
||||
fn html_decode(s: &str) -> String {
|
||||
s.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace("@", "@")
|
||||
.replace("•", "•")
|
||||
.replace("…", "…")
|
||||
}
|
||||
|
||||
/// Crude HTML tag stripper for the post body. Preserves text inside
|
||||
/// nested anchors so URLs don't disappear, and collapses runs of
|
||||
/// whitespace introduced by line wrapping.
|
||||
fn strip_tags(html: &str) -> String {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| Regex::new(r"<[^>]+>").unwrap());
|
||||
let no_tags = re.replace_all(html, "").to_string();
|
||||
html_decode(&no_tags)
|
||||
}
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_li_post_urls() {
|
||||
assert!(matches(
|
||||
"https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"
|
||||
));
|
||||
assert!(matches(
|
||||
"https://www.linkedin.com/feed/update/urn:li:activity:7452618583290892288"
|
||||
));
|
||||
assert!(matches(
|
||||
"https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c"
|
||||
));
|
||||
assert!(!matches("https://www.linkedin.com/in/foo"));
|
||||
assert!(!matches("https://www.linkedin.com/"));
|
||||
assert!(!matches("https://example.com/feed/update/urn:li:share:1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_urn_from_share_url() {
|
||||
assert_eq!(
|
||||
extract_urn("https://www.linkedin.com/feed/update/urn:li:share:7452618582213144577/"),
|
||||
Some("urn:li:share:7452618582213144577".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_urn_from_pretty_post_url() {
|
||||
assert_eq!(
|
||||
extract_urn(
|
||||
"https://www.linkedin.com/posts/somebody_some-slug-7452618583290892288-aB1c/"
|
||||
),
|
||||
Some("urn:li:activity:7452618583290892288".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_og_tags_basic() {
|
||||
let html = r#"<meta property="og:image" content="https://x.com/a.png">
|
||||
<meta property="og:url" content="https://example.com/x">"#;
|
||||
let og = parse_og_tags(html);
|
||||
assert_eq!(
|
||||
og.get("image").map(String::as_str),
|
||||
Some("https://x.com/a.png")
|
||||
);
|
||||
assert_eq!(
|
||||
og.get("url").map(String::as_str),
|
||||
Some("https://example.com/x")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_post_body_strips_anchor_tags() {
|
||||
let html = r#"<p class="attributed-text-segment-list__content text-color-text" dir="ltr">Hello <a href="x">link</a> world</p>"#;
|
||||
assert_eq!(parse_post_body(html).as_deref(), Some("Hello link world"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn html_decode_handles_common_entities() {
|
||||
assert_eq!(html_decode("AT&T @jane"), "AT&T @jane");
|
||||
}
|
||||
}
|
||||
|
|
@ -24,6 +24,9 @@ pub mod github_repo;
|
|||
pub mod hackernews;
|
||||
pub mod huggingface_dataset;
|
||||
pub mod huggingface_model;
|
||||
pub mod instagram_post;
|
||||
pub mod instagram_profile;
|
||||
pub mod linkedin_post;
|
||||
pub mod npm;
|
||||
pub mod pypi;
|
||||
pub mod reddit;
|
||||
|
|
@ -67,6 +70,9 @@ pub fn list() -> Vec<ExtractorInfo> {
|
|||
docker_hub::INFO,
|
||||
dev_to::INFO,
|
||||
stackoverflow::INFO,
|
||||
linkedin_post::INFO,
|
||||
instagram_post::INFO,
|
||||
instagram_profile::INFO,
|
||||
]
|
||||
}
|
||||
|
||||
|
|
@ -171,6 +177,27 @@ pub async fn dispatch_by_url(
|
|||
.map(|v| (stackoverflow::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if linkedin_post::matches(url) {
|
||||
return Some(
|
||||
linkedin_post::extract(client, url)
|
||||
.await
|
||||
.map(|v| (linkedin_post::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if instagram_post::matches(url) {
|
||||
return Some(
|
||||
instagram_post::extract(client, url)
|
||||
.await
|
||||
.map(|v| (instagram_post::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
if instagram_profile::matches(url) {
|
||||
return Some(
|
||||
instagram_profile::extract(client, url)
|
||||
.await
|
||||
.map(|v| (instagram_profile::INFO.name, v)),
|
||||
);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
|
|
@ -259,6 +286,24 @@ pub async fn dispatch_by_name(
|
|||
})
|
||||
.await
|
||||
}
|
||||
n if n == linkedin_post::INFO.name => {
|
||||
run_or_mismatch(linkedin_post::matches(url), n, url, || {
|
||||
linkedin_post::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == instagram_post::INFO.name => {
|
||||
run_or_mismatch(instagram_post::matches(url), n, url, || {
|
||||
instagram_post::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
n if n == instagram_profile::INFO.name => {
|
||||
run_or_mismatch(instagram_profile::matches(url), n, url, || {
|
||||
instagram_profile::extract(client, url)
|
||||
})
|
||||
.await
|
||||
}
|
||||
_ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue