mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-29 03:39:37 +02:00
feat(extractors): add LinkedIn + Instagram with profile-to-posts fan-out
3 social-network extractors that work entirely without auth, using
public embed/preview endpoints + Instagram's own SEO-facing API:
- linkedin_post: /embed/feed/update/{urn} returns full body,
author, image, OG tags. Accepts both the urn:li:share
and urn:li:activity URN forms plus the pretty
/posts/{slug}-{id}-{suffix} URLs.
- instagram_post: /p/{shortcode}/embed/captioned/ returns the full
caption, username, thumbnail. Same endpoint serves
reels and IGTV, kind correctly classified.
- instagram_profile: /api/v1/users/web_profile_info/?username=X with the
x-ig-app-id header (Instagram's public web-app id,
sent by their own JS bundle). Returns the full
profile + the 12 most recent posts with shortcodes,
kinds, like/comment counts, thumbnails, and caption
previews. Falls back to OG-tag scraping of the
public HTML if the API ever 401/403s.
The IG profile output is shaped so callers can fan out cleanly:
for p in profile.recent_posts:
scrape('instagram_post', p.url)
giving you 'whole profile + every recent post' in one loop. End-to-end
tested against ticketswave: 1 profile call + 12 post calls in ~3.5s.
Pagination beyond 12 posts requires authenticated cookies and is left
for the cloud where we can stash a session.
Infrastructure change: added FetchClient::fetch_with_headers so
extractors can satisfy site-specific request headers (here x-ig-app-id;
later github_pr will use this for Authorization, etc.) without polluting
the global FetchConfig.headers map. Same retry semantics as fetch().
Catalog now exposes 17 extractors via /v1/extractors. Total unit tests
across the module: 47 passing. Clippy clean. Fmt clean.
Live test on the maintainer's example URLs:
- LinkedIn post (urn:li:share:7452618582213144577): 'Orc Dev' / full body
/ shipper.club link / CDN image extracted in 250ms.
- Instagram post (DT-RICMjeK5): 835-char Slovak caption, ticketswave
username, thumbnail. 200ms.
- Instagram profile (ticketswave): 18,473 followers (exact, not
rounded), is_verified=True, is_business=True, biography with emojis,
12 recent posts with shortcodes + kinds + likes. 400ms.
Out of scope for this wave (require infra we don't have):
- linkedin_profile: returns 999 to all bot UAs, needs OAuth
- facebook_post / facebook_page: content is JS-loaded, needs cloud Chrome
- facebook_profile (personal): not publicly accessible by design
This commit is contained in:
parent
b041f3cddd
commit
3bb0a4bca0
7 changed files with 1085 additions and 1 deletions
|
|
@ -279,14 +279,85 @@ impl FetchClient {
|
|||
|
||||
/// Single fetch attempt.
|
||||
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
self.fetch_once_with_headers(url, &[]).await
|
||||
}
|
||||
|
||||
/// Single fetch attempt with optional per-request headers appended
|
||||
/// after the profile defaults. Used by extractors that need to
|
||||
/// satisfy site-specific headers (e.g. `x-ig-app-id` for Instagram's
|
||||
/// internal API).
|
||||
async fn fetch_once_with_headers(
|
||||
&self,
|
||||
url: &str,
|
||||
extra: &[(&str, &str)],
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
||||
let resp = client.get(url).send().await?;
|
||||
let mut req = client.get(url);
|
||||
for (k, v) in extra {
|
||||
req = req.header(*k, *v);
|
||||
}
|
||||
let resp = req.send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
response_to_result(response, start)
|
||||
}
|
||||
|
||||
/// Fetch a URL with extra per-request headers appended after the
|
||||
/// browser-profile defaults. Same retry semantics as `fetch`.
|
||||
///
|
||||
/// Use this when an upstream API requires a header the global
|
||||
/// `FetchConfig.headers` shouldn't carry to other hosts (Instagram's
|
||||
/// `x-ig-app-id`, GitHub's `Authorization` once we wire `GITHUB_TOKEN`,
|
||||
/// Reddit's compliant UA when we add OAuth, etc.).
|
||||
#[instrument(skip(self, extra), fields(url = %url, extra_count = extra.len()))]
|
||||
pub async fn fetch_with_headers(
|
||||
&self,
|
||||
url: &str,
|
||||
extra: &[(&str, &str)],
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let delays = [Duration::ZERO, Duration::from_secs(1)];
|
||||
let mut last_err = None;
|
||||
|
||||
for (attempt, delay) in delays.iter().enumerate() {
|
||||
if attempt > 0 {
|
||||
tokio::time::sleep(*delay).await;
|
||||
}
|
||||
match self.fetch_once_with_headers(url, extra).await {
|
||||
Ok(result) => {
|
||||
if is_retryable_status(result.status) && attempt < delays.len() - 1 {
|
||||
warn!(
|
||||
url,
|
||||
status = result.status,
|
||||
attempt = attempt + 1,
|
||||
"retryable status, will retry"
|
||||
);
|
||||
last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
|
||||
continue;
|
||||
}
|
||||
if attempt > 0 {
|
||||
debug!(url, attempt = attempt + 1, "retry succeeded");
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => {
|
||||
if !is_retryable_error(&e) || attempt == delays.len() - 1 {
|
||||
return Err(e);
|
||||
}
|
||||
warn!(
|
||||
url,
|
||||
error = %e,
|
||||
attempt = attempt + 1,
|
||||
"transient error, will retry"
|
||||
);
|
||||
last_err = Some(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
|
||||
}
|
||||
|
||||
/// Fetch a URL then extract structured content.
|
||||
#[instrument(skip(self), fields(url = %url))]
|
||||
pub async fn fetch_and_extract(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue