mirror of
https://github.com/clucraft/PriceGhost.git
synced 2026-06-08 15:05:16 +02:00
- Add helper to detect savings/combo/bundle containers - Prioritize JSON-LD data as primary price source (most reliable) - Skip price elements inside savings containers - Add minimum price threshold to filter out discount amounts Fixes issue where $189.99 bundle savings was extracted instead of actual $675.59 product price for items like AMD Ryzen 9 9950X3D. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1086 lines
35 KiB
TypeScript
1086 lines
35 KiB
TypeScript
import axios, { AxiosError } from 'axios';
|
|
import { load, type CheerioAPI } from 'cheerio';
|
|
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import {
|
|
parsePrice,
|
|
ParsedPrice,
|
|
findMostLikelyPrice,
|
|
} from '../utils/priceParser';
|
|
|
|
// Add stealth plugin to avoid bot detection (Cloudflare, etc.)
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
export type StockStatus = 'in_stock' | 'out_of_stock' | 'unknown';
|
|
|
|
// Browser-based scraping for sites that block HTTP requests (e.g., Cloudflare)
|
|
async function scrapeWithBrowser(url: string): Promise<string> {
|
|
const browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--disable-infobars',
|
|
'--window-size=1920,1080',
|
|
'--start-maximized',
|
|
],
|
|
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || undefined,
|
|
ignoreDefaultArgs: ['--enable-automation'],
|
|
});
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
|
|
// Set viewport
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
// Navigate to the page and wait for content to load
|
|
await page.goto(url, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 45000,
|
|
});
|
|
|
|
// Add some human-like behavior
|
|
await page.mouse.move(100, 200);
|
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
await page.mouse.move(300, 400);
|
|
|
|
// Wait for Cloudflare challenge to complete if present
|
|
// Check if we're on a challenge page and wait for it to resolve
|
|
const maxWaitTime = 20000;
|
|
const startTime = Date.now();
|
|
|
|
while (Date.now() - startTime < maxWaitTime) {
|
|
const title = await page.title();
|
|
// Cloudflare challenge pages have titles like "Just a moment..."
|
|
if (!title.toLowerCase().includes('just a moment') &&
|
|
!title.toLowerCase().includes('checking your browser')) {
|
|
break;
|
|
}
|
|
console.log(`[Browser] Waiting for Cloudflare challenge to complete... (${title})`);
|
|
// Move mouse randomly while waiting
|
|
await page.mouse.move(
|
|
100 + Math.random() * 500,
|
|
100 + Math.random() * 400
|
|
);
|
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
}
|
|
|
|
// Scroll down a bit like a human would
|
|
// eslint-disable-next-line @typescript-eslint/no-implied-eval
|
|
await page.evaluate('window.scrollBy(0, 300)');
|
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
|
|
// Get the full HTML content
|
|
const html = await page.content();
|
|
return html;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
export interface ScrapedProduct {
|
|
name: string | null;
|
|
price: ParsedPrice | null;
|
|
imageUrl: string | null;
|
|
url: string;
|
|
stockStatus: StockStatus;
|
|
}
|
|
|
|
// Site-specific scraper configurations
|
|
interface SiteScraper {
|
|
match: (url: string) => boolean;
|
|
scrape: ($: CheerioAPI, url: string) => Partial<Omit<ScrapedProduct, 'url'>>;
|
|
}
|
|
|
|
const siteScrapers: SiteScraper[] = [
|
|
// Amazon
|
|
{
|
|
match: (url) => /amazon\.(com|co\.uk|ca|de|fr|es|it|co\.jp|in|com\.au)/i.test(url),
|
|
scrape: ($) => {
|
|
// Helper to check if element is inside a coupon/savings container
|
|
const isInCouponContainer = (el: ReturnType<typeof $>) => {
|
|
const parents = el.parents().toArray();
|
|
for (const parent of parents) {
|
|
const id = $(parent).attr('id') || '';
|
|
const className = $(parent).attr('class') || '';
|
|
const text = $(parent).text().toLowerCase();
|
|
if (/coupon|savings|save\s*\$|clipcoupon|promoprice/i.test(id + className)) {
|
|
return true;
|
|
}
|
|
// Check if the immediate container mentions "save" or "coupon"
|
|
if (text.includes('save $') || text.includes('coupon') || text.includes('clip')) {
|
|
// Only consider it a coupon if it's a small container
|
|
if (text.length < 100) return true;
|
|
}
|
|
}
|
|
return false;
|
|
};
|
|
|
|
// Try to get the main displayed price from specific containers first
|
|
// These are the primary price display areas on Amazon
|
|
const primaryPriceContainers = [
|
|
'#corePrice_feature_div',
|
|
'#corePriceDisplay_desktop_feature_div',
|
|
'#apex_desktop_newAccordionRow',
|
|
'#apex_offerDisplay_desktop',
|
|
];
|
|
|
|
let price: ParsedPrice | null = null;
|
|
|
|
// First, try the primary price containers
|
|
for (const containerId of primaryPriceContainers) {
|
|
const container = $(containerId);
|
|
if (!container.length) continue;
|
|
|
|
// Look for the main price display (not savings/coupons)
|
|
const priceElements = container.find('.a-price .a-offscreen');
|
|
|
|
for (let i = 0; i < priceElements.length; i++) {
|
|
const el = $(priceElements[i]);
|
|
|
|
// Skip if this is inside a coupon container
|
|
if (isInCouponContainer(el)) continue;
|
|
|
|
// Skip if the parent has "savings" or similar class
|
|
const parentClass = el.parent().attr('class') || '';
|
|
if (/savings|coupon|save/i.test(parentClass)) continue;
|
|
|
|
const text = el.text().trim();
|
|
const parsed = parsePrice(text);
|
|
|
|
// Validate the price is reasonable (not a $1 coupon)
|
|
if (parsed && parsed.price >= 2) {
|
|
price = parsed;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (price) break;
|
|
}
|
|
|
|
// Fallback: try other known price selectors
|
|
if (!price) {
|
|
const fallbackSelectors = [
|
|
'#priceblock_dealprice',
|
|
'#priceblock_saleprice',
|
|
'#priceblock_ourprice',
|
|
'#price_inside_buybox',
|
|
'#newBuyBoxPrice',
|
|
'span[data-a-color="price"] .a-offscreen',
|
|
];
|
|
|
|
for (const selector of fallbackSelectors) {
|
|
const el = $(selector).first();
|
|
if (el.length && !isInCouponContainer(el)) {
|
|
const text = el.text().trim();
|
|
const parsed = parsePrice(text);
|
|
if (parsed && parsed.price >= 2) {
|
|
price = parsed;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last resort: look for the whole/fraction price format
|
|
if (!price) {
|
|
const whole = $('#corePrice_feature_div .a-price-whole').first().text().replace(',', '');
|
|
const fraction = $('#corePrice_feature_div .a-price-fraction').first().text();
|
|
if (whole) {
|
|
const priceStr = `$${whole}${fraction ? '.' + fraction : ''}`;
|
|
const parsed = parsePrice(priceStr);
|
|
if (parsed && parsed.price >= 2) {
|
|
price = parsed;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Product name
|
|
const name = $('#productTitle').text().trim() ||
|
|
$('h1.a-size-large').text().trim() ||
|
|
null;
|
|
|
|
// Image
|
|
const imageUrl = $('#landingImage').attr('src') ||
|
|
$('#imgBlkFront').attr('src') ||
|
|
$('img[data-a-dynamic-image]').attr('src') ||
|
|
null;
|
|
|
|
// Stock status detection
|
|
let stockStatus: StockStatus = 'unknown';
|
|
const availabilityText = $('#availability').text().toLowerCase();
|
|
const outOfStockDiv = $('#outOfStock').length > 0;
|
|
const unavailableText = $('body').text().toLowerCase();
|
|
|
|
// Check for out of stock indicators
|
|
if (
|
|
outOfStockDiv ||
|
|
availabilityText.includes('currently unavailable') ||
|
|
availabilityText.includes('out of stock') ||
|
|
availabilityText.includes('not available') ||
|
|
$('#add-to-cart-button').length === 0 && $('#buy-now-button').length === 0
|
|
) {
|
|
// Verify it's truly out of stock by checking for unavailable messaging
|
|
if (
|
|
unavailableText.includes('currently unavailable') ||
|
|
unavailableText.includes("we don't know when or if this item will be back in stock") ||
|
|
outOfStockDiv ||
|
|
availabilityText.includes('out of stock')
|
|
) {
|
|
stockStatus = 'out_of_stock';
|
|
}
|
|
} else if (
|
|
availabilityText.includes('in stock') ||
|
|
availabilityText.includes('available') ||
|
|
$('#add-to-cart-button').length > 0
|
|
) {
|
|
stockStatus = 'in_stock';
|
|
}
|
|
|
|
return { name, price, imageUrl, stockStatus };
|
|
},
|
|
},
|
|
|
|
// Walmart
|
|
{
|
|
match: (url) => /walmart\.com/i.test(url),
|
|
scrape: ($) => {
|
|
let price: ParsedPrice | null = null;
|
|
let name: string | null = null;
|
|
let imageUrl: string | null = null;
|
|
let stockStatus: StockStatus = 'unknown';
|
|
|
|
// Walmart embeds product data in a __NEXT_DATA__ script tag
|
|
try {
|
|
const nextDataScript = $('#__NEXT_DATA__').html();
|
|
if (nextDataScript) {
|
|
const nextData = JSON.parse(nextDataScript);
|
|
const productData = nextData?.props?.pageProps?.initialData?.data?.product ||
|
|
nextData?.props?.pageProps?.initialProps?.data?.product;
|
|
|
|
if (productData) {
|
|
// Get price from embedded data
|
|
const priceInfo = productData.priceInfo?.currentPrice ||
|
|
productData.priceInfo?.priceRange?.minPrice;
|
|
if (priceInfo) {
|
|
price = {
|
|
price: typeof priceInfo.price === 'number' ? priceInfo.price : parseFloat(priceInfo.price),
|
|
currency: priceInfo.currencyCode || 'USD',
|
|
};
|
|
}
|
|
|
|
// Get name
|
|
name = productData.name || null;
|
|
|
|
// Get image
|
|
imageUrl = productData.imageInfo?.thumbnailUrl ||
|
|
productData.imageInfo?.allImages?.[0]?.url ||
|
|
null;
|
|
|
|
// Get stock status
|
|
const availability = productData.availabilityStatus ||
|
|
productData.fulfillment?.availabilityStatus;
|
|
if (availability) {
|
|
const availLower = availability.toLowerCase();
|
|
if (availLower === 'in_stock' || availLower === 'available') {
|
|
stockStatus = 'in_stock';
|
|
} else if (availLower === 'out_of_stock' || availLower === 'not_available') {
|
|
stockStatus = 'out_of_stock';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (_e) {
|
|
// JSON parse error, fall back to HTML scraping
|
|
}
|
|
|
|
// Fallback: Try HTML selectors if __NEXT_DATA__ didn't work
|
|
if (!price) {
|
|
const priceSelectors = [
|
|
'[itemprop="price"]',
|
|
'[data-testid="price-wrap"] span[class*="price"]',
|
|
'.price-characteristic',
|
|
'[data-automation="product-price"]',
|
|
'span[data-automation-id="product-price"]',
|
|
];
|
|
|
|
for (const selector of priceSelectors) {
|
|
const el = $(selector).first();
|
|
if (el.length) {
|
|
const content = el.attr('content');
|
|
const text = content || el.text().trim();
|
|
price = parsePrice(text);
|
|
if (price) break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Try price from whole dollars + cents pattern
|
|
if (!price) {
|
|
const priceText = $('[itemprop="price"]').attr('content');
|
|
if (priceText) {
|
|
price = parsePrice(priceText);
|
|
}
|
|
}
|
|
|
|
if (!name) {
|
|
name = $('h1[itemprop="name"]').text().trim() ||
|
|
$('h1#main-title').text().trim() ||
|
|
$('[data-testid="product-title"]').text().trim() ||
|
|
null;
|
|
}
|
|
|
|
if (!imageUrl) {
|
|
imageUrl = $('[data-testid="hero-image-container"] img').attr('src') ||
|
|
$('img[data-testid="hero-image"]').attr('src') ||
|
|
$('meta[property="og:image"]').attr('content') ||
|
|
null;
|
|
}
|
|
|
|
// Fallback stock status from HTML if not found
|
|
if (stockStatus === 'unknown') {
|
|
const addToCartBtn = $('[data-testid="add-to-cart-button"]').length > 0 ||
|
|
$('button[aria-label*="Add to cart"]').length > 0;
|
|
const outOfStockText = $('[data-testid="out-of-stock-message"]').length > 0 ||
|
|
$('body').text().toLowerCase().includes('out of stock');
|
|
|
|
if (addToCartBtn) {
|
|
stockStatus = 'in_stock';
|
|
} else if (outOfStockText) {
|
|
// Only mark as out of stock if we're confident
|
|
const bodyText = $('body').text().toLowerCase();
|
|
// Check specifically for this product being out of stock
|
|
if (bodyText.includes('this item is currently out of stock') ||
|
|
bodyText.includes('this product is currently unavailable') ||
|
|
$('[data-testid="out-of-stock-message"]').length > 0) {
|
|
stockStatus = 'out_of_stock';
|
|
}
|
|
}
|
|
}
|
|
|
|
return { name, price, imageUrl, stockStatus };
|
|
},
|
|
},
|
|
|
|
// Best Buy
|
|
{
|
|
match: (url) => /bestbuy\.com/i.test(url),
|
|
scrape: ($) => {
|
|
const priceSelectors = [
|
|
'[data-testid="customer-price"] span',
|
|
'.priceView-customer-price span',
|
|
'.priceView-hero-price span',
|
|
'[class*="customerPrice"]',
|
|
];
|
|
|
|
let price: ParsedPrice | null = null;
|
|
for (const selector of priceSelectors) {
|
|
const el = $(selector).first();
|
|
if (el.length) {
|
|
price = parsePrice(el.text().trim());
|
|
if (price) break;
|
|
}
|
|
}
|
|
|
|
const name = $('h1.heading-5').text().trim() ||
|
|
$('.sku-title h1').text().trim() ||
|
|
null;
|
|
|
|
const imageUrl = $('img.primary-image').attr('src') ||
|
|
$('[data-testid="image-gallery-image"]').attr('src') ||
|
|
null;
|
|
|
|
return { name, price, imageUrl };
|
|
},
|
|
},
|
|
|
|
// Target
|
|
{
|
|
match: (url) => /target\.com/i.test(url),
|
|
scrape: ($) => {
|
|
const priceSelectors = [
|
|
'[data-test="product-price"]',
|
|
'[data-test="current-price"]',
|
|
'.styles__CurrentPriceFontSize-sc-1qc6t3e-1',
|
|
];
|
|
|
|
let price: ParsedPrice | null = null;
|
|
for (const selector of priceSelectors) {
|
|
const el = $(selector).first();
|
|
if (el.length) {
|
|
price = parsePrice(el.text().trim());
|
|
if (price) break;
|
|
}
|
|
}
|
|
|
|
const name = $('[data-test="product-title"]').text().trim() ||
|
|
$('h1[class*="Heading"]').text().trim() ||
|
|
null;
|
|
|
|
const imageUrl = $('[data-test="image-gallery-item-0"] img').attr('src') ||
|
|
null;
|
|
|
|
return { name, price, imageUrl };
|
|
},
|
|
},
|
|
|
|
// eBay
|
|
{
|
|
match: (url) => /ebay\.(com|co\.uk|de|fr|ca|com\.au)/i.test(url),
|
|
scrape: ($) => {
|
|
const priceSelectors = [
|
|
'[data-testid="x-price-primary"] .ux-textspans',
|
|
'.x-price-primary .ux-textspans',
|
|
'#prcIsum',
|
|
'#mm-saleDscPrc',
|
|
'.vi-price .notranslate',
|
|
];
|
|
|
|
let price: ParsedPrice | null = null;
|
|
for (const selector of priceSelectors) {
|
|
const el = $(selector).first();
|
|
if (el.length) {
|
|
price = parsePrice(el.text().trim());
|
|
if (price) break;
|
|
}
|
|
}
|
|
|
|
const name = $('h1.x-item-title__mainTitle span').text().trim() ||
|
|
$('h1[itemprop="name"]').text().trim() ||
|
|
null;
|
|
|
|
const imageUrl = $('[data-testid="ux-image-carousel"] img').attr('src') ||
|
|
$('#icImg').attr('src') ||
|
|
null;
|
|
|
|
return { name, price, imageUrl };
|
|
},
|
|
},
|
|
|
|
// Newegg
|
|
{
|
|
match: (url) => /newegg\.com/i.test(url),
|
|
scrape: ($) => {
|
|
// Helper to check if element is inside a savings/combo container
|
|
const isInSavingsContainer = (el: ReturnType<typeof $>) => {
|
|
const parents = el.parents().toArray();
|
|
for (const parent of parents) {
|
|
const className = $(parent).attr('class') || '';
|
|
const id = $(parent).attr('id') || '';
|
|
// Skip elements inside combo deals, savings sections, or "you save" areas
|
|
if (/combo|save|saving|deal|bundle|discount/i.test(className + id)) {
|
|
return true;
|
|
}
|
|
// Check for specific Newegg combo/savings containers
|
|
if (className.includes('item-combo') || className.includes('product-combo')) {
|
|
return true;
|
|
}
|
|
}
|
|
// Also check the element's surrounding text for "save" context
|
|
const parentText = el.parent().text().toLowerCase();
|
|
if (parentText.includes('you save') || parentText.includes('save $')) {
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
let price: ParsedPrice | null = null;
|
|
|
|
// First, try JSON-LD data - most reliable source
|
|
try {
|
|
const scripts = $('script[type="application/ld+json"]');
|
|
scripts.each((_, script) => {
|
|
if (price) return; // Already found
|
|
const jsonLd = $(script).html();
|
|
if (jsonLd) {
|
|
const data = JSON.parse(jsonLd);
|
|
// Handle array of JSON-LD objects
|
|
const items = Array.isArray(data) ? data : [data];
|
|
for (const item of items) {
|
|
if (item['@type'] === 'Product' && item.offers) {
|
|
const offer = Array.isArray(item.offers) ? item.offers[0] : item.offers;
|
|
if (offer?.price) {
|
|
price = {
|
|
price: parseFloat(String(offer.price)),
|
|
currency: offer.priceCurrency || 'USD',
|
|
};
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
});
|
|
} catch (_e) {
|
|
// Ignore JSON parse errors
|
|
}
|
|
|
|
// Fallback: Try HTML selectors, but be careful to avoid savings amounts
|
|
if (!price) {
|
|
// Target main product buy box price specifically
|
|
const mainPriceContainers = [
|
|
'.product-buy-box .price-current',
|
|
'.price-main-product .price-current',
|
|
'.product-price .price-current',
|
|
'#app .price-current', // Main app container
|
|
];
|
|
|
|
for (const selector of mainPriceContainers) {
|
|
const elements = $(selector);
|
|
elements.each((_, el) => {
|
|
if (price) return; // Already found
|
|
|
|
const $el = $(el);
|
|
// Skip if inside a savings/combo container
|
|
if (isInSavingsContainer($el)) return;
|
|
|
|
// Combine dollar and cents parts
|
|
const strong = $el.find('strong').text().trim() || $el.text().trim();
|
|
const sup = $el.find('sup').text().trim();
|
|
if (strong) {
|
|
// Clean the strong text - remove any non-numeric chars except comma
|
|
const cleanStrong = strong.replace(/[^0-9,]/g, '');
|
|
if (cleanStrong) {
|
|
const priceText = `$${cleanStrong}${sup ? '.' + sup : ''}`;
|
|
const parsed = parsePrice(priceText);
|
|
// Validate this looks like a real product price (Ryzen 9 should be $500+)
|
|
if (parsed && parsed.price > 50) {
|
|
price = parsed;
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
if (price) break;
|
|
}
|
|
}
|
|
|
|
// Last resort: itemprop price
|
|
if (!price) {
|
|
const itemprop = $('[itemprop="price"]').first();
|
|
if (itemprop.length) {
|
|
const content = itemprop.attr('content');
|
|
if (content) {
|
|
price = parsePrice(content);
|
|
}
|
|
}
|
|
}
|
|
|
|
const name = $('h1.product-title').text().trim() ||
|
|
$('.product-title').text().trim() ||
|
|
$('[itemprop="name"]').text().trim() ||
|
|
null;
|
|
|
|
const imageUrl = $('img.product-view-img-original').attr('src') ||
|
|
$('.product-view-img-original').attr('src') ||
|
|
$('[itemprop="image"]').attr('content') ||
|
|
null;
|
|
|
|
// Stock status detection for Newegg
|
|
let stockStatus: StockStatus = 'unknown';
|
|
const buyButton = $('.btn-primary.btn-wide').text().toLowerCase();
|
|
const soldOutBanner = $('.product-inventory').text().toLowerCase();
|
|
const outOfStockText = $('.product-flag-text').text().toLowerCase();
|
|
|
|
if (
|
|
soldOutBanner.includes('out of stock') ||
|
|
soldOutBanner.includes('sold out') ||
|
|
outOfStockText.includes('out of stock') ||
|
|
$('.product-buy-box .btn-message-error').length > 0
|
|
) {
|
|
stockStatus = 'out_of_stock';
|
|
} else if (
|
|
buyButton.includes('add to cart') ||
|
|
buyButton.includes('buy now') ||
|
|
$('.product-buy-box .btn-primary').length > 0
|
|
) {
|
|
stockStatus = 'in_stock';
|
|
}
|
|
|
|
return { name, price, imageUrl, stockStatus };
|
|
},
|
|
},
|
|
|
|
// Home Depot
|
|
{
|
|
match: (url) => /homedepot\.com/i.test(url),
|
|
scrape: ($) => {
|
|
const priceSelectors = [
|
|
'[data-testid="price-format"] span',
|
|
'.price-format__main-price span',
|
|
'#ajaxPrice',
|
|
];
|
|
|
|
let price: ParsedPrice | null = null;
|
|
for (const selector of priceSelectors) {
|
|
const el = $(selector).first();
|
|
if (el.length) {
|
|
price = parsePrice(el.text().trim());
|
|
if (price) break;
|
|
}
|
|
}
|
|
|
|
const name = $('h1.product-title__title').text().trim() ||
|
|
$('h1[class*="product-details"]').text().trim() ||
|
|
null;
|
|
|
|
const imageUrl = $('img[data-testid="media-gallery-image"]').attr('src') || null;
|
|
|
|
return { name, price, imageUrl };
|
|
},
|
|
},
|
|
|
|
// Costco
|
|
{
|
|
match: (url) => /costco\.com/i.test(url),
|
|
scrape: ($) => {
|
|
const price = parsePrice($('[automation-id="productPriceOutput"]').text().trim()) ||
|
|
parsePrice($('.price').first().text().trim());
|
|
|
|
const name = $('h1[itemprop="name"]').text().trim() ||
|
|
$('h1.product-title').text().trim() ||
|
|
null;
|
|
|
|
const imageUrl = $('img.product-image').attr('src') || null;
|
|
|
|
return { name, price, imageUrl };
|
|
},
|
|
},
|
|
|
|
// AliExpress
|
|
{
|
|
match: (url) => /aliexpress\.com/i.test(url),
|
|
scrape: ($) => {
|
|
const priceSelectors = [
|
|
'.product-price-value',
|
|
'[class*="uniformBannerBoxPrice"]',
|
|
'.snow-price_SnowPrice__mainS__1occeh',
|
|
];
|
|
|
|
let price: ParsedPrice | null = null;
|
|
for (const selector of priceSelectors) {
|
|
const el = $(selector).first();
|
|
if (el.length) {
|
|
price = parsePrice(el.text().trim());
|
|
if (price) break;
|
|
}
|
|
}
|
|
|
|
const name = $('h1[data-pl="product-title"]').text().trim() ||
|
|
$('h1.product-title-text').text().trim() ||
|
|
null;
|
|
|
|
const imageUrl = $('img.magnifier-image').attr('src') || null;
|
|
|
|
return { name, price, imageUrl };
|
|
},
|
|
},
|
|
|
|
];
|
|
|
|
// Generic selectors as fallback
|
|
const genericPriceSelectors = [
|
|
'[itemprop="price"]',
|
|
'[data-price]',
|
|
'[data-product-price]',
|
|
'.price',
|
|
'.product-price',
|
|
'.current-price',
|
|
'.sale-price',
|
|
'.final-price',
|
|
'.offer-price',
|
|
'#price',
|
|
'[class*="price" i]',
|
|
'[class*="Price" i]',
|
|
];
|
|
|
|
const genericNameSelectors = [
|
|
'[itemprop="name"]',
|
|
'h1[class*="product"]',
|
|
'h1[class*="title"]',
|
|
'.product-title',
|
|
'.product-name',
|
|
'h1',
|
|
];
|
|
|
|
const genericImageSelectors = [
|
|
'[itemprop="image"]',
|
|
'[property="og:image"]',
|
|
'.product-image img',
|
|
'.main-image img',
|
|
'[data-zoom-image]',
|
|
'img[class*="product"]',
|
|
];
|
|
|
|
export async function scrapeProduct(url: string, userId?: number): Promise<ScrapedProduct> {
|
|
const result: ScrapedProduct = {
|
|
name: null,
|
|
price: null,
|
|
imageUrl: null,
|
|
url,
|
|
stockStatus: 'unknown',
|
|
};
|
|
|
|
let html: string = '';
|
|
|
|
try {
|
|
let usedBrowser = false;
|
|
|
|
try {
|
|
const response = await axios.get<string>(url, {
|
|
headers: {
|
|
'User-Agent':
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
Accept:
|
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Cache-Control': 'no-cache',
|
|
Pragma: 'no-cache',
|
|
'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
|
'Sec-Ch-Ua-Mobile': '?0',
|
|
'Sec-Ch-Ua-Platform': '"Windows"',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'none',
|
|
'Sec-Fetch-User': '?1',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
},
|
|
timeout: 20000,
|
|
maxRedirects: 5,
|
|
});
|
|
html = response.data;
|
|
} catch (axiosError) {
|
|
// If we get a 403 (Forbidden), try using a headless browser
|
|
if (axiosError instanceof AxiosError && axiosError.response?.status === 403) {
|
|
console.log(`HTTP request blocked (403) for ${url}, falling back to browser scraping...`);
|
|
html = await scrapeWithBrowser(url);
|
|
usedBrowser = true;
|
|
} else {
|
|
throw axiosError;
|
|
}
|
|
}
|
|
|
|
const $ = load(html);
|
|
|
|
if (usedBrowser) {
|
|
console.log(`Successfully scraped ${url} using headless browser`);
|
|
}
|
|
|
|
// Try site-specific scraper first
|
|
const siteScraper = siteScrapers.find((s) => s.match(url));
|
|
if (siteScraper) {
|
|
const siteResult = siteScraper.scrape($, url);
|
|
if (siteResult.name) result.name = siteResult.name;
|
|
if (siteResult.price) result.price = siteResult.price;
|
|
if (siteResult.imageUrl) result.imageUrl = siteResult.imageUrl;
|
|
if (siteResult.stockStatus) result.stockStatus = siteResult.stockStatus;
|
|
}
|
|
|
|
// Try JSON-LD structured data
|
|
if (!result.price || !result.name || result.stockStatus === 'unknown') {
|
|
const jsonLdData = extractJsonLd($);
|
|
if (jsonLdData) {
|
|
if (!result.name && jsonLdData.name) result.name = jsonLdData.name;
|
|
if (!result.price && jsonLdData.price) result.price = jsonLdData.price;
|
|
if (!result.imageUrl && jsonLdData.image) result.imageUrl = jsonLdData.image;
|
|
if (result.stockStatus === 'unknown' && jsonLdData.stockStatus) {
|
|
result.stockStatus = jsonLdData.stockStatus;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback to generic scraping
|
|
if (!result.name) {
|
|
result.name = extractGenericName($);
|
|
}
|
|
|
|
if (!result.price) {
|
|
result.price = extractGenericPrice($);
|
|
}
|
|
|
|
if (!result.imageUrl) {
|
|
result.imageUrl = extractGenericImage($, url);
|
|
}
|
|
|
|
// Generic stock status detection if not already set
|
|
if (result.stockStatus === 'unknown') {
|
|
result.stockStatus = extractGenericStockStatus($);
|
|
}
|
|
|
|
// Try Open Graph meta tags as last resort
|
|
if (!result.name) {
|
|
result.name = $('meta[property="og:title"]').attr('content') || null;
|
|
}
|
|
if (!result.imageUrl) {
|
|
result.imageUrl = $('meta[property="og:image"]').attr('content') || null;
|
|
}
|
|
|
|
// If we still don't have a price and userId is provided, try AI extraction
|
|
if (!result.price && userId && html) {
|
|
try {
|
|
const { tryAIExtraction } = await import('./ai-extractor');
|
|
const aiResult = await tryAIExtraction(url, html, userId);
|
|
|
|
if (aiResult && aiResult.price && aiResult.confidence > 0.5) {
|
|
console.log(`[AI] Successfully extracted price for ${url}: ${aiResult.price.price} (confidence: ${aiResult.confidence})`);
|
|
result.price = aiResult.price;
|
|
if (!result.name && aiResult.name) result.name = aiResult.name;
|
|
if (!result.imageUrl && aiResult.imageUrl) result.imageUrl = aiResult.imageUrl;
|
|
if (result.stockStatus === 'unknown' && aiResult.stockStatus !== 'unknown') {
|
|
result.stockStatus = aiResult.stockStatus;
|
|
}
|
|
}
|
|
} catch (aiError) {
|
|
console.error(`[AI] Extraction failed for ${url}:`, aiError);
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error scraping ${url}:`, error);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
interface JsonLdProduct {
|
|
'@type'?: string;
|
|
'@graph'?: JsonLdProduct[];
|
|
name?: string;
|
|
image?: string | string[] | { url?: string };
|
|
offers?: JsonLdOffer | JsonLdOffer[];
|
|
}
|
|
|
|
interface JsonLdPriceSpecification {
|
|
price?: string | number;
|
|
priceCurrency?: string;
|
|
}
|
|
|
|
interface JsonLdOffer {
|
|
'@type'?: string;
|
|
price?: string | number;
|
|
priceCurrency?: string;
|
|
lowPrice?: string | number;
|
|
priceSpecification?: JsonLdPriceSpecification;
|
|
availability?: string;
|
|
}
|
|
|
|
function extractJsonLd(
|
|
$: CheerioAPI
|
|
): { name?: string; price?: ParsedPrice; image?: string; stockStatus?: StockStatus } | null {
|
|
try {
|
|
const scripts = $('script[type="application/ld+json"]');
|
|
for (let i = 0; i < scripts.length; i++) {
|
|
const content = $(scripts[i]).html();
|
|
if (!content) continue;
|
|
|
|
const data = JSON.parse(content) as JsonLdProduct | JsonLdProduct[];
|
|
const product = findProduct(data);
|
|
|
|
if (product) {
|
|
const result: { name?: string; price?: ParsedPrice; image?: string; stockStatus?: StockStatus } = {};
|
|
|
|
if (product.name) {
|
|
result.name = product.name;
|
|
}
|
|
|
|
if (product.offers) {
|
|
const offer = Array.isArray(product.offers)
|
|
? product.offers[0]
|
|
: product.offers;
|
|
|
|
// Get price, checking multiple locations:
|
|
// 1. lowPrice (for price ranges)
|
|
// 2. price (direct)
|
|
// 3. priceSpecification.price (nested format used by some sites)
|
|
const priceValue = offer.lowPrice || offer.price || offer.priceSpecification?.price;
|
|
const currency = offer.priceCurrency || offer.priceSpecification?.priceCurrency || 'USD';
|
|
|
|
if (priceValue) {
|
|
result.price = {
|
|
price: parseFloat(String(priceValue)),
|
|
currency,
|
|
};
|
|
}
|
|
|
|
// Extract stock status from availability
|
|
if (offer.availability) {
|
|
const avail = offer.availability.toLowerCase();
|
|
if (avail.includes('instock') || avail.includes('in_stock')) {
|
|
result.stockStatus = 'in_stock';
|
|
} else if (avail.includes('outofstock') || avail.includes('out_of_stock') ||
|
|
avail.includes('soldout') || avail.includes('sold_out')) {
|
|
result.stockStatus = 'out_of_stock';
|
|
}
|
|
}
|
|
}
|
|
|
|
if (product.image) {
|
|
if (Array.isArray(product.image)) {
|
|
result.image = product.image[0];
|
|
} else if (typeof product.image === 'string') {
|
|
result.image = product.image;
|
|
} else if (product.image.url) {
|
|
result.image = product.image.url;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}
|
|
} catch (_e) {
|
|
// JSON parse error, continue with other methods
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function findProduct(data: JsonLdProduct | JsonLdProduct[]): JsonLdProduct | null {
|
|
if (!data) return null;
|
|
|
|
if (Array.isArray(data)) {
|
|
for (const item of data) {
|
|
const found = findProduct(item);
|
|
if (found) return found;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
if (data['@type'] === 'Product') {
|
|
return data;
|
|
}
|
|
|
|
if (data['@graph'] && Array.isArray(data['@graph'])) {
|
|
for (const item of data['@graph']) {
|
|
const found = findProduct(item);
|
|
if (found) return found;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function extractGenericPrice($: CheerioAPI): ParsedPrice | null {
|
|
const prices: ParsedPrice[] = [];
|
|
|
|
for (const selector of genericPriceSelectors) {
|
|
const elements = $(selector);
|
|
elements.each((_, el) => {
|
|
const $el = $(el);
|
|
// Skip if this looks like an "original" or "was" price
|
|
const classAttr = $el.attr('class') || '';
|
|
const parentClass = $el.parent().attr('class') || '';
|
|
if (/original|was|old|regular|compare|strikethrough|line-through/i.test(classAttr + parentClass)) {
|
|
return;
|
|
}
|
|
|
|
const text = $el.attr('content') || $el.attr('data-price') || $el.text();
|
|
const parsed = parsePrice(text);
|
|
if (parsed && parsed.price > 0) {
|
|
prices.push(parsed);
|
|
}
|
|
});
|
|
|
|
if (prices.length > 0) break;
|
|
}
|
|
|
|
return findMostLikelyPrice(prices);
|
|
}
|
|
|
|
function extractGenericName($: CheerioAPI): string | null {
|
|
for (const selector of genericNameSelectors) {
|
|
const element = $(selector).first();
|
|
if (element.length) {
|
|
const text = element.text().trim();
|
|
if (text && text.length > 0 && text.length < 500) {
|
|
return text;
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractGenericImage($: CheerioAPI, baseUrl: string): string | null {
|
|
for (const selector of genericImageSelectors) {
|
|
const element = $(selector).first();
|
|
if (element.length) {
|
|
const src =
|
|
element.attr('src') ||
|
|
element.attr('content') ||
|
|
element.attr('data-zoom-image') ||
|
|
element.attr('data-src');
|
|
if (src) {
|
|
try {
|
|
return new URL(src, baseUrl).href;
|
|
} catch (_e) {
|
|
return src;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractGenericStockStatus($: CheerioAPI): StockStatus {
|
|
// First, check for schema.org availability - most reliable
|
|
const availability = $('[itemprop="availability"]').attr('content') ||
|
|
$('[itemprop="availability"]').attr('href') || '';
|
|
if (availability.toLowerCase().includes('outofstock') ||
|
|
availability.toLowerCase().includes('discontinued')) {
|
|
return 'out_of_stock';
|
|
}
|
|
if (availability.toLowerCase().includes('instock') ||
|
|
availability.toLowerCase().includes('available')) {
|
|
return 'in_stock';
|
|
}
|
|
|
|
// Check for add to cart button - strong indicator of in stock
|
|
const hasAddToCart = $('button[class*="add-to-cart" i]').length > 0 ||
|
|
$('button[id*="add-to-cart" i]').length > 0 ||
|
|
$('[data-testid*="add-to-cart" i]').length > 0 ||
|
|
$('button:contains("Add to Cart")').length > 0 ||
|
|
$('input[value*="Add to Cart" i]').length > 0;
|
|
|
|
if (hasAddToCart) {
|
|
return 'in_stock';
|
|
}
|
|
|
|
// Check for explicit out-of-stock elements - be specific
|
|
const hasOutOfStockBadge = $('[class*="out-of-stock" i]').length > 0 ||
|
|
$('[class*="sold-out" i]').length > 0 ||
|
|
$('[data-testid*="out-of-stock" i]').length > 0;
|
|
|
|
if (hasOutOfStockBadge) {
|
|
return 'out_of_stock';
|
|
}
|
|
|
|
// Be conservative - only check main product area text, not entire body
|
|
// to avoid false positives from sidebar recommendations, etc.
|
|
const mainContent = $('main, [role="main"], #main, .main-content, .product-detail, .pdp-main').text().toLowerCase();
|
|
const textToCheck = mainContent || $('body').text().toLowerCase().slice(0, 5000);
|
|
|
|
// Strong out-of-stock phrases (must be exact matches to avoid false positives)
|
|
const strongOutOfStockPhrases = [
|
|
'this item is out of stock',
|
|
'this product is out of stock',
|
|
'currently out of stock',
|
|
'this item is currently unavailable',
|
|
'this product is currently unavailable',
|
|
'temporarily out of stock',
|
|
'this item is sold out',
|
|
];
|
|
|
|
for (const phrase of strongOutOfStockPhrases) {
|
|
if (textToCheck.includes(phrase)) {
|
|
return 'out_of_stock';
|
|
}
|
|
}
|
|
|
|
// Default to unknown rather than guessing
|
|
return 'unknown';
|
|
}
|
|
|
|
export async function scrapePrice(url: string): Promise<ParsedPrice | null> {
|
|
const product = await scrapeProduct(url);
|
|
return product.price;
|
|
}
|