diff --git a/backend/src/services/scraper.ts b/backend/src/services/scraper.ts index 7ae591a..1c0fa2e 100644 --- a/backend/src/services/scraper.ts +++ b/backend/src/services/scraper.ts @@ -13,14 +13,290 @@ export interface ScrapedProduct { url: string; } -// Common price selectors used across e-commerce sites -const priceSelectors = [ - // Schema.org +// Site-specific scraper configurations +interface SiteScraper { + match: (url: string) => boolean; + scrape: ($: CheerioAPI, url: string) => Partial; +} + +const siteScrapers: SiteScraper[] = [ + // Amazon + { + match: (url) => /amazon\.(com|co\.uk|ca|de|fr|es|it|co\.jp|in|com\.au)/i.test(url), + scrape: ($) => { + // Price selectors in order of preference (sale price first) + const priceSelectors = [ + '#corePrice_feature_div .a-price .a-offscreen', + '#corePriceDisplay_desktop_feature_div .a-price .a-offscreen', + '#priceblock_dealprice', + '#priceblock_saleprice', + '#priceblock_ourprice', + '.a-price .a-offscreen', + '#price_inside_buybox', + '#newBuyBoxPrice', + 'span[data-a-color="price"] .a-offscreen', + ]; + + let price: ParsedPrice | null = null; + for (const selector of priceSelectors) { + const el = $(selector).first(); + if (el.length) { + const text = el.text().trim(); + price = parsePrice(text); + if (price) break; + } + } + + // Product name + const name = $('#productTitle').text().trim() || + $('h1.a-size-large').text().trim() || + null; + + // Image + const imageUrl = $('#landingImage').attr('src') || + $('#imgBlkFront').attr('src') || + $('img[data-a-dynamic-image]').attr('src') || + null; + + return { name, price, imageUrl }; + }, + }, + + // Walmart + { + match: (url) => /walmart\.com/i.test(url), + scrape: ($) => { + // Walmart uses various price containers + const priceSelectors = [ + '[data-testid="price-wrap"] [itemprop="price"]', + '[itemprop="price"]', + '.price-characteristic', + '[data-automation="product-price"]', + '.prod-PriceHero .price-group', + ]; + + let price: ParsedPrice | null = null; + for (const selector of priceSelectors) { + const el = $(selector).first(); + if (el.length) { + const content = el.attr('content'); + const text = content || el.text().trim(); + price = parsePrice(text); + if (price) break; + } + } + + // Also try to get price from the whole dollars + cents pattern + if (!price) { + const dollars = $('[data-testid="price-wrap"] .f2').text().trim(); + const cents = $('[data-testid="price-wrap"] .f6').text().trim(); + if (dollars) { + price = parsePrice(`$${dollars}${cents ? '.' + cents : ''}`); + } + } + + const name = $('h1[itemprop="name"]').text().trim() || + $('h1.prod-ProductTitle').text().trim() || + null; + + const imageUrl = $('[data-testid="hero-image-container"] img').attr('src') || + $('img.prod-hero-image').attr('src') || + null; + + return { name, price, imageUrl }; + }, + }, + + // Best Buy + { + match: (url) => /bestbuy\.com/i.test(url), + scrape: ($) => { + const priceSelectors = [ + '[data-testid="customer-price"] span', + '.priceView-customer-price span', + '.priceView-hero-price span', + '[class*="customerPrice"]', + ]; + + let price: ParsedPrice | null = null; + for (const selector of priceSelectors) { + const el = $(selector).first(); + if (el.length) { + price = parsePrice(el.text().trim()); + if (price) break; + } + } + + const name = $('h1.heading-5').text().trim() || + $('.sku-title h1').text().trim() || + null; + + const imageUrl = $('img.primary-image').attr('src') || + $('[data-testid="image-gallery-image"]').attr('src') || + null; + + return { name, price, imageUrl }; + }, + }, + + // Target + { + match: (url) => /target\.com/i.test(url), + scrape: ($) => { + const priceSelectors = [ + '[data-test="product-price"]', + '[data-test="current-price"]', + '.styles__CurrentPriceFontSize-sc-1qc6t3e-1', + ]; + + let price: ParsedPrice | null = null; + for (const selector of priceSelectors) { + const el = $(selector).first(); + if (el.length) { + price = parsePrice(el.text().trim()); + if (price) break; + } + } + + const name = $('[data-test="product-title"]').text().trim() || + $('h1[class*="Heading"]').text().trim() || + null; + + const imageUrl = $('[data-test="image-gallery-item-0"] img').attr('src') || + null; + + return { name, price, imageUrl }; + }, + }, + + // eBay + { + match: (url) => /ebay\.(com|co\.uk|de|fr|ca|com\.au)/i.test(url), + scrape: ($) => { + const priceSelectors = [ + '[data-testid="x-price-primary"] .ux-textspans', + '.x-price-primary .ux-textspans', + '#prcIsum', + '#mm-saleDscPrc', + '.vi-price .notranslate', + ]; + + let price: ParsedPrice | null = null; + for (const selector of priceSelectors) { + const el = $(selector).first(); + if (el.length) { + price = parsePrice(el.text().trim()); + if (price) break; + } + } + + const name = $('h1.x-item-title__mainTitle span').text().trim() || + $('h1[itemprop="name"]').text().trim() || + null; + + const imageUrl = $('[data-testid="ux-image-carousel"] img').attr('src') || + $('#icImg').attr('src') || + null; + + return { name, price, imageUrl }; + }, + }, + + // Newegg + { + match: (url) => /newegg\.com/i.test(url), + scrape: ($) => { + const price = parsePrice($('.price-current').text().trim()) || + parsePrice($('[itemprop="price"]').attr('content') || ''); + + const name = $('h1.product-title').text().trim() || null; + const imageUrl = $('img.product-view-img-original').attr('src') || null; + + return { name, price, imageUrl }; + }, + }, + + // Home Depot + { + match: (url) => /homedepot\.com/i.test(url), + scrape: ($) => { + const priceSelectors = [ + '[data-testid="price-format"] span', + '.price-format__main-price span', + '#ajaxPrice', + ]; + + let price: ParsedPrice | null = null; + for (const selector of priceSelectors) { + const el = $(selector).first(); + if (el.length) { + price = parsePrice(el.text().trim()); + if (price) break; + } + } + + const name = $('h1.product-title__title').text().trim() || + $('h1[class*="product-details"]').text().trim() || + null; + + const imageUrl = $('img[data-testid="media-gallery-image"]').attr('src') || null; + + return { name, price, imageUrl }; + }, + }, + + // Costco + { + match: (url) => /costco\.com/i.test(url), + scrape: ($) => { + const price = parsePrice($('[automation-id="productPriceOutput"]').text().trim()) || + parsePrice($('.price').first().text().trim()); + + const name = $('h1[itemprop="name"]').text().trim() || + $('h1.product-title').text().trim() || + null; + + const imageUrl = $('img.product-image').attr('src') || null; + + return { name, price, imageUrl }; + }, + }, + + // AliExpress + { + match: (url) => /aliexpress\.com/i.test(url), + scrape: ($) => { + const priceSelectors = [ + '.product-price-value', + '[class*="uniformBannerBoxPrice"]', + '.snow-price_SnowPrice__mainS__1occeh', + ]; + + let price: ParsedPrice | null = null; + for (const selector of priceSelectors) { + const el = $(selector).first(); + if (el.length) { + price = parsePrice(el.text().trim()); + if (price) break; + } + } + + const name = $('h1[data-pl="product-title"]').text().trim() || + $('h1.product-title-text').text().trim() || + null; + + const imageUrl = $('img.magnifier-image').attr('src') || null; + + return { name, price, imageUrl }; + }, + }, +]; + +// Generic selectors as fallback +const genericPriceSelectors = [ '[itemprop="price"]', '[data-price]', '[data-product-price]', - - // Common class names '.price', '.product-price', '.current-price', @@ -28,40 +304,22 @@ const priceSelectors = [ '.final-price', '.offer-price', '#price', - '#priceblock_ourprice', - '#priceblock_dealprice', - '#priceblock_saleprice', - - // Amazon specific - '.a-price .a-offscreen', - '.a-price-whole', - '#corePrice_feature_div .a-price .a-offscreen', - '#corePriceDisplay_desktop_feature_div .a-price .a-offscreen', - - // Generic patterns - '[class*="price"]', - '[class*="Price"]', - '[id*="price"]', - '[id*="Price"]', + '[class*="price" i]', + '[class*="Price" i]', ]; -// Selectors for product name -const nameSelectors = [ +const genericNameSelectors = [ '[itemprop="name"]', 'h1[class*="product"]', 'h1[class*="title"]', - '#productTitle', '.product-title', '.product-name', 'h1', ]; -// Selectors for product image -const imageSelectors = [ +const genericImageSelectors = [ '[itemprop="image"]', '[property="og:image"]', - '#landingImage', - '#imgBlkFront', '.product-image img', '.main-image img', '[data-zoom-image]', @@ -80,44 +338,61 @@ export async function scrapeProduct(url: string): Promise { const response = await axios.get(url, { headers: { 'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', Accept: - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', - Connection: 'keep-alive', + 'Cache-Control': 'no-cache', + Pragma: 'no-cache', + 'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': '"Windows"', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', }, - timeout: 15000, + timeout: 20000, maxRedirects: 5, }); const $ = load(response.data); - // Try to extract from JSON-LD structured data first - const jsonLdData = extractJsonLd($); - if (jsonLdData) { - if (jsonLdData.name) result.name = jsonLdData.name; - if (jsonLdData.price) result.price = jsonLdData.price; - if (jsonLdData.image) result.imageUrl = jsonLdData.image; + // Try site-specific scraper first + const siteScraper = siteScrapers.find((s) => s.match(url)); + if (siteScraper) { + const siteResult = siteScraper.scrape($, url); + if (siteResult.name) result.name = siteResult.name; + if (siteResult.price) result.price = siteResult.price; + if (siteResult.imageUrl) result.imageUrl = siteResult.imageUrl; } - // Extract product name + // Try JSON-LD structured data + if (!result.price || !result.name) { + const jsonLdData = extractJsonLd($); + if (jsonLdData) { + if (!result.name && jsonLdData.name) result.name = jsonLdData.name; + if (!result.price && jsonLdData.price) result.price = jsonLdData.price; + if (!result.imageUrl && jsonLdData.image) result.imageUrl = jsonLdData.image; + } + } + + // Fallback to generic scraping if (!result.name) { - result.name = extractName($); + result.name = extractGenericName($); } - // Extract price if (!result.price) { - result.price = extractPrice($); + result.price = extractGenericPrice($); } - // Extract image if (!result.imageUrl) { - result.imageUrl = extractImage($, url); + result.imageUrl = extractGenericImage($, url); } - // Try Open Graph meta tags as fallback + // Try Open Graph meta tags as last resort if (!result.name) { result.name = $('meta[property="og:title"]').attr('content') || null; } @@ -140,8 +415,10 @@ interface JsonLdProduct { } interface JsonLdOffer { + '@type'?: string; price?: string | number; priceCurrency?: string; + lowPrice?: string | number; } function extractJsonLd( @@ -157,8 +434,7 @@ function extractJsonLd( const product = findProduct(data); if (product) { - const result: { name?: string; price?: ParsedPrice; image?: string } = - {}; + const result: { name?: string; price?: ParsedPrice; image?: string } = {}; if (product.name) { result.name = product.name; @@ -168,9 +444,12 @@ function extractJsonLd( const offer = Array.isArray(product.offers) ? product.offers[0] : product.offers; - if (offer && offer.price) { + + // Get price, preferring lowPrice for ranges + const priceValue = offer.lowPrice || offer.price; + if (priceValue) { result.price = { - price: parseFloat(String(offer.price)), + price: parseFloat(String(priceValue)), currency: offer.priceCurrency || 'USD', }; } @@ -220,16 +499,23 @@ function findProduct(data: JsonLdProduct | JsonLdProduct[]): JsonLdProduct | nul return null; } -function extractPrice($: CheerioAPI): ParsedPrice | null { +function extractGenericPrice($: CheerioAPI): ParsedPrice | null { const prices: ParsedPrice[] = []; - for (const selector of priceSelectors) { + for (const selector of genericPriceSelectors) { const elements = $(selector); elements.each((_, el) => { - const text = - $(el).attr('content') || $(el).attr('data-price') || $(el).text(); + const $el = $(el); + // Skip if this looks like an "original" or "was" price + const classAttr = $el.attr('class') || ''; + const parentClass = $el.parent().attr('class') || ''; + if (/original|was|old|regular|compare|strikethrough|line-through/i.test(classAttr + parentClass)) { + return; + } + + const text = $el.attr('content') || $el.attr('data-price') || $el.text(); const parsed = parsePrice(text); - if (parsed) { + if (parsed && parsed.price > 0) { prices.push(parsed); } }); @@ -240,8 +526,8 @@ function extractPrice($: CheerioAPI): ParsedPrice | null { return findMostLikelyPrice(prices); } -function extractName($: CheerioAPI): string | null { - for (const selector of nameSelectors) { +function extractGenericName($: CheerioAPI): string | null { + for (const selector of genericNameSelectors) { const element = $(selector).first(); if (element.length) { const text = element.text().trim(); @@ -253,8 +539,8 @@ function extractName($: CheerioAPI): string | null { return null; } -function extractImage($: CheerioAPI, baseUrl: string): string | null { - for (const selector of imageSelectors) { +function extractGenericImage($: CheerioAPI, baseUrl: string): string | null { + for (const selector of genericImageSelectors) { const element = $(selector).first(); if (element.length) { const src = @@ -263,7 +549,6 @@ function extractImage($: CheerioAPI, baseUrl: string): string | null { element.attr('data-zoom-image') || element.attr('data-src'); if (src) { - // Handle relative URLs try { return new URL(src, baseUrl).href; } catch (_e) {