mirror of
https://github.com/clucraft/PriceGhost.git
synced 2026-06-08 15:05:16 +02:00
Fix Newegg scraper picking up bundle savings instead of product price
- Add helper to detect savings/combo/bundle containers - Prioritize JSON-LD data as primary price source (most reliable) - Skip price elements inside savings containers - Add minimum price threshold to filter out discount amounts Fixes issue where $189.99 bundle savings was extracted instead of actual $675.59 product price for items like AMD Ryzen 9 9950X3D. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
1029a0b08c
commit
31732b814f
1 changed files with 91 additions and 41 deletions
|
|
@ -464,57 +464,107 @@ const siteScrapers: SiteScraper[] = [
|
|||
{
|
||||
match: (url) => /newegg\.com/i.test(url),
|
||||
scrape: ($) => {
|
||||
// Try multiple price selectors
|
||||
const priceSelectors = [
|
||||
'.price-current',
|
||||
'.price-current strong',
|
||||
'[itemprop="price"]',
|
||||
'.product-price .price-current',
|
||||
'.product-buy-box .price-current',
|
||||
'.price-main-product .price-current',
|
||||
];
|
||||
// Helper to check if element is inside a savings/combo container
|
||||
const isInSavingsContainer = (el: ReturnType<typeof $>) => {
|
||||
const parents = el.parents().toArray();
|
||||
for (const parent of parents) {
|
||||
const className = $(parent).attr('class') || '';
|
||||
const id = $(parent).attr('id') || '';
|
||||
// Skip elements inside combo deals, savings sections, or "you save" areas
|
||||
if (/combo|save|saving|deal|bundle|discount/i.test(className + id)) {
|
||||
return true;
|
||||
}
|
||||
// Check for specific Newegg combo/savings containers
|
||||
if (className.includes('item-combo') || className.includes('product-combo')) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Also check the element's surrounding text for "save" context
|
||||
const parentText = el.parent().text().toLowerCase();
|
||||
if (parentText.includes('you save') || parentText.includes('save $')) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
let price: ParsedPrice | null = null;
|
||||
for (const selector of priceSelectors) {
|
||||
const el = $(selector).first();
|
||||
if (el.length) {
|
||||
// For price-current, combine the dollar and cents parts
|
||||
if (selector.includes('price-current')) {
|
||||
const strong = el.find('strong').text().trim() || el.text().trim();
|
||||
const sup = el.find('sup').text().trim();
|
||||
if (strong) {
|
||||
const priceText = `$${strong}${sup ? '.' + sup : ''}`;
|
||||
price = parsePrice(priceText);
|
||||
if (price) break;
|
||||
|
||||
// First, try JSON-LD data - most reliable source
|
||||
try {
|
||||
const scripts = $('script[type="application/ld+json"]');
|
||||
scripts.each((_, script) => {
|
||||
if (price) return; // Already found
|
||||
const jsonLd = $(script).html();
|
||||
if (jsonLd) {
|
||||
const data = JSON.parse(jsonLd);
|
||||
// Handle array of JSON-LD objects
|
||||
const items = Array.isArray(data) ? data : [data];
|
||||
for (const item of items) {
|
||||
if (item['@type'] === 'Product' && item.offers) {
|
||||
const offer = Array.isArray(item.offers) ? item.offers[0] : item.offers;
|
||||
if (offer?.price) {
|
||||
price = {
|
||||
price: parseFloat(String(offer.price)),
|
||||
currency: offer.priceCurrency || 'USD',
|
||||
};
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Try content attribute for itemprop
|
||||
const content = el.attr('content');
|
||||
if (content) {
|
||||
price = parsePrice(content);
|
||||
if (price) break;
|
||||
}
|
||||
// Try text content
|
||||
price = parsePrice(el.text().trim());
|
||||
});
|
||||
} catch (_e) {
|
||||
// Ignore JSON parse errors
|
||||
}
|
||||
|
||||
// Fallback: Try HTML selectors, but be careful to avoid savings amounts
|
||||
if (!price) {
|
||||
// Target main product buy box price specifically
|
||||
const mainPriceContainers = [
|
||||
'.product-buy-box .price-current',
|
||||
'.price-main-product .price-current',
|
||||
'.product-price .price-current',
|
||||
'#app .price-current', // Main app container
|
||||
];
|
||||
|
||||
for (const selector of mainPriceContainers) {
|
||||
const elements = $(selector);
|
||||
elements.each((_, el) => {
|
||||
if (price) return; // Already found
|
||||
|
||||
const $el = $(el);
|
||||
// Skip if inside a savings/combo container
|
||||
if (isInSavingsContainer($el)) return;
|
||||
|
||||
// Combine dollar and cents parts
|
||||
const strong = $el.find('strong').text().trim() || $el.text().trim();
|
||||
const sup = $el.find('sup').text().trim();
|
||||
if (strong) {
|
||||
// Clean the strong text - remove any non-numeric chars except comma
|
||||
const cleanStrong = strong.replace(/[^0-9,]/g, '');
|
||||
if (cleanStrong) {
|
||||
const priceText = `$${cleanStrong}${sup ? '.' + sup : ''}`;
|
||||
const parsed = parsePrice(priceText);
|
||||
// Validate this looks like a real product price (Ryzen 9 should be $500+)
|
||||
if (parsed && parsed.price > 50) {
|
||||
price = parsed;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (price) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Also try JSON-LD data
|
||||
// Last resort: itemprop price
|
||||
if (!price) {
|
||||
try {
|
||||
const jsonLd = $('script[type="application/ld+json"]').first().html();
|
||||
if (jsonLd) {
|
||||
const data = JSON.parse(jsonLd);
|
||||
if (data.offers?.price) {
|
||||
price = {
|
||||
price: parseFloat(String(data.offers.price)),
|
||||
currency: data.offers.priceCurrency || 'USD',
|
||||
};
|
||||
}
|
||||
const itemprop = $('[itemprop="price"]').first();
|
||||
if (itemprop.length) {
|
||||
const content = itemprop.attr('content');
|
||||
if (content) {
|
||||
price = parsePrice(content);
|
||||
}
|
||||
} catch (_e) {
|
||||
// Ignore JSON parse errors
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue