mirror of
https://github.com/clucraft/PriceGhost.git
synced 2026-06-29 15:59:39 +02:00
Fix Best Buy price extraction and add browser rendering for JS-heavy sites
- Fix TypeScript narrowing issue in Best Buy scraper - Add browser rendering for JS-heavy sites (Best Buy, Target, Walmart, Costco) - Improve Best Buy site-specific scraper with better selectors and logging - Skip payment plan prices (/mo, per month, etc.) - Enhance AI HTML preparation: - Extract JSON-LD data before removing scripts - Add price element extraction for better context - Increase character limit from 15000 to 25000 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
cf23ac9db1
commit
196ff8fd0b
2 changed files with 54 additions and 9 deletions
|
|
@ -87,6 +87,7 @@ function prepareHtmlForAI(html: string): string {
|
||||||
if (scriptContent) {
|
if (scriptContent) {
|
||||||
// Include any JSON-LD that might be product-related
|
// Include any JSON-LD that might be product-related
|
||||||
if (scriptContent.includes('price') ||
|
if (scriptContent.includes('price') ||
|
||||||
|
scriptContent.includes('Price') ||
|
||||||
scriptContent.includes('Product') ||
|
scriptContent.includes('Product') ||
|
||||||
scriptContent.includes('Offer')) {
|
scriptContent.includes('Offer')) {
|
||||||
jsonLdScripts.push(scriptContent);
|
jsonLdScripts.push(scriptContent);
|
||||||
|
|
@ -94,6 +95,26 @@ function prepareHtmlForAI(html: string): string {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Extract price-related elements specifically
|
||||||
|
const priceElements: string[] = [];
|
||||||
|
const priceSelectors = [
|
||||||
|
'[class*="price"]',
|
||||||
|
'[class*="Price"]',
|
||||||
|
'[data-testid*="price"]',
|
||||||
|
'[itemprop="price"]',
|
||||||
|
'[data-price]',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of priceSelectors) {
|
||||||
|
$(selector).each((_, el) => {
|
||||||
|
const text = $(el).text().trim();
|
||||||
|
const parent = $(el).parent().text().trim().slice(0, 200);
|
||||||
|
if (text && text.match(/\$[\d,]+\.?\d*/)) {
|
||||||
|
priceElements.push(`Price element: "${text}" (context: "${parent.slice(0, 100)}")`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Now remove script, style, and other non-content elements
|
// Now remove script, style, and other non-content elements
|
||||||
$('script, style, noscript, iframe, svg, path, meta, link, comment').remove();
|
$('script, style, noscript, iframe, svg, path, meta, link, comment').remove();
|
||||||
|
|
||||||
|
|
@ -103,9 +124,10 @@ function prepareHtmlForAI(html: string): string {
|
||||||
// Try to focus on product-related sections if possible
|
// Try to focus on product-related sections if possible
|
||||||
const productSelectors = [
|
const productSelectors = [
|
||||||
'[itemtype*="Product"]',
|
'[itemtype*="Product"]',
|
||||||
'[class*="product"]',
|
'[class*="product-detail"]',
|
||||||
|
'[class*="productDetail"]',
|
||||||
|
'[class*="pdp-"]',
|
||||||
'[id*="product"]',
|
'[id*="product"]',
|
||||||
'[class*="pdp"]',
|
|
||||||
'main',
|
'main',
|
||||||
'[role="main"]',
|
'[role="main"]',
|
||||||
];
|
];
|
||||||
|
|
@ -118,16 +140,24 @@ function prepareHtmlForAI(html: string): string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Combine JSON-LD data with HTML content
|
// Build final content with all price-related info at the top
|
||||||
let finalContent = content;
|
let finalContent = '';
|
||||||
|
|
||||||
if (jsonLdScripts.length > 0) {
|
if (jsonLdScripts.length > 0) {
|
||||||
finalContent = `JSON-LD Structured Data:\n${jsonLdScripts.join('\n')}\n\nHTML Content:\n${content}`;
|
finalContent += `=== JSON-LD Structured Data (MOST RELIABLE) ===\n${jsonLdScripts.join('\n')}\n\n`;
|
||||||
console.log(`[AI] Found ${jsonLdScripts.length} JSON-LD scripts with product data`);
|
console.log(`[AI] Found ${jsonLdScripts.length} JSON-LD scripts with product data`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Truncate to ~15000 characters to stay within token limits
|
if (priceElements.length > 0) {
|
||||||
if (finalContent.length > 15000) {
|
finalContent += `=== Price Elements Found ===\n${priceElements.slice(0, 10).join('\n')}\n\n`;
|
||||||
finalContent = finalContent.substring(0, 15000) + '\n... [truncated]';
|
console.log(`[AI] Found ${priceElements.length} price elements`);
|
||||||
|
}
|
||||||
|
|
||||||
|
finalContent += `=== HTML Content ===\n${content}`;
|
||||||
|
|
||||||
|
// Truncate to ~25000 characters to stay within token limits but capture more content
|
||||||
|
if (finalContent.length > 25000) {
|
||||||
|
finalContent = finalContent.substring(0, 25000) + '\n... [truncated]';
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`[AI] Prepared HTML content: ${finalContent.length} characters`);
|
console.log(`[AI] Prepared HTML content: ${finalContent.length} characters`);
|
||||||
|
|
|
||||||
|
|
@ -591,25 +591,35 @@ const siteScrapers: SiteScraper[] = [
|
||||||
'.priceView-customer-price span',
|
'.priceView-customer-price span',
|
||||||
'.priceView-hero-price span',
|
'.priceView-hero-price span',
|
||||||
'[class*="customerPrice"]',
|
'[class*="customerPrice"]',
|
||||||
|
'[class*="priceView"] span[aria-hidden="true"]',
|
||||||
|
'.pricing-price__regular-price',
|
||||||
|
'[data-testid="product-price"]',
|
||||||
|
'.price-box span',
|
||||||
];
|
];
|
||||||
|
|
||||||
let price: ParsedPrice | null = null;
|
let price: ParsedPrice | null = null;
|
||||||
for (const selector of priceSelectors) {
|
for (const selector of priceSelectors) {
|
||||||
const elements = $(selector);
|
const elements = $(selector);
|
||||||
|
console.log(`[BestBuy] Selector "${selector}" found ${elements.length} elements`);
|
||||||
// Check each element, skip payment plan prices (contain "/mo", "per month", etc.)
|
// Check each element, skip payment plan prices (contain "/mo", "per month", etc.)
|
||||||
elements.each((_, el) => {
|
elements.each((_, el) => {
|
||||||
if (price) return false; // Already found a valid price
|
if (price) return false; // Already found a valid price
|
||||||
const text = $(el).text().trim();
|
const text = $(el).text().trim();
|
||||||
|
if (!text) return true;
|
||||||
|
console.log(`[BestBuy] Found text: "${text.slice(0, 50)}"`);
|
||||||
const lowerText = text.toLowerCase();
|
const lowerText = text.toLowerCase();
|
||||||
// Skip if it looks like a monthly payment plan
|
// Skip if it looks like a monthly payment plan
|
||||||
if (lowerText.includes('/mo') ||
|
if (lowerText.includes('/mo') ||
|
||||||
lowerText.includes('per month') ||
|
lowerText.includes('per month') ||
|
||||||
lowerText.includes('monthly') ||
|
lowerText.includes('monthly') ||
|
||||||
lowerText.includes('financing')) {
|
lowerText.includes('financing') ||
|
||||||
|
lowerText.includes('payment')) {
|
||||||
|
console.log(`[BestBuy] Skipping payment plan price: "${text.slice(0, 30)}"`);
|
||||||
return true; // Continue to next element
|
return true; // Continue to next element
|
||||||
}
|
}
|
||||||
const parsed = parsePrice(text);
|
const parsed = parsePrice(text);
|
||||||
if (parsed) {
|
if (parsed) {
|
||||||
|
console.log(`[BestBuy] Parsed price: ${parsed.price} ${parsed.currency}`);
|
||||||
price = parsed;
|
price = parsed;
|
||||||
return false; // Break the loop
|
return false; // Break the loop
|
||||||
}
|
}
|
||||||
|
|
@ -619,12 +629,17 @@ const siteScrapers: SiteScraper[] = [
|
||||||
|
|
||||||
const name = $('h1.heading-5').text().trim() ||
|
const name = $('h1.heading-5').text().trim() ||
|
||||||
$('.sku-title h1').text().trim() ||
|
$('.sku-title h1').text().trim() ||
|
||||||
|
$('[data-testid="product-title"]').text().trim() ||
|
||||||
|
$('h1').first().text().trim() ||
|
||||||
null;
|
null;
|
||||||
|
console.log(`[BestBuy] Found name: "${name?.slice(0, 50)}"`);
|
||||||
|
|
||||||
const imageUrl = $('img.primary-image').attr('src') ||
|
const imageUrl = $('img.primary-image').attr('src') ||
|
||||||
$('[data-testid="image-gallery-image"]').attr('src') ||
|
$('[data-testid="image-gallery-image"]').attr('src') ||
|
||||||
|
$('img[class*="product-image"]').attr('src') ||
|
||||||
null;
|
null;
|
||||||
|
|
||||||
|
console.log(`[BestBuy] Final result - name: ${!!name}, price: ${price ? (price as ParsedPrice).price : null}, image: ${!!imageUrl}`);
|
||||||
return { name, price, imageUrl };
|
return { name, price, imageUrl };
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue