Add Puppeteer fallback for JavaScript-rendered prices

- If no price found in static HTML, automatically try headless browser
- Re-runs all extraction methods on browser-rendered HTML
- Fixes price extraction for Magento, React, Vue, and other JS-heavy sites
- AI extraction now also benefits from rendered HTML

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
clucraft 2026-01-23 13:21:07 -05:00
parent 61ffafdd8c
commit 25936f4c9d

View file

@ -822,6 +822,63 @@ export async function scrapeProduct(url: string, userId?: number): Promise<Scrap
result.imageUrl = $('meta[property="og:image"]').attr('content') || null;
}
// If no price found and we haven't tried browser yet, try Puppeteer
// This handles JavaScript-rendered prices (Magento, React, Vue, etc.)
if (!result.price && !usedBrowser) {
console.log(`[Scraper] No price found in static HTML for ${url}, trying headless browser...`);
try {
html = await scrapeWithBrowser(url);
usedBrowser = true;
const $browser = load(html);
// Re-try extraction with browser-rendered HTML
// Try site-specific scraper
const siteScraper = siteScrapers.find((s) => s.match(url));
if (siteScraper) {
const siteResult = siteScraper.scrape($browser, url);
if (!result.name && siteResult.name) result.name = siteResult.name;
if (!result.price && siteResult.price) result.price = siteResult.price;
if (!result.imageUrl && siteResult.imageUrl) result.imageUrl = siteResult.imageUrl;
if (result.stockStatus === 'unknown' && siteResult.stockStatus) {
result.stockStatus = siteResult.stockStatus;
}
}
// Try JSON-LD from browser-rendered HTML
if (!result.price) {
const jsonLdData = extractJsonLd($browser);
if (jsonLdData) {
if (!result.name && jsonLdData.name) result.name = jsonLdData.name;
if (!result.price && jsonLdData.price) result.price = jsonLdData.price;
if (!result.imageUrl && jsonLdData.image) result.imageUrl = jsonLdData.image;
if (result.stockStatus === 'unknown' && jsonLdData.stockStatus) {
result.stockStatus = jsonLdData.stockStatus;
}
}
}
// Try generic extraction from browser-rendered HTML
if (!result.price) {
result.price = extractGenericPrice($browser);
}
if (!result.name) {
result.name = extractGenericName($browser);
}
if (!result.imageUrl) {
result.imageUrl = extractGenericImage($browser, url);
}
if (result.stockStatus === 'unknown') {
result.stockStatus = extractGenericStockStatus($browser);
}
if (result.price) {
console.log(`[Scraper] Successfully extracted price ${result.price.price} ${result.price.currency} using headless browser`);
}
} catch (browserError) {
console.error(`[Scraper] Browser fallback failed for ${url}:`, browserError);
}
}
// If we have a price and userId is provided, try AI verification
if (result.price && userId && html) {
try {