mirror of
https://github.com/clucraft/PriceGhost.git
synced 2026-06-08 15:05:16 +02:00
Add Puppeteer fallback for Cloudflare-protected sites
When HTTP requests are blocked with 403 (e.g., B&H Photo's Cloudflare protection), the scraper now automatically retries using a headless Chrome browser via Puppeteer. Also updated Dockerfile to include Chromium dependencies for container deployment. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
7c8ab0721b
commit
c96861fefb
4 changed files with 1415 additions and 36 deletions
|
|
@ -1,5 +1,5 @@
|
|||
# Build stage
|
||||
FROM node:20-alpine AS builder
|
||||
FROM node:20-slim AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
|
@ -16,10 +16,38 @@ COPY . .
|
|||
RUN npm run build
|
||||
|
||||
# Production stage
|
||||
FROM node:20-alpine AS production
|
||||
FROM node:20-slim AS production
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install Chromium dependencies for Puppeteer
|
||||
RUN apt-get update && apt-get install -y \
|
||||
chromium \
|
||||
fonts-liberation \
|
||||
libasound2 \
|
||||
libatk-bridge2.0-0 \
|
||||
libatk1.0-0 \
|
||||
libcups2 \
|
||||
libdbus-1-3 \
|
||||
libdrm2 \
|
||||
libgbm1 \
|
||||
libgtk-3-0 \
|
||||
libnspr4 \
|
||||
libnss3 \
|
||||
libx11-xcb1 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libxss1 \
|
||||
xdg-utils \
|
||||
--no-install-recommends \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set Puppeteer to use installed Chromium
|
||||
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
|
||||
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
|
||||
|
||||
# Copy package files
|
||||
COPY package*.json ./
|
||||
|
||||
|
|
@ -30,8 +58,10 @@ RUN npm install --omit=dev
|
|||
COPY --from=builder /app/dist ./dist
|
||||
|
||||
# Create non-root user
|
||||
RUN addgroup -g 1001 -S nodejs && \
|
||||
adduser -S nodejs -u 1001
|
||||
RUN groupadd -r nodejs && useradd -r -g nodejs nodejs
|
||||
|
||||
# Change ownership of app directory
|
||||
RUN chown -R nodejs:nodejs /app
|
||||
|
||||
USER nodejs
|
||||
|
||||
|
|
|
|||
1294
backend/package-lock.json
generated
1294
backend/package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
|
@ -18,7 +18,8 @@
|
|||
"express": "^4.18.2",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"node-cron": "^3.0.3",
|
||||
"pg": "^8.11.3"
|
||||
"pg": "^8.11.3",
|
||||
"puppeteer": "^22.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bcrypt": "^5.0.2",
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import axios from 'axios';
|
||||
import axios, { AxiosError } from 'axios';
|
||||
import { load, type CheerioAPI } from 'cheerio';
|
||||
import puppeteer from 'puppeteer';
|
||||
import {
|
||||
parsePrice,
|
||||
ParsedPrice,
|
||||
|
|
@ -8,6 +9,48 @@ import {
|
|||
|
||||
export type StockStatus = 'in_stock' | 'out_of_stock' | 'unknown';
|
||||
|
||||
// Browser-based scraping for sites that block HTTP requests (e.g., Cloudflare)
|
||||
async function scrapeWithBrowser(url: string): Promise<string> {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
],
|
||||
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || undefined,
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Set a realistic user agent
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
// Set viewport
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
// Navigate to the page and wait for content to load
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Wait a bit for any dynamic content to render
|
||||
await page.waitForSelector('body', { timeout: 5000 });
|
||||
|
||||
// Get the full HTML content
|
||||
const html = await page.content();
|
||||
return html;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
export interface ScrapedProduct {
|
||||
name: string | null;
|
||||
price: ParsedPrice | null;
|
||||
|
|
@ -567,7 +610,7 @@ const siteScrapers: SiteScraper[] = [
|
|||
// Try to get data from JSON-LD first
|
||||
try {
|
||||
const scripts = $('script[type="application/ld+json"]');
|
||||
scripts.each((_, script) => {
|
||||
scripts.each((_i, script) => {
|
||||
const content = $(script).html();
|
||||
if (!content) return;
|
||||
try {
|
||||
|
|
@ -599,11 +642,11 @@ const siteScrapers: SiteScraper[] = [
|
|||
}
|
||||
}
|
||||
} catch (_e) {
|
||||
// Continue to next script
|
||||
// JSON-LD parse error, continue
|
||||
}
|
||||
});
|
||||
} catch (_e) {
|
||||
// JSON-LD parsing failed
|
||||
// JSON-LD extraction error, continue
|
||||
}
|
||||
|
||||
// Fallback to HTML selectors
|
||||
|
|
@ -712,30 +755,49 @@ export async function scrapeProduct(url: string): Promise<ScrapedProduct> {
|
|||
};
|
||||
|
||||
try {
|
||||
const response = await axios.get<string>(url, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
Accept:
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Cache-Control': 'no-cache',
|
||||
Pragma: 'no-cache',
|
||||
'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
||||
'Sec-Ch-Ua-Mobile': '?0',
|
||||
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
},
|
||||
timeout: 20000,
|
||||
maxRedirects: 5,
|
||||
});
|
||||
let html: string;
|
||||
let usedBrowser = false;
|
||||
|
||||
const $ = load(response.data);
|
||||
try {
|
||||
const response = await axios.get<string>(url, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
Accept:
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Cache-Control': 'no-cache',
|
||||
Pragma: 'no-cache',
|
||||
'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
||||
'Sec-Ch-Ua-Mobile': '?0',
|
||||
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
},
|
||||
timeout: 20000,
|
||||
maxRedirects: 5,
|
||||
});
|
||||
html = response.data;
|
||||
} catch (axiosError) {
|
||||
// If we get a 403 (Forbidden), try using a headless browser
|
||||
if (axiosError instanceof AxiosError && axiosError.response?.status === 403) {
|
||||
console.log(`HTTP request blocked (403) for ${url}, falling back to browser scraping...`);
|
||||
html = await scrapeWithBrowser(url);
|
||||
usedBrowser = true;
|
||||
} else {
|
||||
throw axiosError;
|
||||
}
|
||||
}
|
||||
|
||||
const $ = load(html);
|
||||
|
||||
if (usedBrowser) {
|
||||
console.log(`Successfully scraped ${url} using headless browser`);
|
||||
}
|
||||
|
||||
// Try site-specific scraper first
|
||||
const siteScraper = siteScrapers.find((s) => s.match(url));
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue