Add Puppeteer fallback for Cloudflare-protected sites

When HTTP requests are blocked with 403 (e.g., B&H Photo's Cloudflare protection), the scraper now automatically retries using a headless Chrome browser via Puppeteer. Also updated Dockerfile to include Chromium dependencies for container deployment. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-06-08 15:05:16 +02:00 · 2026-01-21 20:55:49 -05:00 · 2026-01-21 20:55:49 -05:00 · c96861fefb
commit c96861fefb
parent 7c8ab0721b
4 changed files with 1415 additions and 36 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@ -1,5 +1,5 @@
 # Build stage
-FROM node:20-alpine AS builder
+FROM node:20-slim AS builder

 WORKDIR /app

@ -16,10 +16,38 @@ COPY . .
 RUN npm run build

 # Production stage
-FROM node:20-alpine AS production
+FROM node:20-slim AS production

 WORKDIR /app

+# Install Chromium dependencies for Puppeteer
+RUN apt-get update && apt-get install -y \
+    chromium \
+    fonts-liberation \
+    libasound2 \
+    libatk-bridge2.0-0 \
+    libatk1.0-0 \
+    libcups2 \
+    libdbus-1-3 \
+    libdrm2 \
+    libgbm1 \
+    libgtk-3-0 \
+    libnspr4 \
+    libnss3 \
+    libx11-xcb1 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxfixes3 \
+    libxrandr2 \
+    libxss1 \
+    xdg-utils \
+    --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set Puppeteer to use installed Chromium
+ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
+ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
+
 # Copy package files
 COPY package*.json ./

@ -30,8 +58,10 @@ RUN npm install --omit=dev
 COPY --from=builder /app/dist ./dist

 # Create non-root user
-RUN addgroup -g 1001 -S nodejs && \
-    adduser -S nodejs -u 1001
+RUN groupadd -r nodejs && useradd -r -g nodejs nodejs
+
+# Change ownership of app directory
+RUN chown -R nodejs:nodejs /app

 USER nodejs

--- a/backend/package-lock.json
+++ b/backend/package-lock.json
--- a/backend/package.json
+++ b/backend/package.json
@ -18,7 +18,8 @@
    "express": "^4.18.2",
    "jsonwebtoken": "^9.0.2",
    "node-cron": "^3.0.3",
-    "pg": "^8.11.3"
+    "pg": "^8.11.3",
+    "puppeteer": "^22.0.0"
  },
  "devDependencies": {
    "@types/bcrypt": "^5.0.2",
--- a/backend/src/services/scraper.ts
+++ b/backend/src/services/scraper.ts
@ -1,5 +1,6 @@
-import axios from 'axios';
+import axios, { AxiosError } from 'axios';
 import { load, type CheerioAPI } from 'cheerio';
+import puppeteer from 'puppeteer';
 import {
  parsePrice,
  ParsedPrice,
@ -8,6 +9,48 @@ import {

 export type StockStatus = 'in_stock' | 'out_of_stock' | 'unknown';

+// Browser-based scraping for sites that block HTTP requests (e.g., Cloudflare)
+async function scrapeWithBrowser(url: string): Promise<string> {
+  const browser = await puppeteer.launch({
+    headless: true,
+    args: [
+      '--no-sandbox',
+      '--disable-setuid-sandbox',
+      '--disable-dev-shm-usage',
+      '--disable-accelerated-2d-canvas',
+      '--disable-gpu',
+    ],
+    executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || undefined,
+  });
+
+  try {
+    const page = await browser.newPage();
+
+    // Set a realistic user agent
+    await page.setUserAgent(
+      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
+    );
+
+    // Set viewport
+    await page.setViewport({ width: 1920, height: 1080 });
+
+    // Navigate to the page and wait for content to load
+    await page.goto(url, {
+      waitUntil: 'networkidle2',
+      timeout: 30000,
+    });
+
+    // Wait a bit for any dynamic content to render
+    await page.waitForSelector('body', { timeout: 5000 });
+
+    // Get the full HTML content
+    const html = await page.content();
+    return html;
+  } finally {
+    await browser.close();
+  }
+}
+
 export interface ScrapedProduct {
  name: string | null;
  price: ParsedPrice | null;
@ -567,7 +610,7 @@ const siteScrapers: SiteScraper[] = [
      // Try to get data from JSON-LD first
      try {
        const scripts = $('script[type="application/ld+json"]');
-        scripts.each((_, script) => {
+        scripts.each((_i, script) => {
          const content = $(script).html();
          if (!content) return;
          try {
@ -599,11 +642,11 @@ const siteScrapers: SiteScraper[] = [
              }
            }
          } catch (_e) {
-            // Continue to next script
+            // JSON-LD parse error, continue
          }
        });
      } catch (_e) {
-        // JSON-LD parsing failed
+        // JSON-LD extraction error, continue
      }

      // Fallback to HTML selectors
@ -712,30 +755,49 @@ export async function scrapeProduct(url: string): Promise<ScrapedProduct> {
  };

  try {
-    const response = await axios.get<string>(url, {
-      headers: {
-        'User-Agent':
-          'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
-        Accept:
-          'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.9',
-        'Accept-Encoding': 'gzip, deflate, br',
-        'Cache-Control': 'no-cache',
-        Pragma: 'no-cache',
-        'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
-        'Sec-Ch-Ua-Mobile': '?0',
-        'Sec-Ch-Ua-Platform': '"Windows"',
-        'Sec-Fetch-Dest': 'document',
-        'Sec-Fetch-Mode': 'navigate',
-        'Sec-Fetch-Site': 'none',
-        'Sec-Fetch-User': '?1',
-        'Upgrade-Insecure-Requests': '1',
-      },
-      timeout: 20000,
-      maxRedirects: 5,
-    });
+    let html: string;
+    let usedBrowser = false;

-    const $ = load(response.data);
+    try {
+      const response = await axios.get<string>(url, {
+        headers: {
+          'User-Agent':
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
+          Accept:
+            'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+          'Accept-Language': 'en-US,en;q=0.9',
+          'Accept-Encoding': 'gzip, deflate, br',
+          'Cache-Control': 'no-cache',
+          Pragma: 'no-cache',
+          'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
+          'Sec-Ch-Ua-Mobile': '?0',
+          'Sec-Ch-Ua-Platform': '"Windows"',
+          'Sec-Fetch-Dest': 'document',
+          'Sec-Fetch-Mode': 'navigate',
+          'Sec-Fetch-Site': 'none',
+          'Sec-Fetch-User': '?1',
+          'Upgrade-Insecure-Requests': '1',
+        },
+        timeout: 20000,
+        maxRedirects: 5,
+      });
+      html = response.data;
+    } catch (axiosError) {
+      // If we get a 403 (Forbidden), try using a headless browser
+      if (axiosError instanceof AxiosError && axiosError.response?.status === 403) {
+        console.log(`HTTP request blocked (403) for ${url}, falling back to browser scraping...`);
+        html = await scrapeWithBrowser(url);
+        usedBrowser = true;
+      } else {
+        throw axiosError;
+      }
+    }
+
+    const $ = load(html);
+
+    if (usedBrowser) {
+      console.log(`Successfully scraped ${url} using headless browser`);
+    }

    // Try site-specific scraper first
    const siteScraper = siteScrapers.find((s) => s.match(url));