Add Puppeteer fallback for Cloudflare-protected sites

When HTTP requests are blocked with 403 (e.g., B&H Photo's Cloudflare
protection), the scraper now automatically retries using a headless
Chrome browser via Puppeteer. Also updated Dockerfile to include
Chromium dependencies for container deployment.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
clucraft 2026-01-21 20:55:49 -05:00
parent 7c8ab0721b
commit c96861fefb
4 changed files with 1415 additions and 36 deletions

View file

@ -1,5 +1,5 @@
# Build stage
FROM node:20-alpine AS builder
FROM node:20-slim AS builder
WORKDIR /app
@ -16,10 +16,38 @@ COPY . .
RUN npm run build
# Production stage
FROM node:20-alpine AS production
FROM node:20-slim AS production
WORKDIR /app
# Install Chromium dependencies for Puppeteer
RUN apt-get update && apt-get install -y \
chromium \
fonts-liberation \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libcups2 \
libdbus-1-3 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libx11-xcb1 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxrandr2 \
libxss1 \
xdg-utils \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
# Set Puppeteer to use installed Chromium
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
# Copy package files
COPY package*.json ./
@ -30,8 +58,10 @@ RUN npm install --omit=dev
COPY --from=builder /app/dist ./dist
# Create non-root user
RUN addgroup -g 1001 -S nodejs && \
adduser -S nodejs -u 1001
RUN groupadd -r nodejs && useradd -r -g nodejs nodejs
# Change ownership of app directory
RUN chown -R nodejs:nodejs /app
USER nodejs