From 58ad638641dea4f40a10c637f2b4a96d3580e020 Mon Sep 17 00:00:00 2001 From: clucraft Date: Wed, 21 Jan 2026 21:13:09 -0500 Subject: [PATCH] Add human-like behavior to browser scraping --- backend/src/services/scraper.ts | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/backend/src/services/scraper.ts b/backend/src/services/scraper.ts index 126ea32..d5c8a33 100644 --- a/backend/src/services/scraper.ts +++ b/backend/src/services/scraper.ts @@ -21,11 +21,13 @@ async function scrapeWithBrowser(url: string): Promise { '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', - '--disable-accelerated-2d-canvas', - '--disable-gpu', + '--disable-blink-features=AutomationControlled', + '--disable-infobars', '--window-size=1920,1080', + '--start-maximized', ], executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || undefined, + ignoreDefaultArgs: ['--enable-automation'], }); try { @@ -40,9 +42,14 @@ async function scrapeWithBrowser(url: string): Promise { timeout: 45000, }); + // Add some human-like behavior + await page.mouse.move(100, 200); + await new Promise(resolve => setTimeout(resolve, 500)); + await page.mouse.move(300, 400); + // Wait for Cloudflare challenge to complete if present // Check if we're on a challenge page and wait for it to resolve - const maxWaitTime = 15000; + const maxWaitTime = 20000; const startTime = Date.now(); while (Date.now() - startTime < maxWaitTime) { @@ -53,11 +60,18 @@ async function scrapeWithBrowser(url: string): Promise { break; } console.log(`[Browser] Waiting for Cloudflare challenge to complete... (${title})`); + // Move mouse randomly while waiting + await page.mouse.move( + 100 + Math.random() * 500, + 100 + Math.random() * 400 + ); await new Promise(resolve => setTimeout(resolve, 2000)); } - // Additional wait for dynamic content - await new Promise(resolve => setTimeout(resolve, 2000)); + // Scroll down a bit like a human would + // eslint-disable-next-line @typescript-eslint/no-implied-eval + await page.evaluate('window.scrollBy(0, 300)'); + await new Promise(resolve => setTimeout(resolve, 1000)); // Get the full HTML content const html = await page.content();