Add human-like behavior to browser scraping

This commit is contained in:
clucraft 2026-01-21 21:13:09 -05:00
parent 9af18969f3
commit 58ad638641

View file

@ -21,11 +21,13 @@ async function scrapeWithBrowser(url: string): Promise<string> {
'--no-sandbox', '--no-sandbox',
'--disable-setuid-sandbox', '--disable-setuid-sandbox',
'--disable-dev-shm-usage', '--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas', '--disable-blink-features=AutomationControlled',
'--disable-gpu', '--disable-infobars',
'--window-size=1920,1080', '--window-size=1920,1080',
'--start-maximized',
], ],
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || undefined, executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || undefined,
ignoreDefaultArgs: ['--enable-automation'],
}); });
try { try {
@ -40,9 +42,14 @@ async function scrapeWithBrowser(url: string): Promise<string> {
timeout: 45000, timeout: 45000,
}); });
// Add some human-like behavior
await page.mouse.move(100, 200);
await new Promise(resolve => setTimeout(resolve, 500));
await page.mouse.move(300, 400);
// Wait for Cloudflare challenge to complete if present // Wait for Cloudflare challenge to complete if present
// Check if we're on a challenge page and wait for it to resolve // Check if we're on a challenge page and wait for it to resolve
const maxWaitTime = 15000; const maxWaitTime = 20000;
const startTime = Date.now(); const startTime = Date.now();
while (Date.now() - startTime < maxWaitTime) { while (Date.now() - startTime < maxWaitTime) {
@ -53,11 +60,18 @@ async function scrapeWithBrowser(url: string): Promise<string> {
break; break;
} }
console.log(`[Browser] Waiting for Cloudflare challenge to complete... (${title})`); console.log(`[Browser] Waiting for Cloudflare challenge to complete... (${title})`);
// Move mouse randomly while waiting
await page.mouse.move(
100 + Math.random() * 500,
100 + Math.random() * 400
);
await new Promise(resolve => setTimeout(resolve, 2000)); await new Promise(resolve => setTimeout(resolve, 2000));
} }
// Additional wait for dynamic content // Scroll down a bit like a human would
await new Promise(resolve => setTimeout(resolve, 2000)); // eslint-disable-next-line @typescript-eslint/no-implied-eval
await page.evaluate('window.scrollBy(0, 300)');
await new Promise(resolve => setTimeout(resolve, 1000));
// Get the full HTML content // Get the full HTML content
const html = await page.content(); const html = await page.content();