Add AI-powered price extraction fallback

- Add AI extraction service supporting Anthropic (Claude) and OpenAI
- Add AI settings UI in Settings page with provider selection
- Add database migration for AI settings columns
- Integrate AI fallback into scraper when standard methods fail
- Add API endpoints for AI settings and test extraction

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
clucraft 2026-01-21 21:49:55 -05:00
parent cfca33b4ea
commit d98138fe7c
11 changed files with 887 additions and 10 deletions

View file

@ -8,6 +8,7 @@
"name": "priceghost-backend",
"version": "1.0.0",
"dependencies": {
"@anthropic-ai/sdk": "^0.24.0",
"axios": "^1.6.0",
"bcrypt": "^5.1.1",
"cheerio": "^1.0.0-rc.12",
@ -16,6 +17,7 @@
"express": "^4.18.2",
"jsonwebtoken": "^9.0.2",
"node-cron": "^3.0.3",
"openai": "^4.47.0",
"pg": "^8.11.3",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",
@ -33,6 +35,37 @@
"typescript": "^5.3.2"
}
},
"node_modules/@anthropic-ai/sdk": {
"version": "0.24.3",
"resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.24.3.tgz",
"integrity": "sha512-916wJXO6T6k8R6BAAcLhLPv/pnLGy7YSEBZXZ1XTFbLcTZE8oTy3oDW9WJf9KKZwMvVcePIfoTSvzXHRcGxkQQ==",
"license": "MIT",
"dependencies": {
"@types/node": "^18.11.18",
"@types/node-fetch": "^2.6.4",
"abort-controller": "^3.0.0",
"agentkeepalive": "^4.2.1",
"form-data-encoder": "1.7.2",
"formdata-node": "^4.3.2",
"node-fetch": "^2.6.7",
"web-streams-polyfill": "^3.2.1"
}
},
"node_modules/@anthropic-ai/sdk/node_modules/@types/node": {
"version": "18.19.130",
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.130.tgz",
"integrity": "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg==",
"license": "MIT",
"dependencies": {
"undici-types": "~5.26.4"
}
},
"node_modules/@anthropic-ai/sdk/node_modules/undici-types": {
"version": "5.26.5",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
"license": "MIT"
},
"node_modules/@babel/code-frame": {
"version": "7.28.6",
"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.28.6.tgz",
@ -680,7 +713,6 @@
"version": "20.19.30",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.30.tgz",
"integrity": "sha512-WJtwWJu7UdlvzEAUm484QNg5eAoq5QR08KDNx7g45Usrs2NtOPiX8ugDqmKdXkyL03rBqU5dYNYVQetEpBHq2g==",
"devOptional": true,
"license": "MIT",
"dependencies": {
"undici-types": "~6.21.0"
@ -693,6 +725,16 @@
"dev": true,
"license": "MIT"
},
"node_modules/@types/node-fetch": {
"version": "2.6.13",
"resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz",
"integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==",
"license": "MIT",
"dependencies": {
"@types/node": "*",
"form-data": "^4.0.4"
}
},
"node_modules/@types/pg": {
"version": "8.16.0",
"resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.16.0.tgz",
@ -768,6 +810,18 @@
"integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==",
"license": "ISC"
},
"node_modules/abort-controller": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
"integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
"license": "MIT",
"dependencies": {
"event-target-shim": "^5.0.0"
},
"engines": {
"node": ">=6.5"
}
},
"node_modules/accepts": {
"version": "1.3.8",
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz",
@ -816,6 +870,18 @@
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
"license": "MIT"
},
"node_modules/agentkeepalive": {
"version": "4.6.0",
"resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.6.0.tgz",
"integrity": "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==",
"license": "MIT",
"dependencies": {
"humanize-ms": "^1.2.1"
},
"engines": {
"node": ">= 8.0.0"
}
},
"node_modules/ansi-regex": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
@ -1867,6 +1933,15 @@
"node": ">= 0.6"
}
},
"node_modules/event-target-shim": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
"integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==",
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/events-universal": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
@ -2055,6 +2130,34 @@
"node": ">= 6"
}
},
"node_modules/form-data-encoder": {
"version": "1.7.2",
"resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz",
"integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==",
"license": "MIT"
},
"node_modules/formdata-node": {
"version": "4.4.1",
"resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz",
"integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==",
"license": "MIT",
"dependencies": {
"node-domexception": "1.0.0",
"web-streams-polyfill": "4.0.0-beta.3"
},
"engines": {
"node": ">= 12.20"
}
},
"node_modules/formdata-node/node_modules/web-streams-polyfill": {
"version": "4.0.0-beta.3",
"resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz",
"integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==",
"license": "MIT",
"engines": {
"node": ">= 14"
}
},
"node_modules/forwarded": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
@ -2489,6 +2592,15 @@
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
"license": "MIT"
},
"node_modules/humanize-ms": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz",
"integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==",
"license": "MIT",
"dependencies": {
"ms": "^2.0.0"
}
},
"node_modules/iconv-lite": {
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
@ -3021,6 +3133,26 @@
"node": ">=6.0.0"
}
},
"node_modules/node-domexception": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz",
"integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==",
"deprecated": "Use your platform's native DOMException instead",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/jimmywarting"
},
{
"type": "github",
"url": "https://paypal.me/jimmywarting"
}
],
"license": "MIT",
"engines": {
"node": ">=10.5.0"
}
},
"node_modules/node-fetch": {
"version": "2.7.0",
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
@ -3123,6 +3255,51 @@
"wrappy": "1"
}
},
"node_modules/openai": {
"version": "4.104.0",
"resolved": "https://registry.npmjs.org/openai/-/openai-4.104.0.tgz",
"integrity": "sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA==",
"license": "Apache-2.0",
"dependencies": {
"@types/node": "^18.11.18",
"@types/node-fetch": "^2.6.4",
"abort-controller": "^3.0.0",
"agentkeepalive": "^4.2.1",
"form-data-encoder": "1.7.2",
"formdata-node": "^4.3.2",
"node-fetch": "^2.6.7"
},
"bin": {
"openai": "bin/cli"
},
"peerDependencies": {
"ws": "^8.18.0",
"zod": "^3.23.8"
},
"peerDependenciesMeta": {
"ws": {
"optional": true
},
"zod": {
"optional": true
}
}
},
"node_modules/openai/node_modules/@types/node": {
"version": "18.19.130",
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.130.tgz",
"integrity": "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg==",
"license": "MIT",
"dependencies": {
"undici-types": "~5.26.4"
}
},
"node_modules/openai/node_modules/undici-types": {
"version": "5.26.5",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
"license": "MIT"
},
"node_modules/pac-proxy-agent": {
"version": "7.2.0",
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
@ -4466,7 +4643,6 @@
"version": "6.21.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
"devOptional": true,
"license": "MIT"
},
"node_modules/universalify": {
@ -4526,6 +4702,15 @@
"node": ">= 0.8"
}
},
"node_modules/web-streams-polyfill": {
"version": "3.3.3",
"resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz",
"integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==",
"license": "MIT",
"engines": {
"node": ">= 8"
}
},
"node_modules/webidl-conversions": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",

View file

@ -10,6 +10,7 @@
"db:init": "tsx src/config/init-db.ts"
},
"dependencies": {
"@anthropic-ai/sdk": "^0.24.0",
"axios": "^1.6.0",
"bcrypt": "^5.1.1",
"cheerio": "^1.0.0-rc.12",
@ -18,6 +19,7 @@
"express": "^4.18.2",
"jsonwebtoken": "^9.0.2",
"node-cron": "^3.0.3",
"openai": "^4.47.0",
"pg": "^8.11.3",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",

View file

@ -9,6 +9,37 @@ import settingsRoutes from './routes/settings';
import profileRoutes from './routes/profile';
import adminRoutes from './routes/admin';
import { startScheduler } from './services/scheduler';
import pool from './config/database';
// Run database migrations
async function runMigrations() {
const client = await pool.connect();
try {
// Add AI settings columns to users table if they don't exist
await client.query(`
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'ai_enabled') THEN
ALTER TABLE users ADD COLUMN ai_enabled BOOLEAN DEFAULT false;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'ai_provider') THEN
ALTER TABLE users ADD COLUMN ai_provider VARCHAR(20);
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'anthropic_api_key') THEN
ALTER TABLE users ADD COLUMN anthropic_api_key TEXT;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'openai_api_key') THEN
ALTER TABLE users ADD COLUMN openai_api_key TEXT;
END IF;
END $$;
`);
console.log('Database migrations completed');
} catch (error) {
console.error('Migration error:', error);
} finally {
client.release();
}
}
// Load environment variables
dotenv.config();
@ -47,9 +78,12 @@ app.use(
);
// Start server
app.listen(PORT, () => {
app.listen(PORT, async () => {
console.log(`PriceGhost API server running on port ${PORT}`);
// Run database migrations
await runMigrations();
// Start the background price checker
if (process.env.NODE_ENV !== 'test') {
startScheduler();

View file

@ -27,6 +27,13 @@ export interface NotificationSettings {
discord_webhook_url: string | null;
}
export interface AISettings {
ai_enabled: boolean;
ai_provider: 'anthropic' | 'openai' | null;
anthropic_api_key: string | null;
openai_api_key: string | null;
}
export const userQueries = {
findByEmail: async (email: string): Promise<User | null> => {
const result = await pool.query(
@ -155,6 +162,50 @@ export const userQueries = {
);
return (result.rowCount ?? 0) > 0;
},
getAISettings: async (id: number): Promise<AISettings | null> => {
const result = await pool.query(
'SELECT ai_enabled, ai_provider, anthropic_api_key, openai_api_key FROM users WHERE id = $1',
[id]
);
return result.rows[0] || null;
},
updateAISettings: async (
id: number,
settings: Partial<AISettings>
): Promise<AISettings | null> => {
const fields: string[] = [];
const values: (string | boolean | null)[] = [];
let paramIndex = 1;
if (settings.ai_enabled !== undefined) {
fields.push(`ai_enabled = $${paramIndex++}`);
values.push(settings.ai_enabled);
}
if (settings.ai_provider !== undefined) {
fields.push(`ai_provider = $${paramIndex++}`);
values.push(settings.ai_provider);
}
if (settings.anthropic_api_key !== undefined) {
fields.push(`anthropic_api_key = $${paramIndex++}`);
values.push(settings.anthropic_api_key);
}
if (settings.openai_api_key !== undefined) {
fields.push(`openai_api_key = $${paramIndex++}`);
values.push(settings.openai_api_key);
}
if (fields.length === 0) return null;
values.push(id.toString());
const result = await pool.query(
`UPDATE users SET ${fields.join(', ')} WHERE id = $${paramIndex}
RETURNING ai_enabled, ai_provider, anthropic_api_key, openai_api_key`,
values
);
return result.rows[0] || null;
},
};
// System settings queries

View file

@ -39,8 +39,8 @@ router.post('/', async (req: AuthRequest, res: Response) => {
return;
}
// Scrape product info
const scrapedData = await scrapeProduct(url);
// Scrape product info (pass userId for AI fallback)
const scrapedData = await scrapeProduct(url, userId);
// Allow adding out-of-stock products, but require a price for in-stock ones
if (!scrapedData.price && scrapedData.stockStatus !== 'out_of_stock') {

View file

@ -127,4 +127,89 @@ router.post('/notifications/test/discord', async (req: AuthRequest, res: Respons
}
});
// Get AI settings
router.get('/ai', async (req: AuthRequest, res: Response) => {
try {
const userId = req.userId!;
const settings = await userQueries.getAISettings(userId);
if (!settings) {
res.status(404).json({ error: 'User not found' });
return;
}
// Don't expose full API keys, just indicate if they're set
res.json({
ai_enabled: settings.ai_enabled || false,
ai_provider: settings.ai_provider || null,
anthropic_configured: !!settings.anthropic_api_key,
openai_configured: !!settings.openai_api_key,
});
} catch (error) {
console.error('Error fetching AI settings:', error);
res.status(500).json({ error: 'Failed to fetch AI settings' });
}
});
// Update AI settings
router.put('/ai', async (req: AuthRequest, res: Response) => {
try {
const userId = req.userId!;
const { ai_enabled, ai_provider, anthropic_api_key, openai_api_key } = req.body;
const settings = await userQueries.updateAISettings(userId, {
ai_enabled,
ai_provider,
anthropic_api_key,
openai_api_key,
});
if (!settings) {
res.status(400).json({ error: 'No settings to update' });
return;
}
res.json({
ai_enabled: settings.ai_enabled || false,
ai_provider: settings.ai_provider || null,
anthropic_configured: !!settings.anthropic_api_key,
openai_configured: !!settings.openai_api_key,
message: 'AI settings updated successfully',
});
} catch (error) {
console.error('Error updating AI settings:', error);
res.status(500).json({ error: 'Failed to update AI settings' });
}
});
// Test AI extraction
router.post('/ai/test', async (req: AuthRequest, res: Response) => {
try {
const userId = req.userId!;
const { url } = req.body;
if (!url) {
res.status(400).json({ error: 'URL is required' });
return;
}
const settings = await userQueries.getAISettings(userId);
if (!settings?.ai_enabled) {
res.status(400).json({ error: 'AI extraction is not enabled' });
return;
}
const { extractWithAI } = await import('../services/ai-extractor');
const result = await extractWithAI(url, settings);
res.json({
success: !!result.price,
...result,
});
} catch (error) {
console.error('Error testing AI extraction:', error);
res.status(500).json({ error: 'Failed to test AI extraction' });
}
});
export default router;

View file

@ -0,0 +1,258 @@
import Anthropic from '@anthropic-ai/sdk';
import OpenAI from 'openai';
import axios from 'axios';
import { load } from 'cheerio';
import { AISettings } from '../models';
import { ParsedPrice } from '../utils/priceParser';
import { StockStatus } from './scraper';
export interface AIExtractionResult {
name: string | null;
price: ParsedPrice | null;
imageUrl: string | null;
stockStatus: StockStatus;
confidence: number;
}
const EXTRACTION_PROMPT = `You are a price extraction assistant. Analyze the following HTML content from a product page and extract the product information.
Return a JSON object with these fields:
- name: The product name/title (string or null)
- price: The current selling price as a number (not the original/crossed-out price)
- currency: The currency code (USD, EUR, GBP, etc.)
- imageUrl: The main product image URL (string or null)
- stockStatus: One of "in_stock", "out_of_stock", or "unknown"
- confidence: Your confidence in the extraction from 0 to 1
Important:
- Extract the CURRENT/SALE price, not the original price if there's a discount
- If you can't find a price with confidence, set price to null
- Only return valid JSON, no explanation text
HTML Content:
`;
// Truncate HTML to fit within token limits while preserving important content
function prepareHtmlForAI(html: string): string {
const $ = load(html);
// Remove script, style, and other non-content elements
$('script, style, noscript, iframe, svg, path, meta, link, comment').remove();
// Get the body content
let content = $('body').html() || html;
// Try to focus on product-related sections if possible
const productSelectors = [
'[itemtype*="Product"]',
'[class*="product"]',
'[id*="product"]',
'[class*="pdp"]',
'main',
'[role="main"]',
];
for (const selector of productSelectors) {
const section = $(selector).first();
if (section.length && section.html() && section.html()!.length > 500) {
content = section.html()!;
break;
}
}
// Also extract JSON-LD data which often contains product info
const jsonLdScripts: string[] = [];
$('script[type="application/ld+json"]').each((_, el) => {
const scriptContent = $(el).html();
if (scriptContent && scriptContent.includes('price')) {
jsonLdScripts.push(scriptContent);
}
});
// Combine content with JSON-LD data
let finalContent = content;
if (jsonLdScripts.length > 0) {
finalContent = `JSON-LD Data:\n${jsonLdScripts.join('\n')}\n\nHTML Content:\n${content}`;
}
// Truncate to ~15000 characters to stay within token limits
if (finalContent.length > 15000) {
finalContent = finalContent.substring(0, 15000) + '\n... [truncated]';
}
return finalContent;
}
async function extractWithAnthropic(
html: string,
apiKey: string
): Promise<AIExtractionResult> {
const anthropic = new Anthropic({ apiKey });
const preparedHtml = prepareHtmlForAI(html);
const response = await anthropic.messages.create({
model: 'claude-3-haiku-20240307',
max_tokens: 1024,
messages: [
{
role: 'user',
content: EXTRACTION_PROMPT + preparedHtml,
},
],
});
const content = response.content[0];
if (content.type !== 'text') {
throw new Error('Unexpected response type from Anthropic');
}
return parseAIResponse(content.text);
}
async function extractWithOpenAI(
html: string,
apiKey: string
): Promise<AIExtractionResult> {
const openai = new OpenAI({ apiKey });
const preparedHtml = prepareHtmlForAI(html);
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
max_tokens: 1024,
messages: [
{
role: 'user',
content: EXTRACTION_PROMPT + preparedHtml,
},
],
});
const content = response.choices[0]?.message?.content;
if (!content) {
throw new Error('No response from OpenAI');
}
return parseAIResponse(content);
}
function parseAIResponse(responseText: string): AIExtractionResult {
// Try to extract JSON from the response
let jsonStr = responseText.trim();
// Handle markdown code blocks
const jsonMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
if (jsonMatch) {
jsonStr = jsonMatch[1].trim();
}
// Try to find JSON object in the response
const objectMatch = jsonStr.match(/\{[\s\S]*\}/);
if (objectMatch) {
jsonStr = objectMatch[0];
}
try {
const data = JSON.parse(jsonStr);
let price: ParsedPrice | null = null;
if (data.price !== null && data.price !== undefined) {
const priceNum = typeof data.price === 'string'
? parseFloat(data.price.replace(/[^0-9.]/g, ''))
: data.price;
if (!isNaN(priceNum) && priceNum > 0) {
price = {
price: priceNum,
currency: data.currency || 'USD',
};
}
}
let stockStatus: StockStatus = 'unknown';
if (data.stockStatus) {
const status = data.stockStatus.toLowerCase().replace(/[^a-z_]/g, '');
if (status === 'in_stock' || status === 'instock') {
stockStatus = 'in_stock';
} else if (status === 'out_of_stock' || status === 'outofstock') {
stockStatus = 'out_of_stock';
}
}
return {
name: data.name || null,
price,
imageUrl: data.imageUrl || data.image || null,
stockStatus,
confidence: data.confidence || 0.5,
};
} catch (error) {
console.error('Failed to parse AI response:', responseText);
return {
name: null,
price: null,
imageUrl: null,
stockStatus: 'unknown',
confidence: 0,
};
}
}
export async function extractWithAI(
url: string,
settings: AISettings
): Promise<AIExtractionResult> {
// Fetch the page HTML
const response = await axios.get<string>(url, {
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
Accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
},
timeout: 20000,
});
const html = response.data;
// Use the configured provider
if (settings.ai_provider === 'anthropic' && settings.anthropic_api_key) {
return extractWithAnthropic(html, settings.anthropic_api_key);
} else if (settings.ai_provider === 'openai' && settings.openai_api_key) {
return extractWithOpenAI(html, settings.openai_api_key);
}
throw new Error('No valid AI provider configured');
}
// Export for use in scraper as fallback
export async function tryAIExtraction(
url: string,
html: string,
userId: number
): Promise<AIExtractionResult | null> {
try {
// Import dynamically to avoid circular dependencies
const { userQueries } = await import('../models');
const settings = await userQueries.getAISettings(userId);
if (!settings?.ai_enabled) {
return null;
}
// Use the configured provider
if (settings.ai_provider === 'anthropic' && settings.anthropic_api_key) {
console.log(`[AI] Using Anthropic for ${url}`);
return await extractWithAnthropic(html, settings.anthropic_api_key);
} else if (settings.ai_provider === 'openai' && settings.openai_api_key) {
console.log(`[AI] Using OpenAI for ${url}`);
return await extractWithOpenAI(html, settings.openai_api_key);
}
return null;
} catch (error) {
console.error(`[AI] Extraction failed for ${url}:`, error);
return null;
}
}

View file

@ -23,7 +23,7 @@ async function checkPrices(): Promise<void> {
try {
console.log(`Checking price for product ${product.id}: ${product.url}`);
const scrapedData = await scrapeProduct(product.url);
const scrapedData = await scrapeProduct(product.url, product.user_id);
// Check for back-in-stock notification
const wasOutOfStock = product.stock_status === 'out_of_stock';

View file

@ -664,7 +664,7 @@ const genericImageSelectors = [
'img[class*="product"]',
];
export async function scrapeProduct(url: string): Promise<ScrapedProduct> {
export async function scrapeProduct(url: string, userId?: number): Promise<ScrapedProduct> {
const result: ScrapedProduct = {
name: null,
price: null,
@ -673,8 +673,9 @@ export async function scrapeProduct(url: string): Promise<ScrapedProduct> {
stockStatus: 'unknown',
};
let html: string = '';
try {
let html: string;
let usedBrowser = false;
try {
@ -766,6 +767,26 @@ export async function scrapeProduct(url: string): Promise<ScrapedProduct> {
if (!result.imageUrl) {
result.imageUrl = $('meta[property="og:image"]').attr('content') || null;
}
// If we still don't have a price and userId is provided, try AI extraction
if (!result.price && userId && html) {
try {
const { tryAIExtraction } = await import('./ai-extractor');
const aiResult = await tryAIExtraction(url, html, userId);
if (aiResult && aiResult.price && aiResult.confidence > 0.5) {
console.log(`[AI] Successfully extracted price for ${url}: ${aiResult.price.price} (confidence: ${aiResult.confidence})`);
result.price = aiResult.price;
if (!result.name && aiResult.name) result.name = aiResult.name;
if (!result.imageUrl && aiResult.imageUrl) result.imageUrl = aiResult.imageUrl;
if (result.stockStatus === 'unknown' && aiResult.stockStatus !== 'unknown') {
result.stockStatus = aiResult.stockStatus;
}
}
} catch (aiError) {
console.error(`[AI] Extraction failed for ${url}:`, aiError);
}
}
} catch (error) {
console.error(`Error scraping ${url}:`, error);
}