Add AI-powered price extraction fallback

- Add AI extraction service supporting Anthropic (Claude) and OpenAI
- Add AI settings UI in Settings page with provider selection
- Add database migration for AI settings columns
- Integrate AI fallback into scraper when standard methods fail
- Add API endpoints for AI settings and test extraction

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
clucraft 2026-01-21 21:49:55 -05:00
parent cfca33b4ea
commit d98138fe7c
11 changed files with 887 additions and 10 deletions

View file

@ -9,6 +9,37 @@ import settingsRoutes from './routes/settings';
import profileRoutes from './routes/profile';
import adminRoutes from './routes/admin';
import { startScheduler } from './services/scheduler';
import pool from './config/database';
// Run database migrations
async function runMigrations() {
const client = await pool.connect();
try {
// Add AI settings columns to users table if they don't exist
await client.query(`
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'ai_enabled') THEN
ALTER TABLE users ADD COLUMN ai_enabled BOOLEAN DEFAULT false;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'ai_provider') THEN
ALTER TABLE users ADD COLUMN ai_provider VARCHAR(20);
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'anthropic_api_key') THEN
ALTER TABLE users ADD COLUMN anthropic_api_key TEXT;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'openai_api_key') THEN
ALTER TABLE users ADD COLUMN openai_api_key TEXT;
END IF;
END $$;
`);
console.log('Database migrations completed');
} catch (error) {
console.error('Migration error:', error);
} finally {
client.release();
}
}
// Load environment variables
dotenv.config();
@ -47,9 +78,12 @@ app.use(
);
// Start server
app.listen(PORT, () => {
app.listen(PORT, async () => {
console.log(`PriceGhost API server running on port ${PORT}`);
// Run database migrations
await runMigrations();
// Start the background price checker
if (process.env.NODE_ENV !== 'test') {
startScheduler();

View file

@ -27,6 +27,13 @@ export interface NotificationSettings {
discord_webhook_url: string | null;
}
export interface AISettings {
ai_enabled: boolean;
ai_provider: 'anthropic' | 'openai' | null;
anthropic_api_key: string | null;
openai_api_key: string | null;
}
export const userQueries = {
findByEmail: async (email: string): Promise<User | null> => {
const result = await pool.query(
@ -155,6 +162,50 @@ export const userQueries = {
);
return (result.rowCount ?? 0) > 0;
},
getAISettings: async (id: number): Promise<AISettings | null> => {
const result = await pool.query(
'SELECT ai_enabled, ai_provider, anthropic_api_key, openai_api_key FROM users WHERE id = $1',
[id]
);
return result.rows[0] || null;
},
updateAISettings: async (
id: number,
settings: Partial<AISettings>
): Promise<AISettings | null> => {
const fields: string[] = [];
const values: (string | boolean | null)[] = [];
let paramIndex = 1;
if (settings.ai_enabled !== undefined) {
fields.push(`ai_enabled = $${paramIndex++}`);
values.push(settings.ai_enabled);
}
if (settings.ai_provider !== undefined) {
fields.push(`ai_provider = $${paramIndex++}`);
values.push(settings.ai_provider);
}
if (settings.anthropic_api_key !== undefined) {
fields.push(`anthropic_api_key = $${paramIndex++}`);
values.push(settings.anthropic_api_key);
}
if (settings.openai_api_key !== undefined) {
fields.push(`openai_api_key = $${paramIndex++}`);
values.push(settings.openai_api_key);
}
if (fields.length === 0) return null;
values.push(id.toString());
const result = await pool.query(
`UPDATE users SET ${fields.join(', ')} WHERE id = $${paramIndex}
RETURNING ai_enabled, ai_provider, anthropic_api_key, openai_api_key`,
values
);
return result.rows[0] || null;
},
};
// System settings queries

View file

@ -39,8 +39,8 @@ router.post('/', async (req: AuthRequest, res: Response) => {
return;
}
// Scrape product info
const scrapedData = await scrapeProduct(url);
// Scrape product info (pass userId for AI fallback)
const scrapedData = await scrapeProduct(url, userId);
// Allow adding out-of-stock products, but require a price for in-stock ones
if (!scrapedData.price && scrapedData.stockStatus !== 'out_of_stock') {

View file

@ -127,4 +127,89 @@ router.post('/notifications/test/discord', async (req: AuthRequest, res: Respons
}
});
// Get AI settings
router.get('/ai', async (req: AuthRequest, res: Response) => {
try {
const userId = req.userId!;
const settings = await userQueries.getAISettings(userId);
if (!settings) {
res.status(404).json({ error: 'User not found' });
return;
}
// Don't expose full API keys, just indicate if they're set
res.json({
ai_enabled: settings.ai_enabled || false,
ai_provider: settings.ai_provider || null,
anthropic_configured: !!settings.anthropic_api_key,
openai_configured: !!settings.openai_api_key,
});
} catch (error) {
console.error('Error fetching AI settings:', error);
res.status(500).json({ error: 'Failed to fetch AI settings' });
}
});
// Update AI settings
router.put('/ai', async (req: AuthRequest, res: Response) => {
try {
const userId = req.userId!;
const { ai_enabled, ai_provider, anthropic_api_key, openai_api_key } = req.body;
const settings = await userQueries.updateAISettings(userId, {
ai_enabled,
ai_provider,
anthropic_api_key,
openai_api_key,
});
if (!settings) {
res.status(400).json({ error: 'No settings to update' });
return;
}
res.json({
ai_enabled: settings.ai_enabled || false,
ai_provider: settings.ai_provider || null,
anthropic_configured: !!settings.anthropic_api_key,
openai_configured: !!settings.openai_api_key,
message: 'AI settings updated successfully',
});
} catch (error) {
console.error('Error updating AI settings:', error);
res.status(500).json({ error: 'Failed to update AI settings' });
}
});
// Test AI extraction
router.post('/ai/test', async (req: AuthRequest, res: Response) => {
try {
const userId = req.userId!;
const { url } = req.body;
if (!url) {
res.status(400).json({ error: 'URL is required' });
return;
}
const settings = await userQueries.getAISettings(userId);
if (!settings?.ai_enabled) {
res.status(400).json({ error: 'AI extraction is not enabled' });
return;
}
const { extractWithAI } = await import('../services/ai-extractor');
const result = await extractWithAI(url, settings);
res.json({
success: !!result.price,
...result,
});
} catch (error) {
console.error('Error testing AI extraction:', error);
res.status(500).json({ error: 'Failed to test AI extraction' });
}
});
export default router;

View file

@ -0,0 +1,258 @@
import Anthropic from '@anthropic-ai/sdk';
import OpenAI from 'openai';
import axios from 'axios';
import { load } from 'cheerio';
import { AISettings } from '../models';
import { ParsedPrice } from '../utils/priceParser';
import { StockStatus } from './scraper';
export interface AIExtractionResult {
name: string | null;
price: ParsedPrice | null;
imageUrl: string | null;
stockStatus: StockStatus;
confidence: number;
}
const EXTRACTION_PROMPT = `You are a price extraction assistant. Analyze the following HTML content from a product page and extract the product information.
Return a JSON object with these fields:
- name: The product name/title (string or null)
- price: The current selling price as a number (not the original/crossed-out price)
- currency: The currency code (USD, EUR, GBP, etc.)
- imageUrl: The main product image URL (string or null)
- stockStatus: One of "in_stock", "out_of_stock", or "unknown"
- confidence: Your confidence in the extraction from 0 to 1
Important:
- Extract the CURRENT/SALE price, not the original price if there's a discount
- If you can't find a price with confidence, set price to null
- Only return valid JSON, no explanation text
HTML Content:
`;
// Truncate HTML to fit within token limits while preserving important content
function prepareHtmlForAI(html: string): string {
const $ = load(html);
// Remove script, style, and other non-content elements
$('script, style, noscript, iframe, svg, path, meta, link, comment').remove();
// Get the body content
let content = $('body').html() || html;
// Try to focus on product-related sections if possible
const productSelectors = [
'[itemtype*="Product"]',
'[class*="product"]',
'[id*="product"]',
'[class*="pdp"]',
'main',
'[role="main"]',
];
for (const selector of productSelectors) {
const section = $(selector).first();
if (section.length && section.html() && section.html()!.length > 500) {
content = section.html()!;
break;
}
}
// Also extract JSON-LD data which often contains product info
const jsonLdScripts: string[] = [];
$('script[type="application/ld+json"]').each((_, el) => {
const scriptContent = $(el).html();
if (scriptContent && scriptContent.includes('price')) {
jsonLdScripts.push(scriptContent);
}
});
// Combine content with JSON-LD data
let finalContent = content;
if (jsonLdScripts.length > 0) {
finalContent = `JSON-LD Data:\n${jsonLdScripts.join('\n')}\n\nHTML Content:\n${content}`;
}
// Truncate to ~15000 characters to stay within token limits
if (finalContent.length > 15000) {
finalContent = finalContent.substring(0, 15000) + '\n... [truncated]';
}
return finalContent;
}
async function extractWithAnthropic(
html: string,
apiKey: string
): Promise<AIExtractionResult> {
const anthropic = new Anthropic({ apiKey });
const preparedHtml = prepareHtmlForAI(html);
const response = await anthropic.messages.create({
model: 'claude-3-haiku-20240307',
max_tokens: 1024,
messages: [
{
role: 'user',
content: EXTRACTION_PROMPT + preparedHtml,
},
],
});
const content = response.content[0];
if (content.type !== 'text') {
throw new Error('Unexpected response type from Anthropic');
}
return parseAIResponse(content.text);
}
async function extractWithOpenAI(
html: string,
apiKey: string
): Promise<AIExtractionResult> {
const openai = new OpenAI({ apiKey });
const preparedHtml = prepareHtmlForAI(html);
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
max_tokens: 1024,
messages: [
{
role: 'user',
content: EXTRACTION_PROMPT + preparedHtml,
},
],
});
const content = response.choices[0]?.message?.content;
if (!content) {
throw new Error('No response from OpenAI');
}
return parseAIResponse(content);
}
function parseAIResponse(responseText: string): AIExtractionResult {
// Try to extract JSON from the response
let jsonStr = responseText.trim();
// Handle markdown code blocks
const jsonMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
if (jsonMatch) {
jsonStr = jsonMatch[1].trim();
}
// Try to find JSON object in the response
const objectMatch = jsonStr.match(/\{[\s\S]*\}/);
if (objectMatch) {
jsonStr = objectMatch[0];
}
try {
const data = JSON.parse(jsonStr);
let price: ParsedPrice | null = null;
if (data.price !== null && data.price !== undefined) {
const priceNum = typeof data.price === 'string'
? parseFloat(data.price.replace(/[^0-9.]/g, ''))
: data.price;
if (!isNaN(priceNum) && priceNum > 0) {
price = {
price: priceNum,
currency: data.currency || 'USD',
};
}
}
let stockStatus: StockStatus = 'unknown';
if (data.stockStatus) {
const status = data.stockStatus.toLowerCase().replace(/[^a-z_]/g, '');
if (status === 'in_stock' || status === 'instock') {
stockStatus = 'in_stock';
} else if (status === 'out_of_stock' || status === 'outofstock') {
stockStatus = 'out_of_stock';
}
}
return {
name: data.name || null,
price,
imageUrl: data.imageUrl || data.image || null,
stockStatus,
confidence: data.confidence || 0.5,
};
} catch (error) {
console.error('Failed to parse AI response:', responseText);
return {
name: null,
price: null,
imageUrl: null,
stockStatus: 'unknown',
confidence: 0,
};
}
}
export async function extractWithAI(
url: string,
settings: AISettings
): Promise<AIExtractionResult> {
// Fetch the page HTML
const response = await axios.get<string>(url, {
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
Accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
},
timeout: 20000,
});
const html = response.data;
// Use the configured provider
if (settings.ai_provider === 'anthropic' && settings.anthropic_api_key) {
return extractWithAnthropic(html, settings.anthropic_api_key);
} else if (settings.ai_provider === 'openai' && settings.openai_api_key) {
return extractWithOpenAI(html, settings.openai_api_key);
}
throw new Error('No valid AI provider configured');
}
// Export for use in scraper as fallback
export async function tryAIExtraction(
url: string,
html: string,
userId: number
): Promise<AIExtractionResult | null> {
try {
// Import dynamically to avoid circular dependencies
const { userQueries } = await import('../models');
const settings = await userQueries.getAISettings(userId);
if (!settings?.ai_enabled) {
return null;
}
// Use the configured provider
if (settings.ai_provider === 'anthropic' && settings.anthropic_api_key) {
console.log(`[AI] Using Anthropic for ${url}`);
return await extractWithAnthropic(html, settings.anthropic_api_key);
} else if (settings.ai_provider === 'openai' && settings.openai_api_key) {
console.log(`[AI] Using OpenAI for ${url}`);
return await extractWithOpenAI(html, settings.openai_api_key);
}
return null;
} catch (error) {
console.error(`[AI] Extraction failed for ${url}:`, error);
return null;
}
}

View file

@ -23,7 +23,7 @@ async function checkPrices(): Promise<void> {
try {
console.log(`Checking price for product ${product.id}: ${product.url}`);
const scrapedData = await scrapeProduct(product.url);
const scrapedData = await scrapeProduct(product.url, product.user_id);
// Check for back-in-stock notification
const wasOutOfStock = product.stock_status === 'out_of_stock';

View file

@ -664,7 +664,7 @@ const genericImageSelectors = [
'img[class*="product"]',
];
export async function scrapeProduct(url: string): Promise<ScrapedProduct> {
export async function scrapeProduct(url: string, userId?: number): Promise<ScrapedProduct> {
const result: ScrapedProduct = {
name: null,
price: null,
@ -673,8 +673,9 @@ export async function scrapeProduct(url: string): Promise<ScrapedProduct> {
stockStatus: 'unknown',
};
let html: string = '';
try {
let html: string;
let usedBrowser = false;
try {
@ -766,6 +767,26 @@ export async function scrapeProduct(url: string): Promise<ScrapedProduct> {
if (!result.imageUrl) {
result.imageUrl = $('meta[property="og:image"]').attr('content') || null;
}
// If we still don't have a price and userId is provided, try AI extraction
if (!result.price && userId && html) {
try {
const { tryAIExtraction } = await import('./ai-extractor');
const aiResult = await tryAIExtraction(url, html, userId);
if (aiResult && aiResult.price && aiResult.confidence > 0.5) {
console.log(`[AI] Successfully extracted price for ${url}: ${aiResult.price.price} (confidence: ${aiResult.confidence})`);
result.price = aiResult.price;
if (!result.name && aiResult.name) result.name = aiResult.name;
if (!result.imageUrl && aiResult.imageUrl) result.imageUrl = aiResult.imageUrl;
if (result.stockStatus === 'unknown' && aiResult.stockStatus !== 'unknown') {
result.stockStatus = aiResult.stockStatus;
}
}
} catch (aiError) {
console.error(`[AI] Extraction failed for ${url}:`, aiError);
}
}
} catch (error) {
console.error(`Error scraping ${url}:`, error);
}