mirror of
https://github.com/clucraft/PriceGhost.git
synced 2026-05-15 10:52:36 +02:00
Add AI-powered price extraction fallback
- Add AI extraction service supporting Anthropic (Claude) and OpenAI - Add AI settings UI in Settings page with provider selection - Add database migration for AI settings columns - Integrate AI fallback into scraper when standard methods fail - Add API endpoints for AI settings and test extraction Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
cfca33b4ea
commit
d98138fe7c
11 changed files with 887 additions and 10 deletions
258
backend/src/services/ai-extractor.ts
Normal file
258
backend/src/services/ai-extractor.ts
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import OpenAI from 'openai';
|
||||
import axios from 'axios';
|
||||
import { load } from 'cheerio';
|
||||
import { AISettings } from '../models';
|
||||
import { ParsedPrice } from '../utils/priceParser';
|
||||
import { StockStatus } from './scraper';
|
||||
|
||||
export interface AIExtractionResult {
|
||||
name: string | null;
|
||||
price: ParsedPrice | null;
|
||||
imageUrl: string | null;
|
||||
stockStatus: StockStatus;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
const EXTRACTION_PROMPT = `You are a price extraction assistant. Analyze the following HTML content from a product page and extract the product information.
|
||||
|
||||
Return a JSON object with these fields:
|
||||
- name: The product name/title (string or null)
|
||||
- price: The current selling price as a number (not the original/crossed-out price)
|
||||
- currency: The currency code (USD, EUR, GBP, etc.)
|
||||
- imageUrl: The main product image URL (string or null)
|
||||
- stockStatus: One of "in_stock", "out_of_stock", or "unknown"
|
||||
- confidence: Your confidence in the extraction from 0 to 1
|
||||
|
||||
Important:
|
||||
- Extract the CURRENT/SALE price, not the original price if there's a discount
|
||||
- If you can't find a price with confidence, set price to null
|
||||
- Only return valid JSON, no explanation text
|
||||
|
||||
HTML Content:
|
||||
`;
|
||||
|
||||
// Truncate HTML to fit within token limits while preserving important content
|
||||
function prepareHtmlForAI(html: string): string {
|
||||
const $ = load(html);
|
||||
|
||||
// Remove script, style, and other non-content elements
|
||||
$('script, style, noscript, iframe, svg, path, meta, link, comment').remove();
|
||||
|
||||
// Get the body content
|
||||
let content = $('body').html() || html;
|
||||
|
||||
// Try to focus on product-related sections if possible
|
||||
const productSelectors = [
|
||||
'[itemtype*="Product"]',
|
||||
'[class*="product"]',
|
||||
'[id*="product"]',
|
||||
'[class*="pdp"]',
|
||||
'main',
|
||||
'[role="main"]',
|
||||
];
|
||||
|
||||
for (const selector of productSelectors) {
|
||||
const section = $(selector).first();
|
||||
if (section.length && section.html() && section.html()!.length > 500) {
|
||||
content = section.html()!;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Also extract JSON-LD data which often contains product info
|
||||
const jsonLdScripts: string[] = [];
|
||||
$('script[type="application/ld+json"]').each((_, el) => {
|
||||
const scriptContent = $(el).html();
|
||||
if (scriptContent && scriptContent.includes('price')) {
|
||||
jsonLdScripts.push(scriptContent);
|
||||
}
|
||||
});
|
||||
|
||||
// Combine content with JSON-LD data
|
||||
let finalContent = content;
|
||||
if (jsonLdScripts.length > 0) {
|
||||
finalContent = `JSON-LD Data:\n${jsonLdScripts.join('\n')}\n\nHTML Content:\n${content}`;
|
||||
}
|
||||
|
||||
// Truncate to ~15000 characters to stay within token limits
|
||||
if (finalContent.length > 15000) {
|
||||
finalContent = finalContent.substring(0, 15000) + '\n... [truncated]';
|
||||
}
|
||||
|
||||
return finalContent;
|
||||
}
|
||||
|
||||
async function extractWithAnthropic(
|
||||
html: string,
|
||||
apiKey: string
|
||||
): Promise<AIExtractionResult> {
|
||||
const anthropic = new Anthropic({ apiKey });
|
||||
|
||||
const preparedHtml = prepareHtmlForAI(html);
|
||||
|
||||
const response = await anthropic.messages.create({
|
||||
model: 'claude-3-haiku-20240307',
|
||||
max_tokens: 1024,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: EXTRACTION_PROMPT + preparedHtml,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const content = response.content[0];
|
||||
if (content.type !== 'text') {
|
||||
throw new Error('Unexpected response type from Anthropic');
|
||||
}
|
||||
|
||||
return parseAIResponse(content.text);
|
||||
}
|
||||
|
||||
async function extractWithOpenAI(
|
||||
html: string,
|
||||
apiKey: string
|
||||
): Promise<AIExtractionResult> {
|
||||
const openai = new OpenAI({ apiKey });
|
||||
|
||||
const preparedHtml = prepareHtmlForAI(html);
|
||||
|
||||
const response = await openai.chat.completions.create({
|
||||
model: 'gpt-4o-mini',
|
||||
max_tokens: 1024,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: EXTRACTION_PROMPT + preparedHtml,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const content = response.choices[0]?.message?.content;
|
||||
if (!content) {
|
||||
throw new Error('No response from OpenAI');
|
||||
}
|
||||
|
||||
return parseAIResponse(content);
|
||||
}
|
||||
|
||||
function parseAIResponse(responseText: string): AIExtractionResult {
|
||||
// Try to extract JSON from the response
|
||||
let jsonStr = responseText.trim();
|
||||
|
||||
// Handle markdown code blocks
|
||||
const jsonMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
if (jsonMatch) {
|
||||
jsonStr = jsonMatch[1].trim();
|
||||
}
|
||||
|
||||
// Try to find JSON object in the response
|
||||
const objectMatch = jsonStr.match(/\{[\s\S]*\}/);
|
||||
if (objectMatch) {
|
||||
jsonStr = objectMatch[0];
|
||||
}
|
||||
|
||||
try {
|
||||
const data = JSON.parse(jsonStr);
|
||||
|
||||
let price: ParsedPrice | null = null;
|
||||
if (data.price !== null && data.price !== undefined) {
|
||||
const priceNum = typeof data.price === 'string'
|
||||
? parseFloat(data.price.replace(/[^0-9.]/g, ''))
|
||||
: data.price;
|
||||
|
||||
if (!isNaN(priceNum) && priceNum > 0) {
|
||||
price = {
|
||||
price: priceNum,
|
||||
currency: data.currency || 'USD',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
let stockStatus: StockStatus = 'unknown';
|
||||
if (data.stockStatus) {
|
||||
const status = data.stockStatus.toLowerCase().replace(/[^a-z_]/g, '');
|
||||
if (status === 'in_stock' || status === 'instock') {
|
||||
stockStatus = 'in_stock';
|
||||
} else if (status === 'out_of_stock' || status === 'outofstock') {
|
||||
stockStatus = 'out_of_stock';
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name: data.name || null,
|
||||
price,
|
||||
imageUrl: data.imageUrl || data.image || null,
|
||||
stockStatus,
|
||||
confidence: data.confidence || 0.5,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Failed to parse AI response:', responseText);
|
||||
return {
|
||||
name: null,
|
||||
price: null,
|
||||
imageUrl: null,
|
||||
stockStatus: 'unknown',
|
||||
confidence: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractWithAI(
|
||||
url: string,
|
||||
settings: AISettings
|
||||
): Promise<AIExtractionResult> {
|
||||
// Fetch the page HTML
|
||||
const response = await axios.get<string>(url, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
Accept:
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
},
|
||||
timeout: 20000,
|
||||
});
|
||||
|
||||
const html = response.data;
|
||||
|
||||
// Use the configured provider
|
||||
if (settings.ai_provider === 'anthropic' && settings.anthropic_api_key) {
|
||||
return extractWithAnthropic(html, settings.anthropic_api_key);
|
||||
} else if (settings.ai_provider === 'openai' && settings.openai_api_key) {
|
||||
return extractWithOpenAI(html, settings.openai_api_key);
|
||||
}
|
||||
|
||||
throw new Error('No valid AI provider configured');
|
||||
}
|
||||
|
||||
// Export for use in scraper as fallback
|
||||
export async function tryAIExtraction(
|
||||
url: string,
|
||||
html: string,
|
||||
userId: number
|
||||
): Promise<AIExtractionResult | null> {
|
||||
try {
|
||||
// Import dynamically to avoid circular dependencies
|
||||
const { userQueries } = await import('../models');
|
||||
const settings = await userQueries.getAISettings(userId);
|
||||
|
||||
if (!settings?.ai_enabled) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Use the configured provider
|
||||
if (settings.ai_provider === 'anthropic' && settings.anthropic_api_key) {
|
||||
console.log(`[AI] Using Anthropic for ${url}`);
|
||||
return await extractWithAnthropic(html, settings.anthropic_api_key);
|
||||
} else if (settings.ai_provider === 'openai' && settings.openai_api_key) {
|
||||
console.log(`[AI] Using OpenAI for ${url}`);
|
||||
return await extractWithOpenAI(html, settings.openai_api_key);
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error(`[AI] Extraction failed for ${url}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue