mirror of
https://github.com/clucraft/PriceGhost.git
synced 2026-04-25 00:36:32 +02:00
feat: Multi-strategy price voting system with user selection
- Add multi-strategy voting: runs JSON-LD, site-specific, generic CSS, and AI extraction methods in parallel - Implement consensus voting to select the correct price when methods agree - Add AI arbitration when extraction methods disagree - Add PriceSelectionModal for users to select correct price when ambiguous - Store preferred extraction method per product for faster re-checks - Add database columns for preferred_extraction_method and needs_price_review Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
40c45b49c8
commit
4fd04cd160
10 changed files with 1259 additions and 12 deletions
|
|
@ -151,6 +151,22 @@ async function runMigrations() {
|
|||
END $$;
|
||||
`);
|
||||
|
||||
// Add multi-strategy voting columns to products table
|
||||
await client.query(`
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'products' AND column_name = 'preferred_extraction_method') THEN
|
||||
ALTER TABLE products ADD COLUMN preferred_extraction_method VARCHAR(20);
|
||||
END IF;
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'products' AND column_name = 'needs_price_review') THEN
|
||||
ALTER TABLE products ADD COLUMN needs_price_review BOOLEAN DEFAULT false;
|
||||
END IF;
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'products' AND column_name = 'price_candidates') THEN
|
||||
ALTER TABLE products ADD COLUMN price_candidates JSONB;
|
||||
END IF;
|
||||
END $$;
|
||||
`);
|
||||
|
||||
// Create notification_history table for tracking all triggered notifications
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS notification_history (
|
||||
|
|
|
|||
|
|
@ -551,6 +551,21 @@ export const productQueries = {
|
|||
);
|
||||
return result.rows;
|
||||
},
|
||||
|
||||
updateExtractionMethod: async (id: number, method: string): Promise<void> => {
|
||||
await pool.query(
|
||||
'UPDATE products SET preferred_extraction_method = $1, needs_price_review = false WHERE id = $2',
|
||||
[method, id]
|
||||
);
|
||||
},
|
||||
|
||||
getPreferredExtractionMethod: async (id: number): Promise<string | null> => {
|
||||
const result = await pool.query(
|
||||
'SELECT preferred_extraction_method FROM products WHERE id = $1',
|
||||
[id]
|
||||
);
|
||||
return result.rows[0]?.preferred_extraction_method || null;
|
||||
},
|
||||
};
|
||||
|
||||
// Price History types and queries
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import { Router, Response } from 'express';
|
||||
import { AuthRequest, authMiddleware } from '../middleware/auth';
|
||||
import { productQueries, priceHistoryQueries, stockStatusHistoryQueries } from '../models';
|
||||
import { scrapeProduct } from '../services/scraper';
|
||||
import { scrapeProduct, scrapeProductWithVoting, ExtractionMethod } from '../services/scraper';
|
||||
|
||||
const router = Router();
|
||||
|
||||
|
|
@ -20,11 +20,11 @@ router.get('/', async (req: AuthRequest, res: Response) => {
|
|||
}
|
||||
});
|
||||
|
||||
// Add a new product to track
|
||||
// Add a new product to track (with multi-strategy voting)
|
||||
router.post('/', async (req: AuthRequest, res: Response) => {
|
||||
try {
|
||||
const userId = req.userId!;
|
||||
const { url, refresh_interval } = req.body;
|
||||
const { url, refresh_interval, selectedPrice, selectedMethod } = req.body;
|
||||
|
||||
if (!url) {
|
||||
res.status(400).json({ error: 'URL is required' });
|
||||
|
|
@ -39,8 +39,47 @@ router.post('/', async (req: AuthRequest, res: Response) => {
|
|||
return;
|
||||
}
|
||||
|
||||
// Scrape product info (pass userId for AI fallback)
|
||||
const scrapedData = await scrapeProduct(url, userId);
|
||||
// If user is confirming a price selection, use the old scraper with their choice
|
||||
if (selectedPrice !== undefined && selectedMethod) {
|
||||
// User has selected a price from candidates - use it directly
|
||||
const scrapedData = await scrapeProduct(url, userId);
|
||||
|
||||
// Create product with the user-selected price
|
||||
const product = await productQueries.create(
|
||||
userId,
|
||||
url,
|
||||
scrapedData.name,
|
||||
scrapedData.imageUrl,
|
||||
refresh_interval || 3600,
|
||||
scrapedData.stockStatus
|
||||
);
|
||||
|
||||
// Store the preferred extraction method and the user-selected price
|
||||
await productQueries.updateExtractionMethod(product.id, selectedMethod);
|
||||
|
||||
// Record the user-selected price
|
||||
await priceHistoryQueries.create(
|
||||
product.id,
|
||||
selectedPrice,
|
||||
'USD', // TODO: Get currency from selection
|
||||
null
|
||||
);
|
||||
|
||||
// Record initial stock status
|
||||
if (scrapedData.stockStatus !== 'unknown') {
|
||||
await stockStatusHistoryQueries.recordChange(product.id, scrapedData.stockStatus);
|
||||
}
|
||||
|
||||
// Update last_checked timestamp
|
||||
await productQueries.updateLastChecked(product.id, product.refresh_interval);
|
||||
|
||||
const productWithPrice = await productQueries.findById(product.id, userId);
|
||||
res.status(201).json(productWithPrice);
|
||||
return;
|
||||
}
|
||||
|
||||
// Use multi-strategy voting scraper
|
||||
const scrapedData = await scrapeProductWithVoting(url, userId);
|
||||
|
||||
// Allow adding out-of-stock products, but require a price for in-stock ones
|
||||
if (!scrapedData.price && scrapedData.stockStatus !== 'out_of_stock') {
|
||||
|
|
@ -50,6 +89,26 @@ router.post('/', async (req: AuthRequest, res: Response) => {
|
|||
return;
|
||||
}
|
||||
|
||||
// If needsReview is true and there are multiple candidates, return them for user selection
|
||||
if (scrapedData.needsReview && scrapedData.priceCandidates.length > 1) {
|
||||
res.status(200).json({
|
||||
needsReview: true,
|
||||
name: scrapedData.name,
|
||||
imageUrl: scrapedData.imageUrl,
|
||||
stockStatus: scrapedData.stockStatus,
|
||||
priceCandidates: scrapedData.priceCandidates.map(c => ({
|
||||
price: c.price,
|
||||
currency: c.currency,
|
||||
method: c.method,
|
||||
context: c.context,
|
||||
confidence: c.confidence,
|
||||
})),
|
||||
suggestedPrice: scrapedData.price,
|
||||
url,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Create product with stock status
|
||||
const product = await productQueries.create(
|
||||
userId,
|
||||
|
|
@ -60,6 +119,11 @@ router.post('/', async (req: AuthRequest, res: Response) => {
|
|||
scrapedData.stockStatus
|
||||
);
|
||||
|
||||
// Store the extraction method that worked
|
||||
if (scrapedData.selectedMethod) {
|
||||
await productQueries.updateExtractionMethod(product.id, scrapedData.selectedMethod);
|
||||
}
|
||||
|
||||
// Record initial price if available
|
||||
if (scrapedData.price) {
|
||||
await priceHistoryQueries.create(
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import axios from 'axios';
|
|||
import { load } from 'cheerio';
|
||||
import { AISettings } from '../models';
|
||||
import { ParsedPrice } from '../utils/priceParser';
|
||||
import { StockStatus } from './scraper';
|
||||
import { StockStatus, PriceCandidate } from './scraper';
|
||||
|
||||
export interface AIExtractionResult {
|
||||
name: string | null;
|
||||
|
|
@ -548,3 +548,211 @@ export async function tryAIVerification(
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Arbitration prompt for when multiple extraction methods disagree
|
||||
const ARBITRATION_PROMPT = `You are a price arbitration assistant. Multiple price extraction methods found different prices for the same product. Help determine the correct price.
|
||||
|
||||
Found prices:
|
||||
$CANDIDATES$
|
||||
|
||||
Analyze the HTML content below and determine which price is the correct CURRENT selling price for the main product.
|
||||
|
||||
Consider:
|
||||
- JSON-LD structured data is usually highly reliable (schema.org standard)
|
||||
- Site-specific extractors are well-tested for major retailers
|
||||
- Generic CSS selectors might catch wrong prices (shipping, savings, bundles, etc.)
|
||||
- Look for the price that appears in the main product display area
|
||||
- Ignore crossed-out/original prices, shipping costs, subscription prices, or bundle prices
|
||||
|
||||
Return a JSON object with:
|
||||
- selectedIndex: the 0-based index of the correct price from the list above
|
||||
- confidence: your confidence from 0 to 1
|
||||
- reason: brief explanation of why this price is correct
|
||||
|
||||
Only return valid JSON, no explanation text outside the JSON.
|
||||
|
||||
HTML Content:
|
||||
`;
|
||||
|
||||
export interface AIArbitrationResult {
|
||||
selectedPrice: PriceCandidate | null;
|
||||
confidence: number;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
async function arbitrateWithAnthropic(
|
||||
html: string,
|
||||
candidates: PriceCandidate[],
|
||||
apiKey: string
|
||||
): Promise<AIArbitrationResult> {
|
||||
const anthropic = new Anthropic({ apiKey });
|
||||
|
||||
const candidatesList = candidates.map((c, i) =>
|
||||
`${i}. ${c.price} ${c.currency} (method: ${c.method}, context: ${c.context || 'none'})`
|
||||
).join('\n');
|
||||
|
||||
const preparedHtml = prepareHtmlForAI(html);
|
||||
const prompt = ARBITRATION_PROMPT.replace('$CANDIDATES$', candidatesList) + preparedHtml;
|
||||
|
||||
const response = await anthropic.messages.create({
|
||||
model: 'claude-3-haiku-20240307',
|
||||
max_tokens: 512,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
});
|
||||
|
||||
const content = response.content[0];
|
||||
if (content.type !== 'text') {
|
||||
throw new Error('Unexpected response type from Anthropic');
|
||||
}
|
||||
|
||||
return parseArbitrationResponse(content.text, candidates);
|
||||
}
|
||||
|
||||
async function arbitrateWithOpenAI(
|
||||
html: string,
|
||||
candidates: PriceCandidate[],
|
||||
apiKey: string
|
||||
): Promise<AIArbitrationResult> {
|
||||
const openai = new OpenAI({ apiKey });
|
||||
|
||||
const candidatesList = candidates.map((c, i) =>
|
||||
`${i}. ${c.price} ${c.currency} (method: ${c.method}, context: ${c.context || 'none'})`
|
||||
).join('\n');
|
||||
|
||||
const preparedHtml = prepareHtmlForAI(html);
|
||||
const prompt = ARBITRATION_PROMPT.replace('$CANDIDATES$', candidatesList) + preparedHtml;
|
||||
|
||||
const response = await openai.chat.completions.create({
|
||||
model: 'gpt-4o-mini',
|
||||
max_tokens: 512,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
});
|
||||
|
||||
const content = response.choices[0]?.message?.content;
|
||||
if (!content) {
|
||||
throw new Error('No response from OpenAI');
|
||||
}
|
||||
|
||||
return parseArbitrationResponse(content, candidates);
|
||||
}
|
||||
|
||||
async function arbitrateWithOllama(
|
||||
html: string,
|
||||
candidates: PriceCandidate[],
|
||||
baseUrl: string,
|
||||
model: string
|
||||
): Promise<AIArbitrationResult> {
|
||||
const candidatesList = candidates.map((c, i) =>
|
||||
`${i}. ${c.price} ${c.currency} (method: ${c.method}, context: ${c.context || 'none'})`
|
||||
).join('\n');
|
||||
|
||||
const preparedHtml = prepareHtmlForAI(html);
|
||||
const prompt = ARBITRATION_PROMPT.replace('$CANDIDATES$', candidatesList) + preparedHtml;
|
||||
|
||||
const response = await axios.post(
|
||||
`${baseUrl}/api/chat`,
|
||||
{
|
||||
model: model,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
stream: false,
|
||||
},
|
||||
{
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
timeout: 120000,
|
||||
}
|
||||
);
|
||||
|
||||
const content = response.data?.message?.content;
|
||||
if (!content) {
|
||||
throw new Error('No response from Ollama');
|
||||
}
|
||||
|
||||
return parseArbitrationResponse(content, candidates);
|
||||
}
|
||||
|
||||
function parseArbitrationResponse(
|
||||
responseText: string,
|
||||
candidates: PriceCandidate[]
|
||||
): AIArbitrationResult {
|
||||
console.log(`[AI Arbitrate] Raw response: ${responseText.substring(0, 500)}...`);
|
||||
|
||||
const defaultResult: AIArbitrationResult = {
|
||||
selectedPrice: null,
|
||||
confidence: 0,
|
||||
reason: 'Could not parse AI response',
|
||||
};
|
||||
|
||||
let jsonStr = responseText.trim();
|
||||
|
||||
// Handle markdown code blocks
|
||||
const jsonMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
if (jsonMatch) {
|
||||
jsonStr = jsonMatch[1].trim();
|
||||
}
|
||||
|
||||
// Try to find JSON object
|
||||
const objectMatch = jsonStr.match(/\{[\s\S]*\}/);
|
||||
if (objectMatch) {
|
||||
jsonStr = objectMatch[0];
|
||||
}
|
||||
|
||||
try {
|
||||
const data = JSON.parse(jsonStr);
|
||||
console.log(`[AI Arbitrate] Parsed:`, JSON.stringify(data, null, 2));
|
||||
|
||||
const selectedIndex = data.selectedIndex;
|
||||
if (typeof selectedIndex === 'number' && selectedIndex >= 0 && selectedIndex < candidates.length) {
|
||||
return {
|
||||
selectedPrice: candidates[selectedIndex],
|
||||
confidence: data.confidence ?? 0.7,
|
||||
reason: data.reason || 'AI selected this price',
|
||||
};
|
||||
}
|
||||
|
||||
return defaultResult;
|
||||
} catch (error) {
|
||||
console.error('[AI Arbitrate] Failed to parse response:', responseText);
|
||||
return defaultResult;
|
||||
}
|
||||
}
|
||||
|
||||
// Export for use in voting scraper to arbitrate between disagreeing methods
|
||||
export async function tryAIArbitration(
|
||||
url: string,
|
||||
html: string,
|
||||
candidates: PriceCandidate[],
|
||||
userId: number
|
||||
): Promise<AIArbitrationResult | null> {
|
||||
try {
|
||||
const { userQueries } = await import('../models');
|
||||
const settings = await userQueries.getAISettings(userId);
|
||||
|
||||
// Need AI enabled for arbitration
|
||||
if (!settings?.ai_enabled && !settings?.ai_verification_enabled) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Need at least 2 candidates to arbitrate
|
||||
if (candidates.length < 2) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Use the configured provider
|
||||
if (settings.ai_provider === 'anthropic' && settings.anthropic_api_key) {
|
||||
console.log(`[AI Arbitrate] Using Anthropic to arbitrate ${candidates.length} prices for ${url}`);
|
||||
return await arbitrateWithAnthropic(html, candidates, settings.anthropic_api_key);
|
||||
} else if (settings.ai_provider === 'openai' && settings.openai_api_key) {
|
||||
console.log(`[AI Arbitrate] Using OpenAI to arbitrate ${candidates.length} prices for ${url}`);
|
||||
return await arbitrateWithOpenAI(html, candidates, settings.openai_api_key);
|
||||
} else if (settings.ai_provider === 'ollama' && settings.ollama_base_url && settings.ollama_model) {
|
||||
console.log(`[AI Arbitrate] Using Ollama to arbitrate ${candidates.length} prices for ${url}`);
|
||||
return await arbitrateWithOllama(html, candidates, settings.ollama_base_url, settings.ollama_model);
|
||||
}
|
||||
|
||||
console.log(`[AI Arbitrate] No provider configured`);
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error(`[AI Arbitrate] Arbitration failed for ${url}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import cron from 'node-cron';
|
||||
import { productQueries, priceHistoryQueries, userQueries, stockStatusHistoryQueries, notificationHistoryQueries, NotificationType } from '../models';
|
||||
import { scrapeProduct } from './scraper';
|
||||
import { scrapeProduct, scrapeProductWithVoting, ExtractionMethod } from './scraper';
|
||||
import { sendNotifications, NotificationPayload } from './notifications';
|
||||
|
||||
let isRunning = false;
|
||||
|
|
@ -23,7 +23,15 @@ async function checkPrices(): Promise<void> {
|
|||
try {
|
||||
console.log(`Checking price for product ${product.id}: ${product.url}`);
|
||||
|
||||
const scrapedData = await scrapeProduct(product.url, product.user_id);
|
||||
// Get preferred extraction method for this product (if user previously selected one)
|
||||
const preferredMethod = await productQueries.getPreferredExtractionMethod(product.id);
|
||||
|
||||
// Use voting scraper with preferred method if available
|
||||
const scrapedData = await scrapeProductWithVoting(
|
||||
product.url,
|
||||
product.user_id,
|
||||
preferredMethod as ExtractionMethod | undefined
|
||||
);
|
||||
|
||||
// Check for back-in-stock notification
|
||||
const wasOutOfStock = product.stock_status === 'out_of_stock';
|
||||
|
|
|
|||
|
|
@ -13,6 +13,220 @@ puppeteer.use(StealthPlugin());
|
|||
|
||||
export type StockStatus = 'in_stock' | 'out_of_stock' | 'unknown';
|
||||
|
||||
// Extraction method types for multi-strategy voting
|
||||
export type ExtractionMethod = 'json-ld' | 'site-specific' | 'generic-css' | 'ai';
|
||||
|
||||
// Price candidate from a single extraction method
|
||||
export interface PriceCandidate {
|
||||
price: number;
|
||||
currency: string;
|
||||
method: ExtractionMethod;
|
||||
context?: string; // Text around the price for user context
|
||||
confidence: number; // 0-1 confidence score
|
||||
}
|
||||
|
||||
// Extended scrape result with candidates for voting
|
||||
export interface ScrapedProductWithCandidates {
|
||||
name: string | null;
|
||||
price: ParsedPrice | null;
|
||||
imageUrl: string | null;
|
||||
url: string;
|
||||
stockStatus: StockStatus;
|
||||
aiStatus: 'verified' | 'corrected' | null;
|
||||
priceCandidates: PriceCandidate[];
|
||||
needsReview: boolean;
|
||||
selectedMethod?: ExtractionMethod; // Which method was used for final price
|
||||
}
|
||||
|
||||
// Check if two prices are "close enough" to be considered the same (within 5%)
|
||||
function pricesMatch(price1: number, price2: number): boolean {
|
||||
if (price1 === price2) return true;
|
||||
const diff = Math.abs(price1 - price2);
|
||||
const avg = (price1 + price2) / 2;
|
||||
return (diff / avg) < 0.05; // Within 5%
|
||||
}
|
||||
|
||||
// Find consensus among price candidates
|
||||
function findPriceConsensus(candidates: PriceCandidate[]): { price: PriceCandidate | null; hasConsensus: boolean; groups: PriceCandidate[][] } {
|
||||
if (candidates.length === 0) return { price: null, hasConsensus: false, groups: [] };
|
||||
if (candidates.length === 1) return { price: candidates[0], hasConsensus: true, groups: [[candidates[0]]] };
|
||||
|
||||
// Group prices that match
|
||||
const groups: PriceCandidate[][] = [];
|
||||
for (const candidate of candidates) {
|
||||
let foundGroup = false;
|
||||
for (const group of groups) {
|
||||
if (pricesMatch(candidate.price, group[0].price)) {
|
||||
group.push(candidate);
|
||||
foundGroup = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!foundGroup) {
|
||||
groups.push([candidate]);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort groups by size (most votes first), then by confidence
|
||||
groups.sort((a, b) => {
|
||||
if (b.length !== a.length) return b.length - a.length;
|
||||
const avgConfA = a.reduce((sum, c) => sum + c.confidence, 0) / a.length;
|
||||
const avgConfB = b.reduce((sum, c) => sum + c.confidence, 0) / b.length;
|
||||
return avgConfB - avgConfA;
|
||||
});
|
||||
|
||||
const largestGroup = groups[0];
|
||||
// Consensus if majority agrees (>= 50% of methods) OR if top group has significantly more votes
|
||||
const hasConsensus = largestGroup.length >= Math.ceil(candidates.length / 2) ||
|
||||
(groups.length > 1 && largestGroup.length > groups[1].length);
|
||||
|
||||
// Pick the highest confidence candidate from the winning group
|
||||
const winner = largestGroup.sort((a, b) => b.confidence - a.confidence)[0];
|
||||
|
||||
return { price: winner, hasConsensus, groups };
|
||||
}
|
||||
|
||||
// Extract price candidates from JSON-LD structured data
|
||||
function extractJsonLdCandidates($: CheerioAPI): PriceCandidate[] {
|
||||
const candidates: PriceCandidate[] = [];
|
||||
try {
|
||||
const scripts = $('script[type="application/ld+json"]');
|
||||
for (let i = 0; i < scripts.length; i++) {
|
||||
const content = $(scripts[i]).html();
|
||||
if (!content) continue;
|
||||
|
||||
const data = JSON.parse(content) as JsonLdProduct | JsonLdProduct[];
|
||||
const product = findProduct(data);
|
||||
|
||||
if (product?.offers) {
|
||||
const offer = Array.isArray(product.offers) ? product.offers[0] : product.offers;
|
||||
const priceValue = offer.lowPrice || offer.price || offer.priceSpecification?.price;
|
||||
const currency = offer.priceCurrency || offer.priceSpecification?.priceCurrency || 'USD';
|
||||
|
||||
if (priceValue) {
|
||||
const price = parseFloat(String(priceValue));
|
||||
if (!isNaN(price) && price > 0) {
|
||||
candidates.push({
|
||||
price,
|
||||
currency,
|
||||
method: 'json-ld',
|
||||
context: `Structured data: ${product.name || 'Product'}`,
|
||||
confidence: 0.9, // JSON-LD is highly reliable
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (_e) {
|
||||
// JSON parse error
|
||||
}
|
||||
return candidates;
|
||||
}
|
||||
|
||||
// Extract price candidates from site-specific scraper
|
||||
function extractSiteSpecificCandidates($: CheerioAPI, url: string): { candidates: PriceCandidate[]; name: string | null; imageUrl: string | null; stockStatus: StockStatus } {
|
||||
const candidates: PriceCandidate[] = [];
|
||||
let name: string | null = null;
|
||||
let imageUrl: string | null = null;
|
||||
let stockStatus: StockStatus = 'unknown';
|
||||
|
||||
const siteScraper = siteScrapers.find((s) => s.match(url));
|
||||
if (siteScraper) {
|
||||
const siteResult = siteScraper.scrape($, url);
|
||||
if (siteResult.price) {
|
||||
candidates.push({
|
||||
price: siteResult.price.price,
|
||||
currency: siteResult.price.currency,
|
||||
method: 'site-specific',
|
||||
context: `Site-specific extractor for ${new URL(url).hostname}`,
|
||||
confidence: 0.85, // Site-specific scrapers are well-tested
|
||||
});
|
||||
}
|
||||
name = siteResult.name || null;
|
||||
imageUrl = siteResult.imageUrl || null;
|
||||
stockStatus = siteResult.stockStatus || 'unknown';
|
||||
}
|
||||
|
||||
return { candidates, name, imageUrl, stockStatus };
|
||||
}
|
||||
|
||||
// Extract price candidates from generic CSS selectors
|
||||
function extractGenericCssCandidates($: CheerioAPI): PriceCandidate[] {
|
||||
const candidates: PriceCandidate[] = [];
|
||||
const seen = new Set<number>();
|
||||
|
||||
for (const selector of genericPriceSelectors) {
|
||||
const elements = $(selector);
|
||||
elements.each((_, el) => {
|
||||
const $el = $(el);
|
||||
// Skip if this looks like an "original" or "was" price
|
||||
const classAttr = $el.attr('class') || '';
|
||||
const parentClass = $el.parent().attr('class') || '';
|
||||
if (/original|was|old|regular|compare|strikethrough|line-through/i.test(classAttr + parentClass)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check various attributes where price might be stored
|
||||
const priceAmount = $el.attr('data-price-amount');
|
||||
const dataPrice = $el.attr('data-price');
|
||||
const content = $el.attr('content');
|
||||
const text = $el.text();
|
||||
|
||||
let parsed: ParsedPrice | null = null;
|
||||
let context = selector;
|
||||
|
||||
// Try data-price-amount first (Magento stores numeric value here)
|
||||
if (priceAmount) {
|
||||
const price = parseFloat(priceAmount);
|
||||
if (!isNaN(price) && price > 0) {
|
||||
let currency = 'USD';
|
||||
const textSources = [text, $el.parent().text(), $el.closest('.price-box').text()];
|
||||
for (const source of textSources) {
|
||||
if (!source) continue;
|
||||
const currencyCodeMatch = source.match(/\b(CHF|EUR|GBP|USD|CAD|AUD|JPY|INR)\b/i);
|
||||
if (currencyCodeMatch) {
|
||||
currency = currencyCodeMatch[1].toUpperCase();
|
||||
break;
|
||||
}
|
||||
const symbolMatch = source.match(/([$€£¥₹])/);
|
||||
if (symbolMatch) {
|
||||
const symbolMap: Record<string, string> = { '$': 'USD', '€': 'EUR', '£': 'GBP', '¥': 'JPY', '₹': 'INR' };
|
||||
currency = symbolMap[symbolMatch[1]] || 'USD';
|
||||
break;
|
||||
}
|
||||
}
|
||||
parsed = { price, currency };
|
||||
context = `data-price-amount attribute`;
|
||||
}
|
||||
}
|
||||
|
||||
if (!parsed) {
|
||||
const priceStr = content || dataPrice || text;
|
||||
parsed = parsePrice(priceStr);
|
||||
if (parsed) {
|
||||
context = text.trim().slice(0, 50);
|
||||
}
|
||||
}
|
||||
|
||||
if (parsed && parsed.price > 0 && !seen.has(parsed.price)) {
|
||||
seen.add(parsed.price);
|
||||
candidates.push({
|
||||
price: parsed.price,
|
||||
currency: parsed.currency,
|
||||
method: 'generic-css',
|
||||
context,
|
||||
confidence: 0.6, // Generic CSS is less reliable
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Only take first few generic candidates to avoid noise
|
||||
if (candidates.length >= 3) break;
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
// Browser-based scraping for sites that block HTTP requests (e.g., Cloudflare)
|
||||
async function scrapeWithBrowser(url: string): Promise<string> {
|
||||
const browser = await puppeteer.launch({
|
||||
|
|
@ -1059,6 +1273,287 @@ export async function scrapeProduct(url: string, userId?: number): Promise<Scrap
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Multi-strategy voting scraper with user review support.
|
||||
* Runs all extraction methods, finds consensus, and flags ambiguous cases for user review.
|
||||
*/
|
||||
export async function scrapeProductWithVoting(
|
||||
url: string,
|
||||
userId?: number,
|
||||
preferredMethod?: ExtractionMethod
|
||||
): Promise<ScrapedProductWithCandidates> {
|
||||
const result: ScrapedProductWithCandidates = {
|
||||
name: null,
|
||||
price: null,
|
||||
imageUrl: null,
|
||||
url,
|
||||
stockStatus: 'unknown',
|
||||
aiStatus: null,
|
||||
priceCandidates: [],
|
||||
needsReview: false,
|
||||
};
|
||||
|
||||
let html: string = '';
|
||||
|
||||
try {
|
||||
let usedBrowser = false;
|
||||
|
||||
// Fetch HTML
|
||||
try {
|
||||
const response = await axios.get<string>(url, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
Accept:
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Cache-Control': 'no-cache',
|
||||
Pragma: 'no-cache',
|
||||
'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
||||
'Sec-Ch-Ua-Mobile': '?0',
|
||||
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
},
|
||||
timeout: 20000,
|
||||
maxRedirects: 5,
|
||||
});
|
||||
html = response.data;
|
||||
} catch (axiosError) {
|
||||
if (axiosError instanceof AxiosError && axiosError.response?.status === 403) {
|
||||
console.log(`[Voting] HTTP blocked (403) for ${url}, using browser...`);
|
||||
html = await scrapeWithBrowser(url);
|
||||
usedBrowser = true;
|
||||
} else {
|
||||
throw axiosError;
|
||||
}
|
||||
}
|
||||
|
||||
let $ = load(html);
|
||||
|
||||
// Collect candidates from all methods
|
||||
const allCandidates: PriceCandidate[] = [];
|
||||
|
||||
// 1. JSON-LD extraction (highest reliability)
|
||||
const jsonLdCandidates = extractJsonLdCandidates($);
|
||||
allCandidates.push(...jsonLdCandidates);
|
||||
console.log(`[Voting] JSON-LD found ${jsonLdCandidates.length} candidates`);
|
||||
|
||||
// 2. Site-specific extraction
|
||||
const siteResult = extractSiteSpecificCandidates($, url);
|
||||
allCandidates.push(...siteResult.candidates);
|
||||
if (siteResult.name) result.name = siteResult.name;
|
||||
if (siteResult.imageUrl) result.imageUrl = siteResult.imageUrl;
|
||||
if (siteResult.stockStatus !== 'unknown') result.stockStatus = siteResult.stockStatus;
|
||||
console.log(`[Voting] Site-specific found ${siteResult.candidates.length} candidates`);
|
||||
|
||||
// 3. Generic CSS extraction
|
||||
const genericCandidates = extractGenericCssCandidates($);
|
||||
allCandidates.push(...genericCandidates);
|
||||
console.log(`[Voting] Generic CSS found ${genericCandidates.length} candidates`);
|
||||
|
||||
// If no candidates found in static HTML, try browser rendering
|
||||
if (allCandidates.length === 0 && !usedBrowser) {
|
||||
console.log(`[Voting] No candidates in static HTML, trying browser...`);
|
||||
try {
|
||||
html = await scrapeWithBrowser(url);
|
||||
usedBrowser = true;
|
||||
$ = load(html);
|
||||
|
||||
// Re-run all extraction methods
|
||||
allCandidates.push(...extractJsonLdCandidates($));
|
||||
const browserSiteResult = extractSiteSpecificCandidates($, url);
|
||||
allCandidates.push(...browserSiteResult.candidates);
|
||||
if (!result.name && browserSiteResult.name) result.name = browserSiteResult.name;
|
||||
if (!result.imageUrl && browserSiteResult.imageUrl) result.imageUrl = browserSiteResult.imageUrl;
|
||||
if (result.stockStatus === 'unknown' && browserSiteResult.stockStatus !== 'unknown') {
|
||||
result.stockStatus = browserSiteResult.stockStatus;
|
||||
}
|
||||
allCandidates.push(...extractGenericCssCandidates($));
|
||||
console.log(`[Voting] Browser found ${allCandidates.length} total candidates`);
|
||||
} catch (browserError) {
|
||||
console.error(`[Voting] Browser fallback failed:`, browserError);
|
||||
}
|
||||
}
|
||||
|
||||
// Fill in missing metadata
|
||||
if (!result.name) {
|
||||
result.name = extractGenericName($) || $('meta[property="og:title"]').attr('content') || null;
|
||||
}
|
||||
if (!result.imageUrl) {
|
||||
result.imageUrl = extractGenericImage($, url) || $('meta[property="og:image"]').attr('content') || null;
|
||||
}
|
||||
if (result.stockStatus === 'unknown') {
|
||||
result.stockStatus = extractGenericStockStatus($);
|
||||
}
|
||||
|
||||
// Store all candidates
|
||||
result.priceCandidates = allCandidates;
|
||||
|
||||
// If user has a preferred method, try to use it
|
||||
if (preferredMethod && allCandidates.length > 0) {
|
||||
const preferredCandidate = allCandidates.find(c => c.method === preferredMethod);
|
||||
if (preferredCandidate) {
|
||||
console.log(`[Voting] Using preferred method ${preferredMethod}: ${preferredCandidate.price}`);
|
||||
result.price = { price: preferredCandidate.price, currency: preferredCandidate.currency };
|
||||
result.selectedMethod = preferredMethod;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Find consensus
|
||||
const { price: consensusPrice, hasConsensus, groups } = findPriceConsensus(allCandidates);
|
||||
console.log(`[Voting] Consensus: ${hasConsensus}, Groups: ${groups.length}, Winner: ${consensusPrice?.price}`);
|
||||
|
||||
if (hasConsensus && consensusPrice) {
|
||||
// Clear consensus - use the winning price
|
||||
result.price = { price: consensusPrice.price, currency: consensusPrice.currency };
|
||||
result.selectedMethod = consensusPrice.method;
|
||||
console.log(`[Voting] Consensus price: ${consensusPrice.price} via ${consensusPrice.method}`);
|
||||
} else if (allCandidates.length > 0) {
|
||||
// No consensus - try AI arbitration if available
|
||||
if (userId && html) {
|
||||
console.log(`[Voting] No consensus, trying AI arbitration...`);
|
||||
try {
|
||||
const { tryAIArbitration } = await import('./ai-extractor');
|
||||
const aiResult = await tryAIArbitration(url, html, allCandidates, userId);
|
||||
|
||||
if (aiResult && aiResult.selectedPrice) {
|
||||
console.log(`[Voting] AI selected price: ${aiResult.selectedPrice.price} (reason: ${aiResult.reason})`);
|
||||
result.price = { price: aiResult.selectedPrice.price, currency: aiResult.selectedPrice.currency };
|
||||
result.selectedMethod = aiResult.selectedPrice.method;
|
||||
result.aiStatus = 'verified';
|
||||
|
||||
// Add AI as a candidate for transparency
|
||||
if (!allCandidates.find(c => c.method === 'ai')) {
|
||||
result.priceCandidates.push({
|
||||
price: aiResult.selectedPrice.price,
|
||||
currency: aiResult.selectedPrice.currency,
|
||||
method: 'ai',
|
||||
context: `AI arbitration: ${aiResult.reason}`,
|
||||
confidence: aiResult.confidence || 0.8,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// AI couldn't decide either - flag for user review
|
||||
console.log(`[Voting] AI couldn't decide, flagging for user review`);
|
||||
result.needsReview = true;
|
||||
// Use the most confident candidate as default
|
||||
const bestCandidate = allCandidates.sort((a, b) => b.confidence - a.confidence)[0];
|
||||
result.price = { price: bestCandidate.price, currency: bestCandidate.currency };
|
||||
result.selectedMethod = bestCandidate.method;
|
||||
}
|
||||
} catch (aiError) {
|
||||
console.error(`[Voting] AI arbitration failed:`, aiError);
|
||||
// Fall back to flagging for user review
|
||||
result.needsReview = true;
|
||||
const bestCandidate = allCandidates.sort((a, b) => b.confidence - a.confidence)[0];
|
||||
result.price = { price: bestCandidate.price, currency: bestCandidate.currency };
|
||||
result.selectedMethod = bestCandidate.method;
|
||||
}
|
||||
} else {
|
||||
// No AI available - flag for user review if multiple prices differ significantly
|
||||
if (groups.length > 1) {
|
||||
result.needsReview = true;
|
||||
console.log(`[Voting] Multiple price groups found, flagging for user review`);
|
||||
}
|
||||
// Use the most confident candidate as default
|
||||
const bestCandidate = allCandidates.sort((a, b) => b.confidence - a.confidence)[0];
|
||||
result.price = { price: bestCandidate.price, currency: bestCandidate.currency };
|
||||
result.selectedMethod = bestCandidate.method;
|
||||
}
|
||||
} else {
|
||||
// No candidates at all - try pure AI extraction
|
||||
if (userId && html) {
|
||||
console.log(`[Voting] No candidates found, trying AI extraction...`);
|
||||
try {
|
||||
const { tryAIExtraction } = await import('./ai-extractor');
|
||||
const aiResult = await tryAIExtraction(url, html, userId);
|
||||
|
||||
if (aiResult && aiResult.price && aiResult.confidence > 0.5) {
|
||||
console.log(`[Voting] AI extracted price: ${aiResult.price.price}`);
|
||||
result.price = aiResult.price;
|
||||
result.selectedMethod = 'ai';
|
||||
result.priceCandidates.push({
|
||||
price: aiResult.price.price,
|
||||
currency: aiResult.price.currency,
|
||||
method: 'ai',
|
||||
context: 'AI extraction (no other methods found price)',
|
||||
confidence: aiResult.confidence,
|
||||
});
|
||||
if (!result.name && aiResult.name) result.name = aiResult.name;
|
||||
if (!result.imageUrl && aiResult.imageUrl) result.imageUrl = aiResult.imageUrl;
|
||||
if (result.stockStatus === 'unknown' && aiResult.stockStatus !== 'unknown') {
|
||||
result.stockStatus = aiResult.stockStatus;
|
||||
}
|
||||
}
|
||||
} catch (aiError) {
|
||||
console.error(`[Voting] AI extraction failed:`, aiError);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we have a price but AI is available, verify it
|
||||
if (result.price && userId && html && !result.aiStatus) {
|
||||
try {
|
||||
const { tryAIVerification } = await import('./ai-extractor');
|
||||
const verifyResult = await tryAIVerification(
|
||||
url,
|
||||
html,
|
||||
result.price.price,
|
||||
result.price.currency,
|
||||
userId
|
||||
);
|
||||
|
||||
if (verifyResult) {
|
||||
if (verifyResult.isCorrect) {
|
||||
result.aiStatus = 'verified';
|
||||
} else if (verifyResult.suggestedPrice && verifyResult.confidence > 0.7) {
|
||||
// AI suggests a different price - this might indicate we need review
|
||||
const existingCandidate = allCandidates.find(c =>
|
||||
pricesMatch(c.price, verifyResult.suggestedPrice!.price)
|
||||
);
|
||||
if (existingCandidate) {
|
||||
// AI agrees with one of our candidates - use that
|
||||
result.price = verifyResult.suggestedPrice;
|
||||
result.selectedMethod = existingCandidate.method;
|
||||
result.aiStatus = 'corrected';
|
||||
} else if (!result.needsReview) {
|
||||
// AI suggests a price we didn't find - flag for review
|
||||
result.needsReview = true;
|
||||
result.priceCandidates.push({
|
||||
price: verifyResult.suggestedPrice.price,
|
||||
currency: verifyResult.suggestedPrice.currency,
|
||||
method: 'ai',
|
||||
context: `AI suggestion: ${verifyResult.reason}`,
|
||||
confidence: verifyResult.confidence,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Update stock status from AI
|
||||
if (verifyResult.stockStatus && verifyResult.stockStatus !== 'unknown') {
|
||||
if (result.stockStatus === 'unknown' || verifyResult.stockStatus === 'out_of_stock') {
|
||||
result.stockStatus = verifyResult.stockStatus;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (verifyError) {
|
||||
console.error(`[Voting] AI verification failed:`, verifyError);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`[Voting] Error scraping ${url}:`, error);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
interface JsonLdProduct {
|
||||
'@type'?: string;
|
||||
'@graph'?: JsonLdProduct[];
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue