Fix AI extraction JSON-LD parsing and add debug logging

- Extract JSON-LD scripts BEFORE removing script tags
- Add logging for prepared HTML, AI responses, and parsed data
- Include more detailed error messages in test endpoint

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
clucraft 2026-01-21 21:58:39 -05:00
parent d98138fe7c
commit 906212e6ae
2 changed files with 28 additions and 13 deletions

View file

@ -199,16 +199,21 @@ router.post('/ai/test', async (req: AuthRequest, res: Response) => {
return; return;
} }
console.log(`[AI Test] Testing URL: ${url} with provider: ${settings.ai_provider}`);
const { extractWithAI } = await import('../services/ai-extractor'); const { extractWithAI } = await import('../services/ai-extractor');
const result = await extractWithAI(url, settings); const result = await extractWithAI(url, settings);
console.log(`[AI Test] Result:`, JSON.stringify(result, null, 2));
res.json({ res.json({
success: !!result.price, success: !!result.price,
...result, ...result,
}); });
} catch (error) { } catch (error) {
console.error('Error testing AI extraction:', error); console.error('Error testing AI extraction:', error);
res.status(500).json({ error: 'Failed to test AI extraction' }); const errorMessage = error instanceof Error ? error.message : 'Unknown error';
res.status(500).json({ error: `Failed to test AI extraction: ${errorMessage}` });
} }
}); });

View file

@ -36,7 +36,21 @@ HTML Content:
function prepareHtmlForAI(html: string): string { function prepareHtmlForAI(html: string): string {
const $ = load(html); const $ = load(html);
// Remove script, style, and other non-content elements // Extract JSON-LD data BEFORE removing scripts (it often contains product info)
const jsonLdScripts: string[] = [];
$('script[type="application/ld+json"]').each((_, el) => {
const scriptContent = $(el).html();
if (scriptContent) {
// Include any JSON-LD that might be product-related
if (scriptContent.includes('price') ||
scriptContent.includes('Product') ||
scriptContent.includes('Offer')) {
jsonLdScripts.push(scriptContent);
}
}
});
// Now remove script, style, and other non-content elements
$('script, style, noscript, iframe, svg, path, meta, link, comment').remove(); $('script, style, noscript, iframe, svg, path, meta, link, comment').remove();
// Get the body content // Get the body content
@ -60,19 +74,11 @@ function prepareHtmlForAI(html: string): string {
} }
} }
// Also extract JSON-LD data which often contains product info // Combine JSON-LD data with HTML content
const jsonLdScripts: string[] = [];
$('script[type="application/ld+json"]').each((_, el) => {
const scriptContent = $(el).html();
if (scriptContent && scriptContent.includes('price')) {
jsonLdScripts.push(scriptContent);
}
});
// Combine content with JSON-LD data
let finalContent = content; let finalContent = content;
if (jsonLdScripts.length > 0) { if (jsonLdScripts.length > 0) {
finalContent = `JSON-LD Data:\n${jsonLdScripts.join('\n')}\n\nHTML Content:\n${content}`; finalContent = `JSON-LD Structured Data:\n${jsonLdScripts.join('\n')}\n\nHTML Content:\n${content}`;
console.log(`[AI] Found ${jsonLdScripts.length} JSON-LD scripts with product data`);
} }
// Truncate to ~15000 characters to stay within token limits // Truncate to ~15000 characters to stay within token limits
@ -80,6 +86,7 @@ function prepareHtmlForAI(html: string): string {
finalContent = finalContent.substring(0, 15000) + '\n... [truncated]'; finalContent = finalContent.substring(0, 15000) + '\n... [truncated]';
} }
console.log(`[AI] Prepared HTML content: ${finalContent.length} characters`);
return finalContent; return finalContent;
} }
@ -138,6 +145,8 @@ async function extractWithOpenAI(
} }
function parseAIResponse(responseText: string): AIExtractionResult { function parseAIResponse(responseText: string): AIExtractionResult {
console.log(`[AI] Raw response: ${responseText.substring(0, 500)}...`);
// Try to extract JSON from the response // Try to extract JSON from the response
let jsonStr = responseText.trim(); let jsonStr = responseText.trim();
@ -155,6 +164,7 @@ function parseAIResponse(responseText: string): AIExtractionResult {
try { try {
const data = JSON.parse(jsonStr); const data = JSON.parse(jsonStr);
console.log(`[AI] Parsed data:`, JSON.stringify(data, null, 2));
let price: ParsedPrice | null = null; let price: ParsedPrice | null = null;
if (data.price !== null && data.price !== undefined) { if (data.price !== null && data.price !== undefined) {