diff --git a/apps/x/packages/core/src/application/assistant/instructions.ts b/apps/x/packages/core/src/application/assistant/instructions.ts index 0b0eebc6..e0eb0205 100644 --- a/apps/x/packages/core/src/application/assistant/instructions.ts +++ b/apps/x/packages/core/src/application/assistant/instructions.ts @@ -158,7 +158,8 @@ When a user asks for ANY task that might require external capabilities (web sear - \`workspace-readFile\`, \`workspace-writeFile\`, \`workspace-edit\`, \`workspace-remove\` - File operations - \`workspace-readdir\`, \`workspace-exists\`, \`workspace-stat\`, \`workspace-glob\`, \`workspace-grep\` - Directory exploration and file search - \`workspace-mkdir\`, \`workspace-rename\`, \`workspace-copy\` - File/directory management -- \`parseFile\` - Parse and extract text from files (PDF, Excel, CSV, Word .docx). Accepts absolute paths or workspace-relative paths — no need to copy files into the workspace first. +- \`parseFile\` - Parse and extract text from files (PDF, Excel, CSV, Word .docx). Accepts absolute paths or workspace-relative paths — no need to copy files into the workspace first. Best for well-structured digital documents. +- \`LLMParse\` - Send a file to the configured LLM as a multimodal attachment to extract content as markdown. Use this instead of \`parseFile\` for scanned PDFs, images with text, complex layouts, presentations, or any format where local parsing falls short. Supports documents and images. - \`analyzeAgent\` - Agent analysis - \`addMcpServer\`, \`listMcpServers\`, \`listMcpTools\`, \`executeMcpTool\` - MCP server management and execution - \`loadSkill\` - Skill loading diff --git a/apps/x/packages/core/src/application/assistant/skills/organize-files/skill.ts b/apps/x/packages/core/src/application/assistant/skills/organize-files/skill.ts index d0e1ef41..aecf976f 100644 --- a/apps/x/packages/core/src/application/assistant/skills/organize-files/skill.ts +++ b/apps/x/packages/core/src/application/assistant/skills/organize-files/skill.ts @@ -77,6 +77,10 @@ When users want to read or summarize a document's contents (PDF, Excel, CSV, Wor - Accepts absolute paths (e.g., \`~/Downloads/report.pdf\`) or workspace-relative paths — no need to copy files first. - Supported formats: \`.pdf\`, \`.xlsx\`, \`.xls\`, \`.csv\`, \`.docx\` +For scanned PDFs, images with text, complex layouts, or presentations where local parsing falls short, use the \`LLMParse\` builtin tool instead. It sends the file to the configured LLM as a multimodal attachment and returns well-structured markdown. +- Supports everything \`parseFile\` does plus images (\`.png\`, \`.jpg\`, \`.gif\`, \`.webp\`, \`.svg\`, \`.bmp\`, \`.tiff\`), PowerPoint (\`.pptx\`), HTML, and plain text. +- Also accepts an optional \`prompt\` parameter for custom extraction instructions. + ## Organizing Files **Create destination folder:** diff --git a/apps/x/packages/core/src/application/lib/builtin-tools.ts b/apps/x/packages/core/src/application/lib/builtin-tools.ts index b9b8d635..19fbc4e5 100644 --- a/apps/x/packages/core/src/application/lib/builtin-tools.ts +++ b/apps/x/packages/core/src/application/lib/builtin-tools.ts @@ -16,6 +16,9 @@ import { composioAccountsRepo } from "../../composio/repo.js"; import { executeAction as executeComposioAction, isConfigured as isComposioConfigured, listToolkitTools } from "../../composio/client.js"; import { slackToolCatalog } from "../assistant/skills/slack/tool-catalog.js"; import type { ToolContext } from "./exec-tool.js"; +import { generateText } from "ai"; +import { createProvider } from "../../models/models.js"; +import { IModelConfigRepo } from "../../models/repo.js"; // Parser libraries are loaded dynamically inside parseFile.execute() // to avoid pulling pdfjs-dist's DOM polyfills into the main bundle. // Import paths are computed so esbuild cannot statically resolve them. @@ -258,6 +261,26 @@ const resolveSlackToolSlug = async (hintKey: keyof typeof slackToolHints) => { return allSlug; }; +const LLMPARSE_MIME_TYPES: Record = { + '.pdf': 'application/pdf', + '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.doc': 'application/msword', + '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.xls': 'application/vnd.ms-excel', + '.csv': 'text/csv', + '.txt': 'text/plain', + '.html': 'text/html', + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.webp': 'image/webp', + '.svg': 'image/svg+xml', + '.bmp': 'image/bmp', + '.tiff': 'image/tiff', +}; + export const BuiltinTools: z.infer = { loadSkill: { description: "Load a Rowboat skill definition into context by fetching its guidance string", @@ -804,6 +827,74 @@ export const BuiltinTools: z.infer = { }, }, + 'LLMParse': { + description: 'Send a file to the configured LLM as a multimodal attachment and ask it to extract content as markdown. Best for scanned PDFs, images with text, complex layouts, or any format where local parsing falls short. Supports documents (PDF, Word, Excel, PowerPoint, CSV, TXT, HTML) and images (PNG, JPG, GIF, WebP, SVG, BMP, TIFF).', + inputSchema: z.object({ + path: z.string().min(1).describe('File path to parse. Can be an absolute path or a workspace-relative path.'), + prompt: z.string().optional().describe('Custom instruction for the LLM (defaults to "Convert this file to well-structured markdown.")'), + }), + execute: async ({ path: filePath, prompt }: { path: string; prompt?: string }) => { + try { + const fileName = path.basename(filePath); + const ext = path.extname(filePath).toLowerCase(); + const mimeType = LLMPARSE_MIME_TYPES[ext]; + + if (!mimeType) { + return { + success: false, + error: `Unsupported file format '${ext}'. Supported formats: ${Object.keys(LLMPARSE_MIME_TYPES).join(', ')}`, + }; + } + + // Read file as buffer — support both absolute and workspace-relative paths + let buffer: Buffer; + if (path.isAbsolute(filePath)) { + buffer = await fs.readFile(filePath); + } else { + const result = await workspace.readFile(filePath, 'base64'); + buffer = Buffer.from(result.data, 'base64'); + } + + const base64 = buffer.toString('base64'); + + // Resolve model config from DI container + const modelConfigRepo = container.resolve('modelConfigRepo'); + const modelConfig = await modelConfigRepo.getConfig(); + const provider = createProvider(modelConfig.provider); + const model = provider.languageModel(modelConfig.model); + + const userPrompt = prompt || 'Convert this file to well-structured markdown.'; + + const response = await generateText({ + model, + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: userPrompt }, + { type: 'file', data: base64, mediaType: mimeType }, + ], + }, + ], + }); + + return { + success: true, + fileName, + format: ext.slice(1), + mimeType, + content: response.text, + usage: response.usage, + }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error', + }; + } + }, + }, + analyzeAgent: { description: 'Read and analyze an agent file to understand its structure, tools, and configuration', inputSchema: z.object({