Image upload (#270)

* added upload button

* image shows up when attached

* added automatic parsing of the image

* make generate image accept an input image

* move image description to debug message

* disable message sending if the image processing hasnt completed yet

* move to x icon for dismiss

* image description processing is stopped on image dismiss

* minor changes
This commit is contained in:
arkml 2025-09-19 21:00:25 +05:30 committed by GitHub
parent 109997ca2e
commit 8b38660a68
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 526 additions and 17 deletions

View file

@ -0,0 +1,75 @@
import { NextRequest, NextResponse } from 'next/server';
import { S3Client, GetObjectCommand, HeadObjectCommand } from '@aws-sdk/client-s3';
import { Readable } from 'stream';
// Serves uploaded images from S3 by UUID-only path: /api/uploaded-images/{id}
// Reconstructs the S3 key using the same sharding logic as image upload.
export async function GET(request: NextRequest, props: { params: Promise<{ id: string }> }) {
const params = await props.params;
const id = params.id;
if (!id) {
return NextResponse.json({ error: 'Missing id' }, { status: 400 });
}
const bucket = process.env.RAG_UPLOADS_S3_BUCKET || '';
if (!bucket) {
return NextResponse.json({ error: 'S3 bucket not configured' }, { status: 500 });
}
const region = process.env.RAG_UPLOADS_S3_REGION || 'us-east-1';
const s3 = new S3Client({
region,
credentials: process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY ? {
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
} as any : undefined,
});
// Reconstruct directory sharding from last two characters of UUID
const last2 = id.slice(-2).padStart(2, '0');
const dirA = last2.charAt(0);
const dirB = last2.charAt(1);
const baseKey = `uploaded_images/${dirA}/${dirB}/${id}`;
// Try known extensions in order
const exts = ['.png', '.jpg', '.webp', '.bin'];
let foundExt: string | null = null;
for (const ext of exts) {
try {
await s3.send(new HeadObjectCommand({ Bucket: bucket, Key: `${baseKey}${ext}` }));
foundExt = ext;
break;
} catch {
// continue
}
}
if (!foundExt) {
return NextResponse.json({ error: 'Not found' }, { status: 404 });
}
const key = `${baseKey}${foundExt}`;
const filename = `${id}${foundExt}`;
try {
const resp = await s3.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
const contentType = resp.ContentType || 'application/octet-stream';
const body = resp.Body as any;
const webStream = body?.transformToWebStream
? body.transformToWebStream()
: (Readable as any)?.toWeb
? (Readable as any).toWeb(body)
: body;
return new NextResponse(webStream, {
status: 200,
headers: {
'Content-Type': contentType,
'Cache-Control': 'public, max-age=31536000, immutable',
'Content-Disposition': `inline; filename="${filename}"`,
},
});
} catch (e) {
console.error('S3 get error', e);
return NextResponse.json({ error: 'Not found' }, { status: 404 });
}
}

View file

@ -0,0 +1,85 @@
import { NextRequest, NextResponse } from 'next/server';
import { S3Client, PutObjectCommand } from '@aws-sdk/client-s3';
import crypto from 'crypto';
import { tempBinaryCache } from '@/src/application/services/temp-binary-cache';
import { GoogleGenerativeAI } from '@google/generative-ai';
// POST /api/uploaded-images
// Accepts an image file (multipart/form-data, field name: "file")
// Stores it either in S3 (if configured) under uploaded_images/<a>/<b>/<uuid>.<ext>
// or in the in-memory temp cache. Returns a JSON with a URL that the agent can fetch.
export async function POST(request: NextRequest) {
try {
const contentType = request.headers.get('content-type') || '';
if (!contentType.includes('multipart/form-data')) {
return NextResponse.json({ error: 'Expected multipart/form-data' }, { status: 400 });
}
const form = await request.formData();
const file = form.get('file') as File | null;
if (!file) {
return NextResponse.json({ error: 'Missing file' }, { status: 400 });
}
const arrayBuf = await file.arrayBuffer();
const buf = Buffer.from(arrayBuf);
const mime = file.type || 'application/octet-stream';
// Optionally describe image with Gemini
let descriptionMarkdown: string | null = null;
try {
const apiKey = process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY || '';
if (apiKey) {
const genAI = new GoogleGenerativeAI(apiKey);
const model = genAI.getGenerativeModel({ model: 'gemini-2.5-flash' });
const prompt = 'Describe this image in concise, high-quality Markdown. Focus on key objects, text, layout, style, colors, and any notable details. Do not include extra commentary or instructions.';
const result = await model.generateContent([
{ inlineData: { data: buf.toString('base64'), mimeType: mime } },
prompt,
]);
descriptionMarkdown = result.response?.text?.() || null;
}
} catch (e) {
console.warn('Gemini description failed', e);
}
// If S3 configured, upload there
const s3Bucket = process.env.RAG_UPLOADS_S3_BUCKET || '';
if (s3Bucket) {
const s3Region = process.env.RAG_UPLOADS_S3_REGION || 'us-east-1';
const s3 = new S3Client({
region: s3Region,
credentials: process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY ? {
accessKeyId: process.env.AWS_ACCESS_KEY_ID as string,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY as string,
} : undefined,
});
const ext = mime === 'image/jpeg' ? '.jpg' : mime === 'image/webp' ? '.webp' : mime === 'image/png' ? '.png' : '.bin';
const imageId = crypto.randomUUID();
const last2 = imageId.slice(-2).padStart(2, '0');
const dirA = last2.charAt(0);
const dirB = last2.charAt(1);
const key = `uploaded_images/${dirA}/${dirB}/${imageId}${ext}`;
await s3.send(new PutObjectCommand({
Bucket: s3Bucket,
Key: key,
Body: buf,
ContentType: mime,
}));
const url = `/api/uploaded-images/${imageId}`;
return NextResponse.json({ url, storage: 's3', id: imageId, mimeType: mime, description: descriptionMarkdown });
}
// Otherwise store in temp cache and return temp URL
const ttlSec = 10 * 60; // 10 minutes
const id = tempBinaryCache.put(buf, mime, ttlSec * 1000);
const url = `/api/tmp-images/${id}`;
return NextResponse.json({ url, storage: 'temp', id, mimeType: mime, expiresInSec: ttlSec, description: descriptionMarkdown });
} catch (e) {
console.error('upload image error', e);
return NextResponse.json({ error: 'Upload failed' }, { status: 500 });
}
}

View file

@ -152,11 +152,24 @@ export function Chat({
}
}, []);
function handleUserMessage(prompt: string) {
const updatedMessages: z.infer<typeof Message>[] = [...messages, {
role: 'user',
content: prompt,
}];
function handleUserMessage(prompt: string, imageDebug?: { url: string; description?: string | null }) {
// Insert an internal-only debug message with image URL/markdown (if provided),
// then the actual user message last so streaming triggers correctly.
const debugMessages: z.infer<typeof Message>[] = imageDebug ? [{
role: 'assistant',
content: `Image Description\n\nURL: ${imageDebug.url}\n\n${imageDebug.description ? imageDebug.description : ''}`.trim(),
agentName: 'Image Description',
responseType: 'internal',
} as any] : [];
const updatedMessages: z.infer<typeof Message>[] = [
...messages,
...debugMessages,
{
role: 'user',
content: prompt,
} as any,
];
setMessages(updatedMessages);
setError(null);
setIsLastInteracted(true);
@ -229,9 +242,46 @@ export function Chat({
}
// set up a cached turn
// Merge-at-send: if the immediately preceding message is our internal
// Image Description debug message, append its details (URL/markdown)
// to the outgoing user message content, without changing the UI.
const last = messages[messages.length - 1];
let mergedContent = (typeof last?.content === 'string' ? last.content : '') || '';
if (messages.length >= 2) {
const prev = messages[messages.length - 2] as any;
const isImageDebug = prev && prev.role === 'assistant' && prev.responseType === 'internal' && prev.agentName === 'Image Description' && typeof prev.content === 'string';
if (isImageDebug) {
// Expect prev.content to have: "Image Description\n\nURL: <url>\n\n<markdown>"
// Extract URL and markdown blocks for a clean append
const content = prev.content as string;
let url: string | undefined;
let markdown: string | undefined;
const urlMatch = content.match(/URL:\s*(\S+)/i);
if (urlMatch) url = urlMatch[1];
// markdown is whatever comes after the blank line following URL
const parts = content.split(/\n\n/);
if (parts.length >= 3) {
markdown = parts.slice(2).join('\n\n').trim();
}
const appendSections: string[] = [];
if (url) appendSections.push(`The user uploaded an image. URL: ${url}`);
if (markdown) appendSections.push(`Image description (markdown):\n\n${markdown}`);
if (appendSections.length > 0) {
mergedContent = [mergedContent, appendSections.join('\n\n')]
.filter(Boolean)
.join('\n\n');
}
}
}
const messagesToSend: z.infer<typeof Message>[] = [{
role: 'user',
content: mergedContent,
} as any];
const response = await createCachedTurn({
conversationId: conversationId.current,
messages: messages.slice(-1), // only send the last message
messages: messagesToSend, // send merged content only
});
if (ignore) {
return;
@ -500,4 +550,4 @@ export function Chat({
/>
</div>
);
}
}

View file

@ -3,7 +3,7 @@ import { Textarea } from '@/components/ui/textarea';
import { Button, Spinner } from "@heroui/react";
interface ComposeBoxPlaygroundProps {
handleUserMessage: (message: string) => void;
handleUserMessage: (message: string, imageDebug?: { url: string; description?: string | null }) => void;
messages: any[];
loading: boolean;
disabled?: boolean;
@ -22,9 +22,12 @@ export function ComposeBoxPlayground({
onCancel,
}: ComposeBoxPlaygroundProps) {
const [input, setInput] = useState('');
const [uploading, setUploading] = useState(false);
const [pendingImage, setPendingImage] = useState<{ url?: string; previewSrc?: string; mimeType?: string; description?: string | null } | null>(null);
const [isFocused, setIsFocused] = useState(false);
const textareaRef = useRef<HTMLTextAreaElement>(null);
const previousMessagesLength = useRef(messages.length);
const uploadAbortRef = useRef<AbortController | null>(null);
// Handle auto-focus when new messages arrive
useEffect(() => {
@ -35,12 +38,27 @@ export function ComposeBoxPlayground({
}, [messages.length, shouldAutoFocus]);
function handleInput() {
const prompt = input.trim();
if (!prompt) {
// Mirror send-button disable rules to block Enter submits
if (disabled || loading || uploading) return;
if (pendingImage?.url && pendingImage.description === undefined) return;
const text = input.trim();
if (!text && !pendingImage) {
return;
}
// Only include the user's typed text; omit image URL/markdown from user message
const parts: string[] = [];
if (text) parts.push(text);
const prompt = parts.join('\n\n');
// Build optional debug payload to render as internal-only message in debug view
const imageDebug = pendingImage?.url
? { url: pendingImage.url, description: pendingImage.description ?? null }
: undefined;
setInput('');
handleUserMessage(prompt);
if (pendingImage?.previewSrc) {
try { URL.revokeObjectURL(pendingImage.previewSrc); } catch {}
}
setPendingImage(null);
handleUserMessage(prompt, imageDebug);
}
const handleInputKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
@ -55,6 +73,57 @@ export function ComposeBoxPlayground({
onFocus?.();
};
async function handleImagePicked(file: File) {
if (!file) return;
try {
// Show immediate local preview
const previewSrc = URL.createObjectURL(file);
setPendingImage({ previewSrc });
setUploading(true);
// Cancel any in-flight request
if (uploadAbortRef.current) {
try { uploadAbortRef.current.abort(); } catch {}
uploadAbortRef.current = null;
}
const controller = new AbortController();
uploadAbortRef.current = controller;
const form = new FormData();
form.append('file', file);
const res = await fetch('/api/uploaded-images', {
method: 'POST',
body: form,
signal: controller.signal,
});
if (!res.ok) {
throw new Error(`Upload failed: ${res.status}`);
}
const data = await res.json();
const url: string | undefined = data?.url;
if (!url) throw new Error('No URL returned');
// Only apply state if request wasn't aborted/dismissed
if (uploadAbortRef.current === controller) {
setPendingImage({ url, previewSrc, mimeType: data?.mimeType, description: data?.description ?? null });
}
} catch (e: any) {
if (e?.name === 'AbortError') {
// Swallow aborts
console.log('Image upload/description aborted');
} else {
console.error('Image upload failed', e);
alert('Image upload failed. Please try again.');
}
} finally {
if (uploadAbortRef.current === null) {
// Dismissed earlier; ensure uploading is false
setUploading(false);
} else {
// If this is still the active controller, clear uploading and ref
setUploading(false);
uploadAbortRef.current = null;
}
}
}
return (
<div className="relative group">
{/* Keyboard shortcut hint */}
@ -68,6 +137,33 @@ export function ComposeBoxPlayground({
bg-white dark:bg-[#1e2023] flex items-end gap-2">
{/* Textarea */}
<div className="flex-1">
{pendingImage && (
<div className="mb-2 inline-block relative">
<img
src={pendingImage.previewSrc || pendingImage.url}
alt="Uploaded image preview"
className="w-16 h-16 object-cover rounded border border-gray-200 dark:border-gray-700"
/>
<button
type="button"
aria-label="Remove image"
className="absolute -top-1 -right-1 p-1 rounded-full bg-white dark:bg-zinc-900 border border-gray-200 dark:border-gray-700 shadow hover:bg-gray-50 dark:hover:bg-zinc-800"
onClick={() => {
if (pendingImage?.previewSrc) {
try { URL.revokeObjectURL(pendingImage.previewSrc); } catch {}
}
if (uploadAbortRef.current) {
try { uploadAbortRef.current.abort(); } catch {}
uploadAbortRef.current = null;
}
setUploading(false);
setPendingImage(null);
}}
>
<XIcon size={12} />
</button>
</div>
)}
<Textarea
ref={textareaRef}
value={input}
@ -95,11 +191,37 @@ export function ComposeBoxPlayground({
/>
</div>
{/* Image upload button (moved to the right) */}
<label className={`
flex items-center justify-center w-9 h-9 rounded-lg cursor-pointer
${uploading ? 'bg-gray-100 dark:bg-gray-800 text-gray-400' : 'bg-gray-100 hover:bg-gray-200 dark:bg-gray-800 dark:hover:bg-gray-700 text-gray-600 dark:text-gray-300'}
transition-colors
`}>
<input
type="file"
accept="image/*"
className="hidden"
disabled={disabled || loading || uploading}
onChange={(e) => {
const f = e.target.files?.[0];
if (f) handleImagePicked(f);
e.currentTarget.value = '';
}}
/>
{uploading ? <Spinner size="sm" /> : <ImageIcon size={16} />}
</label>
{/* Send/Stop button */}
<Button
size="sm"
isIconOnly
disabled={disabled || (loading ? false : !input.trim())}
disabled={
disabled
|| uploading
// If an image is selected but description isn't ready yet, keep disabled
|| (pendingImage?.url && pendingImage.description === undefined)
// When not loading a response, require either text or a ready image
|| (loading ? false : (!input.trim() && !pendingImage))
}
onPress={loading ? onCancel : handleInput}
className={`
transition-all duration-200
@ -163,4 +285,43 @@ function StopIcon({ size, className }: { size: number, className?: string }) {
<rect x="6" y="6" width="12" height="12" rx="1" />
</svg>
);
}
}
function ImageIcon({ size, className }: { size: number, className?: string }) {
return (
<svg
width={size}
height={size}
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
className={className}
>
<rect x="3" y="3" width="18" height="18" rx="2" ry="2" />
<circle cx="8.5" cy="8.5" r="1.5" />
<path d="M21 15l-5-5L5 21" />
</svg>
);
}
function XIcon({ size, className }: { size: number, className?: string }) {
return (
<svg
width={size}
height={size}
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
className={className}
>
<line x1="18" y1="6" x2="6" y2="18" />
<line x1="6" y1="6" x2="18" y2="18" />
</svg>
);
}

View file

@ -8,7 +8,7 @@ import { SignJWT } from "jose";
import crypto from "crypto";
import { GoogleGenerativeAI } from "@google/generative-ai";
import { tempBinaryCache } from "@/src/application/services/temp-binary-cache";
import { S3Client, PutObjectCommand } from "@aws-sdk/client-s3";
import { S3Client, PutObjectCommand, GetObjectCommand, HeadObjectCommand } from "@aws-sdk/client-s3";
// Internal dependencies
import { embeddingModel } from "@/app/lib/embedding";
@ -44,6 +44,7 @@ export async function invokeGenerateImageTool(
prompt: string,
options?: {
modelName?: string;
inputImageUrl?: string;
}
): Promise<{
texts: string[];
@ -62,7 +63,140 @@ export async function invokeGenerateImageTool(
const model = client.getGenerativeModel({ model: modelName });
log.log(`Generating image with model: ${modelName}`);
const result = await model.generateContent(prompt);
let result: any;
const inputImageUrl = options?.inputImageUrl;
if (inputImageUrl) {
try {
// Resolve the image into inlineData for Gemini
let imageBuf: Buffer | null = null;
let imageMime: string = 'image/png';
if (inputImageUrl.startsWith('/api/tmp-images/')) {
const id = inputImageUrl.split('/api/tmp-images/')[1];
const entry = tempBinaryCache.get(id);
if (entry) {
imageBuf = entry.buf;
imageMime = entry.mimeType || imageMime;
}
} else if (inputImageUrl.startsWith('/api/uploaded-images/')) {
const bucket = process.env.RAG_UPLOADS_S3_BUCKET || '';
if (bucket) {
const region = process.env.RAG_UPLOADS_S3_REGION || 'us-east-1';
const s3 = new S3Client({
region,
credentials: process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY ? {
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
} as any : undefined,
});
const id = inputImageUrl.split('/api/uploaded-images/')[1];
const last2 = id.slice(-2).padStart(2, '0');
const dirA = last2.charAt(0);
const dirB = last2.charAt(1);
const baseKey = `uploaded_images/${dirA}/${dirB}/${id}`;
const exts = ['.png', '.jpg', '.webp', '.bin'];
let foundExt: string | null = null;
for (const ext of exts) {
try {
await s3.send(new HeadObjectCommand({ Bucket: bucket, Key: `${baseKey}${ext}` }));
foundExt = ext; break;
} catch {}
}
if (foundExt) {
const key = `${baseKey}${foundExt}`;
const resp = await s3.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
const chunks: Buffer[] = [];
const body = resp.Body as any;
const nodeStream = typeof body?.pipe === 'function' ? body : undefined;
if (nodeStream) {
imageMime = resp.ContentType || imageMime;
await new Promise<void>((resolve, reject) => {
nodeStream.on('data', (c: Buffer) => chunks.push(Buffer.isBuffer(c) ? c : Buffer.from(c)));
nodeStream.on('end', () => resolve());
nodeStream.on('error', reject);
});
imageBuf = Buffer.concat(chunks);
}
}
}
} else if (inputImageUrl.startsWith('/api/generated-images/')) {
const bucket = process.env.RAG_UPLOADS_S3_BUCKET || '';
if (bucket) {
const region = process.env.RAG_UPLOADS_S3_REGION || 'us-east-1';
const s3 = new S3Client({
region,
credentials: process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY ? {
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
} as any : undefined,
});
const id = inputImageUrl.split('/api/generated-images/')[1];
const last2 = id.slice(-2).padStart(2, '0');
const dirA = last2.charAt(0);
const dirB = last2.charAt(1);
const baseKey = `generated_images/${dirA}/${dirB}/${id}`;
const exts = ['.png', '.jpg', '.webp'];
let foundExt: string | null = null;
for (const ext of exts) {
try {
await s3.send(new HeadObjectCommand({ Bucket: bucket, Key: `${baseKey}${ext}` }));
foundExt = ext; break;
} catch {}
}
if (foundExt) {
const key = `${baseKey}${foundExt}`;
const resp = await s3.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
const chunks: Buffer[] = [];
const body = resp.Body as any;
const nodeStream = typeof body?.pipe === 'function' ? body : undefined;
if (nodeStream) {
imageMime = resp.ContentType || imageMime;
await new Promise<void>((resolve, reject) => {
nodeStream.on('data', (c: Buffer) => chunks.push(Buffer.isBuffer(c) ? c : Buffer.from(c)));
nodeStream.on('end', () => resolve());
nodeStream.on('error', reject);
});
imageBuf = Buffer.concat(chunks);
}
}
}
} else if (inputImageUrl.startsWith('data:')) {
// data URL
const m = inputImageUrl.match(/^data:([^;]+);base64,(.*)$/);
if (m) {
imageMime = m[1];
imageBuf = Buffer.from(m[2], 'base64');
}
} else if (/^https?:\/\//.test(inputImageUrl)) {
// Best-effort network fetch (may fail if egress restricted)
try {
const resp = await fetch(inputImageUrl);
const ab = await resp.arrayBuffer();
imageBuf = Buffer.from(ab);
imageMime = resp.headers.get('content-type') || imageMime;
} catch {
// ignore
}
}
if (imageBuf) {
const parts: any[] = [
{ inlineData: { data: imageBuf.toString('base64'), mimeType: imageMime } },
prompt,
];
result = await model.generateContent(parts as any);
} else {
// Fallback to text-only
result = await model.generateContent(prompt);
}
} catch (e) {
log.log('Falling back to text-only generation due to input image error');
result = await model.generateContent(prompt);
}
} else {
result = await model.generateContent(prompt);
}
const response = result.response as any;
// Track usage if available
@ -627,7 +761,10 @@ export function createGenerateImageTool(
strict: false,
parameters: {
type: 'object',
properties: parameters.properties,
properties: {
...parameters.properties,
input_image_url: { type: 'string', description: 'Optional URL of an input image to condition generation.' },
},
required: parameters.required || [],
additionalProperties: true,
},
@ -638,11 +775,12 @@ export function createGenerateImageTool(
return JSON.stringify({ error: "Missing required field: prompt" });
}
const modelName: string | undefined = input?.modelName;
const inputImageUrl: string | undefined = input?.input_image_url;
const result = await invokeGenerateImageTool(
logger,
usageTracker,
prompt,
{ modelName }
{ modelName, inputImageUrl }
);
// If S3 bucket configured, store in S3 under generated_images/<c>/<d>/<filename>
const s3Bucket = process.env.RAG_UPLOADS_S3_BUCKET || '';