Support multimodal chat with pending screen images on web

This commit is contained in:
CREDO23 2026-04-24 19:17:43 +02:00
parent a7d3e4ff18
commit 3f97b77ab6
6 changed files with 285 additions and 36 deletions

View file

@ -6,6 +6,7 @@ import { useTranslations } from "next-intl";
import type React from "react";
import { useCallback, useEffect, useRef, useState } from "react";
import { toast } from "sonner";
import { pendingUserImageDataUrlsAtom } from "@/atoms/chat/pending-user-images.atom";
import { myAccessAtom } from "@/atoms/members/members-query.atoms";
import { updateLLMPreferencesMutationAtom } from "@/atoms/new-llm-config/new-llm-config-mutation.atoms";
import {
@ -33,6 +34,7 @@ export function DashboardClientLayout({
const pathname = usePathname();
const { search_space_id } = useParams();
const setActiveSearchSpaceIdState = useSetAtom(activeSearchSpaceIdAtom);
const setPendingUserImageUrls = useSetAtom(pendingUserImageDataUrlsAtom);
const {
data: preferences = {},
@ -142,6 +144,14 @@ export function DashboardClientLayout({
const electronAPI = useElectronAPI();
useEffect(() => {
if (!electronAPI?.onChatScreenCapture) return;
return electronAPI.onChatScreenCapture((dataUrl: string) => {
if (typeof dataUrl !== "string" || !dataUrl.startsWith("data:image/")) return;
setPendingUserImageUrls((prev) => [...prev, dataUrl]);
});
}, [electronAPI, setPendingUserImageUrls]);
useEffect(() => {
const activeSeacrhSpaceId =
typeof search_space_id === "string"

View file

@ -26,6 +26,7 @@ import {
messageDocumentsMapAtom,
sidebarSelectedDocumentsAtom,
} from "@/atoms/chat/mentioned-documents.atom";
import { pendingUserImageDataUrlsAtom } from "@/atoms/chat/pending-user-images.atom";
import {
clearPlanOwnerRegistry,
// extractWriteTodosFromContent,
@ -45,8 +46,8 @@ import {
} from "@/components/assistant-ui/token-usage-context";
import { useChatSessionStateSync } from "@/hooks/use-chat-session-state";
import { useMessagesSync } from "@/hooks/use-messages-sync";
import { documentsApiService } from "@/lib/apis/documents-api.service";
import { getAgentFilesystemSelection } from "@/lib/agent-filesystem";
import { documentsApiService } from "@/lib/apis/documents-api.service";
import { getBearerToken } from "@/lib/auth-utils";
import { convertToThreadMessage } from "@/lib/chat/message-utils";
import {
@ -76,6 +77,7 @@ import {
type ThreadListResponse,
type ThreadRecord,
} from "@/lib/chat/thread-persistence";
import { extractUserTurnForNewChatApi } from "@/lib/chat/user-turn-api-parts";
import { NotFoundError } from "@/lib/error";
import {
trackChatCreated,
@ -231,6 +233,8 @@ export default function NewChatPage() {
const updateChatTabTitle = useSetAtom(updateChatTabTitleAtom);
const removeChatTab = useSetAtom(removeChatTabAtom);
const setAgentCreatedDocuments = useSetAtom(agentCreatedDocumentsAtom);
const pendingUserImageUrls = useAtomValue(pendingUserImageDataUrlsAtom);
const setPendingUserImageUrls = useSetAtom(pendingUserImageDataUrlsAtom);
// Get current user for author info in shared chats
const { data: currentUser } = useAtomValue(currentUserAtom);
@ -494,18 +498,13 @@ export default function NewChatPage() {
abortControllerRef.current = null;
}
// Extract user query text from content parts
let userQuery = "";
for (const part of message.content) {
if (part.type === "text") {
userQuery += part.text;
}
}
const urlsSnapshot = [...pendingUserImageUrls];
setPendingUserImageUrls([]);
const { userQuery, userImages } = extractUserTurnForNewChatApi(message, urlsSnapshot);
if (!userQuery.trim()) return;
if (!userQuery.trim() && userImages.length === 0) return;
// Check if podcast is already generating
if (isPodcastGenerating() && looksLikePodcastRequest(userQuery)) {
if (userQuery.trim() && isPodcastGenerating() && looksLikePodcastRequest(userQuery)) {
toast.warning("A podcast is already being generated.");
return;
}
@ -560,10 +559,27 @@ export default function NewChatPage() {
}
: undefined;
const existingImageUrls = new Set(
message.content
.filter(
(p): p is { type: "image"; image: string } =>
typeof p === "object" &&
p !== null &&
"type" in p &&
p.type === "image" &&
"image" in p
)
.map((p) => p.image)
);
const extraImageParts = urlsSnapshot
.filter((u) => !existingImageUrls.has(u))
.map((image) => ({ type: "image" as const, image }));
const userDisplayContent = [...message.content, ...extraImageParts];
const userMessage: ThreadMessageLike = {
id: userMsgId,
role: "user",
content: message.content,
content: userDisplayContent,
createdAt: new Date(),
metadata: authorMetadata,
};
@ -571,7 +587,7 @@ export default function NewChatPage() {
// Track message sent
trackChatMessageSent(searchSpaceId, currentThreadId, {
hasAttachments: false,
hasAttachments: userImages.length > 0,
hasMentionedDocuments:
mentionedDocumentIds.surfsense_doc_ids.length > 0 ||
mentionedDocumentIds.document_ids.length > 0,
@ -596,7 +612,7 @@ export default function NewChatPage() {
}));
}
const persistContent: unknown[] = [...message.content];
const persistContent: unknown[] = [...userDisplayContent];
if (allMentionedDocs.length > 0) {
persistContent.push({
@ -661,8 +677,7 @@ export default function NewChatPage() {
const selection = await getAgentFilesystemSelection();
if (
selection.filesystem_mode === "desktop_local_folder" &&
(!selection.local_filesystem_mounts ||
selection.local_filesystem_mounts.length === 0)
(!selection.local_filesystem_mounts || selection.local_filesystem_mounts.length === 0)
) {
toast.error("Select a local folder before using Local Folder mode.");
return;
@ -711,6 +726,7 @@ export default function NewChatPage() {
? mentionedDocumentIds.surfsense_doc_ids
: undefined,
disabled_tools: disabledTools.length > 0 ? disabledTools : undefined,
...(userImages.length > 0 ? { user_images: userImages } : {}),
}),
signal: controller.signal,
});
@ -842,14 +858,7 @@ export default function NewChatPage() {
});
} else {
const tcId = `interrupt-${action.name}`;
addToolCall(
contentPartsState,
toolsWithUI,
tcId,
action.name,
action.args,
true
);
addToolCall(contentPartsState, toolsWithUI, tcId, action.name, action.args, true);
updateToolCall(contentPartsState, tcId, {
result: { __interrupt__: true, ...interruptData },
});
@ -989,6 +998,9 @@ export default function NewChatPage() {
disabledTools,
updateChatTabTitle,
tokenUsageStore,
pendingUserImageUrls,
setPendingUserImageUrls,
toolsWithUI,
]
);
@ -1189,14 +1201,7 @@ export default function NewChatPage() {
});
} else {
const tcId = `interrupt-${action.name}`;
addToolCall(
contentPartsState,
toolsWithUI,
tcId,
action.name,
action.args,
true
);
addToolCall(contentPartsState, toolsWithUI, tcId, action.name, action.args, true);
updateToolCall(contentPartsState, tcId, {
result: {
__interrupt__: true,
@ -1261,7 +1266,7 @@ export default function NewChatPage() {
abortControllerRef.current = null;
}
},
[pendingInterrupt, messages, searchSpaceId, tokenUsageStore]
[pendingInterrupt, messages, searchSpaceId, tokenUsageStore, toolsWithUI]
);
useEffect(() => {
@ -1588,7 +1593,7 @@ export default function NewChatPage() {
abortControllerRef.current = null;
}
},
[threadId, searchSpaceId, messages, disabledTools, tokenUsageStore]
[threadId, searchSpaceId, messages, disabledTools, tokenUsageStore, toolsWithUI]
);
// Handle editing a message - truncates history and regenerates with new query

View file

@ -0,0 +1,3 @@
import { atom } from "jotai";
export const pendingUserImageDataUrlsAtom = atom<string[]>([]);

View file

@ -16,6 +16,7 @@ import {
ChevronUp,
Clipboard,
Dot,
Camera,
Globe,
Plus,
Settings2,
@ -40,6 +41,7 @@ import {
mentionedDocumentsAtom,
sidebarSelectedDocumentsAtom,
} from "@/atoms/chat/mentioned-documents.atom";
import { pendingUserImageDataUrlsAtom } from "@/atoms/chat/pending-user-images.atom";
import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms";
import { connectorsAtom } from "@/atoms/connectors/connector-query.atoms";
import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms";
@ -89,6 +91,7 @@ import { useBatchCommentsPreload } from "@/hooks/use-comments";
import { useCommentsSync } from "@/hooks/use-comments-sync";
import { useMediaQuery } from "@/hooks/use-media-query";
import { useElectronAPI } from "@/hooks/use-platform";
import { captureDisplayToPngDataUrl } from "@/lib/chat/display-media-capture";
import { SLIDEOUT_PANEL_OPENED_EVENT } from "@/lib/layout-events";
import { cn } from "@/lib/utils";
@ -295,6 +298,32 @@ const ConnectToolsBanner: FC<{ isThreadEmpty: boolean }> = ({ isThreadEmpty }) =
);
};
const PendingScreenImageStrip: FC = () => {
const [urls, setUrls] = useAtom(pendingUserImageDataUrlsAtom);
if (urls.length === 0) return null;
return (
<div className="mx-3 mt-2 flex flex-wrap gap-2">
{urls.map((url, index) => (
<div
key={url}
className="group relative h-14 w-14 shrink-0 overflow-hidden rounded-md border border-border/50 bg-muted"
>
{/* biome-ignore lint/performance/noImgElement: data URL thumbnails from capture */}
<img src={url} alt="" className="size-full object-cover" draggable={false} />
<button
type="button"
onClick={() => setUrls((prev) => prev.filter((_, i) => i !== index))}
className="absolute right-0.5 top-0.5 flex size-5 items-center justify-center rounded-full bg-background/90 text-muted-foreground shadow-sm transition-opacity hover:text-foreground sm:opacity-0 sm:group-hover:opacity-100"
aria-label="Remove screenshot"
>
<X className="size-3" />
</button>
</div>
))}
</div>
);
};
const ClipboardChip: FC<{ text: string; onDismiss: () => void }> = ({ text, onDismiss }) => {
const [expanded, setExpanded] = useState(false);
const isLong = text.length > 120;
@ -702,6 +731,7 @@ const Composer: FC = () => {
</div>
)}
<div className="aui-composer-attachment-dropzone flex w-full flex-col overflow-hidden rounded-2xl border-input bg-muted pt-2 outline-none transition-shadow">
<PendingScreenImageStrip />
{clipboardInitialText && (
<ClipboardChip
text={clipboardInitialText}
@ -761,11 +791,21 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
},
[]
);
const pendingScreenImages = useAtomValue(pendingUserImageDataUrlsAtom);
const setPendingScreenImages = useSetAtom(pendingUserImageDataUrlsAtom);
const electronAPI = useElectronAPI();
const isComposerTextEmpty = useAuiState(({ composer }) => {
const text = composer.text?.trim() || "";
return text.length === 0;
});
const isComposerEmpty = isComposerTextEmpty && mentionedDocuments.length === 0;
const isComposerEmpty =
isComposerTextEmpty && mentionedDocuments.length === 0 && pendingScreenImages.length === 0;
const handleScreenCapture = useCallback(async () => {
const url = await captureDisplayToPngDataUrl();
if (url) setPendingScreenImages((prev) => [...prev, url]);
}, [setPendingScreenImages]);
const { data: userConfigs } = useAtomValue(newLLMConfigsAtom);
const { data: globalConfigs } = useAtomValue(globalNewLLMConfigsAtom);
@ -1201,6 +1241,20 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
</div>
)}
<div className="flex items-center gap-2">
{/* Electron: native shortcut → pending images; skip in-webview getDisplayMedia. */}
{!electronAPI && (
<TooltipIconButton
tooltip="Capture screen"
type="button"
variant="ghost"
size="icon"
className="size-8 rounded-full"
aria-label="Capture screen"
onClick={() => void handleScreenCapture()}
>
<Camera className="size-4" />
</TooltipIconButton>
)}
<AuiIf condition={({ thread }) => !thread.isRunning}>
<ComposerPrimitive.Send asChild disabled={isSendDisabled}>
<TooltipIconButton
@ -1210,7 +1264,7 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
: !hasModelConfigured
? "Please select a model from the header to start chatting"
: isComposerEmpty
? "Enter a message to send"
? "Enter a message or add a screenshot to send"
: "Send message"
}
side="bottom"

View file

@ -0,0 +1,120 @@
/** `getDisplayMedia` → single PNG frame (data URL). */
function getImageCaptureCtor():
| (new (
track: MediaStreamTrack
) => { grabFrame: () => Promise<ImageBitmap> })
| undefined {
if (typeof window === "undefined") return undefined;
const IC = (
window as unknown as {
ImageCapture?: new (track: MediaStreamTrack) => { grabFrame: () => Promise<ImageBitmap> };
}
).ImageCapture;
return typeof IC === "function" ? IC : undefined;
}
function stopAllTracks(stream: MediaStream): void {
for (const t of stream.getTracks()) {
t.stop();
}
}
async function captureTrackToPngDataUrl(
track: MediaStreamTrack,
stream: MediaStream
): Promise<string | null> {
const ImageCtor = getImageCaptureCtor();
if (ImageCtor !== undefined) {
try {
const ic = new ImageCtor(track);
const bitmap = await ic.grabFrame();
try {
const canvas = document.createElement("canvas");
canvas.width = bitmap.width;
canvas.height = bitmap.height;
const ctx = canvas.getContext("2d");
if (!ctx) {
stopAllTracks(stream);
return null;
}
ctx.drawImage(bitmap, 0, 0);
stopAllTracks(stream);
return canvas.toDataURL("image/png");
} finally {
if ("close" in bitmap && typeof bitmap.close === "function") {
bitmap.close();
}
}
} catch {
/* fall through to <video> */
}
}
const videoEl = document.createElement("video");
videoEl.srcObject = stream;
videoEl.muted = true;
const haveCurrentData = 2;
const dataReady = new Promise<void>((resolve) => {
if (videoEl.readyState >= haveCurrentData) {
resolve();
return;
}
videoEl.addEventListener("loadeddata", () => resolve(), { once: true });
});
await videoEl.play();
await Promise.race([
dataReady,
new Promise<void>((resolve) => {
setTimeout(resolve, 500);
}),
]);
const w = videoEl.videoWidth;
const h = videoEl.videoHeight;
if (!w || !h) {
stopAllTracks(stream);
return null;
}
const canvas = document.createElement("canvas");
canvas.width = w;
canvas.height = h;
const ctx = canvas.getContext("2d");
if (!ctx) {
stopAllTracks(stream);
return null;
}
ctx.drawImage(videoEl, 0, 0);
stopAllTracks(stream);
return canvas.toDataURL("image/png");
}
export async function captureDisplayToPngDataUrl(): Promise<string | null> {
if (typeof navigator === "undefined" || !navigator.mediaDevices?.getDisplayMedia) {
return null;
}
let stream: MediaStream | null = null;
try {
stream = await navigator.mediaDevices.getDisplayMedia({
video: { frameRate: { ideal: 1, max: 5 } },
audio: false,
selfBrowserSurface: "exclude",
} as Parameters<MediaDevices["getDisplayMedia"]>[0]);
const track = stream.getVideoTracks()[0];
if (!track) {
stopAllTracks(stream);
return null;
}
const dataUrl = await captureTrackToPngDataUrl(track, stream);
stream = null;
return dataUrl;
} catch (e) {
if (typeof process !== "undefined" && process.env?.NODE_ENV !== "production") {
console.warn("[captureDisplayToPngDataUrl]", e);
}
if (stream) {
stopAllTracks(stream);
}
return null;
}
}

View file

@ -0,0 +1,57 @@
import type { AppendMessage } from "@assistant-ui/react";
const MAX_IMAGES = 4;
export type NewChatUserImagePayload = {
media_type: "image/png" | "image/jpeg" | "image/webp";
data: string;
};
function dataUrlToPayload(dataUrl: string): NewChatUserImagePayload | null {
const m = /^data:(image\/(?:png|jpeg|webp|jpg));base64,([\s\S]+)$/i.exec(dataUrl.trim());
if (!m) return null;
let media = m[1].toLowerCase() as string;
if (media === "image/jpg") media = "image/jpeg";
if (media !== "image/png" && media !== "image/jpeg" && media !== "image/webp") return null;
const data = m[2].replace(/\s/g, "");
if (!data) return null;
return { media_type: media as NewChatUserImagePayload["media_type"], data };
}
function collectImageDataUrlsFromParts(parts: AppendMessage["content"]): string[] {
const out: string[] = [];
for (const part of parts) {
if (typeof part !== "object" || part === null || !("type" in part)) continue;
if (part.type !== "image") continue;
const img = "image" in part && typeof part.image === "string" ? part.image : null;
if (img && dataUrlToPayload(img)) out.push(img);
}
return out;
}
export function extractUserTurnForNewChatApi(
message: AppendMessage,
extraDataUrls: readonly string[]
): { userQuery: string; userImages: NewChatUserImagePayload[] } {
let userQuery = "";
for (const part of message.content) {
if (part.type === "text") {
userQuery += part.text;
}
}
const merged = [...extraDataUrls, ...collectImageDataUrlsFromParts(message.content)];
const payloads: NewChatUserImagePayload[] = [];
const seen = new Set<string>();
for (const url of merged) {
const p = dataUrlToPayload(url);
if (!p) continue;
const key = `${p.media_type}:${p.data.length}`;
if (seen.has(key)) continue;
seen.add(key);
payloads.push(p);
if (payloads.length >= MAX_IMAGES) break;
}
return { userQuery, userImages: payloads };
}