add vision LLM role for screenshot analysis

This commit is contained in:
CREDO23 2026-04-03 17:40:27 +02:00
parent 8ba571566d
commit 482238e5d4
7 changed files with 91 additions and 2 deletions

View file

@ -0,0 +1,39 @@
"""117_add_vision_llm_id_to_search_spaces
Revision ID: 117
Revises: 116
Adds vision_llm_id column to search_spaces for vision/screenshot analysis
LLM role assignment. Defaults to 0 (Auto mode), same convention as
agent_llm_id and document_summary_llm_id.
"""
from __future__ import annotations
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "117"
down_revision: str | None = "116"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
conn = op.get_bind()
existing_columns = [
col["name"] for col in sa.inspect(conn).get_columns("search_spaces")
]
if "vision_llm_id" not in existing_columns:
op.add_column(
"search_spaces",
sa.Column("vision_llm_id", sa.Integer(), nullable=True, server_default="0"),
)
def downgrade() -> None:
op.drop_column("search_spaces", "vision_llm_id")

View file

@ -1329,6 +1329,9 @@ class SearchSpace(BaseModel, TimestampMixin):
image_generation_config_id = Column(
Integer, nullable=True, default=0
) # For image generation, defaults to Auto mode
vision_llm_id = Column(
Integer, nullable=True, default=0
) # For vision/screenshot analysis, defaults to Auto mode
user_id = Column(
UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False

View file

@ -522,14 +522,17 @@ async def get_llm_preferences(
image_generation_config = await _get_image_gen_config_by_id(
session, search_space.image_generation_config_id
)
vision_llm = await _get_llm_config_by_id(session, search_space.vision_llm_id)
return LLMPreferencesRead(
agent_llm_id=search_space.agent_llm_id,
document_summary_llm_id=search_space.document_summary_llm_id,
image_generation_config_id=search_space.image_generation_config_id,
vision_llm_id=search_space.vision_llm_id,
agent_llm=agent_llm,
document_summary_llm=document_summary_llm,
image_generation_config=image_generation_config,
vision_llm=vision_llm,
)
except HTTPException:
@ -589,14 +592,17 @@ async def update_llm_preferences(
image_generation_config = await _get_image_gen_config_by_id(
session, search_space.image_generation_config_id
)
vision_llm = await _get_llm_config_by_id(session, search_space.vision_llm_id)
return LLMPreferencesRead(
agent_llm_id=search_space.agent_llm_id,
document_summary_llm_id=search_space.document_summary_llm_id,
image_generation_config_id=search_space.image_generation_config_id,
vision_llm_id=search_space.vision_llm_id,
agent_llm=agent_llm,
document_summary_llm=document_summary_llm,
image_generation_config=image_generation_config,
vision_llm=vision_llm,
)
except HTTPException:

View file

@ -182,6 +182,9 @@ class LLMPreferencesRead(BaseModel):
image_generation_config_id: int | None = Field(
None, description="ID of the image generation config to use"
)
vision_llm_id: int | None = Field(
None, description="ID of the LLM config to use for vision/screenshot analysis"
)
agent_llm: dict[str, Any] | None = Field(
None, description="Full config for agent LLM"
)
@ -191,6 +194,9 @@ class LLMPreferencesRead(BaseModel):
image_generation_config: dict[str, Any] | None = Field(
None, description="Full config for image generation"
)
vision_llm: dict[str, Any] | None = Field(
None, description="Full config for vision LLM"
)
model_config = ConfigDict(from_attributes=True)
@ -207,3 +213,6 @@ class LLMPreferencesUpdate(BaseModel):
image_generation_config_id: int | None = Field(
None, description="ID of the image generation config to use"
)
vision_llm_id: int | None = Field(
None, description="ID of the LLM config to use for vision/screenshot analysis"
)

View file

@ -32,6 +32,7 @@ logger = logging.getLogger(__name__)
class LLMRole:
AGENT = "agent" # For agent/chat operations
DOCUMENT_SUMMARY = "document_summary" # For document summarization
VISION = "vision" # For vision/screenshot analysis
def get_global_llm_config(llm_config_id: int) -> dict | None:
@ -187,7 +188,7 @@ async def get_search_space_llm_instance(
Args:
session: Database session
search_space_id: Search Space ID
role: LLM role ('agent' or 'document_summary')
role: LLM role ('agent', 'document_summary', or 'vision')
Returns:
ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found
@ -209,6 +210,8 @@ async def get_search_space_llm_instance(
llm_config_id = search_space.agent_llm_id
elif role == LLMRole.DOCUMENT_SUMMARY:
llm_config_id = search_space.document_summary_llm_id
elif role == LLMRole.VISION:
llm_config_id = search_space.vision_llm_id
else:
logger.error(f"Invalid LLM role: {role}")
return None
@ -405,6 +408,13 @@ async def get_document_summary_llm(
)
async def get_vision_llm(
session: AsyncSession, search_space_id: int
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
"""Get the search space's vision LLM instance for screenshot analysis."""
return await get_search_space_llm_instance(session, search_space_id, LLMRole.VISION)
# Backward-compatible alias (LLM preferences are now per-search-space, not per-user)
async def get_user_long_context_llm(
session: AsyncSession,

View file

@ -6,6 +6,7 @@ import {
Bot,
CheckCircle,
CircleDashed,
Eye,
FileText,
ImageIcon,
RefreshCw,
@ -71,6 +72,15 @@ const ROLE_DESCRIPTIONS = {
prefKey: "image_generation_config_id" as const,
configType: "image" as const,
},
vision: {
icon: Eye,
title: "Vision LLM",
description: "Vision-capable model for screenshot analysis and context extraction",
color: "text-amber-600 dark:text-amber-400",
bgColor: "bg-amber-500/10",
prefKey: "vision_llm_id" as const,
configType: "llm" as const,
},
};
interface LLMRoleManagerProps {
@ -116,6 +126,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
agent_llm_id: preferences.agent_llm_id ?? "",
document_summary_llm_id: preferences.document_summary_llm_id ?? "",
image_generation_config_id: preferences.image_generation_config_id ?? "",
vision_llm_id: preferences.vision_llm_id ?? "",
}));
const [hasChanges, setHasChanges] = useState(false);
@ -126,6 +137,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
agent_llm_id: preferences.agent_llm_id ?? "",
document_summary_llm_id: preferences.document_summary_llm_id ?? "",
image_generation_config_id: preferences.image_generation_config_id ?? "",
vision_llm_id: preferences.vision_llm_id ?? "",
};
setAssignments(newAssignments);
setHasChanges(false);
@ -133,6 +145,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
preferences?.agent_llm_id,
preferences?.document_summary_llm_id,
preferences?.image_generation_config_id,
preferences?.vision_llm_id,
]);
const handleRoleAssignment = (prefKey: string, configId: string) => {
@ -147,6 +160,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
agent_llm_id: preferences.agent_llm_id ?? "",
document_summary_llm_id: preferences.document_summary_llm_id ?? "",
image_generation_config_id: preferences.image_generation_config_id ?? "",
vision_llm_id: preferences.vision_llm_id ?? "",
};
const hasChangesNow = Object.keys(newAssignments).some(
@ -168,6 +182,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
agent_llm_id: toNumericOrUndefined(assignments.agent_llm_id),
document_summary_llm_id: toNumericOrUndefined(assignments.document_summary_llm_id),
image_generation_config_id: toNumericOrUndefined(assignments.image_generation_config_id),
vision_llm_id: toNumericOrUndefined(assignments.vision_llm_id),
};
await updatePreferences({
@ -186,6 +201,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
agent_llm_id: preferences.agent_llm_id ?? "",
document_summary_llm_id: preferences.document_summary_llm_id ?? "",
image_generation_config_id: preferences.image_generation_config_id ?? "",
vision_llm_id: preferences.vision_llm_id ?? "",
});
setHasChanges(false);
};
@ -199,7 +215,10 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
assignments.document_summary_llm_id !== undefined &&
assignments.image_generation_config_id !== "" &&
assignments.image_generation_config_id !== null &&
assignments.image_generation_config_id !== undefined;
assignments.image_generation_config_id !== undefined &&
assignments.vision_llm_id !== "" &&
assignments.vision_llm_id !== null &&
assignments.vision_llm_id !== undefined;
// Combine global and custom LLM configs
const allLLMConfigs = [

View file

@ -264,9 +264,11 @@ export const llmPreferences = z.object({
agent_llm_id: z.union([z.number(), z.null()]).optional(),
document_summary_llm_id: z.union([z.number(), z.null()]).optional(),
image_generation_config_id: z.union([z.number(), z.null()]).optional(),
vision_llm_id: z.union([z.number(), z.null()]).optional(),
agent_llm: z.union([z.record(z.string(), z.unknown()), z.null()]).optional(),
document_summary_llm: z.union([z.record(z.string(), z.unknown()), z.null()]).optional(),
image_generation_config: z.union([z.record(z.string(), z.unknown()), z.null()]).optional(),
vision_llm: z.union([z.record(z.string(), z.unknown()), z.null()]).optional(),
});
/**
@ -287,6 +289,7 @@ export const updateLLMPreferencesRequest = z.object({
agent_llm_id: true,
document_summary_llm_id: true,
image_generation_config_id: true,
vision_llm_id: true,
}),
});