feat: add voice selector for Dograh model configs

2026-06-25 08:48:13 +02:00 · 2026-06-23 18:33:04 +05:30 · 2026-06-23 18:33:04 +05:30 · 0956157029
commit 0956157029
parent 40e34994fd
10 changed files with 694 additions and 86 deletions
--- a/api/routes/user.py
+++ b/api/routes/user.py
@ -388,9 +388,18 @@ class VoiceInfo(BaseModel):
    preview_url: Optional[str] = None


+class VoiceFacets(BaseModel):
+    """Distinct selector values across a provider's full voice catalog."""
+
+    genders: List[str] = []
+    accents: List[str] = []
+    languages: List[str] = []
+
+
 class VoicesResponse(BaseModel):
    provider: str
    voices: List[VoiceInfo]
+    facets: Optional[VoiceFacets] = None


@router.get("/configurations/voices/{provider}")
@ -398,6 +407,9 @@ async def get_voices(
    provider: TTSProvider,
    model: Optional[str] = None,
    language: Optional[str] = None,
+    q: Optional[str] = None,
+    gender: Optional[str] = None,
+    accent: Optional[str] = None,
    user: UserModel = Depends(get_user),
 ) -> VoicesResponse:
    """Get available voices for a TTS provider."""
@ -406,12 +418,16 @@ async def get_voices(
            provider=provider,
            model=model,
            language=language,
+            q=q,
+            gender=gender,
+            accent=accent,
            organization_id=user.selected_organization_id,
            created_by=user.provider_id,
        )
        return VoicesResponse(
            provider=result.get("provider", provider),
            voices=[VoiceInfo(**voice) for voice in result.get("voices", [])],
+            facets=result.get("facets"),
        )
    except Exception as e:
        logger.error(f"Failed to fetch voices for {provider}: {e}")
--- a/api/services/mps_service_key_client.py
+++ b/api/services/mps_service_key_client.py
@ -720,6 +720,9 @@ class MPSServiceKeyClient:
        provider: str,
        model: Optional[str] = None,
        language: Optional[str] = None,
+        q: Optional[str] = None,
+        gender: Optional[str] = None,
+        accent: Optional[str] = None,
        organization_id: Optional[int] = None,
        created_by: Optional[str] = None,
    ) -> dict:
@ -745,6 +748,12 @@ class MPSServiceKeyClient:
                params["model"] = model
            if language:
                params["language"] = language
+            if q:
+                params["q"] = q
+            if gender:
+                params["gender"] = gender
+            if accent:
+                params["accent"] = accent
            response = await client.get(
                f"{self.base_url}/api/v1/voice-proxy/{provider}/voices",
                headers=self._get_headers(organization_id, created_by),
--- a/docs/api-reference/openapi.json
+++ b/docs/api-reference/openapi.json
--- a/sdk/python/src/dograh_sdk/_generated_models.py
+++ b/sdk/python/src/dograh_sdk/_generated_models.py
@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
-#   filename:  dograh-openapi-XXXXXX.json.6F33jkClt9
-#   timestamp: 2026-06-19T12:41:10+00:00
+#   filename:  dograh-openapi-XXXXXX.json.rRr9IUrKFk
+#   timestamp: 2026-06-23T13:02:10+00:00

 from __future__ import annotations

--- a/ui/src/client/index.ts
+++ b/ui/src/client/index.ts
--- a/ui/src/client/types.gen.ts
+++ b/ui/src/client/types.gen.ts
@ -6294,6 +6294,26 @@ export type VobizConfigurationResponse = {
    from_numbers: Array<string>;
 };

+/**
+ * VoiceFacets
+ *
+ * Distinct selector values across a provider's full voice catalog.
+ */
+export type VoiceFacets = {
+    /**
+     * Genders
+     */
+    genders?: Array<string>;
+    /**
+     * Accents
+     */
+    accents?: Array<string>;
+    /**
+     * Languages
+     */
+    languages?: Array<string>;
+};
+
 /**
 * VoiceInfo
 */
@ -6340,6 +6360,7 @@ export type VoicesResponse = {
     * Voices
     */
    voices: Array<VoiceInfo>;
+    facets?: VoiceFacets | null;
 };

 /**
@ -9208,6 +9229,18 @@ export type GetVoicesApiV1UserConfigurationsVoicesProviderGetData = {
         * Language
         */
        language?: string | null;
+        /**
+         * Q
+         */
+        q?: string | null;
+        /**
+         * Gender
+         */
+        gender?: string | null;
+        /**
+         * Accent
+         */
+        accent?: string | null;
    };
    url: '/api/v1/user/configurations/voices/{provider}';
 };
--- a/ui/src/components/AIModelConfigurationV2Editor.tsx
+++ b/ui/src/components/AIModelConfigurationV2Editor.tsx
@ -12,11 +12,11 @@ import {
 } from "@/components/ServiceConfigurationForm";
 import { Button } from "@/components/ui/button";
 import { Card, CardContent } from "@/components/ui/card";
-import { Checkbox } from "@/components/ui/checkbox";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
+import { VoiceSelectorModal } from "@/components/VoiceSelectorModal";
 import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";

 type ModelMode = "realtime" | "dograh" | "byok";
@ -278,7 +278,6 @@ export function AIModelConfigurationV2Editor({
    const [realtimeInitialConfig, setRealtimeInitialConfig] = useState<Record<string, unknown> | null>(null);
    const [pipelineInitialConfig, setPipelineInitialConfig] = useState<Record<string, unknown> | null>(null);
    const [isSavingDograh, setIsSavingDograh] = useState(false);
-    const [isCustomVoice, setIsCustomVoice] = useState(false);
    const [error, setError] = useState<string | null>(null);

    const allowCustomVoice = defaults.dograh.allow_custom_input ?? false;
@ -290,7 +289,6 @@ export function AIModelConfigurationV2Editor({
        setMode(preferredMode(rawConfiguration, rawEffectiveConfiguration));
        const nextDograh = buildDograhState(defaults, rawConfiguration, rawEffectiveConfiguration);
        setDograh(nextDograh);
-        setIsCustomVoice(allowCustomVoice && !defaults.dograh.voices.includes(nextDograh.voice));
        setRealtimeInitialConfig(getByokInitialConfig(rawConfiguration, rawEffectiveConfiguration, true));
        setPipelineInitialConfig(getByokInitialConfig(rawConfiguration, rawEffectiveConfiguration, false));
    }, [configuration, defaults, effectiveConfiguration, allowCustomVoice]);
@ -390,46 +388,30 @@ export function AIModelConfigurationV2Editor({
                    <Card>
                        <CardContent className="pt-6">
                            <div className="grid gap-4 sm:grid-cols-2">
-                                <div className="space-y-2">
+                                <div className="space-y-2 sm:col-span-2">
                                    <Label>Voice</Label>
-                                    {isCustomVoice ? (
-                                        <Input
-                                            placeholder="Enter voice"
-                                            value={dograh.voice}
-                                            onChange={(event) => setDograh({ ...dograh, voice: event.target.value })}
-                                        />
-                                    ) : (
-                                        <Select value={dograh.voice} onValueChange={(voice) => setDograh({ ...dograh, voice })}>
-                                            <SelectTrigger className="w-full">
-                                                <SelectValue placeholder="Select voice" />
-                                            </SelectTrigger>
-                                            <SelectContent>
-                                                {defaults.dograh.voices.map((voice) => (
-                                                    <SelectItem key={voice} value={voice}>
-                                                        {voice}
-                                                    </SelectItem>
-                                                ))}
-                                            </SelectContent>
-                                        </Select>
-                                    )}
-                                    {allowCustomVoice && (
-                                        <div className="flex items-center space-x-2">
-                                            <Checkbox
-                                                id="dograh-custom-voice"
-                                                checked={isCustomVoice}
-                                                onCheckedChange={(checked) => {
-                                                    const custom = checked as boolean;
-                                                    setIsCustomVoice(custom);
-                                                    if (!custom) {
-                                                        setDograh({ ...dograh, voice: defaults.dograh.defaults.voice });
-                                                    }
-                                                }}
-                                            />
-                                            <Label htmlFor="dograh-custom-voice" className="text-sm font-normal cursor-pointer">
-                                                Enter Custom Value
-                                            </Label>
-                                        </div>
-                                    )}
+                                    <VoiceSelectorModal
+                                        provider="dograh"
+                                        value={dograh.voice}
+                                        onChange={(voice) => setDograh({ ...dograh, voice })}
+                                        allowManualInput={allowCustomVoice}
+                                    />
+                                </div>
+
+                                <div className="space-y-2 sm:col-span-2">
+                                    <Label>Language</Label>
+                                    <Select value={dograh.language} onValueChange={(language) => setDograh({ ...dograh, language })}>
+                                        <SelectTrigger className="w-full">
+                                            <SelectValue placeholder="Select language" />
+                                        </SelectTrigger>
+                                        <SelectContent>
+                                            {defaults.dograh.languages.map((language) => (
+                                                <SelectItem key={language} value={language}>
+                                                    {LANGUAGE_DISPLAY_NAMES[language] || language}
+                                                </SelectItem>
+                                            ))}
+                                        </SelectContent>
+                                    </Select>
                                </div>

                                <div className="space-y-2">
@ -451,23 +433,7 @@ export function AIModelConfigurationV2Editor({
                                    />
                                </div>

-                                <div className="space-y-2 sm:col-span-2">
-                                    <Label>Language</Label>
-                                    <Select value={dograh.language} onValueChange={(language) => setDograh({ ...dograh, language })}>
-                                        <SelectTrigger className="w-full">
-                                            <SelectValue placeholder="Select language" />
-                                        </SelectTrigger>
-                                        <SelectContent>
-                                            {defaults.dograh.languages.map((language) => (
-                                                <SelectItem key={language} value={language}>
-                                                    {LANGUAGE_DISPLAY_NAMES[language] || language}
-                                                </SelectItem>
-                                            ))}
-                                        </SelectContent>
-                                    </Select>
-                                </div>
-
-                                <div className="space-y-2 sm:col-span-2">
+                                <div className="space-y-2">
                                    <Label htmlFor="dograh-api-key">API Key</Label>
                                    <div className="relative">
                                        <KeyRound className="pointer-events-none absolute left-3 top-1/2 h-4 w-4 -translate-y-1/2 text-muted-foreground" />
--- a/ui/src/components/VoiceSelector.tsx
+++ b/ui/src/components/VoiceSelector.tsx
@ -10,11 +10,13 @@ import { Checkbox } from "@/components/ui/checkbox";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
+import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
 import { cn } from "@/lib/utils";

 // Providers that have MPS voice endpoints
 type TTSProviderWithVoices = "elevenlabs" | "deepgram" | "sarvam" | "cartesia" | "dograh" | "rime";
 const MPS_VOICE_PROVIDERS: TTSProviderWithVoices[] = ["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh", "rime"];
+const ALL_FILTER_VALUE = "__all__";

 interface VoiceSelectorProps {
    provider: string;
@ -22,6 +24,8 @@ interface VoiceSelectorProps {
    onChange: (voiceId: string) => void;
    model?: string;
    language?: string;
+    showFilters?: boolean;
+    allowManualInput?: boolean;
    className?: string;
 }

@ -31,10 +35,15 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
    onChange,
    model,
    language,
+    showFilters = false,
+    allowManualInput = true,
    className,
 }) => {
    const [isOpen, setIsOpen] = useState(false);
    const [searchTerm, setSearchTerm] = useState("");
+    const [genderFilter, setGenderFilter] = useState(ALL_FILTER_VALUE);
+    const [languageFilter, setLanguageFilter] = useState(ALL_FILTER_VALUE);
+    const [accentFilter, setAccentFilter] = useState(ALL_FILTER_VALUE);
    const [isManualInput, setIsManualInput] = useState(false);
    const [manualVoiceId, setManualVoiceId] = useState(value || "");
    const [voices, setVoices] = useState<VoiceInfo[]>([]);
@ -102,13 +111,15 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
    useEffect(() => {
        if (value && voices.length > 0) {
            const voiceExists = voices.some((v) => v.voice_id === value);
-            if (!voiceExists) {
+            if (!voiceExists && allowManualInput) {
                // If the value doesn't exist in the list, switch to manual input mode
                setIsManualInput(true);
                setManualVoiceId(value);
+            } else if (voiceExists) {
+                setIsManualInput(false);
            }
        }
-    }, [value, voices]);
+    }, [value, voices, allowManualInput]);

    // Cleanup audio on unmount or when popover closes
    useEffect(() => {
@ -131,7 +142,7 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({

    const filteredVoices = voices.filter((voice) => {
        const searchLower = searchTerm.toLowerCase();
-        return (
+        const matchesSearch = (
            voice.name.toLowerCase().includes(searchLower) ||
            voice.voice_id.toLowerCase().includes(searchLower) ||
            (voice.description?.toLowerCase() || "").includes(searchLower) ||
@ -139,8 +150,23 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
            (voice.gender?.toLowerCase() || "").includes(searchLower) ||
            (voice.language?.toLowerCase() || "").includes(searchLower)
        );
+        if (!matchesSearch) return false;
+        if (genderFilter !== ALL_FILTER_VALUE && (voice.gender || "").toLowerCase() !== genderFilter) return false;
+        if (languageFilter !== ALL_FILTER_VALUE && (voice.language || "").toLowerCase() !== languageFilter) return false;
+        if (accentFilter !== ALL_FILTER_VALUE && (voice.accent || "").toLowerCase() !== accentFilter) return false;
+        return true;
    });

+    const genderOptions = Array.from(
+        new Set(voices.map((voice) => voice.gender?.toLowerCase()).filter(Boolean) as string[]),
+    ).sort();
+    const languageOptions = Array.from(
+        new Set(voices.map((voice) => voice.language?.toLowerCase()).filter(Boolean) as string[]),
+    ).sort();
+    const accentOptions = Array.from(
+        new Set(voices.map((voice) => voice.accent?.toLowerCase()).filter(Boolean) as string[]),
+    ).sort();
+
    const handleSelectVoice = (voiceId: string) => {
        onChange(voiceId);
        setIsOpen(false);
@ -148,6 +174,7 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
    };

    const handleManualInputToggle = (checked: boolean) => {
+        if (!allowManualInput) return;
        setIsManualInput(checked);
        if (checked) {
            setManualVoiceId(value || "");
@ -219,7 +246,7 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
        );
    }

-    if (isManualInput) {
+    if (isManualInput && allowManualInput) {
        return (
            <div className={cn("space-y-2", className)}>
                <Input
@ -281,6 +308,52 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
                            />
                        </div>

+                        {showFilters && (
+                            <div className="grid gap-2 sm:grid-cols-3">
+                                <Select value={genderFilter} onValueChange={setGenderFilter}>
+                                    <SelectTrigger className="h-8">
+                                        <SelectValue placeholder="Gender" />
+                                    </SelectTrigger>
+                                    <SelectContent>
+                                        <SelectItem value={ALL_FILTER_VALUE}>All genders</SelectItem>
+                                        {genderOptions.map((gender) => (
+                                            <SelectItem key={gender} value={gender} className="capitalize">
+                                                {gender}
+                                            </SelectItem>
+                                        ))}
+                                    </SelectContent>
+                                </Select>
+
+                                <Select value={languageFilter} onValueChange={setLanguageFilter}>
+                                    <SelectTrigger className="h-8">
+                                        <SelectValue placeholder="Language" />
+                                    </SelectTrigger>
+                                    <SelectContent>
+                                        <SelectItem value={ALL_FILTER_VALUE}>All languages</SelectItem>
+                                        {languageOptions.map((voiceLanguage) => (
+                                            <SelectItem key={voiceLanguage} value={voiceLanguage} className="uppercase">
+                                                {voiceLanguage}
+                                            </SelectItem>
+                                        ))}
+                                    </SelectContent>
+                                </Select>
+
+                                <Select value={accentFilter} onValueChange={setAccentFilter}>
+                                    <SelectTrigger className="h-8">
+                                        <SelectValue placeholder="Accent" />
+                                    </SelectTrigger>
+                                    <SelectContent>
+                                        <SelectItem value={ALL_FILTER_VALUE}>All accents</SelectItem>
+                                        {accentOptions.map((accent) => (
+                                            <SelectItem key={accent} value={accent} className="uppercase">
+                                                {accent}
+                                            </SelectItem>
+                                        ))}
+                                    </SelectContent>
+                                </Select>
+                            </div>
+                        )}
+
                        <div className="max-h-[300px] overflow-auto space-y-1">
                            {error ? (
                                <p className="text-sm text-red-500 text-center py-4">
@ -358,26 +431,30 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
                        </div>

                        <div className="pt-2 border-t flex items-center justify-between">
-                            <div className="flex items-center space-x-2">
-                                <Checkbox
-                                    id="manual-voice-input-popup"
-                                    checked={isManualInput}
-                                    onCheckedChange={(checked) => {
-                                        handleManualInputToggle(checked as boolean);
-                                        if (checked) {
-                                            setIsOpen(false);
-                                        }
-                                    }}
-                                />
-                                <Label
-                                    htmlFor="manual-voice-input-popup"
-                                    className="text-sm font-normal cursor-pointer"
-                                >
-                                    Add Voice ID Manually
-                                </Label>
-                            </div>
+                            {allowManualInput ? (
+                                <div className="flex items-center space-x-2">
+                                    <Checkbox
+                                        id="manual-voice-input-popup"
+                                        checked={isManualInput}
+                                        onCheckedChange={(checked) => {
+                                            handleManualInputToggle(checked as boolean);
+                                            if (checked) {
+                                                setIsOpen(false);
+                                            }
+                                        }}
+                                    />
+                                    <Label
+                                        htmlFor="manual-voice-input-popup"
+                                        className="text-sm font-normal cursor-pointer"
+                                    >
+                                        Add Voice ID Manually
+                                    </Label>
+                                </div>
+                            ) : (
+                                <span />
+                            )}
                            <p className="text-xs text-muted-foreground">
-                                {voices.length} voices available
+                                {filteredVoices.length} of {voices.length} voices
                            </p>
                        </div>
                    </div>
--- a/ui/src/components/VoiceSelectorModal.tsx
+++ b/ui/src/components/VoiceSelectorModal.tsx
@ -0,0 +1,451 @@
+"use client";
+
+import { Check, ChevronDown, Loader2, Pencil, Play, Square } from "lucide-react";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+
+import { getVoicesApiV1UserConfigurationsVoicesProviderGet } from "@/client/sdk.gen";
+import { VoiceInfo } from "@/client/types.gen";
+import { Button } from "@/components/ui/button";
+import {
+    Dialog,
+    DialogContent,
+    DialogHeader,
+    DialogTitle,
+} from "@/components/ui/dialog";
+import { Input } from "@/components/ui/input";
+import { Label } from "@/components/ui/label";
+import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
+import { ACCENT_DISPLAY_NAMES } from "@/constants/accents";
+import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
+import { cn } from "@/lib/utils";
+
+const ALL_FILTER_VALUE = "__all__";
+
+// Defaults so the modal opens on a focused set instead of the full catalog.
+const DEFAULT_GENDER = "female";
+const DEFAULT_ACCENT = "us"; // American
+const DEFAULT_LANGUAGE = "en";
+
+const SEARCH_DEBOUNCE_MS = 300;
+
+interface Facets {
+    genders: string[];
+    accents: string[];
+    languages: string[];
+}
+
+const EMPTY_FACETS: Facets = { genders: [], accents: [], languages: [] };
+
+interface VoiceSelectorModalProps {
+    provider: string;
+    value: string;
+    onChange: (voiceId: string) => void;
+    /** Optional model passed through to the voice catalog query. */
+    model?: string;
+    /** Allow typing a raw voice ID for voices outside the catalog. */
+    allowManualInput?: boolean;
+    className?: string;
+}
+
+const capitalize = (value: string) => value.charAt(0).toUpperCase() + value.slice(1);
+
+const accentLabel = (code?: string | null) =>
+    code ? ACCENT_DISPLAY_NAMES[code.toLowerCase()] || capitalize(code) : "";
+const languageLabel = (code?: string | null) =>
+    code ? LANGUAGE_DISPLAY_NAMES[code] || code.toUpperCase() : "";
+const genderLabel = (gender?: string | null) => (gender ? capitalize(gender) : "");
+
+/** Build the "Accent · Gender · Language" trait line shown under a voice name. */
+function voiceTraits(voice: VoiceInfo): string {
+    return [accentLabel(voice.accent), genderLabel(voice.gender), languageLabel(voice.language)]
+        .filter(Boolean)
+        .join(" · ");
+}
+
+/** Ensure the active filter value is always an option so the Select can render it. */
+function withSelected(options: string[], selected: string): string[] {
+    if (selected === ALL_FILTER_VALUE || options.includes(selected)) return options;
+    return [selected, ...options];
+}
+
+export const VoiceSelectorModal: React.FC<VoiceSelectorModalProps> = ({
+    provider,
+    value,
+    onChange,
+    model,
+    allowManualInput = false,
+    className,
+}) => {
+    const [isOpen, setIsOpen] = useState(false);
+    const [voices, setVoices] = useState<VoiceInfo[]>([]);
+    const [facets, setFacets] = useState<Facets>(EMPTY_FACETS);
+    const [isLoading, setIsLoading] = useState(false);
+    const [error, setError] = useState<string | null>(null);
+
+    // Filters drive a server-side query (we never fetch the whole catalog).
+    const [gender, setGender] = useState(DEFAULT_GENDER);
+    const [accent, setAccent] = useState(DEFAULT_ACCENT);
+    const [language, setLanguage] = useState(DEFAULT_LANGUAGE);
+    const [searchInput, setSearchInput] = useState("");
+    const [debouncedSearch, setDebouncedSearch] = useState("");
+
+    // Pending (in-modal) selection; only committed via "Use this voice".
+    const [pendingVoiceId, setPendingVoiceId] = useState(value);
+    const [selectedVoiceInfo, setSelectedVoiceInfo] = useState<VoiceInfo | null>(null);
+    const [manualMode, setManualMode] = useState(false);
+    const [manualVoiceId, setManualVoiceId] = useState("");
+
+    // Preview playback.
+    const [playingVoiceId, setPlayingVoiceId] = useState<string | null>(null);
+    const audioRef = useRef<HTMLAudioElement | null>(null);
+    const requestId = useRef(0);
+
+    const stopPreview = useCallback(() => {
+        if (audioRef.current) {
+            audioRef.current.pause();
+            audioRef.current = null;
+        }
+        setPlayingVoiceId(null);
+    }, []);
+
+    // Debounce the search box so typing doesn't fire a request per keystroke.
+    useEffect(() => {
+        const timer = setTimeout(() => setDebouncedSearch(searchInput), SEARCH_DEBOUNCE_MS);
+        return () => clearTimeout(timer);
+    }, [searchInput]);
+
+    // Resolve the currently-selected voice (for the trigger label) without
+    // pulling the catalog: a targeted lookup by voice ID.
+    useEffect(() => {
+        if (!value) {
+            setSelectedVoiceInfo(null);
+            return;
+        }
+        let active = true;
+        (async () => {
+            const response = await getVoicesApiV1UserConfigurationsVoicesProviderGet({
+                path: { provider: provider as never },
+                query: { q: value },
+            });
+            if (!active) return;
+            const found = response.data?.voices?.find((voice) => voice.voice_id === value) ?? null;
+            setSelectedVoiceInfo(found);
+        })();
+        return () => {
+            active = false;
+        };
+    }, [value, provider]);
+
+    // Fetch the filtered voice list (server-side) whenever the modal is open
+    // and a filter changes. A request counter discards out-of-order responses.
+    useEffect(() => {
+        if (!isOpen || manualMode) return;
+        const id = ++requestId.current;
+        setIsLoading(true);
+        setError(null);
+        (async () => {
+            const query: Record<string, string> = {};
+            if (model) query.model = model;
+            if (gender !== ALL_FILTER_VALUE) query.gender = gender;
+            if (accent !== ALL_FILTER_VALUE) query.accent = accent;
+            if (language !== ALL_FILTER_VALUE) query.language = language;
+            const search = debouncedSearch.trim();
+            if (search) query.q = search;
+
+            const response = await getVoicesApiV1UserConfigurationsVoicesProviderGet({
+                path: { provider: provider as never },
+                query,
+            });
+            if (id !== requestId.current) return; // a newer request superseded this one
+
+            if (response.error) {
+                setError("Failed to load voices");
+                setVoices([]);
+            } else {
+                setVoices(response.data?.voices ?? []);
+                if (response.data?.facets) {
+                    setFacets({
+                        genders: response.data.facets.genders ?? [],
+                        accents: response.data.facets.accents ?? [],
+                        languages: response.data.facets.languages ?? [],
+                    });
+                }
+            }
+            setIsLoading(false);
+        })();
+    }, [isOpen, manualMode, provider, model, gender, accent, language, debouncedSearch]);
+
+    // Stop any preview when the modal closes / unmounts.
+    useEffect(() => {
+        if (!isOpen) stopPreview();
+        return () => stopPreview();
+    }, [isOpen, stopPreview]);
+
+    // Facets arrive sorted by raw code; present them sorted by display label so
+    // the dropdowns read alphabetically (e.g. "American" near the top, not "us").
+    const toSortedOptions = (codes: string[], selected: string, label: (code: string) => string) =>
+        withSelected(codes, selected)
+            .map((code) => ({ value: code, label: label(code) }))
+            .sort((a, b) => a.label.localeCompare(b.label));
+
+    const genderOptions = useMemo(
+        () => toSortedOptions(facets.genders, gender, genderLabel),
+        [facets.genders, gender],
+    );
+    const accentOptions = useMemo(
+        () => toSortedOptions(facets.accents, accent, accentLabel),
+        [facets.accents, accent],
+    );
+    const languageOptions = useMemo(
+        () => toSortedOptions(facets.languages, language, languageLabel),
+        [facets.languages, language],
+    );
+
+    const openModal = () => {
+        setGender(DEFAULT_GENDER);
+        setAccent(DEFAULT_ACCENT);
+        setLanguage(DEFAULT_LANGUAGE);
+        setSearchInput("");
+        setDebouncedSearch("");
+        setManualMode(false);
+        setManualVoiceId(value);
+        setPendingVoiceId(value);
+        setIsOpen(true);
+    };
+
+    const playPreview = (voice: VoiceInfo) => {
+        if (playingVoiceId === voice.voice_id) {
+            stopPreview();
+            return;
+        }
+        stopPreview();
+        if (!voice.preview_url) return;
+        const audio = new Audio(voice.preview_url);
+        audioRef.current = audio;
+        setPlayingVoiceId(voice.voice_id);
+        const clear = () => {
+            if (audioRef.current === audio) audioRef.current = null;
+            setPlayingVoiceId((current) => (current === voice.voice_id ? null : current));
+        };
+        audio.onended = clear;
+        audio.onerror = clear;
+        audio.play().catch(clear);
+    };
+
+    const commitSelection = () => {
+        if (manualMode) {
+            const next = manualVoiceId.trim();
+            if (next) onChange(next);
+        } else if (pendingVoiceId) {
+            onChange(pendingVoiceId);
+            const chosen = voices.find((voice) => voice.voice_id === pendingVoiceId);
+            if (chosen) setSelectedVoiceInfo(chosen);
+        }
+        setIsOpen(false);
+    };
+
+    const triggerLabel = selectedVoiceInfo?.name || value || "Select a voice";
+    const triggerTraits = selectedVoiceInfo ? voiceTraits(selectedVoiceInfo) : "";
+
+    return (
+        <div className={cn("space-y-2", className)}>
+            <Button
+                type="button"
+                variant="outline"
+                className={cn("w-full justify-between", !value && "text-muted-foreground")}
+                onClick={openModal}
+            >
+                <span className="flex min-w-0 items-center gap-2">
+                    <span className="truncate font-medium">{triggerLabel}</span>
+                    {triggerTraits && (
+                        <span className="truncate text-xs text-muted-foreground">{triggerTraits}</span>
+                    )}
+                </span>
+                <ChevronDown className="ml-2 h-4 w-4 shrink-0 opacity-50" />
+            </Button>
+
+            <Dialog open={isOpen} onOpenChange={setIsOpen}>
+                <DialogContent className="flex max-h-[85vh] flex-col gap-0 overflow-hidden p-0 sm:max-w-3xl">
+                    <DialogHeader className="border-b px-6 py-4">
+                        <DialogTitle>Select Voice</DialogTitle>
+                    </DialogHeader>
+
+                    {/* Filter row: Gender · Accent · Language · Search */}
+                    <div className="flex flex-wrap items-center gap-2 border-b px-6 py-3">
+                        <Select value={gender} onValueChange={setGender} disabled={manualMode}>
+                            <SelectTrigger className="h-9 w-[130px]">
+                                <SelectValue placeholder="Gender" />
+                            </SelectTrigger>
+                            <SelectContent>
+                                <SelectItem value={ALL_FILTER_VALUE}>All genders</SelectItem>
+                                {genderOptions.map((option) => (
+                                    <SelectItem key={option.value} value={option.value}>
+                                        {option.label}
+                                    </SelectItem>
+                                ))}
+                            </SelectContent>
+                        </Select>
+
+                        <Select value={accent} onValueChange={setAccent} disabled={manualMode}>
+                            <SelectTrigger className="h-9 w-[140px]">
+                                <SelectValue placeholder="Accent" />
+                            </SelectTrigger>
+                            <SelectContent>
+                                <SelectItem value={ALL_FILTER_VALUE}>All accents</SelectItem>
+                                {accentOptions.map((option) => (
+                                    <SelectItem key={option.value} value={option.value}>
+                                        {option.label}
+                                    </SelectItem>
+                                ))}
+                            </SelectContent>
+                        </Select>
+
+                        <Select value={language} onValueChange={setLanguage} disabled={manualMode}>
+                            <SelectTrigger className="h-9 w-[150px]">
+                                <SelectValue placeholder="Language" />
+                            </SelectTrigger>
+                            <SelectContent>
+                                <SelectItem value={ALL_FILTER_VALUE}>All languages</SelectItem>
+                                {languageOptions.map((option) => (
+                                    <SelectItem key={option.value} value={option.value}>
+                                        {option.label}
+                                    </SelectItem>
+                                ))}
+                            </SelectContent>
+                        </Select>
+
+                        <Input
+                            placeholder="Search voices..."
+                            value={searchInput}
+                            onChange={(event) => setSearchInput(event.target.value)}
+                            className="h-9 min-w-[160px] flex-1"
+                            disabled={manualMode}
+                        />
+                    </div>
+
+                    {/* Body */}
+                    <div className="min-h-[260px] flex-1 overflow-auto px-6 py-4">
+                        {manualMode ? (
+                            <div className="space-y-2">
+                                <Label htmlFor="manual-voice-id">Custom voice ID</Label>
+                                <Input
+                                    id="manual-voice-id"
+                                    placeholder="Enter voice ID"
+                                    value={manualVoiceId}
+                                    onChange={(event) => setManualVoiceId(event.target.value)}
+                                    autoFocus
+                                />
+                                <p className="text-xs text-muted-foreground">
+                                    Use a voice ID that isn&apos;t in the catalog above.
+                                </p>
+                            </div>
+                        ) : error ? (
+                            <p className="py-10 text-center text-sm text-destructive">{error}</p>
+                        ) : isLoading ? (
+                            <div className="flex items-center justify-center py-10">
+                                <Loader2 className="h-6 w-6 animate-spin text-muted-foreground" />
+                            </div>
+                        ) : voices.length === 0 ? (
+                            <p className="py-10 text-center text-sm text-muted-foreground">
+                                No voices match these filters
+                            </p>
+                        ) : (
+                            <div className="grid gap-2 sm:grid-cols-2">
+                                {voices.map((voice) => {
+                                    const isSelected = pendingVoiceId === voice.voice_id;
+                                    const isPlaying = playingVoiceId === voice.voice_id;
+                                    return (
+                                        <button
+                                            type="button"
+                                            key={voice.voice_id}
+                                            onClick={() => setPendingVoiceId(voice.voice_id)}
+                                            className={cn(
+                                                "flex items-center gap-3 rounded-lg border p-3 text-left transition-colors hover:bg-accent",
+                                                isSelected ? "border-primary ring-1 ring-primary" : "border-border",
+                                            )}
+                                        >
+                                            <span
+                                                role="button"
+                                                tabIndex={voice.preview_url ? 0 : -1}
+                                                aria-label={isPlaying ? "Stop preview" : "Play preview"}
+                                                onClick={(event) => {
+                                                    event.stopPropagation();
+                                                    playPreview(voice);
+                                                }}
+                                                onKeyDown={(event) => {
+                                                    if (event.key === "Enter" || event.key === " ") {
+                                                        event.preventDefault();
+                                                        event.stopPropagation();
+                                                        playPreview(voice);
+                                                    }
+                                                }}
+                                                className={cn(
+                                                    "flex h-10 w-10 shrink-0 items-center justify-center rounded-full",
+                                                    voice.preview_url
+                                                        ? "bg-primary/10 text-primary hover:bg-primary/20"
+                                                        : "bg-muted text-muted-foreground",
+                                                )}
+                                            >
+                                                {isPlaying ? (
+                                                    <Square className="h-4 w-4 fill-current" />
+                                                ) : (
+                                                    <Play className="h-4 w-4 fill-current" />
+                                                )}
+                                            </span>
+                                            <span className="flex min-w-0 flex-1 flex-col">
+                                                <span className="flex items-center gap-2">
+                                                    <span className="truncate text-sm font-medium">{voice.name}</span>
+                                                    {isSelected && <Check className="h-4 w-4 shrink-0 text-primary" />}
+                                                </span>
+                                                {voiceTraits(voice) && (
+                                                    <span className="truncate text-xs text-muted-foreground">
+                                                        {voiceTraits(voice)}
+                                                    </span>
+                                                )}
+                                                <span className="truncate text-[11px] text-muted-foreground/70">
+                                                    ID: {voice.voice_id}
+                                                </span>
+                                            </span>
+                                        </button>
+                                    );
+                                })}
+                            </div>
+                        )}
+                    </div>
+
+                    {/* Footer */}
+                    <div className="flex items-center justify-between gap-3 border-t px-6 py-3">
+                        {allowManualInput ? (
+                            <Button
+                                type="button"
+                                variant="ghost"
+                                size="sm"
+                                className="text-muted-foreground"
+                                onClick={() => setManualMode((prev) => !prev)}
+                            >
+                                <Pencil className="mr-2 h-4 w-4" />
+                                {manualMode ? "Browse catalog" : "Custom voice ID"}
+                            </Button>
+                        ) : (
+                            <span className="text-xs text-muted-foreground">
+                                {!manualMode && !isLoading && !error ? `${voices.length} voices` : ""}
+                            </span>
+                        )}
+                        <div className="flex items-center gap-2">
+                            <Button type="button" variant="outline" onClick={() => setIsOpen(false)}>
+                                Cancel
+                            </Button>
+                            <Button
+                                type="button"
+                                onClick={commitSelection}
+                                disabled={manualMode ? !manualVoiceId.trim() : !pendingVoiceId}
+                            >
+                                Use this voice
+                            </Button>
+                        </div>
+                    </div>
+                </DialogContent>
+            </Dialog>
+        </div>
+    );
+};
--- a/ui/src/constants/accents.ts
+++ b/ui/src/constants/accents.ts
@ -0,0 +1,56 @@
+// Display names for accent codes returned by the voice catalog.
+//
+// The catalog derives accent from a voice's locale country (e.g. "en-US" -> "us"),
+// so the stored/filter value is an ISO 3166-1 alpha-2 country code. These are the
+// human-readable accent labels shown in the UI; the underlying code stays the
+// filter value. Unknown codes fall back to a capitalized form at the call site.
+export const ACCENT_DISPLAY_NAMES: Record<string, string> = {
+    us: "American",
+    gb: "British",
+    au: "Australian",
+    ca: "Canadian",
+    ie: "Irish",
+    nz: "New Zealand",
+    za: "South African",
+    in: "Indian",
+    bd: "Bangladeshi",
+    sg: "Singaporean",
+    my: "Malaysian",
+    ph: "Filipino",
+    id: "Indonesian",
+    vn: "Vietnamese",
+    th: "Thai",
+    cn: "Chinese",
+    jp: "Japanese",
+    kr: "Korean",
+    fr: "French",
+    de: "German",
+    ch: "Swiss",
+    nl: "Dutch",
+    it: "Italian",
+    es: "Spanish",
+    mx: "Mexican",
+    co: "Colombian",
+    bo: "Bolivian",
+    br: "Brazilian",
+    pt: "Portuguese",
+    ru: "Russian",
+    ua: "Ukrainian",
+    pl: "Polish",
+    cz: "Czech",
+    sk: "Slovak",
+    hu: "Hungarian",
+    ro: "Romanian",
+    bg: "Bulgarian",
+    hr: "Croatian",
+    gr: "Greek",
+    ge: "Georgian",
+    md: "Moldovan",
+    se: "Swedish",
+    no: "Norwegian",
+    dk: "Danish",
+    fi: "Finnish",
+    tr: "Turkish",
+    il: "Israeli",
+    sa: "Saudi",
+};