mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-25 08:48:13 +02:00
feat: add voice selector for Dograh model configs
This commit is contained in:
parent
40e34994fd
commit
0956157029
10 changed files with 694 additions and 86 deletions
|
|
@ -388,9 +388,18 @@ class VoiceInfo(BaseModel):
|
|||
preview_url: Optional[str] = None
|
||||
|
||||
|
||||
class VoiceFacets(BaseModel):
|
||||
"""Distinct selector values across a provider's full voice catalog."""
|
||||
|
||||
genders: List[str] = []
|
||||
accents: List[str] = []
|
||||
languages: List[str] = []
|
||||
|
||||
|
||||
class VoicesResponse(BaseModel):
|
||||
provider: str
|
||||
voices: List[VoiceInfo]
|
||||
facets: Optional[VoiceFacets] = None
|
||||
|
||||
|
||||
@router.get("/configurations/voices/{provider}")
|
||||
|
|
@ -398,6 +407,9 @@ async def get_voices(
|
|||
provider: TTSProvider,
|
||||
model: Optional[str] = None,
|
||||
language: Optional[str] = None,
|
||||
q: Optional[str] = None,
|
||||
gender: Optional[str] = None,
|
||||
accent: Optional[str] = None,
|
||||
user: UserModel = Depends(get_user),
|
||||
) -> VoicesResponse:
|
||||
"""Get available voices for a TTS provider."""
|
||||
|
|
@ -406,12 +418,16 @@ async def get_voices(
|
|||
provider=provider,
|
||||
model=model,
|
||||
language=language,
|
||||
q=q,
|
||||
gender=gender,
|
||||
accent=accent,
|
||||
organization_id=user.selected_organization_id,
|
||||
created_by=user.provider_id,
|
||||
)
|
||||
return VoicesResponse(
|
||||
provider=result.get("provider", provider),
|
||||
voices=[VoiceInfo(**voice) for voice in result.get("voices", [])],
|
||||
facets=result.get("facets"),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch voices for {provider}: {e}")
|
||||
|
|
|
|||
|
|
@ -720,6 +720,9 @@ class MPSServiceKeyClient:
|
|||
provider: str,
|
||||
model: Optional[str] = None,
|
||||
language: Optional[str] = None,
|
||||
q: Optional[str] = None,
|
||||
gender: Optional[str] = None,
|
||||
accent: Optional[str] = None,
|
||||
organization_id: Optional[int] = None,
|
||||
created_by: Optional[str] = None,
|
||||
) -> dict:
|
||||
|
|
@ -745,6 +748,12 @@ class MPSServiceKeyClient:
|
|||
params["model"] = model
|
||||
if language:
|
||||
params["language"] = language
|
||||
if q:
|
||||
params["q"] = q
|
||||
if gender:
|
||||
params["gender"] = gender
|
||||
if accent:
|
||||
params["accent"] = accent
|
||||
response = await client.get(
|
||||
f"{self.base_url}/api/v1/voice-proxy/{provider}/voices",
|
||||
headers=self._get_headers(organization_id, created_by),
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -1,6 +1,6 @@
|
|||
# generated by datamodel-codegen:
|
||||
# filename: dograh-openapi-XXXXXX.json.6F33jkClt9
|
||||
# timestamp: 2026-06-19T12:41:10+00:00
|
||||
# filename: dograh-openapi-XXXXXX.json.rRr9IUrKFk
|
||||
# timestamp: 2026-06-23T13:02:10+00:00
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -6294,6 +6294,26 @@ export type VobizConfigurationResponse = {
|
|||
from_numbers: Array<string>;
|
||||
};
|
||||
|
||||
/**
|
||||
* VoiceFacets
|
||||
*
|
||||
* Distinct selector values across a provider's full voice catalog.
|
||||
*/
|
||||
export type VoiceFacets = {
|
||||
/**
|
||||
* Genders
|
||||
*/
|
||||
genders?: Array<string>;
|
||||
/**
|
||||
* Accents
|
||||
*/
|
||||
accents?: Array<string>;
|
||||
/**
|
||||
* Languages
|
||||
*/
|
||||
languages?: Array<string>;
|
||||
};
|
||||
|
||||
/**
|
||||
* VoiceInfo
|
||||
*/
|
||||
|
|
@ -6340,6 +6360,7 @@ export type VoicesResponse = {
|
|||
* Voices
|
||||
*/
|
||||
voices: Array<VoiceInfo>;
|
||||
facets?: VoiceFacets | null;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -9208,6 +9229,18 @@ export type GetVoicesApiV1UserConfigurationsVoicesProviderGetData = {
|
|||
* Language
|
||||
*/
|
||||
language?: string | null;
|
||||
/**
|
||||
* Q
|
||||
*/
|
||||
q?: string | null;
|
||||
/**
|
||||
* Gender
|
||||
*/
|
||||
gender?: string | null;
|
||||
/**
|
||||
* Accent
|
||||
*/
|
||||
accent?: string | null;
|
||||
};
|
||||
url: '/api/v1/user/configurations/voices/{provider}';
|
||||
};
|
||||
|
|
|
|||
|
|
@ -12,11 +12,11 @@ import {
|
|||
} from "@/components/ServiceConfigurationForm";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Card, CardContent } from "@/components/ui/card";
|
||||
import { Checkbox } from "@/components/ui/checkbox";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
|
||||
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
|
||||
import { VoiceSelectorModal } from "@/components/VoiceSelectorModal";
|
||||
import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
|
||||
|
||||
type ModelMode = "realtime" | "dograh" | "byok";
|
||||
|
|
@ -278,7 +278,6 @@ export function AIModelConfigurationV2Editor({
|
|||
const [realtimeInitialConfig, setRealtimeInitialConfig] = useState<Record<string, unknown> | null>(null);
|
||||
const [pipelineInitialConfig, setPipelineInitialConfig] = useState<Record<string, unknown> | null>(null);
|
||||
const [isSavingDograh, setIsSavingDograh] = useState(false);
|
||||
const [isCustomVoice, setIsCustomVoice] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const allowCustomVoice = defaults.dograh.allow_custom_input ?? false;
|
||||
|
|
@ -290,7 +289,6 @@ export function AIModelConfigurationV2Editor({
|
|||
setMode(preferredMode(rawConfiguration, rawEffectiveConfiguration));
|
||||
const nextDograh = buildDograhState(defaults, rawConfiguration, rawEffectiveConfiguration);
|
||||
setDograh(nextDograh);
|
||||
setIsCustomVoice(allowCustomVoice && !defaults.dograh.voices.includes(nextDograh.voice));
|
||||
setRealtimeInitialConfig(getByokInitialConfig(rawConfiguration, rawEffectiveConfiguration, true));
|
||||
setPipelineInitialConfig(getByokInitialConfig(rawConfiguration, rawEffectiveConfiguration, false));
|
||||
}, [configuration, defaults, effectiveConfiguration, allowCustomVoice]);
|
||||
|
|
@ -390,46 +388,30 @@ export function AIModelConfigurationV2Editor({
|
|||
<Card>
|
||||
<CardContent className="pt-6">
|
||||
<div className="grid gap-4 sm:grid-cols-2">
|
||||
<div className="space-y-2">
|
||||
<div className="space-y-2 sm:col-span-2">
|
||||
<Label>Voice</Label>
|
||||
{isCustomVoice ? (
|
||||
<Input
|
||||
placeholder="Enter voice"
|
||||
value={dograh.voice}
|
||||
onChange={(event) => setDograh({ ...dograh, voice: event.target.value })}
|
||||
/>
|
||||
) : (
|
||||
<Select value={dograh.voice} onValueChange={(voice) => setDograh({ ...dograh, voice })}>
|
||||
<SelectTrigger className="w-full">
|
||||
<SelectValue placeholder="Select voice" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
{defaults.dograh.voices.map((voice) => (
|
||||
<SelectItem key={voice} value={voice}>
|
||||
{voice}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
)}
|
||||
{allowCustomVoice && (
|
||||
<div className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
id="dograh-custom-voice"
|
||||
checked={isCustomVoice}
|
||||
onCheckedChange={(checked) => {
|
||||
const custom = checked as boolean;
|
||||
setIsCustomVoice(custom);
|
||||
if (!custom) {
|
||||
setDograh({ ...dograh, voice: defaults.dograh.defaults.voice });
|
||||
}
|
||||
}}
|
||||
/>
|
||||
<Label htmlFor="dograh-custom-voice" className="text-sm font-normal cursor-pointer">
|
||||
Enter Custom Value
|
||||
</Label>
|
||||
</div>
|
||||
)}
|
||||
<VoiceSelectorModal
|
||||
provider="dograh"
|
||||
value={dograh.voice}
|
||||
onChange={(voice) => setDograh({ ...dograh, voice })}
|
||||
allowManualInput={allowCustomVoice}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2 sm:col-span-2">
|
||||
<Label>Language</Label>
|
||||
<Select value={dograh.language} onValueChange={(language) => setDograh({ ...dograh, language })}>
|
||||
<SelectTrigger className="w-full">
|
||||
<SelectValue placeholder="Select language" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
{defaults.dograh.languages.map((language) => (
|
||||
<SelectItem key={language} value={language}>
|
||||
{LANGUAGE_DISPLAY_NAMES[language] || language}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2">
|
||||
|
|
@ -451,23 +433,7 @@ export function AIModelConfigurationV2Editor({
|
|||
/>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2 sm:col-span-2">
|
||||
<Label>Language</Label>
|
||||
<Select value={dograh.language} onValueChange={(language) => setDograh({ ...dograh, language })}>
|
||||
<SelectTrigger className="w-full">
|
||||
<SelectValue placeholder="Select language" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
{defaults.dograh.languages.map((language) => (
|
||||
<SelectItem key={language} value={language}>
|
||||
{LANGUAGE_DISPLAY_NAMES[language] || language}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2 sm:col-span-2">
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="dograh-api-key">API Key</Label>
|
||||
<div className="relative">
|
||||
<KeyRound className="pointer-events-none absolute left-3 top-1/2 h-4 w-4 -translate-y-1/2 text-muted-foreground" />
|
||||
|
|
|
|||
|
|
@ -10,11 +10,13 @@ import { Checkbox } from "@/components/ui/checkbox";
|
|||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
|
||||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
// Providers that have MPS voice endpoints
|
||||
type TTSProviderWithVoices = "elevenlabs" | "deepgram" | "sarvam" | "cartesia" | "dograh" | "rime";
|
||||
const MPS_VOICE_PROVIDERS: TTSProviderWithVoices[] = ["elevenlabs", "deepgram", "sarvam", "cartesia", "dograh", "rime"];
|
||||
const ALL_FILTER_VALUE = "__all__";
|
||||
|
||||
interface VoiceSelectorProps {
|
||||
provider: string;
|
||||
|
|
@ -22,6 +24,8 @@ interface VoiceSelectorProps {
|
|||
onChange: (voiceId: string) => void;
|
||||
model?: string;
|
||||
language?: string;
|
||||
showFilters?: boolean;
|
||||
allowManualInput?: boolean;
|
||||
className?: string;
|
||||
}
|
||||
|
||||
|
|
@ -31,10 +35,15 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
|||
onChange,
|
||||
model,
|
||||
language,
|
||||
showFilters = false,
|
||||
allowManualInput = true,
|
||||
className,
|
||||
}) => {
|
||||
const [isOpen, setIsOpen] = useState(false);
|
||||
const [searchTerm, setSearchTerm] = useState("");
|
||||
const [genderFilter, setGenderFilter] = useState(ALL_FILTER_VALUE);
|
||||
const [languageFilter, setLanguageFilter] = useState(ALL_FILTER_VALUE);
|
||||
const [accentFilter, setAccentFilter] = useState(ALL_FILTER_VALUE);
|
||||
const [isManualInput, setIsManualInput] = useState(false);
|
||||
const [manualVoiceId, setManualVoiceId] = useState(value || "");
|
||||
const [voices, setVoices] = useState<VoiceInfo[]>([]);
|
||||
|
|
@ -102,13 +111,15 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
|||
useEffect(() => {
|
||||
if (value && voices.length > 0) {
|
||||
const voiceExists = voices.some((v) => v.voice_id === value);
|
||||
if (!voiceExists) {
|
||||
if (!voiceExists && allowManualInput) {
|
||||
// If the value doesn't exist in the list, switch to manual input mode
|
||||
setIsManualInput(true);
|
||||
setManualVoiceId(value);
|
||||
} else if (voiceExists) {
|
||||
setIsManualInput(false);
|
||||
}
|
||||
}
|
||||
}, [value, voices]);
|
||||
}, [value, voices, allowManualInput]);
|
||||
|
||||
// Cleanup audio on unmount or when popover closes
|
||||
useEffect(() => {
|
||||
|
|
@ -131,7 +142,7 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
|||
|
||||
const filteredVoices = voices.filter((voice) => {
|
||||
const searchLower = searchTerm.toLowerCase();
|
||||
return (
|
||||
const matchesSearch = (
|
||||
voice.name.toLowerCase().includes(searchLower) ||
|
||||
voice.voice_id.toLowerCase().includes(searchLower) ||
|
||||
(voice.description?.toLowerCase() || "").includes(searchLower) ||
|
||||
|
|
@ -139,8 +150,23 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
|||
(voice.gender?.toLowerCase() || "").includes(searchLower) ||
|
||||
(voice.language?.toLowerCase() || "").includes(searchLower)
|
||||
);
|
||||
if (!matchesSearch) return false;
|
||||
if (genderFilter !== ALL_FILTER_VALUE && (voice.gender || "").toLowerCase() !== genderFilter) return false;
|
||||
if (languageFilter !== ALL_FILTER_VALUE && (voice.language || "").toLowerCase() !== languageFilter) return false;
|
||||
if (accentFilter !== ALL_FILTER_VALUE && (voice.accent || "").toLowerCase() !== accentFilter) return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
const genderOptions = Array.from(
|
||||
new Set(voices.map((voice) => voice.gender?.toLowerCase()).filter(Boolean) as string[]),
|
||||
).sort();
|
||||
const languageOptions = Array.from(
|
||||
new Set(voices.map((voice) => voice.language?.toLowerCase()).filter(Boolean) as string[]),
|
||||
).sort();
|
||||
const accentOptions = Array.from(
|
||||
new Set(voices.map((voice) => voice.accent?.toLowerCase()).filter(Boolean) as string[]),
|
||||
).sort();
|
||||
|
||||
const handleSelectVoice = (voiceId: string) => {
|
||||
onChange(voiceId);
|
||||
setIsOpen(false);
|
||||
|
|
@ -148,6 +174,7 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
|||
};
|
||||
|
||||
const handleManualInputToggle = (checked: boolean) => {
|
||||
if (!allowManualInput) return;
|
||||
setIsManualInput(checked);
|
||||
if (checked) {
|
||||
setManualVoiceId(value || "");
|
||||
|
|
@ -219,7 +246,7 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
|||
);
|
||||
}
|
||||
|
||||
if (isManualInput) {
|
||||
if (isManualInput && allowManualInput) {
|
||||
return (
|
||||
<div className={cn("space-y-2", className)}>
|
||||
<Input
|
||||
|
|
@ -281,6 +308,52 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
|||
/>
|
||||
</div>
|
||||
|
||||
{showFilters && (
|
||||
<div className="grid gap-2 sm:grid-cols-3">
|
||||
<Select value={genderFilter} onValueChange={setGenderFilter}>
|
||||
<SelectTrigger className="h-8">
|
||||
<SelectValue placeholder="Gender" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value={ALL_FILTER_VALUE}>All genders</SelectItem>
|
||||
{genderOptions.map((gender) => (
|
||||
<SelectItem key={gender} value={gender} className="capitalize">
|
||||
{gender}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
|
||||
<Select value={languageFilter} onValueChange={setLanguageFilter}>
|
||||
<SelectTrigger className="h-8">
|
||||
<SelectValue placeholder="Language" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value={ALL_FILTER_VALUE}>All languages</SelectItem>
|
||||
{languageOptions.map((voiceLanguage) => (
|
||||
<SelectItem key={voiceLanguage} value={voiceLanguage} className="uppercase">
|
||||
{voiceLanguage}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
|
||||
<Select value={accentFilter} onValueChange={setAccentFilter}>
|
||||
<SelectTrigger className="h-8">
|
||||
<SelectValue placeholder="Accent" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value={ALL_FILTER_VALUE}>All accents</SelectItem>
|
||||
{accentOptions.map((accent) => (
|
||||
<SelectItem key={accent} value={accent} className="uppercase">
|
||||
{accent}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="max-h-[300px] overflow-auto space-y-1">
|
||||
{error ? (
|
||||
<p className="text-sm text-red-500 text-center py-4">
|
||||
|
|
@ -358,26 +431,30 @@ export const VoiceSelector: React.FC<VoiceSelectorProps> = ({
|
|||
</div>
|
||||
|
||||
<div className="pt-2 border-t flex items-center justify-between">
|
||||
<div className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
id="manual-voice-input-popup"
|
||||
checked={isManualInput}
|
||||
onCheckedChange={(checked) => {
|
||||
handleManualInputToggle(checked as boolean);
|
||||
if (checked) {
|
||||
setIsOpen(false);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
<Label
|
||||
htmlFor="manual-voice-input-popup"
|
||||
className="text-sm font-normal cursor-pointer"
|
||||
>
|
||||
Add Voice ID Manually
|
||||
</Label>
|
||||
</div>
|
||||
{allowManualInput ? (
|
||||
<div className="flex items-center space-x-2">
|
||||
<Checkbox
|
||||
id="manual-voice-input-popup"
|
||||
checked={isManualInput}
|
||||
onCheckedChange={(checked) => {
|
||||
handleManualInputToggle(checked as boolean);
|
||||
if (checked) {
|
||||
setIsOpen(false);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
<Label
|
||||
htmlFor="manual-voice-input-popup"
|
||||
className="text-sm font-normal cursor-pointer"
|
||||
>
|
||||
Add Voice ID Manually
|
||||
</Label>
|
||||
</div>
|
||||
) : (
|
||||
<span />
|
||||
)}
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{voices.length} voices available
|
||||
{filteredVoices.length} of {voices.length} voices
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
|||
451
ui/src/components/VoiceSelectorModal.tsx
Normal file
451
ui/src/components/VoiceSelectorModal.tsx
Normal file
|
|
@ -0,0 +1,451 @@
|
|||
"use client";
|
||||
|
||||
import { Check, ChevronDown, Loader2, Pencil, Play, Square } from "lucide-react";
|
||||
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
|
||||
|
||||
import { getVoicesApiV1UserConfigurationsVoicesProviderGet } from "@/client/sdk.gen";
|
||||
import { VoiceInfo } from "@/client/types.gen";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import {
|
||||
Dialog,
|
||||
DialogContent,
|
||||
DialogHeader,
|
||||
DialogTitle,
|
||||
} from "@/components/ui/dialog";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
|
||||
import { ACCENT_DISPLAY_NAMES } from "@/constants/accents";
|
||||
import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
const ALL_FILTER_VALUE = "__all__";
|
||||
|
||||
// Defaults so the modal opens on a focused set instead of the full catalog.
|
||||
const DEFAULT_GENDER = "female";
|
||||
const DEFAULT_ACCENT = "us"; // American
|
||||
const DEFAULT_LANGUAGE = "en";
|
||||
|
||||
const SEARCH_DEBOUNCE_MS = 300;
|
||||
|
||||
interface Facets {
|
||||
genders: string[];
|
||||
accents: string[];
|
||||
languages: string[];
|
||||
}
|
||||
|
||||
const EMPTY_FACETS: Facets = { genders: [], accents: [], languages: [] };
|
||||
|
||||
interface VoiceSelectorModalProps {
|
||||
provider: string;
|
||||
value: string;
|
||||
onChange: (voiceId: string) => void;
|
||||
/** Optional model passed through to the voice catalog query. */
|
||||
model?: string;
|
||||
/** Allow typing a raw voice ID for voices outside the catalog. */
|
||||
allowManualInput?: boolean;
|
||||
className?: string;
|
||||
}
|
||||
|
||||
const capitalize = (value: string) => value.charAt(0).toUpperCase() + value.slice(1);
|
||||
|
||||
const accentLabel = (code?: string | null) =>
|
||||
code ? ACCENT_DISPLAY_NAMES[code.toLowerCase()] || capitalize(code) : "";
|
||||
const languageLabel = (code?: string | null) =>
|
||||
code ? LANGUAGE_DISPLAY_NAMES[code] || code.toUpperCase() : "";
|
||||
const genderLabel = (gender?: string | null) => (gender ? capitalize(gender) : "");
|
||||
|
||||
/** Build the "Accent · Gender · Language" trait line shown under a voice name. */
|
||||
function voiceTraits(voice: VoiceInfo): string {
|
||||
return [accentLabel(voice.accent), genderLabel(voice.gender), languageLabel(voice.language)]
|
||||
.filter(Boolean)
|
||||
.join(" · ");
|
||||
}
|
||||
|
||||
/** Ensure the active filter value is always an option so the Select can render it. */
|
||||
function withSelected(options: string[], selected: string): string[] {
|
||||
if (selected === ALL_FILTER_VALUE || options.includes(selected)) return options;
|
||||
return [selected, ...options];
|
||||
}
|
||||
|
||||
export const VoiceSelectorModal: React.FC<VoiceSelectorModalProps> = ({
|
||||
provider,
|
||||
value,
|
||||
onChange,
|
||||
model,
|
||||
allowManualInput = false,
|
||||
className,
|
||||
}) => {
|
||||
const [isOpen, setIsOpen] = useState(false);
|
||||
const [voices, setVoices] = useState<VoiceInfo[]>([]);
|
||||
const [facets, setFacets] = useState<Facets>(EMPTY_FACETS);
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
// Filters drive a server-side query (we never fetch the whole catalog).
|
||||
const [gender, setGender] = useState(DEFAULT_GENDER);
|
||||
const [accent, setAccent] = useState(DEFAULT_ACCENT);
|
||||
const [language, setLanguage] = useState(DEFAULT_LANGUAGE);
|
||||
const [searchInput, setSearchInput] = useState("");
|
||||
const [debouncedSearch, setDebouncedSearch] = useState("");
|
||||
|
||||
// Pending (in-modal) selection; only committed via "Use this voice".
|
||||
const [pendingVoiceId, setPendingVoiceId] = useState(value);
|
||||
const [selectedVoiceInfo, setSelectedVoiceInfo] = useState<VoiceInfo | null>(null);
|
||||
const [manualMode, setManualMode] = useState(false);
|
||||
const [manualVoiceId, setManualVoiceId] = useState("");
|
||||
|
||||
// Preview playback.
|
||||
const [playingVoiceId, setPlayingVoiceId] = useState<string | null>(null);
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const requestId = useRef(0);
|
||||
|
||||
const stopPreview = useCallback(() => {
|
||||
if (audioRef.current) {
|
||||
audioRef.current.pause();
|
||||
audioRef.current = null;
|
||||
}
|
||||
setPlayingVoiceId(null);
|
||||
}, []);
|
||||
|
||||
// Debounce the search box so typing doesn't fire a request per keystroke.
|
||||
useEffect(() => {
|
||||
const timer = setTimeout(() => setDebouncedSearch(searchInput), SEARCH_DEBOUNCE_MS);
|
||||
return () => clearTimeout(timer);
|
||||
}, [searchInput]);
|
||||
|
||||
// Resolve the currently-selected voice (for the trigger label) without
|
||||
// pulling the catalog: a targeted lookup by voice ID.
|
||||
useEffect(() => {
|
||||
if (!value) {
|
||||
setSelectedVoiceInfo(null);
|
||||
return;
|
||||
}
|
||||
let active = true;
|
||||
(async () => {
|
||||
const response = await getVoicesApiV1UserConfigurationsVoicesProviderGet({
|
||||
path: { provider: provider as never },
|
||||
query: { q: value },
|
||||
});
|
||||
if (!active) return;
|
||||
const found = response.data?.voices?.find((voice) => voice.voice_id === value) ?? null;
|
||||
setSelectedVoiceInfo(found);
|
||||
})();
|
||||
return () => {
|
||||
active = false;
|
||||
};
|
||||
}, [value, provider]);
|
||||
|
||||
// Fetch the filtered voice list (server-side) whenever the modal is open
|
||||
// and a filter changes. A request counter discards out-of-order responses.
|
||||
useEffect(() => {
|
||||
if (!isOpen || manualMode) return;
|
||||
const id = ++requestId.current;
|
||||
setIsLoading(true);
|
||||
setError(null);
|
||||
(async () => {
|
||||
const query: Record<string, string> = {};
|
||||
if (model) query.model = model;
|
||||
if (gender !== ALL_FILTER_VALUE) query.gender = gender;
|
||||
if (accent !== ALL_FILTER_VALUE) query.accent = accent;
|
||||
if (language !== ALL_FILTER_VALUE) query.language = language;
|
||||
const search = debouncedSearch.trim();
|
||||
if (search) query.q = search;
|
||||
|
||||
const response = await getVoicesApiV1UserConfigurationsVoicesProviderGet({
|
||||
path: { provider: provider as never },
|
||||
query,
|
||||
});
|
||||
if (id !== requestId.current) return; // a newer request superseded this one
|
||||
|
||||
if (response.error) {
|
||||
setError("Failed to load voices");
|
||||
setVoices([]);
|
||||
} else {
|
||||
setVoices(response.data?.voices ?? []);
|
||||
if (response.data?.facets) {
|
||||
setFacets({
|
||||
genders: response.data.facets.genders ?? [],
|
||||
accents: response.data.facets.accents ?? [],
|
||||
languages: response.data.facets.languages ?? [],
|
||||
});
|
||||
}
|
||||
}
|
||||
setIsLoading(false);
|
||||
})();
|
||||
}, [isOpen, manualMode, provider, model, gender, accent, language, debouncedSearch]);
|
||||
|
||||
// Stop any preview when the modal closes / unmounts.
|
||||
useEffect(() => {
|
||||
if (!isOpen) stopPreview();
|
||||
return () => stopPreview();
|
||||
}, [isOpen, stopPreview]);
|
||||
|
||||
// Facets arrive sorted by raw code; present them sorted by display label so
|
||||
// the dropdowns read alphabetically (e.g. "American" near the top, not "us").
|
||||
const toSortedOptions = (codes: string[], selected: string, label: (code: string) => string) =>
|
||||
withSelected(codes, selected)
|
||||
.map((code) => ({ value: code, label: label(code) }))
|
||||
.sort((a, b) => a.label.localeCompare(b.label));
|
||||
|
||||
const genderOptions = useMemo(
|
||||
() => toSortedOptions(facets.genders, gender, genderLabel),
|
||||
[facets.genders, gender],
|
||||
);
|
||||
const accentOptions = useMemo(
|
||||
() => toSortedOptions(facets.accents, accent, accentLabel),
|
||||
[facets.accents, accent],
|
||||
);
|
||||
const languageOptions = useMemo(
|
||||
() => toSortedOptions(facets.languages, language, languageLabel),
|
||||
[facets.languages, language],
|
||||
);
|
||||
|
||||
const openModal = () => {
|
||||
setGender(DEFAULT_GENDER);
|
||||
setAccent(DEFAULT_ACCENT);
|
||||
setLanguage(DEFAULT_LANGUAGE);
|
||||
setSearchInput("");
|
||||
setDebouncedSearch("");
|
||||
setManualMode(false);
|
||||
setManualVoiceId(value);
|
||||
setPendingVoiceId(value);
|
||||
setIsOpen(true);
|
||||
};
|
||||
|
||||
const playPreview = (voice: VoiceInfo) => {
|
||||
if (playingVoiceId === voice.voice_id) {
|
||||
stopPreview();
|
||||
return;
|
||||
}
|
||||
stopPreview();
|
||||
if (!voice.preview_url) return;
|
||||
const audio = new Audio(voice.preview_url);
|
||||
audioRef.current = audio;
|
||||
setPlayingVoiceId(voice.voice_id);
|
||||
const clear = () => {
|
||||
if (audioRef.current === audio) audioRef.current = null;
|
||||
setPlayingVoiceId((current) => (current === voice.voice_id ? null : current));
|
||||
};
|
||||
audio.onended = clear;
|
||||
audio.onerror = clear;
|
||||
audio.play().catch(clear);
|
||||
};
|
||||
|
||||
const commitSelection = () => {
|
||||
if (manualMode) {
|
||||
const next = manualVoiceId.trim();
|
||||
if (next) onChange(next);
|
||||
} else if (pendingVoiceId) {
|
||||
onChange(pendingVoiceId);
|
||||
const chosen = voices.find((voice) => voice.voice_id === pendingVoiceId);
|
||||
if (chosen) setSelectedVoiceInfo(chosen);
|
||||
}
|
||||
setIsOpen(false);
|
||||
};
|
||||
|
||||
const triggerLabel = selectedVoiceInfo?.name || value || "Select a voice";
|
||||
const triggerTraits = selectedVoiceInfo ? voiceTraits(selectedVoiceInfo) : "";
|
||||
|
||||
return (
|
||||
<div className={cn("space-y-2", className)}>
|
||||
<Button
|
||||
type="button"
|
||||
variant="outline"
|
||||
className={cn("w-full justify-between", !value && "text-muted-foreground")}
|
||||
onClick={openModal}
|
||||
>
|
||||
<span className="flex min-w-0 items-center gap-2">
|
||||
<span className="truncate font-medium">{triggerLabel}</span>
|
||||
{triggerTraits && (
|
||||
<span className="truncate text-xs text-muted-foreground">{triggerTraits}</span>
|
||||
)}
|
||||
</span>
|
||||
<ChevronDown className="ml-2 h-4 w-4 shrink-0 opacity-50" />
|
||||
</Button>
|
||||
|
||||
<Dialog open={isOpen} onOpenChange={setIsOpen}>
|
||||
<DialogContent className="flex max-h-[85vh] flex-col gap-0 overflow-hidden p-0 sm:max-w-3xl">
|
||||
<DialogHeader className="border-b px-6 py-4">
|
||||
<DialogTitle>Select Voice</DialogTitle>
|
||||
</DialogHeader>
|
||||
|
||||
{/* Filter row: Gender · Accent · Language · Search */}
|
||||
<div className="flex flex-wrap items-center gap-2 border-b px-6 py-3">
|
||||
<Select value={gender} onValueChange={setGender} disabled={manualMode}>
|
||||
<SelectTrigger className="h-9 w-[130px]">
|
||||
<SelectValue placeholder="Gender" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value={ALL_FILTER_VALUE}>All genders</SelectItem>
|
||||
{genderOptions.map((option) => (
|
||||
<SelectItem key={option.value} value={option.value}>
|
||||
{option.label}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
|
||||
<Select value={accent} onValueChange={setAccent} disabled={manualMode}>
|
||||
<SelectTrigger className="h-9 w-[140px]">
|
||||
<SelectValue placeholder="Accent" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value={ALL_FILTER_VALUE}>All accents</SelectItem>
|
||||
{accentOptions.map((option) => (
|
||||
<SelectItem key={option.value} value={option.value}>
|
||||
{option.label}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
|
||||
<Select value={language} onValueChange={setLanguage} disabled={manualMode}>
|
||||
<SelectTrigger className="h-9 w-[150px]">
|
||||
<SelectValue placeholder="Language" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value={ALL_FILTER_VALUE}>All languages</SelectItem>
|
||||
{languageOptions.map((option) => (
|
||||
<SelectItem key={option.value} value={option.value}>
|
||||
{option.label}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
|
||||
<Input
|
||||
placeholder="Search voices..."
|
||||
value={searchInput}
|
||||
onChange={(event) => setSearchInput(event.target.value)}
|
||||
className="h-9 min-w-[160px] flex-1"
|
||||
disabled={manualMode}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Body */}
|
||||
<div className="min-h-[260px] flex-1 overflow-auto px-6 py-4">
|
||||
{manualMode ? (
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="manual-voice-id">Custom voice ID</Label>
|
||||
<Input
|
||||
id="manual-voice-id"
|
||||
placeholder="Enter voice ID"
|
||||
value={manualVoiceId}
|
||||
onChange={(event) => setManualVoiceId(event.target.value)}
|
||||
autoFocus
|
||||
/>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Use a voice ID that isn't in the catalog above.
|
||||
</p>
|
||||
</div>
|
||||
) : error ? (
|
||||
<p className="py-10 text-center text-sm text-destructive">{error}</p>
|
||||
) : isLoading ? (
|
||||
<div className="flex items-center justify-center py-10">
|
||||
<Loader2 className="h-6 w-6 animate-spin text-muted-foreground" />
|
||||
</div>
|
||||
) : voices.length === 0 ? (
|
||||
<p className="py-10 text-center text-sm text-muted-foreground">
|
||||
No voices match these filters
|
||||
</p>
|
||||
) : (
|
||||
<div className="grid gap-2 sm:grid-cols-2">
|
||||
{voices.map((voice) => {
|
||||
const isSelected = pendingVoiceId === voice.voice_id;
|
||||
const isPlaying = playingVoiceId === voice.voice_id;
|
||||
return (
|
||||
<button
|
||||
type="button"
|
||||
key={voice.voice_id}
|
||||
onClick={() => setPendingVoiceId(voice.voice_id)}
|
||||
className={cn(
|
||||
"flex items-center gap-3 rounded-lg border p-3 text-left transition-colors hover:bg-accent",
|
||||
isSelected ? "border-primary ring-1 ring-primary" : "border-border",
|
||||
)}
|
||||
>
|
||||
<span
|
||||
role="button"
|
||||
tabIndex={voice.preview_url ? 0 : -1}
|
||||
aria-label={isPlaying ? "Stop preview" : "Play preview"}
|
||||
onClick={(event) => {
|
||||
event.stopPropagation();
|
||||
playPreview(voice);
|
||||
}}
|
||||
onKeyDown={(event) => {
|
||||
if (event.key === "Enter" || event.key === " ") {
|
||||
event.preventDefault();
|
||||
event.stopPropagation();
|
||||
playPreview(voice);
|
||||
}
|
||||
}}
|
||||
className={cn(
|
||||
"flex h-10 w-10 shrink-0 items-center justify-center rounded-full",
|
||||
voice.preview_url
|
||||
? "bg-primary/10 text-primary hover:bg-primary/20"
|
||||
: "bg-muted text-muted-foreground",
|
||||
)}
|
||||
>
|
||||
{isPlaying ? (
|
||||
<Square className="h-4 w-4 fill-current" />
|
||||
) : (
|
||||
<Play className="h-4 w-4 fill-current" />
|
||||
)}
|
||||
</span>
|
||||
<span className="flex min-w-0 flex-1 flex-col">
|
||||
<span className="flex items-center gap-2">
|
||||
<span className="truncate text-sm font-medium">{voice.name}</span>
|
||||
{isSelected && <Check className="h-4 w-4 shrink-0 text-primary" />}
|
||||
</span>
|
||||
{voiceTraits(voice) && (
|
||||
<span className="truncate text-xs text-muted-foreground">
|
||||
{voiceTraits(voice)}
|
||||
</span>
|
||||
)}
|
||||
<span className="truncate text-[11px] text-muted-foreground/70">
|
||||
ID: {voice.voice_id}
|
||||
</span>
|
||||
</span>
|
||||
</button>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Footer */}
|
||||
<div className="flex items-center justify-between gap-3 border-t px-6 py-3">
|
||||
{allowManualInput ? (
|
||||
<Button
|
||||
type="button"
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
className="text-muted-foreground"
|
||||
onClick={() => setManualMode((prev) => !prev)}
|
||||
>
|
||||
<Pencil className="mr-2 h-4 w-4" />
|
||||
{manualMode ? "Browse catalog" : "Custom voice ID"}
|
||||
</Button>
|
||||
) : (
|
||||
<span className="text-xs text-muted-foreground">
|
||||
{!manualMode && !isLoading && !error ? `${voices.length} voices` : ""}
|
||||
</span>
|
||||
)}
|
||||
<div className="flex items-center gap-2">
|
||||
<Button type="button" variant="outline" onClick={() => setIsOpen(false)}>
|
||||
Cancel
|
||||
</Button>
|
||||
<Button
|
||||
type="button"
|
||||
onClick={commitSelection}
|
||||
disabled={manualMode ? !manualVoiceId.trim() : !pendingVoiceId}
|
||||
>
|
||||
Use this voice
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
56
ui/src/constants/accents.ts
Normal file
56
ui/src/constants/accents.ts
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
// Display names for accent codes returned by the voice catalog.
|
||||
//
|
||||
// The catalog derives accent from a voice's locale country (e.g. "en-US" -> "us"),
|
||||
// so the stored/filter value is an ISO 3166-1 alpha-2 country code. These are the
|
||||
// human-readable accent labels shown in the UI; the underlying code stays the
|
||||
// filter value. Unknown codes fall back to a capitalized form at the call site.
|
||||
export const ACCENT_DISPLAY_NAMES: Record<string, string> = {
|
||||
us: "American",
|
||||
gb: "British",
|
||||
au: "Australian",
|
||||
ca: "Canadian",
|
||||
ie: "Irish",
|
||||
nz: "New Zealand",
|
||||
za: "South African",
|
||||
in: "Indian",
|
||||
bd: "Bangladeshi",
|
||||
sg: "Singaporean",
|
||||
my: "Malaysian",
|
||||
ph: "Filipino",
|
||||
id: "Indonesian",
|
||||
vn: "Vietnamese",
|
||||
th: "Thai",
|
||||
cn: "Chinese",
|
||||
jp: "Japanese",
|
||||
kr: "Korean",
|
||||
fr: "French",
|
||||
de: "German",
|
||||
ch: "Swiss",
|
||||
nl: "Dutch",
|
||||
it: "Italian",
|
||||
es: "Spanish",
|
||||
mx: "Mexican",
|
||||
co: "Colombian",
|
||||
bo: "Bolivian",
|
||||
br: "Brazilian",
|
||||
pt: "Portuguese",
|
||||
ru: "Russian",
|
||||
ua: "Ukrainian",
|
||||
pl: "Polish",
|
||||
cz: "Czech",
|
||||
sk: "Slovak",
|
||||
hu: "Hungarian",
|
||||
ro: "Romanian",
|
||||
bg: "Bulgarian",
|
||||
hr: "Croatian",
|
||||
gr: "Greek",
|
||||
ge: "Georgian",
|
||||
md: "Moldovan",
|
||||
se: "Swedish",
|
||||
no: "Norwegian",
|
||||
dk: "Danish",
|
||||
fi: "Finnish",
|
||||
tr: "Turkish",
|
||||
il: "Israeli",
|
||||
sa: "Saudi",
|
||||
};
|
||||
Loading…
Add table
Add a link
Reference in a new issue