mirror of
https://github.com/rowboatlabs/rowboat.git
synced 2026-04-28 09:56:23 +02:00
voice mode with TTS input/output
This commit is contained in:
parent
d150294af1
commit
47d5118448
17 changed files with 937 additions and 15 deletions
|
|
@ -894,11 +894,19 @@ export async function* streamAgent({
|
|||
}
|
||||
|
||||
// get any queued user messages
|
||||
let voiceInput = false;
|
||||
let voiceOutput: 'summary' | 'full' | null = null;
|
||||
while (true) {
|
||||
const msg = await messageQueue.dequeue(runId);
|
||||
if (!msg) {
|
||||
break;
|
||||
}
|
||||
if (msg.voiceInput) {
|
||||
voiceInput = true;
|
||||
}
|
||||
if (msg.voiceOutput) {
|
||||
voiceOutput = msg.voiceOutput;
|
||||
}
|
||||
loopLogger.log('dequeued user message', msg.messageId);
|
||||
yield* processEvent({
|
||||
runId,
|
||||
|
|
@ -938,7 +946,18 @@ export async function* streamAgent({
|
|||
minute: '2-digit',
|
||||
timeZoneName: 'short'
|
||||
});
|
||||
const instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`;
|
||||
let instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`;
|
||||
if (voiceInput) {
|
||||
loopLogger.log('voice input enabled, injecting voice input prompt');
|
||||
instructionsWithDateTime += `\n\n# Voice Input\nThe user's message was transcribed from speech. Be aware that:\n- There may be transcription errors. Silently correct obvious ones (e.g. homophones, misheard words). If an error is genuinely ambiguous, briefly mention your interpretation (e.g. "I'm assuming you meant X").\n- Spoken messages are often long-winded. The user may ramble, repeat themselves, or correct something they said earlier in the same message. Focus on their final intent, not every word verbatim.`;
|
||||
}
|
||||
if (voiceOutput === 'summary') {
|
||||
loopLogger.log('voice output enabled (summary mode), injecting voice output prompt');
|
||||
instructionsWithDateTime += `\n\n# Voice Output (MANDATORY)\nThe user has voice output enabled. You MUST start your response with <voice></voice> tags that provide a spoken summary and guide to your written response. This is NOT optional — every response MUST begin with <voice> tags.\n\nRules:\n1. ALWAYS start your response with one or more <voice> tags. Never skip them.\n2. Place ALL <voice> tags at the BEGINNING of your response, before any detailed content. Do NOT intersperse <voice> tags throughout the response.\n3. Wrap EACH spoken sentence in its own separate <voice> tag so it can be spoken incrementally. Do NOT wrap everything in a single <voice> block.\n4. Use voice as a TL;DR and navigation aid — do NOT read the entire response aloud.\n\nExample — if the user asks "what happened in my meeting with Sarah yesterday?":\n<voice>Your meeting with Sarah covered three main things: the Q2 roadmap timeline, hiring for the backend role, and the client demo next week.</voice>\n<voice>I've pulled out the key details and action items below — the demo prep notes are at the end.</voice>\n\n## Meeting with Sarah — March 11\n(Then the full detailed written response follows without any more <voice> tags.)\n\nAny text outside <voice> tags is shown visually but not spoken.`;
|
||||
} else if (voiceOutput === 'full') {
|
||||
loopLogger.log('voice output enabled (full mode), injecting voice output prompt');
|
||||
instructionsWithDateTime += `\n\n# Voice Output — Full Read-Aloud (MANDATORY)\nThe user wants your ENTIRE response spoken aloud. You MUST wrap your full response in <voice></voice> tags. This is NOT optional.\n\nRules:\n1. Wrap EACH sentence in its own separate <voice> tag so it can be spoken incrementally.\n2. Write your response in a natural, conversational style suitable for listening — no markdown headings, bullet points, or formatting symbols. Use plain spoken language.\n3. Structure the content as if you are speaking to the user directly. Use transitions like "first", "also", "one more thing" instead of visual formatting.\n4. Every sentence MUST be inside a <voice> tag. Do not leave any content outside <voice> tags.\n\nExample:\n<voice>Your meeting with Sarah covered three main things.</voice>\n<voice>First, you discussed the Q2 roadmap timeline and agreed to push the launch to April.</voice>\n<voice>Second, you talked about hiring for the backend role — Sarah will send over two candidates by Friday.</voice>\n<voice>And lastly, the client demo is next week on Thursday at 2pm, and you're handling the intro slides.</voice>`;
|
||||
}
|
||||
let streamError: string | null = null;
|
||||
for await (const event of streamLlm(
|
||||
model,
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@ Rowboat is an agentic assistant for everyday work - emails, meetings, projects,
|
|||
|
||||
**Document Collaboration:** When users ask you to work on a document, collaborate on writing, create a new document, edit/refine existing notes, or say things like "let's work on [X]", "help me write [X]", "create a doc for [X]", or "let's draft [X]", you MUST load the \`doc-collab\` skill first. This is required for any document creation or editing task. The skill provides structured guidance for creating, editing, and refining documents in the knowledge base.
|
||||
|
||||
**App Control:** When users ask you to open notes, show the bases or graph view, filter or search notes, or manage saved views, load the \`app-navigation\` skill first. It provides structured guidance for navigating the app UI and controlling the knowledge base view.
|
||||
|
||||
**Slack:** When users ask about Slack messages, want to send messages to teammates, check channel conversations, or find someone on Slack, load the \`slack\` skill. You can send messages, view channel history, search conversations, and find users. Always show message drafts to the user before sending.
|
||||
|
||||
## Memory That Compounds
|
||||
|
|
@ -184,6 +186,7 @@ ${runtimeContextPrompt}
|
|||
- \`loadSkill\` - Skill loading
|
||||
- \`slack-checkConnection\`, \`slack-listAvailableTools\`, \`slack-executeAction\` - Slack integration (requires Slack to be connected via Composio). Use \`slack-listAvailableTools\` first to discover available tool slugs, then \`slack-executeAction\` to execute them.
|
||||
- \`web-search\` and \`research-search\` - Web and research search tools (available when configured). **You MUST load the \`web-search\` skill before using either of these tools.** It tells you which tool to pick and how many searches to do.
|
||||
- \`app-navigation\` - Control the app UI: open notes, switch views, filter/search the knowledge base, manage saved views. **Load the \`app-navigation\` skill before using this tool.**
|
||||
|
||||
**Prefer these tools whenever possible** — they work instantly with zero friction. For file operations inside \`~/.rowboat/\`, always use these instead of \`executeCommand\`.
|
||||
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ Change filters, columns, sort order, or search in the bases (table) view.
|
|||
- If unsure what categories/values are available, call ` + "`get-base-state`" + ` first.
|
||||
- For "show me X", prefer ` + "`filters.set`" + ` to start fresh rather than ` + "`filters.add`" + `.
|
||||
- Categories come from frontmatter keys (e.g., relationship, status, topic, type).
|
||||
- **CRITICAL: Do NOT pass ` + "`columns`" + ` unless the user explicitly asks to show/hide specific columns.** Omit the ` + "`columns`" + ` parameter entirely when only filtering, sorting, or searching. Passing ` + "`columns`" + ` will override the user's current column layout and can make the view appear empty.
|
||||
|
||||
### get-base-state
|
||||
Retrieve information about what's in the knowledge base — available filter categories, values, and note count.
|
||||
|
|
@ -75,6 +76,7 @@ Save the current view configuration as a named base.
|
|||
- The ` + "`update-base-view`" + ` action will automatically navigate to the bases view if the user isn't already there.
|
||||
- ` + "`open-note`" + ` validates that the file exists before navigating.
|
||||
- Filter categories and values come from frontmatter in knowledge files.
|
||||
- **Never send ` + "`columns`" + ` or ` + "`sort`" + ` with ` + "`update-base-view`" + ` unless the user specifically asks to change them.** Only pass the parameters you intend to change — omitted parameters are left untouched.
|
||||
`;
|
||||
|
||||
export default skill;
|
||||
|
|
|
|||
|
|
@ -884,6 +884,145 @@ export const BuiltinTools: z.infer<typeof BuiltinToolsSchema> = {
|
|||
},
|
||||
},
|
||||
|
||||
// ============================================================================
|
||||
// App Navigation
|
||||
// ============================================================================
|
||||
|
||||
'app-navigation': {
|
||||
description: 'Control the app UI - navigate to notes, switch views, filter/search the knowledge base, and manage saved views.',
|
||||
inputSchema: z.object({
|
||||
action: z.enum(["open-note", "open-view", "update-base-view", "get-base-state", "create-base"]).describe("The navigation action to perform"),
|
||||
// open-note
|
||||
path: z.string().optional().describe("Knowledge file path for open-note, e.g. knowledge/People/John.md"),
|
||||
// open-view
|
||||
view: z.enum(["bases", "graph"]).optional().describe("Which view to open (for open-view action)"),
|
||||
// update-base-view
|
||||
filters: z.object({
|
||||
set: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Replace all filters with these"),
|
||||
add: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Add these filters"),
|
||||
remove: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Remove these filters"),
|
||||
clear: z.boolean().optional().describe("Clear all filters"),
|
||||
}).optional().describe("Filter modifications (for update-base-view)"),
|
||||
columns: z.object({
|
||||
set: z.array(z.string()).optional().describe("Replace visible columns with these"),
|
||||
add: z.array(z.string()).optional().describe("Add these columns"),
|
||||
remove: z.array(z.string()).optional().describe("Remove these columns"),
|
||||
}).optional().describe("Column modifications (for update-base-view)"),
|
||||
sort: z.object({
|
||||
field: z.string(),
|
||||
dir: z.enum(["asc", "desc"]),
|
||||
}).optional().describe("Sort configuration (for update-base-view)"),
|
||||
search: z.string().optional().describe("Search query to filter notes (for update-base-view)"),
|
||||
// get-base-state
|
||||
base_name: z.string().optional().describe("Name of a saved base to inspect (for get-base-state). Omit for the current/default view."),
|
||||
// create-base
|
||||
name: z.string().optional().describe("Name for the saved base view (for create-base)"),
|
||||
}),
|
||||
execute: async (input: {
|
||||
action: string;
|
||||
[key: string]: unknown;
|
||||
}) => {
|
||||
switch (input.action) {
|
||||
case 'open-note': {
|
||||
const filePath = input.path as string;
|
||||
try {
|
||||
const result = await workspace.exists(filePath);
|
||||
if (!result.exists) {
|
||||
return { success: false, error: `File not found: ${filePath}` };
|
||||
}
|
||||
return { success: true, action: 'open-note', path: filePath };
|
||||
} catch {
|
||||
return { success: false, error: `Could not access file: ${filePath}` };
|
||||
}
|
||||
}
|
||||
|
||||
case 'open-view': {
|
||||
const view = input.view as string;
|
||||
return { success: true, action: 'open-view', view };
|
||||
}
|
||||
|
||||
case 'update-base-view': {
|
||||
const updates: Record<string, unknown> = {};
|
||||
if (input.filters) updates.filters = input.filters;
|
||||
if (input.columns) updates.columns = input.columns;
|
||||
if (input.sort) updates.sort = input.sort;
|
||||
if (input.search !== undefined) updates.search = input.search;
|
||||
return { success: true, action: 'update-base-view', updates };
|
||||
}
|
||||
|
||||
case 'get-base-state': {
|
||||
// Scan knowledge/ files and extract frontmatter properties
|
||||
try {
|
||||
const { parseFrontmatter } = await import("@x/shared/dist/frontmatter.js");
|
||||
const entries = await workspace.readdir("knowledge", { recursive: true, allowedExtensions: [".md"] });
|
||||
const files = entries.filter(e => e.kind === 'file');
|
||||
const properties = new Map<string, Set<string>>();
|
||||
let noteCount = 0;
|
||||
|
||||
for (const file of files) {
|
||||
try {
|
||||
const { data } = await workspace.readFile(file.path);
|
||||
const { fields } = parseFrontmatter(data);
|
||||
noteCount++;
|
||||
for (const [key, value] of Object.entries(fields)) {
|
||||
if (!value) continue;
|
||||
let set = properties.get(key);
|
||||
if (!set) { set = new Set(); properties.set(key, set); }
|
||||
const values = Array.isArray(value) ? value : [value];
|
||||
for (const v of values) {
|
||||
const trimmed = v.trim();
|
||||
if (trimmed) set.add(trimmed);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// skip unreadable files
|
||||
}
|
||||
}
|
||||
|
||||
const availableProperties: Record<string, string[]> = {};
|
||||
for (const [key, values] of properties) {
|
||||
availableProperties[key] = [...values].sort();
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
action: 'get-base-state',
|
||||
noteCount,
|
||||
availableProperties,
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to read knowledge base',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
case 'create-base': {
|
||||
const name = input.name as string;
|
||||
const safeName = name.replace(/[^a-zA-Z0-9_\- ]/g, '').trim();
|
||||
if (!safeName) {
|
||||
return { success: false, error: 'Invalid base name' };
|
||||
}
|
||||
const basePath = `bases/${safeName}.base`;
|
||||
try {
|
||||
const config = { name: safeName, filters: [], columns: [] };
|
||||
await workspace.writeFile(basePath, JSON.stringify(config, null, 2), { mkdirp: true });
|
||||
return { success: true, action: 'create-base', name: safeName, path: basePath };
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to create base',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
return { success: false, error: `Unknown action: ${input.action}` };
|
||||
}
|
||||
},
|
||||
},
|
||||
|
||||
// ============================================================================
|
||||
// Web Search (Brave Search API)
|
||||
// ============================================================================
|
||||
|
|
|
|||
|
|
@ -3,14 +3,17 @@ import { UserMessageContent } from "@x/shared/dist/message.js";
|
|||
import z from "zod";
|
||||
|
||||
export type UserMessageContentType = z.infer<typeof UserMessageContent>;
|
||||
export type VoiceOutputMode = 'summary' | 'full';
|
||||
|
||||
type EnqueuedMessage = {
|
||||
messageId: string;
|
||||
message: UserMessageContentType;
|
||||
voiceInput?: boolean;
|
||||
voiceOutput?: VoiceOutputMode;
|
||||
};
|
||||
|
||||
export interface IMessageQueue {
|
||||
enqueue(runId: string, message: UserMessageContentType): Promise<string>;
|
||||
enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string>;
|
||||
dequeue(runId: string): Promise<EnqueuedMessage | null>;
|
||||
}
|
||||
|
||||
|
|
@ -26,7 +29,7 @@ export class InMemoryMessageQueue implements IMessageQueue {
|
|||
this.idGenerator = idGenerator;
|
||||
}
|
||||
|
||||
async enqueue(runId: string, message: UserMessageContentType): Promise<string> {
|
||||
async enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string> {
|
||||
if (!this.store[runId]) {
|
||||
this.store[runId] = [];
|
||||
}
|
||||
|
|
@ -34,6 +37,8 @@ export class InMemoryMessageQueue implements IMessageQueue {
|
|||
this.store[runId].push({
|
||||
messageId: id,
|
||||
message,
|
||||
voiceInput,
|
||||
voiceOutput,
|
||||
});
|
||||
return id;
|
||||
}
|
||||
|
|
@ -44,4 +49,4 @@ export class InMemoryMessageQueue implements IMessageQueue {
|
|||
}
|
||||
return this.store[runId].shift() ?? null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,3 +9,6 @@ export { initConfigs } from './config/initConfigs.js';
|
|||
|
||||
// Knowledge version history
|
||||
export * as versionHistory from './knowledge/version_history.js';
|
||||
|
||||
// Voice mode (config + TTS)
|
||||
export * as voice from './voice/voice.js';
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ export function createProvider(config: z.infer<typeof Provider>): ProviderV2 {
|
|||
apiKey,
|
||||
baseURL,
|
||||
headers,
|
||||
});
|
||||
}) as unknown as ProviderV2;
|
||||
default:
|
||||
throw new Error(`Unsupported provider flavor: ${config.flavor}`);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import z from "zod";
|
||||
import container from "../di/container.js";
|
||||
import { IMessageQueue, UserMessageContentType } from "../application/lib/message-queue.js";
|
||||
import { IMessageQueue, UserMessageContentType, VoiceOutputMode } from "../application/lib/message-queue.js";
|
||||
import { AskHumanResponseEvent, ToolPermissionRequestEvent, ToolPermissionResponseEvent, CreateRunOptions, Run, ListRunsResponse, ToolPermissionAuthorizePayload, AskHumanResponsePayload } from "@x/shared/dist/runs.js";
|
||||
import { IRunsRepo } from "./repo.js";
|
||||
import { IAgentRuntime } from "../agents/runtime.js";
|
||||
|
|
@ -19,9 +19,9 @@ export async function createRun(opts: z.infer<typeof CreateRunOptions>): Promise
|
|||
return run;
|
||||
}
|
||||
|
||||
export async function createMessage(runId: string, message: UserMessageContentType): Promise<string> {
|
||||
export async function createMessage(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string> {
|
||||
const queue = container.resolve<IMessageQueue>('messageQueue');
|
||||
const id = await queue.enqueue(runId, message);
|
||||
const id = await queue.enqueue(runId, message, voiceInput, voiceOutput);
|
||||
const runtime = container.resolve<IAgentRuntime>('agentRuntime');
|
||||
runtime.trigger(runId);
|
||||
return id;
|
||||
|
|
|
|||
70
apps/x/packages/core/src/voice/voice.ts
Normal file
70
apps/x/packages/core/src/voice/voice.ts
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
import * as fs from 'fs/promises';
|
||||
import * as path from 'path';
|
||||
|
||||
const homedir = process.env.HOME || process.env.USERPROFILE || '';
|
||||
|
||||
export interface VoiceConfig {
|
||||
deepgram: { apiKey: string } | null;
|
||||
elevenlabs: { apiKey: string; voiceId?: string } | null;
|
||||
}
|
||||
|
||||
async function readJsonConfig(filename: string): Promise<Record<string, unknown> | null> {
|
||||
try {
|
||||
const configPath = path.join(homedir, '.rowboat', 'config', filename);
|
||||
const raw = await fs.readFile(configPath, 'utf8');
|
||||
return JSON.parse(raw);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function getVoiceConfig(): Promise<VoiceConfig> {
|
||||
const dgConfig = await readJsonConfig('deepgram.json');
|
||||
const elConfig = await readJsonConfig('elevenlabs.json');
|
||||
|
||||
return {
|
||||
deepgram: dgConfig?.apiKey ? { apiKey: dgConfig.apiKey as string } : null,
|
||||
elevenlabs: elConfig?.apiKey
|
||||
? { apiKey: elConfig.apiKey as string, voiceId: elConfig.voiceId as string | undefined }
|
||||
: null,
|
||||
};
|
||||
}
|
||||
|
||||
export async function synthesizeSpeech(text: string): Promise<{ audioBase64: string; mimeType: string }> {
|
||||
const config = await getVoiceConfig();
|
||||
if (!config.elevenlabs) {
|
||||
throw new Error('ElevenLabs not configured. Create ~/.rowboat/config/elevenlabs.json with { "apiKey": "<your-key>" }');
|
||||
}
|
||||
|
||||
const voiceId = config.elevenlabs.voiceId || 'UgBBYS2sOqTuMpoF3BR0';
|
||||
const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`;
|
||||
|
||||
console.log('[voice] synthesizing speech, text length:', text.length, 'voiceId:', voiceId);
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'xi-api-key': config.elevenlabs.apiKey,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: 'eleven_multilingual_v2',
|
||||
voice_settings: {
|
||||
stability: 0.5,
|
||||
similarity_boost: 0.75,
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errText = await response.text().catch(() => 'Unknown error');
|
||||
console.error('[voice] ElevenLabs API error:', response.status, errText);
|
||||
throw new Error(`ElevenLabs API error ${response.status}: ${errText}`);
|
||||
}
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
const audioBase64 = Buffer.from(arrayBuffer).toString('base64');
|
||||
console.log('[voice] synthesized audio, base64 length:', audioBase64.length);
|
||||
return { audioBase64, mimeType: 'audio/mpeg' };
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue