From 950f86417496070f7ce9a0e4d15074747407c206 Mon Sep 17 00:00:00 2001 From: Arjun <6592213+arkml@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:07:52 +0530 Subject: [PATCH] fix voice output --- apps/x/packages/core/src/agents/runtime.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/x/packages/core/src/agents/runtime.ts b/apps/x/packages/core/src/agents/runtime.ts index 98bae635..b7ebdfdf 100644 --- a/apps/x/packages/core/src/agents/runtime.ts +++ b/apps/x/packages/core/src/agents/runtime.ts @@ -859,7 +859,7 @@ export async function* streamAgent({ const isInlineTaskAgent = state.agentName === "inline_task_agent"; const defaultModel = signedIn ? "gpt-5.4" : modelConfig.model; const defaultKgModel = signedIn ? "gpt-5.4-mini" : defaultModel; - const defaultInlineTaskModel = signedIn ? "gpt-5.4-mini" : defaultModel; + const defaultInlineTaskModel = signedIn ? "gpt-5.4" : defaultModel; const modelId = isInlineTaskAgent ? defaultInlineTaskModel : (isKgAgent && modelConfig.knowledgeGraphModel) @@ -869,6 +869,9 @@ export async function* streamAgent({ logger.log(`using model: ${modelId}`); let loopCounter = 0; + let voiceInput = false; + let voiceOutput: 'summary' | 'full' | null = null; + let searchEnabled = false; while (true) { // Check abort at the top of each iteration signal.throwIfAborted(); @@ -982,9 +985,6 @@ export async function* streamAgent({ } // get any queued user messages - let voiceInput = false; - let voiceOutput: 'summary' | 'full' | null = null; - let searchEnabled = false; while (true) { const msg = await messageQueue.dequeue(runId); if (!msg) { @@ -1052,10 +1052,10 @@ export async function* streamAgent({ } if (voiceOutput === 'summary') { loopLogger.log('voice output enabled (summary mode), injecting voice output prompt'); - instructionsWithDateTime += `\n\n# Voice Output (MANDATORY)\nThe user has voice output enabled. You MUST start your response with tags that provide a spoken summary and guide to your written response. This is NOT optional — every response MUST begin with tags.\n\nRules:\n1. ALWAYS start your response with one or more tags. Never skip them.\n2. Place ALL tags at the BEGINNING of your response, before any detailed content. Do NOT intersperse tags throughout the response.\n3. Wrap EACH spoken sentence in its own separate tag so it can be spoken incrementally. Do NOT wrap everything in a single block.\n4. Use voice as a TL;DR and navigation aid — do NOT read the entire response aloud.\n\nExample — if the user asks "what happened in my meeting with Sarah yesterday?":\nYour meeting with Sarah covered three main things: the Q2 roadmap timeline, hiring for the backend role, and the client demo next week.\nI've pulled out the key details and action items below — the demo prep notes are at the end.\n\n## Meeting with Sarah — March 11\n(Then the full detailed written response follows without any more tags.)\n\nAny text outside tags is shown visually but not spoken.`; + instructionsWithDateTime += `\n\n# Voice Output (MANDATORY — READ THIS FIRST)\nThe user has voice output enabled. THIS IS YOUR #1 PRIORITY: you MUST start your response with tags. If your response does not begin with tags, the user will hear nothing — which is a broken experience. NEVER skip this.\n\nRules:\n1. YOUR VERY FIRST OUTPUT MUST BE A TAG. No exceptions. Do not start with markdown, headings, or any other text. The literal first characters of your response must be "".\n2. Place ALL tags at the BEGINNING of your response, before any detailed content. Do NOT intersperse tags throughout the response.\n3. Wrap EACH spoken sentence in its own separate tag so it can be spoken incrementally. Do NOT wrap everything in a single block.\n4. Use voice as a TL;DR and navigation aid — do NOT read the entire response aloud.\n5. After all tags, you may include detailed written content (markdown, tables, code, etc.) that will be shown visually but not spoken.\n\n## Examples\n\nExample 1 — User asks: "what happened in my meeting with Alex yesterday?"\n\nYour meeting with Alex covered three main things: the Q2 roadmap timeline, hiring for the backend role, and the client demo next week.\nI've pulled out the key details and action items below — the demo prep notes are at the end.\n\n## Meeting with Alex — March 11\n### Roadmap\n- Agreed to push Q2 launch to April 15...\n(detailed written content continues)\n\nExample 2 — User asks: "summarize my emails"\n\nYou have five new emails since this morning.\nTwo are from your team — Jordan sent the RFC you requested and Taylor flagged a contract issue.\nThere's also a warm intro from a VC partner connecting you with someone at a prospective customer.\nI've drafted responses for three of them. The details and drafts are below.\n\n(email blocks, tables, and detailed content follow)\n\nExample 3 — User asks: "what's on my calendar today?"\n\nYou've got a pretty packed day — seven meetings starting with standup at 9.\nThe big ones are your investor call at 11, lunch with a partner from your lead VC at 12:30, and a customer call at 4.\nYour only free block for deep work is 2:30 to 4.\n\n(calendar block with full event details follows)\n\nExample 4 — User asks: "draft an email to Sam with our metrics"\n\nDone — I've drafted the email to Sam with your latest WAU and churn numbers.\nTake a look at the draft below and send it when you're ready.\n\n(email block with draft follows)\n\nREMEMBER: If you do not start with tags, the user hears silence. Always speak first, then write.`; } else if (voiceOutput === 'full') { loopLogger.log('voice output enabled (full mode), injecting voice output prompt'); - instructionsWithDateTime += `\n\n# Voice Output — Full Read-Aloud (MANDATORY)\nThe user wants your ENTIRE response spoken aloud. You MUST wrap your full response in tags. This is NOT optional.\n\nRules:\n1. Wrap EACH sentence in its own separate tag so it can be spoken incrementally.\n2. Write your response in a natural, conversational style suitable for listening — no markdown headings, bullet points, or formatting symbols. Use plain spoken language.\n3. Structure the content as if you are speaking to the user directly. Use transitions like "first", "also", "one more thing" instead of visual formatting.\n4. Every sentence MUST be inside a tag. Do not leave any content outside tags.\n\nExample:\nYour meeting with Sarah covered three main things.\nFirst, you discussed the Q2 roadmap timeline and agreed to push the launch to April.\nSecond, you talked about hiring for the backend role — Sarah will send over two candidates by Friday.\nAnd lastly, the client demo is next week on Thursday at 2pm, and you're handling the intro slides.`; + instructionsWithDateTime += `\n\n# Voice Output — Full Read-Aloud (MANDATORY — READ THIS FIRST)\nThe user wants your ENTIRE response spoken aloud. THIS IS YOUR #1 PRIORITY: every single sentence must be wrapped in tags. If you write anything outside tags, the user will not hear it — which is a broken experience. NEVER skip this.\n\nRules:\n1. YOUR VERY FIRST OUTPUT MUST BE A TAG. No exceptions. The literal first characters of your response must be "".\n2. Wrap EACH sentence in its own separate tag so it can be spoken incrementally.\n3. Write your response in a natural, conversational style suitable for listening — no markdown headings, bullet points, or formatting symbols. Use plain spoken language.\n4. Structure the content as if you are speaking to the user directly. Use transitions like "first", "also", "one more thing" instead of visual formatting.\n5. EVERY sentence MUST be inside a tag. Do not leave ANY content outside tags. If it's not in a tag, the user cannot hear it.\n\n## Examples\n\nExample 1 — User asks: "what happened in my meeting with Alex yesterday?"\n\nYour meeting with Alex covered three main things.\nFirst, you discussed the Q2 roadmap timeline and agreed to push the launch to April.\nSecond, you talked about hiring for the backend role — Alex will send over two candidates by Friday.\nAnd lastly, the client demo is next week on Thursday at 2pm, and you're handling the intro slides.\n\nExample 2 — User asks: "summarize my emails"\n\nYou've got five new emails since this morning.\nTwo are from your team — Jordan sent the RFC you asked for, and Taylor flagged a contract issue that needs your sign-off.\nThere's a warm intro from a VC partner connecting you with an engineering lead at a potential customer.\nAnd someone from a prospective client wants to confirm your API tier before your call this afternoon.\nI've drafted replies for three of them — the metrics update, the intro, and the API question.\nThe only one I left for you is Taylor's contract redline, since that needs your judgment on the liability cap.\n\nExample 3 — User asks: "what's on my calendar today?"\n\nYou've got a packed day — seven meetings starting with standup at 9.\nThe highlights are your investor call at 11, lunch with a VC partner at 12:30, and a customer call at 4.\nYour only open block for deep work is 2:30 to 4, so plan accordingly.\nOh, and your 1-on-1 with your co-founder is at 5:30 — that's a walking meeting.\n\nExample 4 — User asks: "how are our metrics looking?"\n\nMetrics are looking strong this week.\nYou hit 2,573 weekly active users, which is up 12% week over week.\nThat means you've crossed the 2,500 milestone — worth calling out in your next investor update.\nChurn is down to 4.1%, improving month over month.\nThe trailing 8-week compound growth rate is about 10%.\n\nREMEMBER: Start with immediately. No preamble, no markdown before it. Speak first.`; } if (searchEnabled) { loopLogger.log('search enabled, injecting search prompt');