From ae362f50f44404b3d43904de165928b416607581 Mon Sep 17 00:00:00 2001 From: Gagancreates Date: Fri, 12 Jun 2026 21:23:32 +0530 Subject: [PATCH] fix(code-mode): surface agent startup failures instead of hanging forever - 60s deadline on adapter initialize / session create+load: an engine that launches but never completes the SDK handshake (e.g. an outdated local CLI) now fails with the adapter's stderr attached instead of leaving the turn (pending...) indefinitely; prompts themselves stay un-timed - dispose the adapter client when startup fails so the spawned process does not leak - set DEBUG_CLAUDE_AGENT_SDK=1 so the SDK logs the exact spawn command and claude's stderr to ~/.claude/debug/sdk-*.txt and startup errors point at that file (engine stderr is otherwise discarded entirely) - graft the user's login-shell PATH onto the adapter env on macOS/Linux: GUI launches inherit launchd's stripped PATH, which breaks node-shebang claude launchers (nvm/npm installs) and the engines' own subprocess spawns --- .../packages/core/src/code-mode/acp/agents.ts | 41 ++++++++++++++----- .../packages/core/src/code-mode/acp/client.ts | 39 ++++++++++++++++-- .../core/src/code-mode/acp/manager.ts | 18 +++++--- .../core/src/code-mode/acp/shell-env.ts | 41 +++++++++++++++++++ 4 files changed, 119 insertions(+), 20 deletions(-) create mode 100644 apps/x/packages/core/src/code-mode/acp/shell-env.ts diff --git a/apps/x/packages/core/src/code-mode/acp/agents.ts b/apps/x/packages/core/src/code-mode/acp/agents.ts index 26a593ee..5b8cf130 100644 --- a/apps/x/packages/core/src/code-mode/acp/agents.ts +++ b/apps/x/packages/core/src/code-mode/acp/agents.ts @@ -4,6 +4,7 @@ import { fileURLToPath } from 'url'; import type { CodingAgent } from './types.js'; import { resolveClaudeExecutable } from './claude-exec.js'; import { resolveCodexExecutable } from './codex-exec.js'; +import { loginShellPath } from './shell-env.js'; const require = createRequire(import.meta.url); @@ -64,23 +65,43 @@ export function getAgentLaunchSpec(agent: CodingAgent): AgentLaunchSpec { const entry = resolveAdapterEntry(ADAPTER_PACKAGE[agent]); const env: NodeJS.ProcessEnv = { ...process.env }; + // macOS/Linux GUI launches inherit launchd's stripped PATH. Resolving the engine + // binary below isn't enough on its own: an npm-installed claude is a + // `#!/usr/bin/env node` script (node must be on the ADAPTER's PATH when it spawns + // it), and the engines spawn git/rg/bash themselves. Graft the user's real + // login-shell PATH onto the adapter env so all of those resolve. + const shellPath = loginShellPath(); + if (shellPath && shellPath !== env.PATH) { + const dirs = [...shellPath.split(path.delimiter), ...(env.PATH ?? '').split(path.delimiter)]; + env.PATH = [...new Set(dirs.filter(Boolean))].join(path.delimiter); + } + // Point each adapter at the user's LOCAL agent executable. We intentionally do not // bundle the agents' native engines (~230 MB each) into packaged builds — the // adapters fall back to a bundled engine only when these are unset, and we strip // those binaries during packaging (see apps/main/forge.config.cjs). So a local // install is required; throw a clear error instead of letting the adapter fail // cryptically on the absent bundled engine. - if (agent === 'claude' && !env.CLAUDE_CODE_EXECUTABLE) { - // On Windows resolving the real .exe is also mandatory: Node can't spawn the - // .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net for GUI - // launches that don't inherit the login shell's PATH. - const exe = resolveClaudeExecutable(); - if (!exe) { - throw new Error( - 'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.', - ); + if (agent === 'claude') { + // The claude-agent-sdk discards the engine's stderr unless this is set. With + // it, the SDK logs the exact spawn command + claude's stderr to a debug file + // (~/.claude/debug/sdk-*.txt) and prints "SDK debug logs: " on the + // adapter's stderr — which we capture and attach to startup errors, so a + // failed/hung launch points at the file with the real cause. + env.DEBUG_CLAUDE_AGENT_SDK = '1'; + + if (!env.CLAUDE_CODE_EXECUTABLE) { + // On Windows resolving the real .exe is also mandatory: Node can't spawn + // the .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net + // for GUI launches that don't inherit the login shell's PATH. + const exe = resolveClaudeExecutable(); + if (!exe) { + throw new Error( + 'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.', + ); + } + env.CLAUDE_CODE_EXECUTABLE = exe; } - env.CLAUDE_CODE_EXECUTABLE = exe; } if (agent === 'codex' && !env.CODEX_PATH) { diff --git a/apps/x/packages/core/src/code-mode/acp/client.ts b/apps/x/packages/core/src/code-mode/acp/client.ts index 5c2bd1ba..a6848544 100644 --- a/apps/x/packages/core/src/code-mode/acp/client.ts +++ b/apps/x/packages/core/src/code-mode/acp/client.ts @@ -27,6 +27,14 @@ export interface AcpClientOptions { onEvent: (event: CodeRunEvent) => void; } +// Deadline for the startup phases (initialize / session create+load). A healthy cold +// start — adapter boot, engine spawn, SDK handshake, MCP connects — takes seconds; +// only a wedged engine takes this long (e.g. an outdated local CLI that launches but +// never answers the handshake). Without a deadline that failure mode is an infinite +// "(pending...)" with zero feedback. Prompts are intentionally NOT time-limited: +// turns legitimately run for many minutes and may wait on user permission asks. +const STARTUP_TIMEOUT_MS = 60_000; + // Map a raw ACP session/update notification onto our small CodeRunEvent union. function toEvent(update: SessionUpdate): CodeRunEvent { switch (update.sessionUpdate) { @@ -130,10 +138,10 @@ export class AcpClient { this.connection = new ClientSideConnection(() => client, stream); try { - const init = await this.connection.initialize({ + const init = await this.withStartupTimeout(this.connection.initialize({ protocolVersion: PROTOCOL_VERSION, clientCapabilities: { fs: { readTextFile: true, writeTextFile: true } }, - }); + })); this.loadSession_ = init.agentCapabilities?.loadSession === true; } catch (e) { throw this.enrich(e, 'initialize'); @@ -142,7 +150,7 @@ export class AcpClient { async newSession(): Promise { try { - const res = await this.conn().newSession({ cwd: this.cwd, mcpServers: [] }); + const res = await this.withStartupTimeout(this.conn().newSession({ cwd: this.cwd, mcpServers: [] })); return res.sessionId; } catch (e) { throw this.enrich(e, 'newSession'); @@ -151,12 +159,35 @@ export class AcpClient { async loadSession(sessionId: string): Promise { try { - await this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] }); + await this.withStartupTimeout(this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] })); } catch (e) { throw this.enrich(e, 'loadSession'); } } + // Race a startup-phase request against the deadline. The timeout error flows + // through enrich(), which appends the adapter's exit info / stderr tail — so a + // hung startup reports WHY (including the "SDK debug logs: " pointer) + // instead of pending forever. Callers dispose the client on failure, which + // kills the spawned adapter. + private async withStartupTimeout(work: Promise): Promise { + let timer: ReturnType | undefined; + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => { + reject(new Error( + `timed out after ${STARTUP_TIMEOUT_MS / 1000}s — the local ${this.agent} CLI may be ` + + `outdated or failing to launch (check \`${this.agent} --version\`)`, + )); + }, STARTUP_TIMEOUT_MS); + timer.unref?.(); + }); + try { + return await Promise.race([work, timeout]); + } finally { + if (timer) clearTimeout(timer); + } + } + async prompt(sessionId: string, text: string): Promise { try { return await this.conn().prompt({ sessionId, prompt: [{ type: 'text', text }] }); diff --git a/apps/x/packages/core/src/code-mode/acp/manager.ts b/apps/x/packages/core/src/code-mode/acp/manager.ts index 04ccdebd..c81bdbbe 100644 --- a/apps/x/packages/core/src/code-mode/acp/manager.ts +++ b/apps/x/packages/core/src/code-mode/acp/manager.ts @@ -158,12 +158,18 @@ export class CodeModeManager { if (existing) this.dispose(runId); // agent/cwd changed — start over const client = new AcpClient({ agent, cwd, broker, onEvent }); - await client.start(); - - const sessionId = await this.openSession(runId, agent, cwd, client); - const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 }; - this.runs.set(runId, run); - return run; + try { + await client.start(); + const sessionId = await this.openSession(runId, agent, cwd, client); + const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 }; + this.runs.set(runId, run); + return run; + } catch (e) { + // Startup failed (e.g. handshake timeout). The client isn't in `runs` + // yet, so dispose here or the spawned adapter process leaks. + client.dispose(); + throw e; + } } // Resume the persisted session for this chat when possible; else start a new one diff --git a/apps/x/packages/core/src/code-mode/acp/shell-env.ts b/apps/x/packages/core/src/code-mode/acp/shell-env.ts new file mode 100644 index 00000000..acff79b5 --- /dev/null +++ b/apps/x/packages/core/src/code-mode/acp/shell-env.ts @@ -0,0 +1,41 @@ +import { execSync } from 'child_process'; +import * as path from 'path'; + +let cached: string | null = null; + +// The user's login-shell PATH (macOS/Linux; undefined on Windows or probe failure). +// GUI-launched Electron apps inherit launchd's stripped PATH (/usr/bin:/bin:...), so +// anything resolved or spawned off process.env.PATH misses nvm/homebrew/npm-global +// installs. claude-exec/codex-exec already login-shell-probe for their one binary; +// this recovers the WHOLE PATH for transitive spawns the probes can't cover — an +// npm-installed claude is a `#!/usr/bin/env node` script (node must be on the +// spawner's PATH), and the engines spawn git/rg/bash themselves. +export function loginShellPath(): string | undefined { + if (process.platform === 'win32') return undefined; + if (cached !== null) return cached || undefined; + + // Prefer the user's own shell when it's POSIX-flavored, so its login profile + // (~/.zprofile for zsh — macOS default — ~/.profile for bash/sh) is the one that + // builds the PATH. fish et al. are skipped: their `echo $PATH` is space-joined. + const userShell = process.env.SHELL; + const shellOk = userShell && ['sh', 'bash', 'zsh', 'dash', 'ksh'].includes(path.basename(userShell)); + const shells = [...new Set([...(shellOk ? [userShell] : []), '/bin/sh'])]; + + for (const shell of shells) { + try { + const out = execSync(`${shell} -lc 'echo $PATH'`, { timeout: 5000, encoding: 'utf-8' }); + // Profile scripts may echo their own lines; our `echo $PATH` runs last, + // so take the last non-empty line and sanity-check it looks like a PATH. + const lines = out.split('\n').map((l) => l.trim()).filter(Boolean); + const last = lines[lines.length - 1]; + if (last && last.includes('/')) { + cached = last; + return last; + } + } catch { + // probe failed — try the next shell + } + } + cached = ''; // remember the failure so we don't re-pay the probe every spawn + return undefined; +}