fix(code-mode): surface agent startup failures instead of hanging forever

- 60s deadline on adapter initialize / session create+load: an engine that
  launches but never completes the SDK handshake (e.g. an outdated local
  CLI) now fails with the adapter's stderr attached instead of leaving the
  turn (pending...) indefinitely; prompts themselves stay un-timed
- dispose the adapter client when startup fails so the spawned process
  does not leak
- set DEBUG_CLAUDE_AGENT_SDK=1 so the SDK logs the exact spawn command and
  claude's stderr to ~/.claude/debug/sdk-*.txt and startup errors point at
  that file (engine stderr is otherwise discarded entirely)
- graft the user's login-shell PATH onto the adapter env on macOS/Linux:
  GUI launches inherit launchd's stripped PATH, which breaks node-shebang
  claude launchers (nvm/npm installs) and the engines' own subprocess spawns
This commit is contained in:
Gagancreates 2026-06-12 21:23:32 +05:30
parent e6c5a13d1b
commit ae362f50f4
4 changed files with 119 additions and 20 deletions

View file

@ -4,6 +4,7 @@ import { fileURLToPath } from 'url';
import type { CodingAgent } from './types.js';
import { resolveClaudeExecutable } from './claude-exec.js';
import { resolveCodexExecutable } from './codex-exec.js';
import { loginShellPath } from './shell-env.js';
const require = createRequire(import.meta.url);
@ -64,23 +65,43 @@ export function getAgentLaunchSpec(agent: CodingAgent): AgentLaunchSpec {
const entry = resolveAdapterEntry(ADAPTER_PACKAGE[agent]);
const env: NodeJS.ProcessEnv = { ...process.env };
// macOS/Linux GUI launches inherit launchd's stripped PATH. Resolving the engine
// binary below isn't enough on its own: an npm-installed claude is a
// `#!/usr/bin/env node` script (node must be on the ADAPTER's PATH when it spawns
// it), and the engines spawn git/rg/bash themselves. Graft the user's real
// login-shell PATH onto the adapter env so all of those resolve.
const shellPath = loginShellPath();
if (shellPath && shellPath !== env.PATH) {
const dirs = [...shellPath.split(path.delimiter), ...(env.PATH ?? '').split(path.delimiter)];
env.PATH = [...new Set(dirs.filter(Boolean))].join(path.delimiter);
}
// Point each adapter at the user's LOCAL agent executable. We intentionally do not
// bundle the agents' native engines (~230 MB each) into packaged builds — the
// adapters fall back to a bundled engine only when these are unset, and we strip
// those binaries during packaging (see apps/main/forge.config.cjs). So a local
// install is required; throw a clear error instead of letting the adapter fail
// cryptically on the absent bundled engine.
if (agent === 'claude' && !env.CLAUDE_CODE_EXECUTABLE) {
// On Windows resolving the real .exe is also mandatory: Node can't spawn the
// .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net for GUI
// launches that don't inherit the login shell's PATH.
const exe = resolveClaudeExecutable();
if (!exe) {
throw new Error(
'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.',
);
if (agent === 'claude') {
// The claude-agent-sdk discards the engine's stderr unless this is set. With
// it, the SDK logs the exact spawn command + claude's stderr to a debug file
// (~/.claude/debug/sdk-*.txt) and prints "SDK debug logs: <path>" on the
// adapter's stderr — which we capture and attach to startup errors, so a
// failed/hung launch points at the file with the real cause.
env.DEBUG_CLAUDE_AGENT_SDK = '1';
if (!env.CLAUDE_CODE_EXECUTABLE) {
// On Windows resolving the real .exe is also mandatory: Node can't spawn
// the .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net
// for GUI launches that don't inherit the login shell's PATH.
const exe = resolveClaudeExecutable();
if (!exe) {
throw new Error(
'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.',
);
}
env.CLAUDE_CODE_EXECUTABLE = exe;
}
env.CLAUDE_CODE_EXECUTABLE = exe;
}
if (agent === 'codex' && !env.CODEX_PATH) {

View file

@ -27,6 +27,14 @@ export interface AcpClientOptions {
onEvent: (event: CodeRunEvent) => void;
}
// Deadline for the startup phases (initialize / session create+load). A healthy cold
// start — adapter boot, engine spawn, SDK handshake, MCP connects — takes seconds;
// only a wedged engine takes this long (e.g. an outdated local CLI that launches but
// never answers the handshake). Without a deadline that failure mode is an infinite
// "(pending...)" with zero feedback. Prompts are intentionally NOT time-limited:
// turns legitimately run for many minutes and may wait on user permission asks.
const STARTUP_TIMEOUT_MS = 60_000;
// Map a raw ACP session/update notification onto our small CodeRunEvent union.
function toEvent(update: SessionUpdate): CodeRunEvent {
switch (update.sessionUpdate) {
@ -130,10 +138,10 @@ export class AcpClient {
this.connection = new ClientSideConnection(() => client, stream);
try {
const init = await this.connection.initialize({
const init = await this.withStartupTimeout(this.connection.initialize({
protocolVersion: PROTOCOL_VERSION,
clientCapabilities: { fs: { readTextFile: true, writeTextFile: true } },
});
}));
this.loadSession_ = init.agentCapabilities?.loadSession === true;
} catch (e) {
throw this.enrich(e, 'initialize');
@ -142,7 +150,7 @@ export class AcpClient {
async newSession(): Promise<string> {
try {
const res = await this.conn().newSession({ cwd: this.cwd, mcpServers: [] });
const res = await this.withStartupTimeout(this.conn().newSession({ cwd: this.cwd, mcpServers: [] }));
return res.sessionId;
} catch (e) {
throw this.enrich(e, 'newSession');
@ -151,12 +159,35 @@ export class AcpClient {
async loadSession(sessionId: string): Promise<void> {
try {
await this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] });
await this.withStartupTimeout(this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] }));
} catch (e) {
throw this.enrich(e, 'loadSession');
}
}
// Race a startup-phase request against the deadline. The timeout error flows
// through enrich(), which appends the adapter's exit info / stderr tail — so a
// hung startup reports WHY (including the "SDK debug logs: <path>" pointer)
// instead of pending forever. Callers dispose the client on failure, which
// kills the spawned adapter.
private async withStartupTimeout<T>(work: Promise<T>): Promise<T> {
let timer: ReturnType<typeof setTimeout> | undefined;
const timeout = new Promise<never>((_, reject) => {
timer = setTimeout(() => {
reject(new Error(
`timed out after ${STARTUP_TIMEOUT_MS / 1000}s — the local ${this.agent} CLI may be ` +
`outdated or failing to launch (check \`${this.agent} --version\`)`,
));
}, STARTUP_TIMEOUT_MS);
timer.unref?.();
});
try {
return await Promise.race([work, timeout]);
} finally {
if (timer) clearTimeout(timer);
}
}
async prompt(sessionId: string, text: string): Promise<PromptResponse> {
try {
return await this.conn().prompt({ sessionId, prompt: [{ type: 'text', text }] });

View file

@ -158,12 +158,18 @@ export class CodeModeManager {
if (existing) this.dispose(runId); // agent/cwd changed — start over
const client = new AcpClient({ agent, cwd, broker, onEvent });
await client.start();
const sessionId = await this.openSession(runId, agent, cwd, client);
const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 };
this.runs.set(runId, run);
return run;
try {
await client.start();
const sessionId = await this.openSession(runId, agent, cwd, client);
const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 };
this.runs.set(runId, run);
return run;
} catch (e) {
// Startup failed (e.g. handshake timeout). The client isn't in `runs`
// yet, so dispose here or the spawned adapter process leaks.
client.dispose();
throw e;
}
}
// Resume the persisted session for this chat when possible; else start a new one

View file

@ -0,0 +1,41 @@
import { execSync } from 'child_process';
import * as path from 'path';
let cached: string | null = null;
// The user's login-shell PATH (macOS/Linux; undefined on Windows or probe failure).
// GUI-launched Electron apps inherit launchd's stripped PATH (/usr/bin:/bin:...), so
// anything resolved or spawned off process.env.PATH misses nvm/homebrew/npm-global
// installs. claude-exec/codex-exec already login-shell-probe for their one binary;
// this recovers the WHOLE PATH for transitive spawns the probes can't cover — an
// npm-installed claude is a `#!/usr/bin/env node` script (node must be on the
// spawner's PATH), and the engines spawn git/rg/bash themselves.
export function loginShellPath(): string | undefined {
if (process.platform === 'win32') return undefined;
if (cached !== null) return cached || undefined;
// Prefer the user's own shell when it's POSIX-flavored, so its login profile
// (~/.zprofile for zsh — macOS default — ~/.profile for bash/sh) is the one that
// builds the PATH. fish et al. are skipped: their `echo $PATH` is space-joined.
const userShell = process.env.SHELL;
const shellOk = userShell && ['sh', 'bash', 'zsh', 'dash', 'ksh'].includes(path.basename(userShell));
const shells = [...new Set([...(shellOk ? [userShell] : []), '/bin/sh'])];
for (const shell of shells) {
try {
const out = execSync(`${shell} -lc 'echo $PATH'`, { timeout: 5000, encoding: 'utf-8' });
// Profile scripts may echo their own lines; our `echo $PATH` runs last,
// so take the last non-empty line and sanity-check it looks like a PATH.
const lines = out.split('\n').map((l) => l.trim()).filter(Boolean);
const last = lines[lines.length - 1];
if (last && last.includes('/')) {
cached = last;
return last;
}
} catch {
// probe failed — try the next shell
}
}
cached = ''; // remember the failure so we don't re-pay the probe every spawn
return undefined;
}