fix(code-mode): surface agent startup failures instead of hanging forever

- 60s deadline on adapter initialize / session create+load: an engine that launches but never completes the SDK handshake (e.g. an outdated local CLI) now fails with the adapter's stderr attached instead of leaving the turn (pending...) indefinitely; prompts themselves stay un-timed - dispose the adapter client when startup fails so the spawned process does not leak - set DEBUG_CLAUDE_AGENT_SDK=1 so the SDK logs the exact spawn command and claude's stderr to ~/.claude/debug/sdk-*.txt and startup errors point at that file (engine stderr is otherwise discarded entirely) - graft the user's login-shell PATH onto the adapter env on macOS/Linux: GUI launches inherit launchd's stripped PATH, which breaks node-shebang claude launchers (nvm/npm installs) and the engines' own subprocess spawns
2026-06-12 19:55:19 +02:00 · 2026-06-12 21:23:32 +05:30 · 2026-06-12 21:23:32 +05:30 · ae362f50f4
commit ae362f50f4
parent e6c5a13d1b
4 changed files with 119 additions and 20 deletions
--- a/apps/x/packages/core/src/code-mode/acp/agents.ts
+++ b/apps/x/packages/core/src/code-mode/acp/agents.ts
@ -4,6 +4,7 @@ import { fileURLToPath } from 'url';
 import type { CodingAgent } from './types.js';
 import { resolveClaudeExecutable } from './claude-exec.js';
 import { resolveCodexExecutable } from './codex-exec.js';
+import { loginShellPath } from './shell-env.js';

 const require = createRequire(import.meta.url);

@ -64,23 +65,43 @@ export function getAgentLaunchSpec(agent: CodingAgent): AgentLaunchSpec {
    const entry = resolveAdapterEntry(ADAPTER_PACKAGE[agent]);
    const env: NodeJS.ProcessEnv = { ...process.env };

+    // macOS/Linux GUI launches inherit launchd's stripped PATH. Resolving the engine
+    // binary below isn't enough on its own: an npm-installed claude is a
+    // `#!/usr/bin/env node` script (node must be on the ADAPTER's PATH when it spawns
+    // it), and the engines spawn git/rg/bash themselves. Graft the user's real
+    // login-shell PATH onto the adapter env so all of those resolve.
+    const shellPath = loginShellPath();
+    if (shellPath && shellPath !== env.PATH) {
+        const dirs = [...shellPath.split(path.delimiter), ...(env.PATH ?? '').split(path.delimiter)];
+        env.PATH = [...new Set(dirs.filter(Boolean))].join(path.delimiter);
+    }
+
    // Point each adapter at the user's LOCAL agent executable. We intentionally do not
    // bundle the agents' native engines (~230 MB each) into packaged builds — the
    // adapters fall back to a bundled engine only when these are unset, and we strip
    // those binaries during packaging (see apps/main/forge.config.cjs). So a local
    // install is required; throw a clear error instead of letting the adapter fail
    // cryptically on the absent bundled engine.
-    if (agent === 'claude' && !env.CLAUDE_CODE_EXECUTABLE) {
-        // On Windows resolving the real .exe is also mandatory: Node can't spawn the
-        // .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net for GUI
-        // launches that don't inherit the login shell's PATH.
-        const exe = resolveClaudeExecutable();
-        if (!exe) {
-            throw new Error(
-                'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.',
-            );
+    if (agent === 'claude') {
+        // The claude-agent-sdk discards the engine's stderr unless this is set. With
+        // it, the SDK logs the exact spawn command + claude's stderr to a debug file
+        // (~/.claude/debug/sdk-*.txt) and prints "SDK debug logs: <path>" on the
+        // adapter's stderr — which we capture and attach to startup errors, so a
+        // failed/hung launch points at the file with the real cause.
+        env.DEBUG_CLAUDE_AGENT_SDK = '1';
+
+        if (!env.CLAUDE_CODE_EXECUTABLE) {
+            // On Windows resolving the real .exe is also mandatory: Node can't spawn
+            // the .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net
+            // for GUI launches that don't inherit the login shell's PATH.
+            const exe = resolveClaudeExecutable();
+            if (!exe) {
+                throw new Error(
+                    'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.',
+                );
+            }
+            env.CLAUDE_CODE_EXECUTABLE = exe;
        }
-        env.CLAUDE_CODE_EXECUTABLE = exe;
    }

    if (agent === 'codex' && !env.CODEX_PATH) {
--- a/apps/x/packages/core/src/code-mode/acp/client.ts
+++ b/apps/x/packages/core/src/code-mode/acp/client.ts
@ -27,6 +27,14 @@ export interface AcpClientOptions {
    onEvent: (event: CodeRunEvent) => void;
 }

+// Deadline for the startup phases (initialize / session create+load). A healthy cold
+// start — adapter boot, engine spawn, SDK handshake, MCP connects — takes seconds;
+// only a wedged engine takes this long (e.g. an outdated local CLI that launches but
+// never answers the handshake). Without a deadline that failure mode is an infinite
+// "(pending...)" with zero feedback. Prompts are intentionally NOT time-limited:
+// turns legitimately run for many minutes and may wait on user permission asks.
+const STARTUP_TIMEOUT_MS = 60_000;
+
 // Map a raw ACP session/update notification onto our small CodeRunEvent union.
 function toEvent(update: SessionUpdate): CodeRunEvent {
    switch (update.sessionUpdate) {
@ -130,10 +138,10 @@ export class AcpClient {
        this.connection = new ClientSideConnection(() => client, stream);

        try {
-            const init = await this.connection.initialize({
+            const init = await this.withStartupTimeout(this.connection.initialize({
                protocolVersion: PROTOCOL_VERSION,
                clientCapabilities: { fs: { readTextFile: true, writeTextFile: true } },
-            });
+            }));
            this.loadSession_ = init.agentCapabilities?.loadSession === true;
        } catch (e) {
            throw this.enrich(e, 'initialize');
@ -142,7 +150,7 @@ export class AcpClient {

    async newSession(): Promise<string> {
        try {
-            const res = await this.conn().newSession({ cwd: this.cwd, mcpServers: [] });
+            const res = await this.withStartupTimeout(this.conn().newSession({ cwd: this.cwd, mcpServers: [] }));
            return res.sessionId;
        } catch (e) {
            throw this.enrich(e, 'newSession');
@ -151,12 +159,35 @@ export class AcpClient {

    async loadSession(sessionId: string): Promise<void> {
        try {
-            await this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] });
+            await this.withStartupTimeout(this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] }));
        } catch (e) {
            throw this.enrich(e, 'loadSession');
        }
    }

+    // Race a startup-phase request against the deadline. The timeout error flows
+    // through enrich(), which appends the adapter's exit info / stderr tail — so a
+    // hung startup reports WHY (including the "SDK debug logs: <path>" pointer)
+    // instead of pending forever. Callers dispose the client on failure, which
+    // kills the spawned adapter.
+    private async withStartupTimeout<T>(work: Promise<T>): Promise<T> {
+        let timer: ReturnType<typeof setTimeout> | undefined;
+        const timeout = new Promise<never>((_, reject) => {
+            timer = setTimeout(() => {
+                reject(new Error(
+                    `timed out after ${STARTUP_TIMEOUT_MS / 1000}s — the local ${this.agent} CLI may be ` +
+                    `outdated or failing to launch (check \`${this.agent} --version\`)`,
+                ));
+            }, STARTUP_TIMEOUT_MS);
+            timer.unref?.();
+        });
+        try {
+            return await Promise.race([work, timeout]);
+        } finally {
+            if (timer) clearTimeout(timer);
+        }
+    }
+
    async prompt(sessionId: string, text: string): Promise<PromptResponse> {
        try {
            return await this.conn().prompt({ sessionId, prompt: [{ type: 'text', text }] });
--- a/apps/x/packages/core/src/code-mode/acp/manager.ts
+++ b/apps/x/packages/core/src/code-mode/acp/manager.ts
@ -158,12 +158,18 @@ export class CodeModeManager {
        if (existing) this.dispose(runId); // agent/cwd changed — start over

        const client = new AcpClient({ agent, cwd, broker, onEvent });
-        await client.start();
-
-        const sessionId = await this.openSession(runId, agent, cwd, client);
-        const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 };
-        this.runs.set(runId, run);
-        return run;
+        try {
+            await client.start();
+            const sessionId = await this.openSession(runId, agent, cwd, client);
+            const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 };
+            this.runs.set(runId, run);
+            return run;
+        } catch (e) {
+            // Startup failed (e.g. handshake timeout). The client isn't in `runs`
+            // yet, so dispose here or the spawned adapter process leaks.
+            client.dispose();
+            throw e;
+        }
    }

    // Resume the persisted session for this chat when possible; else start a new one
--- a/apps/x/packages/core/src/code-mode/acp/shell-env.ts
+++ b/apps/x/packages/core/src/code-mode/acp/shell-env.ts
@ -0,0 +1,41 @@
+import { execSync } from 'child_process';
+import * as path from 'path';
+
+let cached: string | null = null;
+
+// The user's login-shell PATH (macOS/Linux; undefined on Windows or probe failure).
+// GUI-launched Electron apps inherit launchd's stripped PATH (/usr/bin:/bin:...), so
+// anything resolved or spawned off process.env.PATH misses nvm/homebrew/npm-global
+// installs. claude-exec/codex-exec already login-shell-probe for their one binary;
+// this recovers the WHOLE PATH for transitive spawns the probes can't cover — an
+// npm-installed claude is a `#!/usr/bin/env node` script (node must be on the
+// spawner's PATH), and the engines spawn git/rg/bash themselves.
+export function loginShellPath(): string | undefined {
+    if (process.platform === 'win32') return undefined;
+    if (cached !== null) return cached || undefined;
+
+    // Prefer the user's own shell when it's POSIX-flavored, so its login profile
+    // (~/.zprofile for zsh — macOS default — ~/.profile for bash/sh) is the one that
+    // builds the PATH. fish et al. are skipped: their `echo $PATH` is space-joined.
+    const userShell = process.env.SHELL;
+    const shellOk = userShell && ['sh', 'bash', 'zsh', 'dash', 'ksh'].includes(path.basename(userShell));
+    const shells = [...new Set([...(shellOk ? [userShell] : []), '/bin/sh'])];
+
+    for (const shell of shells) {
+        try {
+            const out = execSync(`${shell} -lc 'echo $PATH'`, { timeout: 5000, encoding: 'utf-8' });
+            // Profile scripts may echo their own lines; our `echo $PATH` runs last,
+            // so take the last non-empty line and sanity-check it looks like a PATH.
+            const lines = out.split('\n').map((l) => l.trim()).filter(Boolean);
+            const last = lines[lines.length - 1];
+            if (last && last.includes('/')) {
+                cached = last;
+                return last;
+            }
+        } catch {
+            // probe failed — try the next shell
+        }
+    }
+    cached = ''; // remember the failure so we don't re-pay the probe every spawn
+    return undefined;
+}