From ae362f50f44404b3d43904de165928b416607581 Mon Sep 17 00:00:00 2001
From: Gagancreates <gaganp000999@gmail.com>
Date: Fri, 12 Jun 2026 21:23:32 +0530
Subject: [PATCH] fix(code-mode): surface agent startup failures instead of
 hanging forever

- 60s deadline on adapter initialize / session create+load: an engine that
  launches but never completes the SDK handshake (e.g. an outdated local
  CLI) now fails with the adapter's stderr attached instead of leaving the
  turn (pending...) indefinitely; prompts themselves stay un-timed
- dispose the adapter client when startup fails so the spawned process
  does not leak
- set DEBUG_CLAUDE_AGENT_SDK=1 so the SDK logs the exact spawn command and
  claude's stderr to ~/.claude/debug/sdk-*.txt and startup errors point at
  that file (engine stderr is otherwise discarded entirely)
- graft the user's login-shell PATH onto the adapter env on macOS/Linux:
  GUI launches inherit launchd's stripped PATH, which breaks node-shebang
  claude launchers (nvm/npm installs) and the engines' own subprocess spawns
---
 .../packages/core/src/code-mode/acp/agents.ts | 41 ++++++++++++++-----
 .../packages/core/src/code-mode/acp/client.ts | 39 ++++++++++++++++--
 .../core/src/code-mode/acp/manager.ts         | 18 +++++---
 .../core/src/code-mode/acp/shell-env.ts       | 41 +++++++++++++++++++
 4 files changed, 119 insertions(+), 20 deletions(-)
 create mode 100644 apps/x/packages/core/src/code-mode/acp/shell-env.ts

diff --git a/apps/x/packages/core/src/code-mode/acp/agents.ts b/apps/x/packages/core/src/code-mode/acp/agents.ts
index 26a593ee..5b8cf130 100644
--- a/apps/x/packages/core/src/code-mode/acp/agents.ts
+++ b/apps/x/packages/core/src/code-mode/acp/agents.ts
@@ -4,6 +4,7 @@ import { fileURLToPath } from 'url';
 import type { CodingAgent } from './types.js';
 import { resolveClaudeExecutable } from './claude-exec.js';
 import { resolveCodexExecutable } from './codex-exec.js';
+import { loginShellPath } from './shell-env.js';
 
 const require = createRequire(import.meta.url);
 
@@ -64,23 +65,43 @@ export function getAgentLaunchSpec(agent: CodingAgent): AgentLaunchSpec {
     const entry = resolveAdapterEntry(ADAPTER_PACKAGE[agent]);
     const env: NodeJS.ProcessEnv = { ...process.env };
 
+    // macOS/Linux GUI launches inherit launchd's stripped PATH. Resolving the engine
+    // binary below isn't enough on its own: an npm-installed claude is a
+    // `#!/usr/bin/env node` script (node must be on the ADAPTER's PATH when it spawns
+    // it), and the engines spawn git/rg/bash themselves. Graft the user's real
+    // login-shell PATH onto the adapter env so all of those resolve.
+    const shellPath = loginShellPath();
+    if (shellPath && shellPath !== env.PATH) {
+        const dirs = [...shellPath.split(path.delimiter), ...(env.PATH ?? '').split(path.delimiter)];
+        env.PATH = [...new Set(dirs.filter(Boolean))].join(path.delimiter);
+    }
+
     // Point each adapter at the user's LOCAL agent executable. We intentionally do not
     // bundle the agents' native engines (~230 MB each) into packaged builds — the
     // adapters fall back to a bundled engine only when these are unset, and we strip
     // those binaries during packaging (see apps/main/forge.config.cjs). So a local
     // install is required; throw a clear error instead of letting the adapter fail
     // cryptically on the absent bundled engine.
-    if (agent === 'claude' && !env.CLAUDE_CODE_EXECUTABLE) {
-        // On Windows resolving the real .exe is also mandatory: Node can't spawn the
-        // .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net for GUI
-        // launches that don't inherit the login shell's PATH.
-        const exe = resolveClaudeExecutable();
-        if (!exe) {
-            throw new Error(
-                'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.',
-            );
+    if (agent === 'claude') {
+        // The claude-agent-sdk discards the engine's stderr unless this is set. With
+        // it, the SDK logs the exact spawn command + claude's stderr to a debug file
+        // (~/.claude/debug/sdk-*.txt) and prints "SDK debug logs: <path>" on the
+        // adapter's stderr — which we capture and attach to startup errors, so a
+        // failed/hung launch points at the file with the real cause.
+        env.DEBUG_CLAUDE_AGENT_SDK = '1';
+
+        if (!env.CLAUDE_CODE_EXECUTABLE) {
+            // On Windows resolving the real .exe is also mandatory: Node can't spawn
+            // the .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net
+            // for GUI launches that don't inherit the login shell's PATH.
+            const exe = resolveClaudeExecutable();
+            if (!exe) {
+                throw new Error(
+                    'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.',
+                );
+            }
+            env.CLAUDE_CODE_EXECUTABLE = exe;
         }
-        env.CLAUDE_CODE_EXECUTABLE = exe;
     }
 
     if (agent === 'codex' && !env.CODEX_PATH) {
diff --git a/apps/x/packages/core/src/code-mode/acp/client.ts b/apps/x/packages/core/src/code-mode/acp/client.ts
index 5c2bd1ba..a6848544 100644
--- a/apps/x/packages/core/src/code-mode/acp/client.ts
+++ b/apps/x/packages/core/src/code-mode/acp/client.ts
@@ -27,6 +27,14 @@ export interface AcpClientOptions {
     onEvent: (event: CodeRunEvent) => void;
 }
 
+// Deadline for the startup phases (initialize / session create+load). A healthy cold
+// start — adapter boot, engine spawn, SDK handshake, MCP connects — takes seconds;
+// only a wedged engine takes this long (e.g. an outdated local CLI that launches but
+// never answers the handshake). Without a deadline that failure mode is an infinite
+// "(pending...)" with zero feedback. Prompts are intentionally NOT time-limited:
+// turns legitimately run for many minutes and may wait on user permission asks.
+const STARTUP_TIMEOUT_MS = 60_000;
+
 // Map a raw ACP session/update notification onto our small CodeRunEvent union.
 function toEvent(update: SessionUpdate): CodeRunEvent {
     switch (update.sessionUpdate) {
@@ -130,10 +138,10 @@ export class AcpClient {
         this.connection = new ClientSideConnection(() => client, stream);
 
         try {
-            const init = await this.connection.initialize({
+            const init = await this.withStartupTimeout(this.connection.initialize({
                 protocolVersion: PROTOCOL_VERSION,
                 clientCapabilities: { fs: { readTextFile: true, writeTextFile: true } },
-            });
+            }));
             this.loadSession_ = init.agentCapabilities?.loadSession === true;
         } catch (e) {
             throw this.enrich(e, 'initialize');
@@ -142,7 +150,7 @@ export class AcpClient {
 
     async newSession(): Promise<string> {
         try {
-            const res = await this.conn().newSession({ cwd: this.cwd, mcpServers: [] });
+            const res = await this.withStartupTimeout(this.conn().newSession({ cwd: this.cwd, mcpServers: [] }));
             return res.sessionId;
         } catch (e) {
             throw this.enrich(e, 'newSession');
@@ -151,12 +159,35 @@ export class AcpClient {
 
     async loadSession(sessionId: string): Promise<void> {
         try {
-            await this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] });
+            await this.withStartupTimeout(this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] }));
         } catch (e) {
             throw this.enrich(e, 'loadSession');
         }
     }
 
+    // Race a startup-phase request against the deadline. The timeout error flows
+    // through enrich(), which appends the adapter's exit info / stderr tail — so a
+    // hung startup reports WHY (including the "SDK debug logs: <path>" pointer)
+    // instead of pending forever. Callers dispose the client on failure, which
+    // kills the spawned adapter.
+    private async withStartupTimeout<T>(work: Promise<T>): Promise<T> {
+        let timer: ReturnType<typeof setTimeout> | undefined;
+        const timeout = new Promise<never>((_, reject) => {
+            timer = setTimeout(() => {
+                reject(new Error(
+                    `timed out after ${STARTUP_TIMEOUT_MS / 1000}s — the local ${this.agent} CLI may be ` +
+                    `outdated or failing to launch (check \`${this.agent} --version\`)`,
+                ));
+            }, STARTUP_TIMEOUT_MS);
+            timer.unref?.();
+        });
+        try {
+            return await Promise.race([work, timeout]);
+        } finally {
+            if (timer) clearTimeout(timer);
+        }
+    }
+
     async prompt(sessionId: string, text: string): Promise<PromptResponse> {
         try {
             return await this.conn().prompt({ sessionId, prompt: [{ type: 'text', text }] });
diff --git a/apps/x/packages/core/src/code-mode/acp/manager.ts b/apps/x/packages/core/src/code-mode/acp/manager.ts
index 04ccdebd..c81bdbbe 100644
--- a/apps/x/packages/core/src/code-mode/acp/manager.ts
+++ b/apps/x/packages/core/src/code-mode/acp/manager.ts
@@ -158,12 +158,18 @@ export class CodeModeManager {
         if (existing) this.dispose(runId); // agent/cwd changed — start over
 
         const client = new AcpClient({ agent, cwd, broker, onEvent });
-        await client.start();
-
-        const sessionId = await this.openSession(runId, agent, cwd, client);
-        const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 };
-        this.runs.set(runId, run);
-        return run;
+        try {
+            await client.start();
+            const sessionId = await this.openSession(runId, agent, cwd, client);
+            const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 };
+            this.runs.set(runId, run);
+            return run;
+        } catch (e) {
+            // Startup failed (e.g. handshake timeout). The client isn't in `runs`
+            // yet, so dispose here or the spawned adapter process leaks.
+            client.dispose();
+            throw e;
+        }
     }
 
     // Resume the persisted session for this chat when possible; else start a new one
diff --git a/apps/x/packages/core/src/code-mode/acp/shell-env.ts b/apps/x/packages/core/src/code-mode/acp/shell-env.ts
new file mode 100644
index 00000000..acff79b5
--- /dev/null
+++ b/apps/x/packages/core/src/code-mode/acp/shell-env.ts
@@ -0,0 +1,41 @@
+import { execSync } from 'child_process';
+import * as path from 'path';
+
+let cached: string | null = null;
+
+// The user's login-shell PATH (macOS/Linux; undefined on Windows or probe failure).
+// GUI-launched Electron apps inherit launchd's stripped PATH (/usr/bin:/bin:...), so
+// anything resolved or spawned off process.env.PATH misses nvm/homebrew/npm-global
+// installs. claude-exec/codex-exec already login-shell-probe for their one binary;
+// this recovers the WHOLE PATH for transitive spawns the probes can't cover — an
+// npm-installed claude is a `#!/usr/bin/env node` script (node must be on the
+// spawner's PATH), and the engines spawn git/rg/bash themselves.
+export function loginShellPath(): string | undefined {
+    if (process.platform === 'win32') return undefined;
+    if (cached !== null) return cached || undefined;
+
+    // Prefer the user's own shell when it's POSIX-flavored, so its login profile
+    // (~/.zprofile for zsh — macOS default — ~/.profile for bash/sh) is the one that
+    // builds the PATH. fish et al. are skipped: their `echo $PATH` is space-joined.
+    const userShell = process.env.SHELL;
+    const shellOk = userShell && ['sh', 'bash', 'zsh', 'dash', 'ksh'].includes(path.basename(userShell));
+    const shells = [...new Set([...(shellOk ? [userShell] : []), '/bin/sh'])];
+
+    for (const shell of shells) {
+        try {
+            const out = execSync(`${shell} -lc 'echo $PATH'`, { timeout: 5000, encoding: 'utf-8' });
+            // Profile scripts may echo their own lines; our `echo $PATH` runs last,
+            // so take the last non-empty line and sanity-check it looks like a PATH.
+            const lines = out.split('\n').map((l) => l.trim()).filter(Boolean);
+            const last = lines[lines.length - 1];
+            if (last && last.includes('/')) {
+                cached = last;
+                return last;
+            }
+        } catch {
+            // probe failed — try the next shell
+        }
+    }
+    cached = ''; // remember the failure so we don't re-pay the probe every spawn
+    return undefined;
+}