mirror of
https://github.com/rowboatlabs/rowboat.git
synced 2026-06-12 19:55:19 +02:00
fix(code-mode): surface agent startup failures instead of hanging forever
- 60s deadline on adapter initialize / session create+load: an engine that launches but never completes the SDK handshake (e.g. an outdated local CLI) now fails with the adapter's stderr attached instead of leaving the turn (pending...) indefinitely; prompts themselves stay un-timed - dispose the adapter client when startup fails so the spawned process does not leak - set DEBUG_CLAUDE_AGENT_SDK=1 so the SDK logs the exact spawn command and claude's stderr to ~/.claude/debug/sdk-*.txt and startup errors point at that file (engine stderr is otherwise discarded entirely) - graft the user's login-shell PATH onto the adapter env on macOS/Linux: GUI launches inherit launchd's stripped PATH, which breaks node-shebang claude launchers (nvm/npm installs) and the engines' own subprocess spawns
This commit is contained in:
parent
e6c5a13d1b
commit
ae362f50f4
4 changed files with 119 additions and 20 deletions
|
|
@ -4,6 +4,7 @@ import { fileURLToPath } from 'url';
|
|||
import type { CodingAgent } from './types.js';
|
||||
import { resolveClaudeExecutable } from './claude-exec.js';
|
||||
import { resolveCodexExecutable } from './codex-exec.js';
|
||||
import { loginShellPath } from './shell-env.js';
|
||||
|
||||
const require = createRequire(import.meta.url);
|
||||
|
||||
|
|
@ -64,23 +65,43 @@ export function getAgentLaunchSpec(agent: CodingAgent): AgentLaunchSpec {
|
|||
const entry = resolveAdapterEntry(ADAPTER_PACKAGE[agent]);
|
||||
const env: NodeJS.ProcessEnv = { ...process.env };
|
||||
|
||||
// macOS/Linux GUI launches inherit launchd's stripped PATH. Resolving the engine
|
||||
// binary below isn't enough on its own: an npm-installed claude is a
|
||||
// `#!/usr/bin/env node` script (node must be on the ADAPTER's PATH when it spawns
|
||||
// it), and the engines spawn git/rg/bash themselves. Graft the user's real
|
||||
// login-shell PATH onto the adapter env so all of those resolve.
|
||||
const shellPath = loginShellPath();
|
||||
if (shellPath && shellPath !== env.PATH) {
|
||||
const dirs = [...shellPath.split(path.delimiter), ...(env.PATH ?? '').split(path.delimiter)];
|
||||
env.PATH = [...new Set(dirs.filter(Boolean))].join(path.delimiter);
|
||||
}
|
||||
|
||||
// Point each adapter at the user's LOCAL agent executable. We intentionally do not
|
||||
// bundle the agents' native engines (~230 MB each) into packaged builds — the
|
||||
// adapters fall back to a bundled engine only when these are unset, and we strip
|
||||
// those binaries during packaging (see apps/main/forge.config.cjs). So a local
|
||||
// install is required; throw a clear error instead of letting the adapter fail
|
||||
// cryptically on the absent bundled engine.
|
||||
if (agent === 'claude' && !env.CLAUDE_CODE_EXECUTABLE) {
|
||||
// On Windows resolving the real .exe is also mandatory: Node can't spawn the
|
||||
// .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net for GUI
|
||||
// launches that don't inherit the login shell's PATH.
|
||||
const exe = resolveClaudeExecutable();
|
||||
if (!exe) {
|
||||
throw new Error(
|
||||
'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.',
|
||||
);
|
||||
if (agent === 'claude') {
|
||||
// The claude-agent-sdk discards the engine's stderr unless this is set. With
|
||||
// it, the SDK logs the exact spawn command + claude's stderr to a debug file
|
||||
// (~/.claude/debug/sdk-*.txt) and prints "SDK debug logs: <path>" on the
|
||||
// adapter's stderr — which we capture and attach to startup errors, so a
|
||||
// failed/hung launch points at the file with the real cause.
|
||||
env.DEBUG_CLAUDE_AGENT_SDK = '1';
|
||||
|
||||
if (!env.CLAUDE_CODE_EXECUTABLE) {
|
||||
// On Windows resolving the real .exe is also mandatory: Node can't spawn
|
||||
// the .cmd shim (EINVAL). On macOS/Linux it doubles as a PATH safety net
|
||||
// for GUI launches that don't inherit the login shell's PATH.
|
||||
const exe = resolveClaudeExecutable();
|
||||
if (!exe) {
|
||||
throw new Error(
|
||||
'Claude Code CLI not found. Install it (`npm i -g @anthropic-ai/claude-code`) to use Claude in code mode.',
|
||||
);
|
||||
}
|
||||
env.CLAUDE_CODE_EXECUTABLE = exe;
|
||||
}
|
||||
env.CLAUDE_CODE_EXECUTABLE = exe;
|
||||
}
|
||||
|
||||
if (agent === 'codex' && !env.CODEX_PATH) {
|
||||
|
|
|
|||
|
|
@ -27,6 +27,14 @@ export interface AcpClientOptions {
|
|||
onEvent: (event: CodeRunEvent) => void;
|
||||
}
|
||||
|
||||
// Deadline for the startup phases (initialize / session create+load). A healthy cold
|
||||
// start — adapter boot, engine spawn, SDK handshake, MCP connects — takes seconds;
|
||||
// only a wedged engine takes this long (e.g. an outdated local CLI that launches but
|
||||
// never answers the handshake). Without a deadline that failure mode is an infinite
|
||||
// "(pending...)" with zero feedback. Prompts are intentionally NOT time-limited:
|
||||
// turns legitimately run for many minutes and may wait on user permission asks.
|
||||
const STARTUP_TIMEOUT_MS = 60_000;
|
||||
|
||||
// Map a raw ACP session/update notification onto our small CodeRunEvent union.
|
||||
function toEvent(update: SessionUpdate): CodeRunEvent {
|
||||
switch (update.sessionUpdate) {
|
||||
|
|
@ -130,10 +138,10 @@ export class AcpClient {
|
|||
this.connection = new ClientSideConnection(() => client, stream);
|
||||
|
||||
try {
|
||||
const init = await this.connection.initialize({
|
||||
const init = await this.withStartupTimeout(this.connection.initialize({
|
||||
protocolVersion: PROTOCOL_VERSION,
|
||||
clientCapabilities: { fs: { readTextFile: true, writeTextFile: true } },
|
||||
});
|
||||
}));
|
||||
this.loadSession_ = init.agentCapabilities?.loadSession === true;
|
||||
} catch (e) {
|
||||
throw this.enrich(e, 'initialize');
|
||||
|
|
@ -142,7 +150,7 @@ export class AcpClient {
|
|||
|
||||
async newSession(): Promise<string> {
|
||||
try {
|
||||
const res = await this.conn().newSession({ cwd: this.cwd, mcpServers: [] });
|
||||
const res = await this.withStartupTimeout(this.conn().newSession({ cwd: this.cwd, mcpServers: [] }));
|
||||
return res.sessionId;
|
||||
} catch (e) {
|
||||
throw this.enrich(e, 'newSession');
|
||||
|
|
@ -151,12 +159,35 @@ export class AcpClient {
|
|||
|
||||
async loadSession(sessionId: string): Promise<void> {
|
||||
try {
|
||||
await this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] });
|
||||
await this.withStartupTimeout(this.conn().loadSession({ sessionId, cwd: this.cwd, mcpServers: [] }));
|
||||
} catch (e) {
|
||||
throw this.enrich(e, 'loadSession');
|
||||
}
|
||||
}
|
||||
|
||||
// Race a startup-phase request against the deadline. The timeout error flows
|
||||
// through enrich(), which appends the adapter's exit info / stderr tail — so a
|
||||
// hung startup reports WHY (including the "SDK debug logs: <path>" pointer)
|
||||
// instead of pending forever. Callers dispose the client on failure, which
|
||||
// kills the spawned adapter.
|
||||
private async withStartupTimeout<T>(work: Promise<T>): Promise<T> {
|
||||
let timer: ReturnType<typeof setTimeout> | undefined;
|
||||
const timeout = new Promise<never>((_, reject) => {
|
||||
timer = setTimeout(() => {
|
||||
reject(new Error(
|
||||
`timed out after ${STARTUP_TIMEOUT_MS / 1000}s — the local ${this.agent} CLI may be ` +
|
||||
`outdated or failing to launch (check \`${this.agent} --version\`)`,
|
||||
));
|
||||
}, STARTUP_TIMEOUT_MS);
|
||||
timer.unref?.();
|
||||
});
|
||||
try {
|
||||
return await Promise.race([work, timeout]);
|
||||
} finally {
|
||||
if (timer) clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
async prompt(sessionId: string, text: string): Promise<PromptResponse> {
|
||||
try {
|
||||
return await this.conn().prompt({ sessionId, prompt: [{ type: 'text', text }] });
|
||||
|
|
|
|||
|
|
@ -158,12 +158,18 @@ export class CodeModeManager {
|
|||
if (existing) this.dispose(runId); // agent/cwd changed — start over
|
||||
|
||||
const client = new AcpClient({ agent, cwd, broker, onEvent });
|
||||
await client.start();
|
||||
|
||||
const sessionId = await this.openSession(runId, agent, cwd, client);
|
||||
const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 };
|
||||
this.runs.set(runId, run);
|
||||
return run;
|
||||
try {
|
||||
await client.start();
|
||||
const sessionId = await this.openSession(runId, agent, cwd, client);
|
||||
const run: ActiveRun = { client, sessionId, agent, cwd, inflight: 0 };
|
||||
this.runs.set(runId, run);
|
||||
return run;
|
||||
} catch (e) {
|
||||
// Startup failed (e.g. handshake timeout). The client isn't in `runs`
|
||||
// yet, so dispose here or the spawned adapter process leaks.
|
||||
client.dispose();
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
// Resume the persisted session for this chat when possible; else start a new one
|
||||
|
|
|
|||
41
apps/x/packages/core/src/code-mode/acp/shell-env.ts
Normal file
41
apps/x/packages/core/src/code-mode/acp/shell-env.ts
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import { execSync } from 'child_process';
|
||||
import * as path from 'path';
|
||||
|
||||
let cached: string | null = null;
|
||||
|
||||
// The user's login-shell PATH (macOS/Linux; undefined on Windows or probe failure).
|
||||
// GUI-launched Electron apps inherit launchd's stripped PATH (/usr/bin:/bin:...), so
|
||||
// anything resolved or spawned off process.env.PATH misses nvm/homebrew/npm-global
|
||||
// installs. claude-exec/codex-exec already login-shell-probe for their one binary;
|
||||
// this recovers the WHOLE PATH for transitive spawns the probes can't cover — an
|
||||
// npm-installed claude is a `#!/usr/bin/env node` script (node must be on the
|
||||
// spawner's PATH), and the engines spawn git/rg/bash themselves.
|
||||
export function loginShellPath(): string | undefined {
|
||||
if (process.platform === 'win32') return undefined;
|
||||
if (cached !== null) return cached || undefined;
|
||||
|
||||
// Prefer the user's own shell when it's POSIX-flavored, so its login profile
|
||||
// (~/.zprofile for zsh — macOS default — ~/.profile for bash/sh) is the one that
|
||||
// builds the PATH. fish et al. are skipped: their `echo $PATH` is space-joined.
|
||||
const userShell = process.env.SHELL;
|
||||
const shellOk = userShell && ['sh', 'bash', 'zsh', 'dash', 'ksh'].includes(path.basename(userShell));
|
||||
const shells = [...new Set([...(shellOk ? [userShell] : []), '/bin/sh'])];
|
||||
|
||||
for (const shell of shells) {
|
||||
try {
|
||||
const out = execSync(`${shell} -lc 'echo $PATH'`, { timeout: 5000, encoding: 'utf-8' });
|
||||
// Profile scripts may echo their own lines; our `echo $PATH` runs last,
|
||||
// so take the last non-empty line and sanity-check it looks like a PATH.
|
||||
const lines = out.split('\n').map((l) => l.trim()).filter(Boolean);
|
||||
const last = lines[lines.length - 1];
|
||||
if (last && last.includes('/')) {
|
||||
cached = last;
|
||||
return last;
|
||||
}
|
||||
} catch {
|
||||
// probe failed — try the next shell
|
||||
}
|
||||
}
|
||||
cached = ''; // remember the failure so we don't re-pay the probe every spawn
|
||||
return undefined;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue