feat(cli): add ingest LLM rate-limit governor with paced retries (#261)

* feat(cli): add ingest rate limit governor * feat(cli): wire ingest rate-limit config * feat(cli): report provider rate-limit signals * feat(cli): show ingest rate-limit waits * fix(cli): complete rate-limit event coverage * fix(cli): abort ingest provider calls cleanly * fix(cli): propagate ingest cancellation * fix(cli): reject pre-aborted ingest rate-limit waits * fix(cli): honor Claude rate-limit reset waits * fix(cli): retry thrown Codex rate-limit failures * fix(cli): type Claude rate-limit result details * fix(cli): emit ingest rate-limit countdowns from rejected signals * fix(cli): report ai sdk rate-limit header utilization * fix(cli): gate LLM rate-limit retries on the governor budget The AI SDK and Codex runtimes retried 429 / opaque rate-limit failures up to 6-7 times with no backoff when constructed without a RateLimitGovernor (scan, memory, setup) or with pacing disabled, ignoring Retry-After and worsening the limit. The outer retry loop only cooperates with the governor's pause, so without active pacing there is no backoff to apply. Route the retry bound through a single source: RateLimitGovernor .maxRetryAttempts(), which returns retry.maxAttempts when enabled and 1 (no outer retry) when absent or disabled. All three runtimes (ai-sdk, codex, claude-code) now use it, so ingest.rateLimit.retry.maxAttempts genuinely controls attempts and the hard-coded 6 (plus Codex's off-by-one extra attempt) is gone. Backend-native retry (e.g. the AI SDK's maxRetries) still handles transient 429s. Also correct the ktx.yaml docs for maxWaitMs (caps each wait, not the whole run) and maxAttempts, and sync uv.lock ktx-sl/ktx-daemon to 0.9.0.
2026-06-13 08:15:14 +02:00 · 2026-06-05 12:10:27 +02:00 · 2026-06-05 12:10:27 +02:00 · c3d8cedb0b
commit c3d8cedb0b
parent 5a8821073b
35 changed files with 2336 additions and 72 deletions
--- a/packages/cli/test/context/llm/codex-runtime.test.ts
+++ b/packages/cli/test/context/llm/codex-runtime.test.ts
@ -130,6 +130,150 @@ describe('CodexKtxLlmRuntime', () => {
    ).rejects.toThrow('Codex structured output failed validation');
  });

+  it('reports Codex rate-limit failures and retries with opaque backoff', async () => {
+    const waitForReady = vi.fn().mockResolvedValue(undefined);
+    const report = vi.fn();
+    const fakeRunner = {
+      runStreamed: vi
+        .fn()
+        .mockResolvedValueOnce(events([{ type: 'turn.failed', error: { message: '429 rate limit exceeded' } }]))
+        .mockResolvedValueOnce(
+          events([
+            { type: 'turn.started' },
+            { type: 'item.completed', item: { type: 'agent_message', text: 'ok' } },
+            { type: 'turn.completed' },
+          ]),
+        ),
+    };
+    const runtime = new CodexKtxLlmRuntime({
+      projectDir: '/tmp/project',
+      modelSlots: { default: 'codex' },
+      runner: fakeRunner,
+      rateLimitGovernor: { waitForReady, report, maxRetryAttempts: () => 6 } as never,
+    });
+
+    await expect(runtime.generateText({ role: 'default', prompt: 'hello' })).resolves.toBe('ok');
+    expect(report).toHaveBeenCalledWith({ provider: 'codex', status: 'rejected', rateLimitType: 'opaque' });
+    expect(waitForReady).toHaveBeenCalledTimes(2);
+    expect(fakeRunner.runStreamed).toHaveBeenCalledTimes(2);
+  });
+
+  it('reports thrown Codex rate-limit failures and retries with opaque backoff', async () => {
+    const waitForReady = vi.fn().mockResolvedValue(undefined);
+    const report = vi.fn();
+    const fakeRunner = {
+      runStreamed: vi
+        .fn()
+        .mockRejectedValueOnce(new Error('ThreadError: 429 rate limit exceeded'))
+        .mockResolvedValueOnce(
+          events([
+            { type: 'turn.started' },
+            { type: 'item.completed', item: { type: 'agent_message', text: 'ok' } },
+            { type: 'turn.completed' },
+          ]),
+        ),
+    };
+    const runtime = new CodexKtxLlmRuntime({
+      projectDir: '/tmp/project',
+      modelSlots: { default: 'codex' },
+      runner: fakeRunner,
+      rateLimitGovernor: { waitForReady, report, maxRetryAttempts: () => 6 } as never,
+    });
+
+    await expect(runtime.generateText({ role: 'default', prompt: 'hello' })).resolves.toBe('ok');
+
+    expect(report).toHaveBeenCalledWith({ provider: 'codex', status: 'rejected', rateLimitType: 'opaque' });
+    expect(waitForReady).toHaveBeenCalledTimes(2);
+    expect(fakeRunner.runStreamed).toHaveBeenCalledTimes(2);
+  });
+
+  it('surfaces Codex rate-limit failures without retrying when no governor is present', async () => {
+    const fakeRunner = runner([{ type: 'turn.failed', error: { message: '429 rate limit exceeded' } }]);
+    const runtime = new CodexKtxLlmRuntime({
+      projectDir: '/tmp/project',
+      modelSlots: { default: 'codex' },
+      runner: fakeRunner,
+    });
+
+    await expect(runtime.generateText({ role: 'default', prompt: 'hello' })).rejects.toThrow(/rate limit/i);
+    expect(fakeRunner.runStreamed).toHaveBeenCalledTimes(1);
+  });
+
+  it('passes abort signals into Codex text generation and governor waits', async () => {
+    const controller = new AbortController();
+    const waitForReady = vi.fn().mockResolvedValue(undefined);
+    let observedSignal: AbortSignal | undefined;
+    const fakeRunner = {
+      runStreamed: vi.fn(async (input: { signal?: AbortSignal }) => {
+        observedSignal = input.signal;
+        return events([
+          { type: 'turn.started' },
+          { type: 'item.completed', item: { type: 'agent_message', text: 'ok' } },
+          { type: 'turn.completed' },
+        ]);
+      }),
+    };
+    const runtime = new CodexKtxLlmRuntime({
+      projectDir: '/tmp/project',
+      modelSlots: { default: 'codex' },
+      runner: fakeRunner,
+      rateLimitGovernor: { waitForReady, report: vi.fn(), maxRetryAttempts: () => 6 } as never,
+    });
+
+    await expect(runtime.generateText({ role: 'default', prompt: 'hello', abortSignal: controller.signal })).resolves.toBe('ok');
+
+    expect(waitForReady).toHaveBeenCalledWith(controller.signal);
+    expect(observedSignal).toBe(controller.signal);
+  });
+
+  it('links the parent abort signal into Codex agent-loop streamed runs', async () => {
+    const controller = new AbortController();
+    let releaseStream!: () => void;
+    const streamRelease = new Promise<void>((resolve) => {
+      releaseStream = resolve;
+    });
+    let markRunnerCalled!: () => void;
+    const runnerCalled = new Promise<void>((resolve) => {
+      markRunnerCalled = resolve;
+    });
+    let observedSignal: AbortSignal | undefined;
+    const fakeRunner = {
+      runStreamed: vi.fn(async (input: { signal?: AbortSignal }) => {
+        observedSignal = input.signal;
+        markRunnerCalled();
+        return (async function* () {
+          await streamRelease;
+          yield { type: 'turn.started' };
+          yield { type: 'item.completed', item: { type: 'agent_message', text: 'ok' } };
+          yield { type: 'turn.completed' };
+        })();
+      }),
+    };
+    const runtime = new CodexKtxLlmRuntime({
+      projectDir: '/tmp/project',
+      modelSlots: { default: 'codex' },
+      runner: fakeRunner,
+    });
+
+    const pending = runtime.runAgentLoop({
+      modelRole: 'default',
+      systemPrompt: '',
+      userPrompt: '',
+      toolSet: {},
+      stepBudget: 10,
+      telemetryTags: {},
+      abortSignal: controller.signal,
+    });
+
+    await runnerCalled;
+    expect(observedSignal).toBeDefined();
+    expect(observedSignal).not.toBe(controller.signal);
+    controller.abort();
+    expect(observedSignal?.aborted).toBe(true);
+    releaseStream();
+    await expect(pending).resolves.toMatchObject({ stopReason: 'natural' });
+  });
+
  it('starts and closes a temporary MCP server for tool-backed agent loops', async () => {
    const close = vi.fn(async () => undefined);
    const startMcpServer = vi.fn(async () => ({