feat(cli): add ingest LLM rate-limit governor with paced retries (#261)

* feat(cli): add ingest rate limit governor * feat(cli): wire ingest rate-limit config * feat(cli): report provider rate-limit signals * feat(cli): show ingest rate-limit waits * fix(cli): complete rate-limit event coverage * fix(cli): abort ingest provider calls cleanly * fix(cli): propagate ingest cancellation * fix(cli): reject pre-aborted ingest rate-limit waits * fix(cli): honor Claude rate-limit reset waits * fix(cli): retry thrown Codex rate-limit failures * fix(cli): type Claude rate-limit result details * fix(cli): emit ingest rate-limit countdowns from rejected signals * fix(cli): report ai sdk rate-limit header utilization * fix(cli): gate LLM rate-limit retries on the governor budget The AI SDK and Codex runtimes retried 429 / opaque rate-limit failures up to 6-7 times with no backoff when constructed without a RateLimitGovernor (scan, memory, setup) or with pacing disabled, ignoring Retry-After and worsening the limit. The outer retry loop only cooperates with the governor's pause, so without active pacing there is no backoff to apply. Route the retry bound through a single source: RateLimitGovernor .maxRetryAttempts(), which returns retry.maxAttempts when enabled and 1 (no outer retry) when absent or disabled. All three runtimes (ai-sdk, codex, claude-code) now use it, so ingest.rateLimit.retry.maxAttempts genuinely controls attempts and the hard-coded 6 (plus Codex's off-by-one extra attempt) is gone. Backend-native retry (e.g. the AI SDK's maxRetries) still handles transient 429s. Also correct the ktx.yaml docs for maxWaitMs (caps each wait, not the whole run) and maxAttempts, and sync uv.lock ktx-sl/ktx-daemon to 0.9.0.
2026-07-25 12:01:03 +02:00 · 2026-06-05 12:10:27 +02:00 · 2026-06-05 12:10:27 +02:00 · c3d8cedb0b
commit c3d8cedb0b
parent 5a8821073b
35 changed files with 2336 additions and 72 deletions
--- a/packages/cli/src/context/core/abort.ts
+++ b/packages/cli/src/context/core/abort.ts
@ -0,0 +1,39 @@
+/** @internal */
+export function createAbortError(message = 'Aborted'): DOMException {
+  return new DOMException(message, 'AbortError');
+}
+
+export function isAbortError(error: unknown): boolean {
+  if (error instanceof DOMException && error.name === 'AbortError') {
+    return true;
+  }
+  if (!error || typeof error !== 'object') {
+    return false;
+  }
+  const record = error as { name?: unknown; code?: unknown };
+  return record.name === 'AbortError' || record.code === 'ABORT_ERR';
+}
+
+/** @internal */
+export function throwIfAborted(signal?: AbortSignal): void {
+  if (signal?.aborted) {
+    throw createAbortError();
+  }
+}
+
+export function linkAbortSignal(parent?: AbortSignal): { controller: AbortController; dispose: () => void } {
+  const controller = new AbortController();
+  if (!parent) {
+    return { controller, dispose: () => undefined };
+  }
+  if (parent.aborted) {
+    controller.abort(createAbortError());
+    return { controller, dispose: () => undefined };
+  }
+  const onAbort = () => controller.abort(createAbortError());
+  parent.addEventListener('abort', onAbort, { once: true });
+  return {
+    controller,
+    dispose: () => parent.removeEventListener('abort', onAbort),
+  };
+}
--- a/packages/cli/src/context/ingest/context-candidates/curator-pagination.service.ts
+++ b/packages/cli/src/context/ingest/context-candidates/curator-pagination.service.ts
@ -40,6 +40,7 @@ export interface CuratorPaginationInput {
  buildToolSet: (passNumber: number) => KtxRuntimeToolSet;
  getReconciliationActions: () => MemoryAction[];
  onStepFinish?: (info: { passNumber: number; stepIndex: number; stepBudget: number }) => void;
+  abortSignal?: AbortSignal;
 }

 interface CuratorPaginationResult extends ReconciliationOutcome {
@ -243,6 +244,7 @@ export class CuratorPaginationService implements CuratorPaginationPort {
      sourceKey: params.input.sourceKey,
      jobId: params.input.jobId,
      forceRun: params.forceRun,
+      abortSignal: params.input.abortSignal,
      onStepFinish: params.input.onStepFinish
        ? ({ stepIndex, stepBudget }) =>
            params.input.onStepFinish?.({ passNumber: params.passNumber, stepIndex, stepBudget })
--- a/packages/cli/src/context/ingest/final-gate-repair.ts
+++ b/packages/cli/src/context/ingest/final-gate-repair.ts
@ -21,6 +21,7 @@ export interface RepairFinalGateFailureInput {
  repairKind: FinalGateRepairKind;
  maxAttempts?: number;
  stepBudget?: number;
+  abortSignal?: AbortSignal;
 }

 const readRepairFileSchema = z.object({
@ -200,6 +201,7 @@ export async function repairFinalGateFailure(
          jobId: input.trace.context.jobId,
          repairKind: input.repairKind,
        },
+        abortSignal: input.abortSignal,
      }),
    );

--- a/packages/cli/src/context/ingest/ingest-bundle.runner.ts
+++ b/packages/cli/src/context/ingest/ingest-bundle.runner.ts
@ -3,6 +3,7 @@ import { dirname, join } from 'node:path';
 import pLimit from 'p-limit';
 import { z } from 'zod';
 import { type KtxLogger, noopLogger } from '../../context/core/config.js';
+import type { RateLimitWaitState } from '../../context/llm/rate-limit-governor.js';
 import { createRuntimeToolDescriptorFromAiTool } from '../../context/llm/runtime-tools.js';
 import type { KtxRuntimeToolSet } from '../../context/llm/runtime-port.js';
 import type { CaptureSession, MemoryAction } from '../../context/memory/types.js';
@ -219,6 +220,10 @@ export class IngestBundleRunner {
  }

  async run(job: IngestBundleJob, ctx?: IngestJobContext): Promise<IngestBundleResult> {
+    const unsubscribeRateLimitGovernor = this.subscribeRateLimitGovernor({
+      trace: this.createTrace(job),
+      memoryFlow: ctx?.memoryFlow,
+    });
    const key = job.connectionId;
    const previous = this.chainByConnection.get(key);
    if (previous) {
@ -241,10 +246,72 @@ export class IngestBundleRunner {
      ctx?.memoryFlow?.finish('error', [sanitizeMemoryFlowError(error)]);
      throw error;
    } finally {
+      unsubscribeRateLimitGovernor();
      await this.maybeEmitIngestProfile(job.jobId);
    }
  }

+  private formatRateLimitWait(
+    state: Extract<RateLimitWaitState, { kind: 'wait_tick' | 'wait_started' | 'wait_finished' }>,
+  ): string {
+    const seconds = Math.ceil(state.remainingMs / 1_000);
+    const minutes = Math.floor(seconds / 60);
+    const remainder = seconds % 60;
+    const duration = minutes > 0 ? `${minutes}m${String(remainder).padStart(2, '0')}s` : `${seconds}s`;
+    const type = state.rateLimitType ? ` ${state.rateLimitType}` : '';
+    return `Rate-limited (${state.provider}${type}); resuming in ${duration}; Ctrl+C to stop`;
+  }
+
+  private subscribeRateLimitGovernor(input: {
+    trace: IngestTraceWriter;
+    memoryFlow?: MemoryFlowEventSink;
+  }): () => void {
+    const governor = this.deps.settings.rateLimitGovernor;
+    if (!governor) {
+      return () => undefined;
+    }
+    return governor.subscribe((state: RateLimitWaitState) => {
+      if (state.kind === 'rate_limit_observed') {
+        void input.trace.event('info', 'rate_limit', 'rate_limit_observed', { ...state });
+        return;
+      }
+      if (state.kind === 'concurrency_adjusted') {
+        void input.trace.event('info', 'rate_limit', 'concurrency_adjusted', { ...state });
+        return;
+      }
+      void input.trace.event('info', 'rate_limit', state.kind, { ...state });
+      if (state.kind === 'wait_tick' || state.kind === 'wait_started') {
+        input.memoryFlow?.emit({
+          type: 'rate_limit_wait',
+          provider: state.provider,
+          ...(state.rateLimitType ? { rateLimitType: state.rateLimitType } : {}),
+          resumeAtMs: state.resumeAtMs,
+          remainingMs: state.remainingMs,
+        });
+        input.memoryFlow?.emit({
+          type: 'stage_progress',
+          stage: 'integration',
+          percent: 50,
+          message: this.formatRateLimitWait(state),
+          transient: true,
+        });
+      }
+    });
+  }
+
+  private async withRateLimitWorkSlot<T>(abortSignal: AbortSignal | undefined, fn: () => Promise<T>): Promise<T> {
+    const governor = this.deps.settings.rateLimitGovernor;
+    if (!governor) {
+      return fn();
+    }
+    const release = await governor.acquireWorkSlot(abortSignal);
+    try {
+      return await fn();
+    } finally {
+      release();
+    }
+  }
+
  /**
   * When profiling is enabled — via the `KTX_PROFILE_INGEST` env var or the
   * `ingest.profile` config setting — read the job's trace + tool transcripts
@ -877,6 +944,7 @@ export class IngestBundleRunner {
    includeContextEvidenceTools: boolean;
    currentTableExists(tableRef: string): Promise<boolean>;
    memoryFlow?: MemoryFlowEventSink;
+    abortSignal?: AbortSignal;
    wuSkillNames: string[];
    onStepFinish?: (info: { stepIndex: number; stepBudget: number }) => void;
  }): Promise<WorkUnitOutcome> {
@ -1029,6 +1097,7 @@ export class IngestBundleRunner {
        jobId: input.job.jobId,
        toolFailureCount: (unitKey) => input.transcriptSummaries.get(unitKey)?.fatalErrorCount ?? 0,
        onStepFinish: input.onStepFinish,
+        abortSignal: input.abortSignal,
      },
      input.wu,
    );
@ -1524,7 +1593,8 @@ export class IngestBundleRunner {
        try {
          await Promise.all(
            workUnits.map((wu, index) =>
-              limitWorkUnit(async () => {
+              limitWorkUnit(() =>
+                this.withRateLimitWorkSlot(ctx?.abortSignal, async () => {
                const outcome = await runIsolatedWorkUnit({
                  unitIndex: index,
                  ingestionBaseSha,
@ -1532,6 +1602,7 @@ export class IngestBundleRunner {
                  patchDir,
                  trace: runTrace,
                  workUnit: wu,
+                  abortSignal: ctx?.abortSignal,
                  afterSuccess: (child) => copyTransientIngestEvidence(child.workdir, sessionWorktree.workdir),
                  run: async (child) => {
                    const scopedWikiService = this.deps.wikiService.forWorktree(child.workdir);
@ -1565,6 +1636,7 @@ export class IngestBundleRunner {
                      includeContextEvidenceTools: adapter.evidenceIndexing === 'documents' && !!contextReport,
                      currentTableExists: (tableRef) =>
                        this.tableRefExistsInSemanticLayer(scopedSemanticLayerService, slConnectionIds, tableRef),
+                      abortSignal: ctx?.abortSignal,
                      memoryFlow,
                      wuSkillNames,
                      onStepFinish: ({ stepIndex, stepBudget }) => {
@ -1594,7 +1666,8 @@ export class IngestBundleRunner {
                  completedWorkUnits / workUnits.length,
                  `${completedWorkUnits} of ${workUnits.length} work units complete`,
                );
-              }),
+                }),
+              ),
            ),
          );
        } catch (error) {
@ -1693,6 +1766,7 @@ export class IngestBundleRunner {
                reason: context.reason,
                maxAttempts: 1,
                stepBudget: 12,
+                abortSignal: ctx?.abortSignal,
              });
              emitStageProgress(
                'integration',
@ -1714,6 +1788,7 @@ export class IngestBundleRunner {
                repairKind: 'patch_semantic_gate',
                maxAttempts: 1,
                stepBudget: 16,
+                abortSignal: ctx?.abortSignal,
              });
              emitStageProgress(
                'integration',
@ -1993,6 +2068,7 @@ export class IngestBundleRunner {
                );
              }
            : undefined,
+          abortSignal: ctx?.abortSignal,
        });
        curatorReport = curatorOutcome.report;
        curatorWarnings = curatorOutcome.warnings;
@ -2038,6 +2114,7 @@ export class IngestBundleRunner {
          sourceKey: job.sourceKey,
          jobId: job.jobId,
          force: !!overrideReport,
+          abortSignal: ctx?.abortSignal,
          onStepFinish: stage4
            ? ({ stepIndex, stepBudget }) => {
                emitStageProgress('reconciliation', 85, `Reconciling results: step ${stepIndex}/${stepBudget}`, {
@ -2470,6 +2547,7 @@ export class IngestBundleRunner {
          repairKind: 'final_artifact_gate',
          maxAttempts: 1,
          stepBudget: 16,
+          abortSignal: ctx?.abortSignal,
        });

        isolatedDiffSummary.gateRepairAttempts += gateRepair.attempts;
--- a/packages/cli/src/context/ingest/isolated-diff/textual-conflict-resolver.ts
+++ b/packages/cli/src/context/ingest/isolated-diff/textual-conflict-resolver.ts
@ -19,6 +19,7 @@ export interface ResolveTextualConflictInput {
  reason: string;
  maxAttempts?: number;
  stepBudget?: number;
+  abortSignal?: AbortSignal;
 }

 const readIntegrationFileSchema = z.object({
@ -208,6 +209,7 @@ export async function resolveTextualConflict(
          jobId: input.trace.context.jobId,
          unitKey: input.unitKey,
        },
+        abortSignal: input.abortSignal,
      }),
    );

--- a/packages/cli/src/context/ingest/isolated-diff/work-unit-executor.ts
+++ b/packages/cli/src/context/ingest/isolated-diff/work-unit-executor.ts
@ -14,6 +14,7 @@ export interface RunIsolatedWorkUnitInput {
  patchDir: string;
  trace: IngestTraceWriter;
  workUnit: WorkUnit;
+  abortSignal?: AbortSignal;
  run(child: IngestSessionWorktree): Promise<WorkUnitOutcome>;
  afterSuccess?(child: IngestSessionWorktree): Promise<void>;
 }
--- a/packages/cli/src/context/ingest/local-bundle-runtime.ts
+++ b/packages/cli/src/context/ingest/local-bundle-runtime.ts
@ -12,6 +12,7 @@ import type { KtxSemanticLayerComputePort } from '../../context/daemon/semantic-
 import { createRuntimeToolDescriptorFromAiTool } from '../../context/llm/runtime-tools.js';
 import { createLocalKtxLlmRuntimeFromConfig } from '../../context/llm/local-config.js';
 import { KtxIngestEmbeddingPortAdapter } from '../../context/llm/embedding-port.js';
+import { createRateLimitGovernorConfig, RateLimitGovernor } from '../../context/llm/rate-limit-governor.js';
 import { RuntimeAgentRunner, type AgentRunnerPort, type KtxLlmRuntimePort, type KtxRuntimeToolSet } from '../../context/llm/runtime-port.js';
 import type { KtxEmbeddingProvider } from '../../llm/types.js';
 import type { KtxLocalProject } from '../../context/project/project.js';
@ -619,7 +620,7 @@ function localIngestLlmProviderGuardMessage(projectDir: string): string {
  ].join('\n');
 }

-function resolveAgentRunner(options: CreateLocalBundleIngestRuntimeOptions): {
+function resolveAgentRunner(options: CreateLocalBundleIngestRuntimeOptions, rateLimitGovernor: RateLimitGovernor): {
  agentRunner: AgentRunnerPort;
  llmRuntime?: KtxLlmRuntimePort;
 } {
@ -628,6 +629,7 @@ function resolveAgentRunner(options: CreateLocalBundleIngestRuntimeOptions): {
    (options.createLlmRuntime ?? createLocalKtxLlmRuntimeFromConfig)(options.project.config.llm, {
      projectDir: options.project.projectDir,
      env: process.env,
+      rateLimitGovernor,
    }) ??
    undefined;

@ -677,7 +679,13 @@ export function createLocalBundleIngestRuntime(
  const knowledgeIndex = new LocalKnowledgeIndex(options.project, embedding);
  const knowledgeEvents = new NoopKnowledgeEventPort();
  const wikiService = new KnowledgeWikiService(rootFileStore, embedding, knowledgeIndex, options.project.git, logger);
-  const { agentRunner, llmRuntime } = resolveAgentRunner(options);
+  const rateLimitGovernor = new RateLimitGovernor(
+    createRateLimitGovernorConfig({
+      ...options.project.config.ingest.rateLimit,
+      maxConcurrency: options.project.config.ingest.workUnits.maxConcurrency,
+    }),
+  );
+  const { agentRunner, llmRuntime } = resolveAgentRunner(options, rateLimitGovernor);
  const promptService = new PromptService({ promptsDir, partials: [], logger });
  const storage = new LocalIngestStorage(options.project);
  const registry = registerAdapters(options.adapters);
@ -717,6 +725,7 @@ export function createLocalBundleIngestRuntime(
      workUnitMaxConcurrency: options.project.config.ingest.workUnits.maxConcurrency,
      workUnitStepBudget: options.project.config.ingest.workUnits.stepBudget,
      workUnitFailureMode: options.project.config.ingest.workUnits.failureMode,
+      rateLimitGovernor,
      profileIngest: options.project.config.ingest.profile,
      ingestTraceLevel: ingestTraceLevelFromEnv(),
    },
--- a/packages/cli/src/context/ingest/local-ingest.ts
+++ b/packages/cli/src/context/ingest/local-ingest.ts
@ -3,6 +3,7 @@ import { cp, mkdir, rm } from 'node:fs/promises';
 import { isAbsolute, resolve } from 'node:path';
 import type { KtxSqlQueryExecutorPort } from '../../context/connections/query-executor.js';
 import type { KtxLogger } from '../../context/core/config.js';
+import { createAbortError, isAbortError } from '../../context/core/abort.js';
 import type { KtxSemanticLayerComputePort } from '../../context/daemon/semantic-layer-compute.js';
 import type { AgentRunnerPort, KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
 import type { KtxLocalProject } from '../../context/project/project.js';
@ -36,6 +37,7 @@ export interface RunLocalIngestOptions {
  queryExecutor?: KtxSqlQueryExecutorPort;
  logger?: KtxLogger;
  embeddingProvider?: import('../../llm/types.js').KtxEmbeddingProvider | null;
+  abortSignal?: AbortSignal;
 }

 export interface LocalIngestResult {
@ -123,10 +125,11 @@ function findAdapter(adapters: SourceAdapter[], source: string): SourceAdapter {
  return adapter;
 }

-function localJobContext(jobId: string, memoryFlow?: MemoryFlowEventSink): IngestJobContext {
+function localJobContext(jobId: string, memoryFlow?: MemoryFlowEventSink, abortSignal?: AbortSignal): IngestJobContext {
  return {
    jobId,
    ...(memoryFlow ? { memoryFlow } : {}),
+    ...(abortSignal ? { abortSignal } : {}),
    startPhase() {
      return new LocalIngestPhase();
    },
@ -158,6 +161,7 @@ async function runScheduledPullJob(options: {
  queryExecutor?: KtxSqlQueryExecutorPort;
  logger?: KtxLogger;
  embeddingProvider?: import('../../llm/types.js').KtxEmbeddingProvider | null;
+  abortSignal?: AbortSignal;
 }): Promise<LocalIngestResult> {
  const runtime = createLocalBundleIngestRuntime(options);
  const jobId = options.jobId ?? runtime.nextJobId();
@ -169,7 +173,7 @@ async function runScheduledPullJob(options: {
      trigger: options.trigger ?? 'manual_resync',
      bundleRef: { kind: 'scheduled_pull', config: options.pullConfig },
    },
-    localJobContext(jobId, options.memoryFlow),
+    localJobContext(jobId, options.memoryFlow, options.abortSignal),
  );
  const report = await runtime.store.findByJobId(jobId);
  if (!report) {
@ -212,6 +216,7 @@ export async function runLocalIngest(options: RunLocalIngestOptions): Promise<Lo
      queryExecutor: options.queryExecutor,
      logger: options.logger,
      embeddingProvider: options.embeddingProvider,
+      abortSignal: options.abortSignal,
    });
  }

@ -223,7 +228,7 @@ export async function runLocalIngest(options: RunLocalIngestOptions): Promise<Lo
      trigger: options.trigger ?? (options.sourceDir ? 'upload' : 'manual_resync'),
      bundleRef,
    },
-    localJobContext(jobId, options.memoryFlow),
+    localJobContext(jobId, options.memoryFlow, options.abortSignal),
  );
  const report = await runtime.store.findByJobId(jobId);
  if (!report) {
@ -362,6 +367,9 @@ export async function runLocalMetabaseIngest(

  const children: LocalMetabaseFanoutChild[] = [];
  for (const childPlan of childPlans) {
+    if (options.abortSignal?.aborted) {
+      throw createAbortError();
+    }
    const targetConnectionId = safeSegment('target connection id', childPlan.targetConnectionId);
    if (!options.project.config.connections[targetConnectionId]) {
      throw new Error(`Target connection "${targetConnectionId}" is not configured in ktx.yaml`);
@ -391,8 +399,12 @@ export async function runLocalMetabaseIngest(
        queryExecutor: options.queryExecutor,
        logger: options.logger,
        embeddingProvider: options.embeddingProvider,
+        abortSignal: options.abortSignal,
      });
    } catch (error) {
+      if (isAbortError(error)) {
+        throw error;
+      }
      child = await recordLocalMetabaseChildFailure({
        project: options.project,
        jobId: childJobId,
--- a/packages/cli/src/context/ingest/memory-flow/schema.ts
+++ b/packages/cli/src/context/ingest/memory-flow/schema.ts
@ -70,6 +70,13 @@ const memoryFlowEventSchema = z.discriminatedUnion('type', [
    message: z.string().min(1),
    transient: z.boolean().optional(),
  }),
+  eventSchema({
+    type: z.literal('rate_limit_wait'),
+    provider: z.string(),
+    rateLimitType: z.string().optional(),
+    resumeAtMs: z.number().int().nonnegative(),
+    remainingMs: z.number().int().nonnegative(),
+  }),
  eventSchema({
    type: z.literal('work_unit_started'),
    unitKey: z.string().min(1),
--- a/packages/cli/src/context/ingest/memory-flow/types.ts
+++ b/packages/cli/src/context/ingest/memory-flow/types.ts
@ -60,6 +60,13 @@ type MemoryFlowEventPayload =
      message: string;
      transient?: boolean;
    }
+  | {
+      type: 'rate_limit_wait';
+      provider: string;
+      rateLimitType?: string;
+      resumeAtMs: number;
+      remainingMs: number;
+    }
  | {
      type: 'work_unit_started';
      unitKey: string;
--- a/packages/cli/src/context/ingest/ports.ts
+++ b/packages/cli/src/context/ingest/ports.ts
@ -5,6 +5,7 @@ import type { KtxFileStorePort } from '../../context/core/file-store.js';
 import type { KtxLogger } from '../../context/core/config.js';
 import type { SessionOutcome } from '../../context/core/session-worktree.service.js';
 import type { AgentRunnerPort, KtxLlmRuntimePort, KtxRuntimeToolSet } from '../../context/llm/runtime-port.js';
+import type { RateLimitGovernor } from '../llm/rate-limit-governor.js';
 import type { MemoryAction, MemoryKnowledgeSlRefsPort } from '../../context/memory/types.js';
 import type { PromptService } from '../../context/prompts/prompt.service.js';
 import type { SkillsRegistryService } from '../../context/skills/skills-registry.service.js';
@ -144,6 +145,7 @@ interface IngestSettingsPort {
  workUnitMaxConcurrency?: number;
  workUnitStepBudget?: number;
  workUnitFailureMode?: 'abort' | 'continue';
+  rateLimitGovernor?: RateLimitGovernor;
  /** Print a timing breakdown to stderr at the end of each run (config-driven; see also KTX_PROFILE_INGEST). `'json'` emits the raw structured profile. */
  profileIngest?: boolean | 'json';
  ingestTraceLevel?: IngestTraceLevel;
@ -323,6 +325,7 @@ export interface CuratorPaginationPort {
    buildToolSet: (passNumber: number) => KtxRuntimeToolSet;
    getReconciliationActions: () => MemoryAction[];
    onStepFinish?: (info: { passNumber: number; stepIndex: number; stepBudget: number }) => void;
+    abortSignal?: AbortSignal;
  }): Promise<ReconciliationOutcome & { report: CuratorPaginationReport; warnings: string[] }>;
 }

--- a/packages/cli/src/context/ingest/stages/stage-3-work-units.ts
+++ b/packages/cli/src/context/ingest/stages/stage-3-work-units.ts
@ -1,4 +1,5 @@
 import type { KtxModelRole } from '../../../llm/types.js';
+import { isAbortError } from '../../core/abort.js';
 import type { AgentRunnerPort, KtxRuntimeToolSet, RunLoopMetrics } from '../../../context/llm/runtime-port.js';
 import type { CaptureSession, MemoryAction } from '../../../context/memory/types.js';
 import { listTouchedSlSources, type TouchedSlSource } from '../../../context/tools/touched-sl-sources.js';
@ -28,6 +29,7 @@ export interface WorkUnitExecutionDeps {
  connectionId: string;
  jobId: string;
  onStepFinish?: (info: { stepIndex: number; stepBudget: number }) => void;
+  abortSignal?: AbortSignal;
  toolFailureCount?: (unitKey: string) => number;
 }

@ -106,8 +108,12 @@ export async function executeWorkUnit(deps: WorkUnitExecutionDeps, wu: WorkUnit)
        jobId: deps.jobId,
      },
      onStepFinish: deps.onStepFinish,
+      abortSignal: deps.abortSignal,
    });
  } catch (error) {
+    if (isAbortError(error)) {
+      throw error;
+    }
    return failWithResetFromCurrentHead(error instanceof Error ? error.message : String(error));
  }

--- a/packages/cli/src/context/ingest/stages/stage-4-reconciliation.ts
+++ b/packages/cli/src/context/ingest/stages/stage-4-reconciliation.ts
@ -16,6 +16,7 @@ export interface ReconciliationContext {
  jobId: string;
  force?: boolean;
  onStepFinish?: (info: { stepIndex: number; stepBudget: number }) => void;
+  abortSignal?: AbortSignal;
  forceRun?: boolean;
 }

@ -40,6 +41,7 @@ export async function runReconciliationStage4(ctx: ReconciliationContext): Promi
    stepBudget: ctx.stepBudget,
    telemetryTags: { operationName: 'ingest-bundle-reconcile', source: ctx.sourceKey, jobId: ctx.jobId },
    onStepFinish: ctx.onStepFinish,
+    abortSignal: ctx.abortSignal,
  });
  return { skipped: false, stopReason: run.stopReason, error: run.error, ...(run.metrics ? { metrics: run.metrics } : {}) };
 }
--- a/packages/cli/src/context/ingest/types.ts
+++ b/packages/cli/src/context/ingest/types.ts
@ -220,5 +220,6 @@ export interface IngestJobPhase {
 export interface IngestJobContext {
  jobId: string;
  memoryFlow?: MemoryFlowEventSink;
+  abortSignal?: AbortSignal;
  startPhase(weight: number): IngestJobPhase;
 }
--- a/packages/cli/src/context/llm/ai-sdk-runtime.ts
+++ b/packages/cli/src/context/llm/ai-sdk-runtime.ts
@ -3,7 +3,9 @@ import type { KtxLlmProvider } from '../../llm/types.js';
 import { generateText, Output, stepCountIs, type FlexibleSchema, type TelemetrySettings, type ToolSet } from 'ai';
 import type { z } from 'zod';
 import { noopLogger, type KtxLogger } from '../../context/core/config.js';
+import { isAbortError } from '../core/abort.js';
 import { summarizeKtxLlmDebugRequest, type KtxLlmDebugRequestRecorder } from './debug-request-recorder.js';
+import type { RateLimitGovernor, RateLimitProvider, RateLimitSignal } from './rate-limit-governor.js';
 import { createAiSdkToolSet } from './runtime-tools.js';
 import type {
  KtxGenerateObjectInput,
@ -40,12 +42,129 @@ export interface AiSdkKtxLlmRuntimeDeps {
  telemetry?: AgentTelemetryPort;
  logger?: KtxLogger;
  debugRequestRecorder?: KtxLlmDebugRequestRecorder;
+  rateLimitGovernor?: Pick<RateLimitGovernor, 'waitForReady' | 'report' | 'maxRetryAttempts'>;
 }

 function hasTools(tools: Record<string, unknown>): boolean {
  return Object.keys(tools).length > 0;
 }

+function modelProviderName(model: unknown): RateLimitProvider {
+  const provider = (model as { provider?: string }).provider ?? '';
+  return provider.includes('vertex') || provider.includes('google') ? 'vertex' : 'anthropic-api';
+}
+
+interface HeaderLimitPair {
+  limit: string;
+  remaining: string;
+  rateLimitType: string;
+}
+
+const RATE_LIMIT_HEADER_PAIRS: HeaderLimitPair[] = [
+  {
+    limit: 'anthropic-ratelimit-requests-limit',
+    remaining: 'anthropic-ratelimit-requests-remaining',
+    rateLimitType: 'rpm',
+  },
+  {
+    limit: 'anthropic-ratelimit-tokens-limit',
+    remaining: 'anthropic-ratelimit-tokens-remaining',
+    rateLimitType: 'tpm',
+  },
+  {
+    limit: 'anthropic-ratelimit-input-tokens-limit',
+    remaining: 'anthropic-ratelimit-input-tokens-remaining',
+    rateLimitType: 'itpm',
+  },
+  {
+    limit: 'anthropic-ratelimit-output-tokens-limit',
+    remaining: 'anthropic-ratelimit-output-tokens-remaining',
+    rateLimitType: 'otpm',
+  },
+  {
+    limit: 'x-ratelimit-limit-requests',
+    remaining: 'x-ratelimit-remaining-requests',
+    rateLimitType: 'rpm',
+  },
+  {
+    limit: 'x-ratelimit-limit-tokens',
+    remaining: 'x-ratelimit-remaining-tokens',
+    rateLimitType: 'tpm',
+  },
+];
+
+function normalizeHeaders(headers: unknown): Record<string, string> {
+  if (!headers || typeof headers !== 'object') {
+    return {};
+  }
+  const get = (headers as { get?: unknown }).get;
+  if (typeof get === 'function') {
+    const out: Record<string, string> = {};
+    for (const pair of RATE_LIMIT_HEADER_PAIRS) {
+      const limit = get.call(headers, pair.limit);
+      const remaining = get.call(headers, pair.remaining);
+      if (typeof limit === 'string') out[pair.limit] = limit;
+      if (typeof remaining === 'string') out[pair.remaining] = remaining;
+    }
+    return out;
+  }
+  return Object.fromEntries(
+    Object.entries(headers as Record<string, unknown>)
+      .filter((entry): entry is [string, string | number] => typeof entry[1] === 'string' || typeof entry[1] === 'number')
+      .map(([key, value]) => [key.toLowerCase(), String(value)]),
+  );
+}
+
+function numericHeader(headers: Record<string, string>, key: string): number | undefined {
+  const value = Number(headers[key]);
+  return Number.isFinite(value) && value >= 0 ? value : undefined;
+}
+
+function utilizationForPair(headers: Record<string, string>, pair: HeaderLimitPair): number | undefined {
+  const limit = numericHeader(headers, pair.limit);
+  const remaining = numericHeader(headers, pair.remaining);
+  if (limit === undefined || remaining === undefined || limit <= 0) {
+    return undefined;
+  }
+  return 1 - Math.min(limit, remaining) / limit;
+}
+
+function aiSdkHeaderRateLimitSignal(provider: RateLimitProvider, result: unknown): RateLimitSignal | undefined {
+  const headers = normalizeHeaders((result as { response?: { headers?: unknown } }).response?.headers);
+  let best: { utilization: number; rateLimitType: string } | undefined;
+  for (const pair of RATE_LIMIT_HEADER_PAIRS) {
+    const utilization = utilizationForPair(headers, pair);
+    if (utilization === undefined) {
+      continue;
+    }
+    if (!best || utilization > best.utilization) {
+      best = { utilization, rateLimitType: pair.rateLimitType };
+    }
+  }
+  if (!best) {
+    return undefined;
+  }
+  return {
+    provider,
+    status: 'allowed',
+    rateLimitType: best.rateLimitType,
+    utilization: Number(best.utilization.toFixed(4)),
+  };
+}
+
+function retryAfterMs(error: unknown): number | undefined {
+  const value = (error as { retryAfter?: unknown }).retryAfter;
+  if (typeof value === 'number' && Number.isFinite(value) && value > 0) {
+    return value < 1_000 ? value * 1_000 : value;
+  }
+  return undefined;
+}
+
+function isAiSdkRateLimitError(error: unknown): boolean {
+  const record = error as { name?: string; statusCode?: number; status?: number };
+  return record.name === 'TooManyRequestsError' || record.statusCode === 429 || record.status === 429;
+}
+
 export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
  private readonly logger: KtxLogger;

@ -53,6 +172,41 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
    this.logger = deps.logger ?? noopLogger;
  }

+  private async generateTextWithRateLimitRetry<T>(
+    provider: RateLimitProvider,
+    abortSignal: AbortSignal | undefined,
+    run: () => Promise<T>,
+  ): Promise<T> {
+    // maxRetryAttempts() returns 1 when no governor is present or pacing is
+    // disabled, so a 429 throws immediately instead of hammering the provider
+    // with no backoff; the AI SDK's own maxRetries still handles transient 429s.
+    const maxAttempts = this.deps.rateLimitGovernor?.maxRetryAttempts() ?? 1;
+    let attempt = 0;
+    while (true) {
+      await this.deps.rateLimitGovernor?.waitForReady(abortSignal);
+      try {
+        const result = await run();
+        const signal = aiSdkHeaderRateLimitSignal(provider, result);
+        if (signal) {
+          this.deps.rateLimitGovernor?.report(signal);
+        }
+        return result;
+      } catch (error) {
+        if (isAbortError(error) || !isAiSdkRateLimitError(error) || attempt >= maxAttempts - 1) {
+          throw error;
+        }
+        attempt += 1;
+        const retryAfter = retryAfterMs(error);
+        this.deps.rateLimitGovernor?.report({
+          provider,
+          status: 'rejected',
+          rateLimitType: 'http_429',
+          ...(retryAfter !== undefined ? { retryAfterMs: retryAfter } : {}),
+        });
+      }
+    }
+  }
+
  async generateText(input: KtxGenerateTextInput): Promise<string> {
    const model = this.deps.llmProvider.getModel(input.role);
    if ((model as { provider?: string }).provider === 'deterministic') {
@ -67,12 +221,13 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
    });
    const split = splitKtxSystemMessages(built.messages);
    const startedAt = Date.now();
-    const result = await generateText({
+    const request = {
      model,
      temperature: input.temperature ?? 0,
      ...(split.system ? { system: split.system } : {}),
      messages: split.messages,
      tools: built.tools as ToolSet,
+      ...(input.abortSignal ? { abortSignal: input.abortSignal } : {}),
      ...(hasTools(tools)
        ? {
            experimental_repairToolCall: this.deps.llmProvider.repairToolCallHandler({
@ -80,7 +235,8 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
            }),
          }
        : {}),
-    });
+    };
+    const result = await this.generateTextWithRateLimitRetry(modelProviderName(model), input.abortSignal, () => generateText(request));
    input.onMetrics?.({ totalMs: Date.now() - startedAt, usage: toLlmTokenUsage(result.totalUsage ?? result.usage) });
    if (typeof result.text !== 'string') {
      throw new Error('KTX LLM text generation returned no text');
@ -101,12 +257,13 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
    });
    const split = splitKtxSystemMessages(built.messages);
    const startedAt = Date.now();
-    const result = await generateText({
+    const request = {
      model,
      temperature: input.temperature ?? 0,
      ...(split.system ? { system: split.system } : {}),
      messages: split.messages,
      tools: built.tools as ToolSet,
+      ...(input.abortSignal ? { abortSignal: input.abortSignal } : {}),
      ...(hasTools(tools)
        ? {
            experimental_repairToolCall: this.deps.llmProvider.repairToolCallHandler({
@ -115,7 +272,8 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
          }
        : {}),
      output: Output.object({ schema: input.schema as unknown as FlexibleSchema<TOutput> }),
-    });
+    };
+    const result = await this.generateTextWithRateLimitRetry(modelProviderName(model), input.abortSignal, () => generateText(request));
    input.onMetrics?.({ totalMs: Date.now() - startedAt, usage: toLlmTokenUsage(result.totalUsage ?? result.usage) });
    if (result.output == null) {
      throw new Error('KTX LLM object generation returned no output');
@ -152,7 +310,7 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
        }),
      );

-      const result = await generateText({
+      const request = {
        model,
        temperature: 0,
        stopWhen: stepCountIs(params.stepBudget),
@ -163,6 +321,7 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
        ...(promptMessages.system ? { system: promptMessages.system } : {}),
        messages: promptMessages.messages,
        tools: built.tools as ToolSet,
+        ...(params.abortSignal ? { abortSignal: params.abortSignal } : {}),
        onStepFinish: async () => {
          stepIndex += 1;
          stepBoundariesMs.push(Date.now() - startedAt);
@ -179,7 +338,8 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
            );
          }
        },
-      });
+      };
+      const result = await this.generateTextWithRateLimitRetry(modelProviderName(model), params.abortSignal, () => generateText(request));
      return {
        stopReason: 'natural',
        metrics: {
@ -190,6 +350,9 @@ export class AiSdkKtxLlmRuntime implements KtxLlmRuntimePort {
        },
      };
    } catch (error) {
+      if (isAbortError(error)) {
+        throw error;
+      }
      const err = error instanceof Error ? error : new Error(String(error));
      this.logger.warn(`[agent-runner] loop failed: ${err.message}`);
      return {
--- a/packages/cli/src/context/llm/claude-code-runtime.ts
+++ b/packages/cli/src/context/llm/claude-code-runtime.ts
@ -7,8 +7,10 @@ import {
 } from '@anthropic-ai/claude-agent-sdk';
 import { z } from 'zod';
 import { noopLogger, type KtxLogger } from '../../context/core/config.js';
+import { createAbortError, isAbortError, throwIfAborted } from '../core/abort.js';
 import { createKtxClaudeCodeEnv } from './claude-code-env.js';
 import { resolveClaudeCodeModel } from './claude-code-models.js';
+import type { RateLimitGovernor, RateLimitSignal } from './rate-limit-governor.js';
 import { createClaudeSdkTools, mcpToolIds } from './runtime-tools.js';
 import type {
  KtxGenerateObjectInput,
@ -21,7 +23,16 @@ import type {
  RunLoopStopReason,
 } from './runtime-port.js';

-type QueryFn = (params: Parameters<typeof defaultQuery>[0]) => AsyncIterable<SDKMessage>;
+type QueryResult = AsyncIterable<SDKMessage> & {
+  interrupt?: () => void | Promise<void>;
+};
+
+type QueryFn = (params: Parameters<typeof defaultQuery>[0]) => QueryResult;
+
+interface ClaudeQueryOutcome {
+  result: SDKResultMessage;
+  rejectedRateLimitSignal?: RateLimitSignal;
+}

 function claudeTokenUsage(result: SDKResultMessage): LlmTokenUsage {
  const usage = (result as { usage?: { input_tokens?: number; output_tokens?: number } }).usage;
@ -43,6 +54,7 @@ export interface ClaudeCodeKtxLlmRuntimeDeps {
  query?: QueryFn;
  env?: NodeJS.ProcessEnv;
  logger?: KtxLogger;
+  rateLimitGovernor?: Pick<RateLimitGovernor, 'waitForReady' | 'report' | 'maxRetryAttempts'>;
 }

 const BUILTIN_TOOLS = [
@ -157,6 +169,74 @@ function expectedMcpServerNames(tools: KtxRuntimeToolSet | undefined): Set<strin
  return tools && Object.keys(tools).length > 0 ? new Set([KTX_MCP_SERVER_NAME]) : new Set();
 }

+const CLAUDE_RATE_LIMIT_ERROR_MARKERS = /\b429\b|rate limit|too many requests|quota exceeded|overloaded|max_retries/i;
+
+function normalizeClaudeResetAtMs(value: unknown): number | undefined {
+  if (typeof value === 'number' && Number.isFinite(value) && value > 0) {
+    return Math.round(value < 10_000_000_000 ? value * 1_000 : value);
+  }
+  if (typeof value === 'string') {
+    const numeric = Number(value);
+    if (Number.isFinite(numeric) && numeric > 0) {
+      return normalizeClaudeResetAtMs(numeric);
+    }
+    const parsed = Date.parse(value);
+    return Number.isFinite(parsed) ? parsed : undefined;
+  }
+  return undefined;
+}
+
+function isClaudeRateLimitResult(result: SDKResultMessage, rejectedSignal: RateLimitSignal | undefined): boolean {
+  const error = resultError(result);
+  if (!error) {
+    return false;
+  }
+  if (rejectedSignal?.status === 'rejected') {
+    return true;
+  }
+  const resultDetails = result as {
+    stop_reason?: unknown;
+    terminal_reason?: unknown;
+    errors?: unknown[];
+  };
+  const details = [
+    error.message,
+    resultDetails.stop_reason,
+    resultDetails.terminal_reason,
+    ...(resultDetails.errors ?? []),
+  ]
+    .filter((value): value is string => typeof value === 'string' && value.length > 0)
+    .join('\n');
+  return CLAUDE_RATE_LIMIT_ERROR_MARKERS.test(details);
+}
+
+function claudeRateLimitSignal(message: SDKMessage): RateLimitSignal | null {
+  const record = message as unknown as Record<string, unknown>;
+  if (record.type === 'rate_limit_event') {
+    const info = record.rate_limit_info as Record<string, unknown> | undefined;
+    if (!info) return null;
+    const rawStatus = typeof info.status === 'string' ? info.status : 'allowed';
+    const resetAtMs = normalizeClaudeResetAtMs(info.resetsAt);
+    return {
+      provider: 'claude-subscription',
+      status: rawStatus === 'rejected' ? 'rejected' : rawStatus === 'allowed_warning' ? 'warning' : 'allowed',
+      ...(resetAtMs !== undefined ? { resetAtMs } : {}),
+      ...(typeof info.rateLimitType === 'string' ? { rateLimitType: info.rateLimitType } : {}),
+      ...(typeof info.utilization === 'number' ? { utilization: info.utilization } : {}),
+    };
+  }
+  if (record.subtype === 'api_retry' || record.type === 'api_retry') {
+    const retryDelayMs = typeof record.retry_delay_ms === 'number' ? record.retry_delay_ms : undefined;
+    return {
+      provider: 'claude-subscription',
+      status: 'warning',
+      ...(retryDelayMs !== undefined ? { retryAfterMs: retryDelayMs } : {}),
+      rateLimitType: 'api_retry',
+    };
+  }
+  return null;
+}
+
 function managedMcpSettings(serverNames: string[]): NonNullable<Options['managedSettings']> {
  return {
    allowManagedMcpServersOnly: true,
@ -217,21 +297,63 @@ async function collectResult(params: {
  allowedToolIds: Set<string>;
  expectedMcpServerNames: Set<string>;
  onAssistantTurn?: () => Promise<void>;
-}): Promise<SDKResultMessage> {
+  rateLimitGovernor?: Pick<RateLimitGovernor, 'waitForReady' | 'report' | 'maxRetryAttempts'>;
+  abortSignal?: AbortSignal;
+}): Promise<ClaudeQueryOutcome> {
  let result: SDKResultMessage | undefined;
-  for await (const message of params.query({ prompt: params.prompt, options: params.options })) {
-    assertInitIsolation(message, params.allowedToolIds, params.expectedMcpServerNames);
-    if (countsAsAssistantTurn(message)) {
-      await params.onAssistantTurn?.();
-    }
-    if (isResult(message)) {
-      result = message;
+  let rejectedRateLimitSignal: RateLimitSignal | undefined;
+  throwIfAborted(params.abortSignal);
+  await params.rateLimitGovernor?.waitForReady(params.abortSignal);
+  throwIfAborted(params.abortSignal);
+  const queryResult = params.query({ prompt: params.prompt, options: params.options });
+  const onAbort = () => {
+    void Promise.resolve(queryResult.interrupt?.()).catch(() => undefined);
+  };
+  params.abortSignal?.addEventListener('abort', onAbort, { once: true });
+  try {
+    for await (const message of queryResult) {
+      throwIfAborted(params.abortSignal);
+      const rateLimitSignal = claudeRateLimitSignal(message);
+      if (rateLimitSignal) {
+        if (rateLimitSignal.status === 'rejected') {
+          rejectedRateLimitSignal = rateLimitSignal;
+        }
+        params.rateLimitGovernor?.report(rateLimitSignal);
+      }
+      assertInitIsolation(message, params.allowedToolIds, params.expectedMcpServerNames);
+      if (countsAsAssistantTurn(message)) {
+        await params.onAssistantTurn?.();
+      }
+      if (isResult(message)) {
+        result = message;
+      }
    }
+  } finally {
+    params.abortSignal?.removeEventListener('abort', onAbort);
+  }
+  if (params.abortSignal?.aborted) {
+    throw createAbortError();
  }
  if (!result) {
    throw new Error('Claude Code query returned no result message');
  }
-  return result;
+  return {
+    result,
+    ...(rejectedRateLimitSignal ? { rejectedRateLimitSignal } : {}),
+  };
+}
+
+async function collectResultWithRateLimitRetry(params: Parameters<typeof collectResult>[0]): Promise<SDKResultMessage> {
+  // maxRetryAttempts() returns 1 when no governor is present or pacing is
+  // disabled, so a rate-limited result surfaces without an extra query; the
+  // Claude Code SDK applies its own backoff for transient rejections.
+  const maxAttempts = params.rateLimitGovernor?.maxRetryAttempts() ?? 1;
+  for (let attempt = 0; ; attempt += 1) {
+    const outcome = await collectResult(params);
+    if (!isClaudeRateLimitResult(outcome.result, outcome.rejectedRateLimitSignal) || attempt >= maxAttempts - 1) {
+      return outcome.result;
+    }
+  }
 }

 export class ClaudeCodeKtxLlmRuntime implements KtxLlmRuntimePort {
@ -252,12 +374,14 @@ export class ClaudeCodeKtxLlmRuntime implements KtxLlmRuntimePort {
      tools: input.tools,
    });
    const startedAt = Date.now();
-    const result = await collectResult({
+    const result = await collectResultWithRateLimitRetry({
      query: this.runQuery,
      prompt: [input.system, input.prompt].filter(Boolean).join('\n\n'),
      options,
      allowedToolIds: new Set(mcpToolIds(input.tools ?? {})),
      expectedMcpServerNames: expectedMcpServerNames(input.tools),
+      rateLimitGovernor: this.deps.rateLimitGovernor,
+      abortSignal: input.abortSignal,
    });
    input.onMetrics?.({ totalMs: Date.now() - startedAt, usage: claudeTokenUsage(result) });
    const error = resultError(result);
@ -289,12 +413,14 @@ export class ClaudeCodeKtxLlmRuntime implements KtxLlmRuntimePort {
      outputFormat: { type: 'json_schema' as const, schema: jsonSchema(input.schema as z.ZodType) },
    };
    const startedAt = Date.now();
-    const result = await collectResult({
+    const result = await collectResultWithRateLimitRetry({
      query: this.runQuery,
      prompt: [input.system, input.prompt].filter(Boolean).join('\n\n'),
      options,
      allowedToolIds: new Set([...mcpToolIds(input.tools ?? {}), STRUCTURED_OUTPUT_TOOL_NAME]),
      expectedMcpServerNames: expectedMcpServerNames(input.tools),
+      rateLimitGovernor: this.deps.rateLimitGovernor,
+      abortSignal: input.abortSignal,
    });
    input.onMetrics?.({ totalMs: Date.now() - startedAt, usage: claudeTokenUsage(result) });
    const error = resultError(result);
@ -319,12 +445,14 @@ export class ClaudeCodeKtxLlmRuntime implements KtxLlmRuntimePort {
        maxTurns: params.stepBudget,
        tools: params.toolSet,
      });
-      const result = await collectResult({
+      const result = await collectResultWithRateLimitRetry({
        query: this.runQuery,
        prompt: params.userPrompt,
        options: { ...options, systemPrompt: params.systemPrompt },
        allowedToolIds: new Set(mcpToolIds(params.toolSet)),
        expectedMcpServerNames: expectedMcpServerNames(params.toolSet),
+        rateLimitGovernor: this.deps.rateLimitGovernor,
+        abortSignal: params.abortSignal,
        onAssistantTurn: async () => {
          stepIndex += 1;
          stepBoundariesMs.push(Date.now() - startedAt);
@ -355,6 +483,9 @@ export class ClaudeCodeKtxLlmRuntime implements KtxLlmRuntimePort {
        },
      };
    } catch (error) {
+      if (isAbortError(error)) {
+        throw error;
+      }
      const err = error instanceof Error ? error : new Error(String(error));
      return {
        stopReason: 'error',
@ -388,7 +519,7 @@ export async function runClaudeCodeAuthProbe(input: {
      env: input.env,
      maxTurns: 1,
    });
-    const result = await collectResult({
+    const result = await collectResultWithRateLimitRetry({
      query: input.query ?? defaultQuery,
      prompt: 'Reply with exactly: ok',
      options,
--- a/packages/cli/src/context/llm/codex-runtime.ts
+++ b/packages/cli/src/context/llm/codex-runtime.ts
@ -1,5 +1,6 @@
 import { z } from 'zod';
 import { noopLogger, type KtxLogger } from '../core/config.js';
+import { isAbortError, linkAbortSignal } from '../core/abort.js';
 import { isCompletedAgentStep, summarizeCodexExecEvents, type CodexExecEventSummary } from './codex-exec-events.js';
 import {
  startCodexRuntimeMcpServer,
@ -8,6 +9,7 @@ import {
 import { resolveCodexModel } from './codex-models.js';
 import { buildCodexRuntimeConfig } from './codex-runtime-config.js';
 import { CodexSdkCliRunner, type CodexSdkRunner } from './codex-sdk-runner.js';
+import type { RateLimitGovernor } from './rate-limit-governor.js';
 import type {
  KtxGenerateObjectInput,
  KtxGenerateTextInput,
@ -24,6 +26,7 @@ export interface CodexKtxLlmRuntimeDeps {
  runner?: CodexSdkRunner;
  startMcpServer?: (input: { projectDir: string; toolSet: KtxRuntimeToolSet }) => Promise<CodexRuntimeMcpServerHandle>;
  logger?: KtxLogger;
+  rateLimitGovernor?: Pick<RateLimitGovernor, 'waitForReady' | 'report' | 'maxRetryAttempts'>;
 }

 function modelForRole(modelSlots: CodexKtxLlmRuntimeDeps['modelSlots'], role: string): string {
@ -159,6 +162,12 @@ function runtimeToolNames(toolSet: KtxRuntimeToolSet | undefined): string[] {
  return Object.values(toolSet ?? {}).map((descriptor) => descriptor.name);
 }

+const CODEX_RATE_LIMIT_MARKERS = /\b429\b|rate limit|too many requests|quota exceeded|temporarily overloaded/i;
+
+function isCodexRateLimitError(error: Error | undefined): boolean {
+  return !!error && CODEX_RATE_LIMIT_MARKERS.test(error.message);
+}
+
 export class CodexKtxLlmRuntime implements KtxLlmRuntimePort {
  private readonly runner: CodexSdkRunner;
  private readonly logger: KtxLogger;
@ -168,6 +177,37 @@ export class CodexKtxLlmRuntime implements KtxLlmRuntimePort {
    this.logger = deps.logger ?? noopLogger;
  }

+  private async runWithRateLimitRetry<T>(
+    abortSignal: AbortSignal | undefined,
+    run: () => Promise<T>,
+    getError: (result: T) => Error | undefined,
+  ): Promise<T> {
+    // maxRetryAttempts() returns 1 when no governor is present or pacing is
+    // disabled, so an opaque rate-limit failure surfaces on the first attempt
+    // instead of being retried with no backoff.
+    const maxAttempts = this.deps.rateLimitGovernor?.maxRetryAttempts() ?? 1;
+    for (let attempt = 0; ; attempt += 1) {
+      await this.deps.rateLimitGovernor?.waitForReady(abortSignal);
+      const lastAttempt = attempt >= maxAttempts - 1;
+      try {
+        const result = await run();
+        const error = getError(result);
+        if (!isCodexRateLimitError(error) || lastAttempt) {
+          return result;
+        }
+      } catch (error) {
+        if (isAbortError(error)) {
+          throw error;
+        }
+        const err = error instanceof Error ? error : new Error(String(error));
+        if (!isCodexRateLimitError(err) || lastAttempt) {
+          throw error;
+        }
+      }
+      this.deps.rateLimitGovernor?.report({ provider: 'codex', status: 'rejected', rateLimitType: 'opaque' });
+    }
+  }
+
  async generateText(input: KtxGenerateTextInput): Promise<string> {
    const startedAt = Date.now();
    const model = modelForRole(this.deps.modelSlots, input.role);
@ -190,18 +230,26 @@ export class CodexKtxLlmRuntime implements KtxLlmRuntimePort {
            }
          : {}),
      });
-      const collected = await collectEvents(
-        await this.runner.runStreamed({
-          projectDir: this.deps.projectDir,
-          model,
-          prompt: promptWithSystem(input.system, input.prompt),
-          configOverrides: config.configOverrides,
-          env: config.env,
-        }),
+      const result = await this.runWithRateLimitRetry(
+        input.abortSignal,
+        async () => {
+          const collected = await collectEvents(
+            await this.runner.runStreamed({
+              projectDir: this.deps.projectDir,
+              model,
+              prompt: promptWithSystem(input.system, input.prompt),
+              configOverrides: config.configOverrides,
+              env: config.env,
+              ...(input.abortSignal ? { signal: input.abortSignal } : {}),
+            }),
+          );
+          const summary = summarizeCodexExecEvents(collected.events, { startedAt });
+          return { collected, summary };
+        },
+        ({ collected, summary }) => summaryError(summary, collected.streamError),
      );
-      const summary = summarizeCodexExecEvents(collected.events, { startedAt });
-      input.onMetrics?.(metrics(summary, startedAt));
-      return assertSuccessfulText(summary, collected.streamError);
+      input.onMetrics?.(metrics(result.summary, startedAt));
+      return assertSuccessfulText(result.summary, result.collected.streamError);
    } finally {
      await mcp?.close();
    }
@ -231,19 +279,27 @@ export class CodexKtxLlmRuntime implements KtxLlmRuntimePort {
            }
          : {}),
      });
-      const collected = await collectEvents(
-        await this.runner.runStreamed({
-          projectDir: this.deps.projectDir,
-          model,
-          prompt: promptWithSystem(input.system, input.prompt),
-          configOverrides: config.configOverrides,
-          env: config.env,
-          outputSchema: z.toJSONSchema(input.schema, { target: 'draft-7' }) as Record<string, unknown>,
-        }),
+      const result = await this.runWithRateLimitRetry(
+        input.abortSignal,
+        async () => {
+          const collected = await collectEvents(
+            await this.runner.runStreamed({
+              projectDir: this.deps.projectDir,
+              model,
+              prompt: promptWithSystem(input.system, input.prompt),
+              configOverrides: config.configOverrides,
+              env: config.env,
+              outputSchema: z.toJSONSchema(input.schema, { target: 'draft-7' }) as Record<string, unknown>,
+              ...(input.abortSignal ? { signal: input.abortSignal } : {}),
+            }),
+          );
+          const summary = summarizeCodexExecEvents(collected.events, { startedAt });
+          return { collected, summary };
+        },
+        ({ collected, summary }) => summaryError(summary, collected.streamError),
      );
-      const summary = summarizeCodexExecEvents(collected.events, { startedAt });
-      input.onMetrics?.(metrics(summary, startedAt));
-      return parseStructuredOutput(input.schema, assertSuccessfulText(summary, collected.streamError));
+      input.onMetrics?.(metrics(result.summary, startedAt));
+      return parseStructuredOutput(input.schema, assertSuccessfulText(result.summary, result.collected.streamError));
    } finally {
      await mcp?.close();
    }
@ -272,7 +328,6 @@ export class CodexKtxLlmRuntime implements KtxLlmRuntimePort {
            }
          : {}),
      });
-      const abortController = new AbortController();
      const onStep = async (stepIndex: number): Promise<void> => {
        try {
          await params.onStepFinish?.({ stepIndex, stepBudget: params.stepBudget });
@ -282,31 +337,50 @@ export class CodexKtxLlmRuntime implements KtxLlmRuntimePort {
          );
        }
      };
-      const collected = await collectEvents(
-        await this.runner.runStreamed({
-          projectDir: this.deps.projectDir,
-          model,
-          prompt: promptWithSystem(params.systemPrompt, params.userPrompt),
-          configOverrides: config.configOverrides,
-          env: config.env,
-          signal: abortController.signal,
-        }),
-        { stepBudget: params.stepBudget, abortController, onStep },
+      const result = await this.runWithRateLimitRetry(
+        params.abortSignal,
+        async () => {
+          const linked = linkAbortSignal(params.abortSignal);
+          const abortController = linked.controller;
+          try {
+            const collected = await collectEvents(
+              await this.runner.runStreamed({
+                projectDir: this.deps.projectDir,
+                model,
+                prompt: promptWithSystem(params.systemPrompt, params.userPrompt),
+                configOverrides: config.configOverrides,
+                env: config.env,
+                signal: abortController.signal,
+              }),
+              { stepBudget: params.stepBudget, abortController, onStep },
+            );
+            const summary = summarizeCodexExecEvents(collected.events, { startedAt });
+            return { collected, summary };
+          } finally {
+            linked.dispose();
+          }
+        },
+        ({ collected, summary }) => summaryError(summary, collected.streamError),
      );
-      const summary = summarizeCodexExecEvents(collected.events, { startedAt });
-      const error = summaryError(summary, collected.streamError);
-      const stopReason = collected.budgetExceeded ? 'budget' : error ? 'error' : summary.stopReason;
+      const error = summaryError(result.summary, result.collected.streamError);
+      if (isAbortError(error)) {
+        throw error;
+      }
+      const stopReason = result.collected.budgetExceeded ? 'budget' : error ? 'error' : result.summary.stopReason;
      return {
        stopReason,
        ...(stopReason === 'error' && error ? { error } : {}),
        metrics: {
          totalMs: Date.now() - startedAt,
-          usage: summary.usage,
-          stepCount: summary.stepCount,
-          stepBoundariesMs: summary.stepBoundariesMs,
+          usage: result.summary.usage,
+          stepCount: result.summary.stepCount,
+          stepBoundariesMs: result.summary.stepBoundariesMs,
        },
      };
    } catch (error) {
+      if (isAbortError(error)) {
+        throw error;
+      }
      const err = error instanceof Error ? error : new Error(String(error));
      return {
        stopReason: 'error',
--- a/packages/cli/src/context/llm/local-config.ts
+++ b/packages/cli/src/context/llm/local-config.ts
@ -6,16 +6,28 @@ import type { KtxProjectEmbeddingConfig, KtxProjectLlmConfig } from '../project/
 import { AiSdkKtxLlmRuntime } from './ai-sdk-runtime.js';
 import { ClaudeCodeKtxLlmRuntime } from './claude-code-runtime.js';
 import { CodexKtxLlmRuntime } from './codex-runtime.js';
+import type { RateLimitGovernor } from './rate-limit-governor.js';
 import type { KtxLlmRuntimePort } from './runtime-port.js';

+type ClaudeCodeRuntimeDeps = ConstructorParameters<typeof ClaudeCodeKtxLlmRuntime>[0] & {
+  rateLimitGovernor?: RateLimitGovernor;
+};
+type CodexRuntimeDeps = ConstructorParameters<typeof CodexKtxLlmRuntime>[0] & {
+  rateLimitGovernor?: RateLimitGovernor;
+};
+type AiSdkRuntimeDeps = ConstructorParameters<typeof AiSdkKtxLlmRuntime>[0] & {
+  rateLimitGovernor?: RateLimitGovernor;
+};
+
 interface LocalConfigDeps {
  env?: NodeJS.ProcessEnv;
  projectDir?: string;
+  rateLimitGovernor?: RateLimitGovernor;
  createKtxLlmProvider?: typeof createKtxLlmProvider;
  createKtxEmbeddingProvider?: typeof createKtxEmbeddingProvider;
-  createClaudeCodeRuntime?: (deps: ConstructorParameters<typeof ClaudeCodeKtxLlmRuntime>[0]) => KtxLlmRuntimePort;
-  createCodexRuntime?: (deps: ConstructorParameters<typeof CodexKtxLlmRuntime>[0]) => KtxLlmRuntimePort;
-  createAiSdkRuntime?: (deps: { llmProvider: KtxLlmProvider }) => KtxLlmRuntimePort;
+  createClaudeCodeRuntime?: (deps: ClaudeCodeRuntimeDeps) => KtxLlmRuntimePort;
+  createCodexRuntime?: (deps: CodexRuntimeDeps) => KtxLlmRuntimePort;
+  createAiSdkRuntime?: (deps: AiSdkRuntimeDeps) => KtxLlmRuntimePort;
 }

 function resolveOptional(value: string | undefined, env: NodeJS.ProcessEnv): string | undefined {
@ -129,6 +141,7 @@ export function createLocalKtxLlmRuntimeFromConfig(
      projectDir,
      modelSlots: resolved.modelSlots,
      env: deps.env,
+      rateLimitGovernor: deps.rateLimitGovernor,
    });
  }
  if (resolved.backend === 'codex') {
@ -139,10 +152,14 @@ export function createLocalKtxLlmRuntimeFromConfig(
    return (deps.createCodexRuntime ?? ((runtimeDeps) => new CodexKtxLlmRuntime(runtimeDeps)))({
      projectDir,
      modelSlots: resolved.modelSlots,
+      rateLimitGovernor: deps.rateLimitGovernor,
    });
  }
  const llmProvider = (deps.createKtxLlmProvider ?? createKtxLlmProvider)(resolved);
-  return (deps.createAiSdkRuntime ?? ((runtimeDeps) => new AiSdkKtxLlmRuntime(runtimeDeps)))({ llmProvider });
+  return (deps.createAiSdkRuntime ?? ((runtimeDeps) => new AiSdkKtxLlmRuntime(runtimeDeps)))({
+    llmProvider,
+    rateLimitGovernor: deps.rateLimitGovernor,
+  });
 }

 export function resolveLocalKtxEmbeddingConfig(
--- a/packages/cli/src/context/llm/rate-limit-governor.ts
+++ b/packages/cli/src/context/llm/rate-limit-governor.ts
@ -0,0 +1,387 @@
+import { createAbortError, throwIfAborted } from '../core/abort.js';
+
+export type RateLimitProvider = 'claude-subscription' | 'anthropic-api' | 'vertex' | 'codex';
+type RateLimitSignalStatus = 'allowed' | 'warning' | 'rejected';
+
+export interface RateLimitSignal {
+  provider: RateLimitProvider;
+  status: RateLimitSignalStatus;
+  resetAtMs?: number;
+  retryAfterMs?: number;
+  utilization?: number;
+  rateLimitType?: string;
+}
+
+export interface RateLimitRetryConfig {
+  maxAttempts: number;
+  baseDelayMs: number;
+  maxDelayMs: number;
+  jitter: boolean;
+}
+
+export interface RateLimitGovernorConfig {
+  enabled: boolean;
+  maxConcurrency: number;
+  throttleThreshold: number;
+  minConcurrencyUnderPressure: number;
+  maxWaitMs?: number;
+  waitStateTickMs: number;
+  retry: RateLimitRetryConfig;
+}
+
+export type RateLimitWaitState =
+  | {
+      kind: 'rate_limit_observed';
+      provider: RateLimitProvider;
+      status: RateLimitSignalStatus;
+      rateLimitType?: string;
+      resetAtMs?: number;
+      retryAfterMs?: number;
+      utilization?: number;
+    }
+  | {
+      kind: 'concurrency_adjusted';
+      provider: RateLimitProvider;
+      from: number;
+      to: number;
+      reason: string;
+      rateLimitType?: string;
+      utilization?: number;
+    }
+  | {
+      kind: 'wait_started' | 'wait_tick' | 'wait_finished';
+      provider: RateLimitProvider;
+      rateLimitType?: string;
+      resumeAtMs: number;
+      remainingMs: number;
+    };
+
+export interface RateLimitGovernorDeps {
+  now?: () => number;
+  sleep?: (ms: number, signal?: AbortSignal) => Promise<void>;
+  random?: () => number;
+}
+
+export type RateLimitRelease = () => void;
+type Subscriber = (state: RateLimitWaitState) => void;
+
+const defaultSleep = (ms: number, signal?: AbortSignal): Promise<void> =>
+  new Promise((resolve, reject) => {
+    if (signal?.aborted) {
+      reject(createAbortError());
+      return;
+    }
+    const timeout = setTimeout(resolve, ms);
+    signal?.addEventListener(
+      'abort',
+      () => {
+        clearTimeout(timeout);
+        reject(createAbortError());
+      },
+      { once: true },
+    );
+  });
+
+export function createRateLimitGovernorConfig(
+  input: Partial<RateLimitGovernorConfig> & { retry?: Partial<RateLimitRetryConfig> } = {},
+): RateLimitGovernorConfig {
+  return {
+    enabled: input.enabled ?? true,
+    maxConcurrency: input.maxConcurrency ?? 1,
+    throttleThreshold: input.throttleThreshold ?? 0.8,
+    minConcurrencyUnderPressure: input.minConcurrencyUnderPressure ?? 1,
+    ...(input.maxWaitMs !== undefined ? { maxWaitMs: input.maxWaitMs } : {}),
+    waitStateTickMs: input.waitStateTickMs ?? 1_000,
+    retry: {
+      maxAttempts: input.retry?.maxAttempts ?? 6,
+      baseDelayMs: input.retry?.baseDelayMs ?? 1_000,
+      maxDelayMs: input.retry?.maxDelayMs ?? 60_000,
+      jitter: input.retry?.jitter ?? true,
+    },
+  };
+}
+
+export class RateLimitGovernor {
+  private readonly now: () => number;
+  private readonly sleep: (ms: number, signal?: AbortSignal) => Promise<void>;
+  private readonly random: () => number;
+  private readonly subscribers = new Set<Subscriber>();
+  private waiters: Array<() => void> = [];
+  private active = 0;
+  private effectiveLimit: number;
+  private pausedUntilMs: number | null = null;
+  private pausedProvider: RateLimitProvider | null = null;
+  private pausedRateLimitType: string | undefined;
+  private pausedTickMs: number | null = null;
+  private opaqueAttempts = new Map<RateLimitProvider, number>();
+  private pauseGeneration = 0;
+  private visibleWaitAbort: AbortController | null = null;
+
+  constructor(
+    private readonly config: RateLimitGovernorConfig,
+    deps: RateLimitGovernorDeps = {},
+  ) {
+    this.now = deps.now ?? Date.now;
+    this.sleep = deps.sleep ?? defaultSleep;
+    this.random = deps.random ?? Math.random;
+    this.effectiveLimit = Math.max(1, config.maxConcurrency);
+  }
+
+  currentLimit(): number {
+    return this.config.enabled ? this.effectiveLimit : this.config.maxConcurrency;
+  }
+
+  /**
+   * Total attempts a runtime should make for a single rate-limited LLM call,
+   * including the first try. Returns 1 (no outer retry) when pacing is disabled:
+   * the outer retry loop only exists to cooperate with this governor's pause, so
+   * without active pacing there is no backoff to apply and the backend's own
+   * retry handles transient rejections.
+   */
+  maxRetryAttempts(): number {
+    return this.config.enabled ? Math.max(1, this.config.retry.maxAttempts) : 1;
+  }
+
+  activeSlots(): number {
+    return this.active;
+  }
+
+  subscribe(cb: Subscriber): () => void {
+    this.subscribers.add(cb);
+    if (this.pausedUntilMs !== null) {
+      this.startVisibleWaitTicker();
+    }
+    return () => {
+      this.subscribers.delete(cb);
+      if (this.subscribers.size === 0) {
+        this.stopVisibleWaitTicker();
+        this.wakeWaiters();
+      }
+    };
+  }
+
+  report(signal: RateLimitSignal): void {
+    if (!this.config.enabled) {
+      return;
+    }
+    this.emit({
+      kind: 'rate_limit_observed',
+      provider: signal.provider,
+      status: signal.status,
+      ...(signal.rateLimitType ? { rateLimitType: signal.rateLimitType } : {}),
+      ...(signal.resetAtMs !== undefined ? { resetAtMs: signal.resetAtMs } : {}),
+      ...(signal.retryAfterMs !== undefined ? { retryAfterMs: signal.retryAfterMs } : {}),
+      ...(signal.utilization !== undefined ? { utilization: signal.utilization } : {}),
+    });
+
+    if (signal.status === 'rejected') {
+      this.applyPause(signal);
+      return;
+    }
+
+    if (signal.status === 'warning' || (signal.utilization ?? 0) >= this.config.throttleThreshold) {
+      this.adjustLimit(Math.max(1, this.config.minConcurrencyUnderPressure), signal, 'provider pressure');
+      return;
+    }
+
+    this.opaqueAttempts.delete(signal.provider);
+    if ((signal.utilization ?? 0) < this.config.throttleThreshold) {
+      this.adjustLimit(Math.max(1, this.config.maxConcurrency), signal, 'provider recovered');
+    }
+  }
+
+  async waitForReady(signal?: AbortSignal): Promise<void> {
+    throwIfAborted(signal);
+    if (!this.config.enabled) {
+      return;
+    }
+    await this.waitForPause(signal);
+    throwIfAborted(signal);
+  }
+
+  async acquireWorkSlot(signal?: AbortSignal): Promise<RateLimitRelease> {
+    throwIfAborted(signal);
+    if (!this.config.enabled) {
+      this.active += 1;
+      return () => {
+        this.active -= 1;
+      };
+    }
+
+    while (true) {
+      throwIfAborted(signal);
+      await this.waitForPause(signal);
+      throwIfAborted(signal);
+      if (this.active < this.effectiveLimit) {
+        this.active += 1;
+        let released = false;
+        return () => {
+          if (released) return;
+          released = true;
+          this.active -= 1;
+          this.wakeWaiters();
+        };
+      }
+      await this.waitForSlot(signal);
+    }
+  }
+
+  private applyPause(signal: RateLimitSignal): void {
+    const resumeAtMs = this.resumeAtMsFor(signal);
+    const boundedResumeAtMs =
+      this.config.maxWaitMs === undefined ? resumeAtMs : Math.min(resumeAtMs, this.now() + this.config.maxWaitMs);
+    if (this.pausedUntilMs === null || boundedResumeAtMs > this.pausedUntilMs) {
+      this.pausedUntilMs = boundedResumeAtMs;
+      this.pausedProvider = signal.provider;
+      this.pausedRateLimitType = signal.rateLimitType;
+      this.pausedTickMs = signal.rateLimitType === 'opaque' ? Math.max(1, boundedResumeAtMs - this.now()) : null;
+      this.emitWait('wait_started');
+      this.startVisibleWaitTicker();
+      this.wakeWaiters();
+    }
+    this.adjustLimit(Math.max(1, this.config.minConcurrencyUnderPressure), signal, 'provider rejected');
+  }
+
+  private resumeAtMsFor(signal: RateLimitSignal): number {
+    if (signal.resetAtMs !== undefined) {
+      return signal.resetAtMs;
+    }
+    if (signal.retryAfterMs !== undefined) {
+      return this.now() + signal.retryAfterMs;
+    }
+    const attempts = this.opaqueAttempts.get(signal.provider) ?? 0;
+    this.opaqueAttempts.set(signal.provider, Math.min(attempts + 1, this.config.retry.maxAttempts));
+    const base = Math.min(
+      this.config.retry.maxDelayMs,
+      this.config.retry.baseDelayMs * 2 ** Math.min(attempts, this.config.retry.maxAttempts - 1),
+    );
+    const jitterMultiplier = this.config.retry.jitter ? 0.75 + this.random() * 0.5 : 1;
+    return this.now() + Math.round(base * jitterMultiplier);
+  }
+
+  private adjustLimit(to: number, signal: RateLimitSignal, reason: string): void {
+    const bounded = Math.max(1, Math.min(this.config.maxConcurrency, to));
+    if (bounded === this.effectiveLimit) {
+      return;
+    }
+    const from = this.effectiveLimit;
+    this.effectiveLimit = bounded;
+    this.emit({
+      kind: 'concurrency_adjusted',
+      provider: signal.provider,
+      from,
+      to: bounded,
+      reason,
+      ...(signal.rateLimitType ? { rateLimitType: signal.rateLimitType } : {}),
+      ...(signal.utilization !== undefined ? { utilization: signal.utilization } : {}),
+    });
+    this.wakeWaiters();
+  }
+
+  private startVisibleWaitTicker(): void {
+    if (this.subscribers.size === 0 || this.pausedUntilMs === null) {
+      return;
+    }
+    this.stopVisibleWaitTicker();
+    const generation = (this.pauseGeneration += 1);
+    const controller = new AbortController();
+    this.visibleWaitAbort = controller;
+    void this.runVisibleWaitTicker(generation, controller.signal).catch(() => undefined);
+  }
+
+  private stopVisibleWaitTicker(): void {
+    this.visibleWaitAbort?.abort();
+    this.visibleWaitAbort = null;
+  }
+
+  private async runVisibleWaitTicker(generation: number, signal: AbortSignal): Promise<void> {
+    while (!signal.aborted && generation === this.pauseGeneration && this.pausedUntilMs !== null) {
+      const remainingMs = this.pausedUntilMs - this.now();
+      if (remainingMs <= 0) {
+        this.finishPause(generation);
+        return;
+      }
+      this.emitWait('wait_tick');
+      await this.sleep(Math.min(this.pausedTickMs ?? this.config.waitStateTickMs, remainingMs), signal);
+    }
+  }
+
+  private finishPause(generation?: number): void {
+    if (generation !== undefined && generation !== this.pauseGeneration) {
+      return;
+    }
+    this.emitWait('wait_finished');
+    this.pausedUntilMs = null;
+    this.pausedProvider = null;
+    this.pausedRateLimitType = undefined;
+    this.pausedTickMs = null;
+    this.stopVisibleWaitTicker();
+    this.wakeWaiters();
+  }
+
+  private async waitForPause(signal?: AbortSignal): Promise<void> {
+    throwIfAborted(signal);
+    while (this.pausedUntilMs !== null) {
+      const remainingMs = this.pausedUntilMs - this.now();
+      if (remainingMs <= 0) {
+        this.finishPause();
+        return;
+      }
+      if (this.visibleWaitAbort !== null) {
+        await this.waitForSlot(signal);
+      } else {
+        await this.sleep(Math.min(this.pausedTickMs ?? this.config.waitStateTickMs, remainingMs), signal);
+      }
+      throwIfAborted(signal);
+    }
+  }
+
+  private waitForSlot(signal?: AbortSignal): Promise<void> {
+    if (signal?.aborted) {
+      return Promise.reject(createAbortError());
+    }
+    return new Promise((resolve, reject) => {
+      const wake = () => {
+        cleanup();
+        resolve();
+      };
+      const onAbort = () => {
+        cleanup();
+        reject(createAbortError());
+      };
+      const cleanup = () => {
+        this.waiters = this.waiters.filter((candidate) => candidate !== wake);
+        signal?.removeEventListener('abort', onAbort);
+      };
+      this.waiters.push(wake);
+      signal?.addEventListener('abort', onAbort, { once: true });
+    });
+  }
+
+  private wakeWaiters(): void {
+    const waiters = this.waiters;
+    this.waiters = [];
+    for (const waiter of waiters) {
+      waiter();
+    }
+  }
+
+  private emitWait(kind: Extract<RateLimitWaitState['kind'], 'wait_started' | 'wait_tick' | 'wait_finished'>): void {
+    if (this.pausedUntilMs === null || this.pausedProvider === null) {
+      return;
+    }
+    this.emit({
+      kind,
+      provider: this.pausedProvider,
+      ...(this.pausedRateLimitType ? { rateLimitType: this.pausedRateLimitType } : {}),
+      resumeAtMs: this.pausedUntilMs,
+      remainingMs: Math.max(0, this.pausedUntilMs - this.now()),
+    });
+  }
+
+  private emit(state: RateLimitWaitState): void {
+    for (const subscriber of this.subscribers) {
+      subscriber(state);
+    }
+  }
+}
--- a/packages/cli/src/context/llm/runtime-port.ts
+++ b/packages/cli/src/context/llm/runtime-port.ts
@ -49,6 +49,7 @@ export interface RunLoopParams {
  stepBudget: number;
  telemetryTags: Record<string, string>;
  onStepFinish?: (info: RunLoopStepInfo) => void | Promise<void>;
+  abortSignal?: AbortSignal;
 }

 export interface RunLoopResult {
@ -64,6 +65,7 @@ export interface KtxGenerateTextInput {
  tools?: KtxRuntimeToolSet;
  temperature?: number;
  onMetrics?: (metrics: { totalMs: number; usage: LlmTokenUsage }) => void;
+  abortSignal?: AbortSignal;
 }

 export interface KtxGenerateObjectInput<TOutput, TSchema extends z.ZodType<TOutput>> {
@ -74,6 +76,7 @@ export interface KtxGenerateObjectInput<TOutput, TSchema extends z.ZodType<TOutp
  temperature?: number;
  schema: TSchema;
  onMetrics?: (metrics: { totalMs: number; usage: LlmTokenUsage }) => void;
+  abortSignal?: AbortSignal;
 }

 export interface KtxLlmRuntimePort {
--- a/packages/cli/src/context/project/config.ts
+++ b/packages/cli/src/context/project/config.ts
@ -100,6 +100,44 @@ const workUnitsSchema = z
  })
  .describe('Concurrency and failure handling for ingest work units.');

+const ingestRateLimitRetrySchema = z
+  .strictObject({
+    maxAttempts: z
+      .int()
+      .positive()
+      .default(6)
+      .describe(
+        'Maximum attempts for a single rate-limited LLM call before the failure surfaces, counting the first try. Also bounds how far opaque backoff grows for providers that do not expose a reset time.',
+      ),
+    baseDelayMs: z.int().positive().default(1_000).describe('Initial opaque retry delay in milliseconds.'),
+    maxDelayMs: z.int().positive().default(60_000).describe('Maximum opaque retry delay in milliseconds.'),
+    jitter: z.boolean().default(true).describe('When true, apply bounded jitter to opaque retry delays.'),
+  })
+  .describe('Retry policy for rate-limit responses that do not include a reset time or retry-after value.');
+
+const ingestRateLimitSchema = z
+  .strictObject({
+    enabled: z.boolean().default(true).describe('Master switch for ingest LLM rate-limit pacing and visible waits.'),
+    throttleThreshold: z
+      .number()
+      .min(0)
+      .max(1)
+      .default(0.8)
+      .describe('Provider utilization at or above which ingest throttles new work-unit starts.'),
+    minConcurrencyUnderPressure: z
+      .int()
+      .positive()
+      .default(1)
+      .describe('Effective work-unit concurrency while a provider is under rate-limit pressure.'),
+    maxWaitMs: z
+      .int()
+      .positive()
+      .optional()
+      .describe('Optional cap on a single provider reset wait. Omit to wait indefinitely until the provider reset time.'),
+    retry: ingestRateLimitRetrySchema.prefault({}).describe('Opaque retry policy for providers without reset hints.'),
+  })
+  .describe('Rate-limit pacing and wait policy for ingest LLM calls.');
+
 const ingestSchema = z
  .strictObject({
    adapters: z
@ -110,6 +148,7 @@ const ingestSchema = z
      .prefault({ backend: 'none' })
      .describe('Embedding configuration used when ingest adapters need to embed documents.'),
    workUnits: workUnitsSchema.prefault({}).describe('Concurrency and failure handling for ingest work units.'),
+    rateLimit: ingestRateLimitSchema.prefault({}).describe('LLM rate-limit pacing and visible-wait policy for ingest.'),
    profile: z
      .union([z.boolean(), z.literal('json')])
      .default(false)
--- a/packages/cli/src/ingest.ts
+++ b/packages/cli/src/ingest.ts
@ -78,6 +78,7 @@ export interface KtxIngestDeps {
  readReportFile?: typeof readIngestReportSnapshotFile;
  renderStoredMemoryFlow?: typeof renderMemoryFlowTui;
  startLiveMemoryFlow?: typeof startLiveMemoryFlowTui;
+  abortSignal?: AbortSignal;
  env?: NodeJS.ProcessEnv;
 localIngestOptions?: Pick<
   RunLocalIngestOptions,
@ -93,6 +94,23 @@ export interface KtxIngestDeps {
  runtimeIo?: KtxIngestIo;
 }

+function createCliAbortSignal(): { signal: AbortSignal; dispose: () => void } {
+  const controller = new AbortController();
+  let interrupted = false;
+  const onSigint = () => {
+    if (interrupted) {
+      process.exit(130);
+    }
+    interrupted = true;
+    controller.abort(new DOMException('Aborted', 'AbortError'));
+  };
+  process.on('SIGINT', onSigint);
+  return {
+    signal: controller.signal,
+    dispose: () => process.off('SIGINT', onSigint),
+  };
+}
+
 const REPORT_SOURCE_LABELS = new Map<string, string>([
  ['live-database', 'Database schema'],
  ['historic-sql', 'Query history'],
@ -364,6 +382,12 @@ function plainIngestEventProgress(
        message: event.message,
        ...(event.transient !== undefined ? { transient: event.transient } : {}),
      };
+    case 'rate_limit_wait':
+      return {
+        percent: 50,
+        message: `Rate-limited (${event.provider}${event.rateLimitType ? ` ${event.rateLimitType}` : ''}); resuming in ${Math.ceil(event.remainingMs / 1_000)}s`,
+        transient: true,
+      };
    case 'work_unit_started': {
      const total = plannedWorkUnitCountThrough(snapshot, eventIndex);
      const ordinal = workUnitOrdinalThrough(snapshot, eventIndex, event.unitKey);
@ -750,6 +774,8 @@ export async function runKtxIngest(
              );
        plainProgress?.start();
        structuredProgress?.start();
+        const cliAbort = deps.abortSignal ? null : createCliAbortSignal();
+        const abortSignal = deps.abortSignal ?? cliAbort?.signal;
        let result: LocalMetabaseFanoutResult;
        try {
          result = await executeMetabaseFanout({
@ -763,6 +789,7 @@ export async function runKtxIngest(
            embeddingProvider,
            ...(memoryFlow ? { memoryFlow } : {}),
            ...(progress ? { progress } : {}),
+            ...(abortSignal ? { abortSignal } : {}),
          });
          plainProgress?.flush();
          if (args.outputMode === 'json') {
@ -772,6 +799,7 @@ export async function runKtxIngest(
          }
        } finally {
          plainProgress?.flush();
+          cliAbort?.dispose();
        }
        return result.status === 'all_failed' ? 1 : 0;
      }
@ -820,6 +848,8 @@ export async function runKtxIngest(

      plainProgress?.start();
      structuredProgress?.start();
+      const cliAbort = deps.abortSignal ? null : createCliAbortSignal();
+      const abortSignal = deps.abortSignal ?? cliAbort?.signal;

      try {
        const result = await executeLocalIngest({
@ -836,6 +866,7 @@ export async function runKtxIngest(
          embeddingProvider,
          ...(args.debugLlmRequestFile ? { llmDebugRequestFile: args.debugLlmRequestFile } : {}),
          ...(memoryFlow ? { memoryFlow } : {}),
+          ...(abortSignal ? { abortSignal } : {}),
        });
        if (shouldUseLiveViz && memoryFlow) {
          latestMemoryFlowSnapshot = finalRunMemoryFlowInput(memoryFlow.snapshot(), result.report);
@ -854,6 +885,7 @@ export async function runKtxIngest(
      } finally {
        plainProgress?.flush();
        liveTui?.close();
+        cliAbort?.dispose();
      }
    }