ktx/packages/cli/test/context/scan/local-enrichment.test.ts

import Database from 'better-sqlite3';
import { describe, expect, it, vi } from 'vitest';
import { buildDefaultKtxProjectConfig } from '../../../src/context/project/config.js';
import type {
  KtxScanEnrichmentCompletedStage,
  KtxScanEnrichmentFailedStage,
  KtxScanEnrichmentStageLookup,
  KtxScanEnrichmentStateStore,
} from '../../../src/context/scan/enrichment-state.js';
import {
  createDeterministicLocalScanEnrichmentProviders,
  runLocalScanEnrichment,
  snapshotToKtxEnrichedSchema,
} from '../../../src/context/scan/local-enrichment.js';
import {
  createKtxConnectorCapabilities,
  type KtxQueryResult,
  type KtxReadOnlyQueryInput,
  type KtxEmbeddingPort,
  type KtxScanConnector,
  type KtxScanContext,
  type KtxSchemaSnapshot,
} from '../../../src/context/scan/types.js';

function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
  return {
    dimensions: options.dimensions,
    maxBatchSize: options.maxBatchSize ?? 64,
    async embedBatch(texts) {
      return texts.map((_, textIndex) =>
        Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
      );
    },
  };
}

const snapshot: KtxSchemaSnapshot = {
  connectionId: 'warehouse',
  driver: 'postgres',
  extractedAt: '2026-04-29T12:00:00.000Z',
  scope: { schemas: ['public'] },
  metadata: {},
  tables: [
    {
      catalog: null,
      db: 'public',
      name: 'customers',
      kind: 'table',
      comment: 'Customer accounts',
      estimatedRows: 2,
      foreignKeys: [],
      columns: [
        {
          name: 'id',
          nativeType: 'integer',
          normalizedType: 'integer',
          dimensionType: 'number',
          nullable: false,
          primaryKey: true,
          comment: 'Customer id',
        },
      ],
    },
    {
      catalog: null,
      db: 'public',
      name: 'orders',
      kind: 'table',
      comment: 'Customer orders',
      estimatedRows: 3,
      foreignKeys: [],
      columns: [
        {
          name: 'id',
          nativeType: 'integer',
          normalizedType: 'integer',
          dimensionType: 'number',
          nullable: false,
          primaryKey: true,
          comment: 'Order id',
        },
        {
          name: 'customer_id',
          nativeType: 'integer',
          normalizedType: 'integer',
          dimensionType: 'number',
          nullable: false,
          primaryKey: false,
          comment: 'Customer id',
        },
      ],
    },
  ],
};

function connector(): KtxScanConnector {
  return {
    id: 'test:warehouse',
    driver: 'postgres',
    capabilities: createKtxConnectorCapabilities({
      tableSampling: true,
      columnSampling: true,
      readOnlySql: true,
      columnStats: true,
    }),
    introspect: vi.fn(async () => snapshot),
    listSchemas: vi.fn(async () => []),
    listTables: vi.fn(async () => []),
    sampleTable: vi.fn(async () => ({
      headers: ['id', 'customer_id'],
      rows: [[1, 10]],
      totalRows: 1,
    })),
    sampleColumn: vi.fn(async () => ({
      values: ['10', '11'],
      nullCount: 0,
      distinctCount: 2,
    })),
  };
}

class InMemorySqliteExecutor {
  readonly db = new Database(':memory:');

  executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
    const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
    const headers = Object.keys(rows[0] ?? {});
    return Promise.resolve({
      headers,
      rows: rows.map((row) => headers.map((header) => row[header])),
      totalRows: rows.length,
      rowCount: rows.length,
    });
  }

  close(): void {
    this.db.close();
  }
}

function noDeclaredRelationshipSnapshot(): KtxSchemaSnapshot {
  return {
    connectionId: 'warehouse',
    driver: 'sqlite',
    extractedAt: '2026-05-07T00:00:00.000Z',
    scope: {},
    metadata: {},
    tables: [
      {
        catalog: null,
        db: null,
        name: 'accounts',
        kind: 'table',
        comment: null,
        estimatedRows: 2,
        foreignKeys: [],
        columns: [
          {
            name: 'id',
            nativeType: 'INTEGER',
            normalizedType: 'integer',
            dimensionType: 'number',
            nullable: false,
            primaryKey: false,
            comment: null,
          },
        ],
      },
      {
        catalog: null,
        db: null,
        name: 'orders',
        kind: 'table',
        comment: null,
        estimatedRows: 3,
        foreignKeys: [],
        columns: [
          {
            name: 'id',
            nativeType: 'INTEGER',
            normalizedType: 'integer',
            dimensionType: 'number',
            nullable: false,
            primaryKey: false,
            comment: null,
          },
          {
            name: 'account_id',
            nativeType: 'INTEGER',
            normalizedType: 'integer',
            dimensionType: 'number',
            nullable: false,
            primaryKey: false,
            comment: null,
          },
        ],
      },
    ],
  };
}

function memoryEnrichmentStateStore(): KtxScanEnrichmentStateStore {
  const records = new Map<string, KtxScanEnrichmentCompletedStage | KtxScanEnrichmentFailedStage>();
  const key = (input: Pick<KtxScanEnrichmentStageLookup, 'runId' | 'stage'>) => `${input.runId}:${input.stage}`;
  return {
    async findCompletedStage<TOutput>(input: KtxScanEnrichmentStageLookup) {
      const record = records.get(key(input));
      if (!record || record.status !== 'completed' || record.inputHash !== input.inputHash) {
        return null;
      }
      return record as KtxScanEnrichmentCompletedStage<TOutput>;
    },
    async saveCompletedStage(input) {
      records.set(key(input), {
        ...input,
        status: 'completed',
        errorMessage: null,
      });
    },
    async saveFailedStage(input) {
      records.set(key(input), {
        ...input,
        status: 'failed',
        output: null,
      });
    },
    async listRunStages(runId) {
      return [...records.values()].filter((record) => record.runId === runId);
    },
  };
}

describe('local scan enrichment', () => {
  it('maps a scan snapshot into relationship detector schema', () => {
    const schema = snapshotToKtxEnrichedSchema(snapshot);

    expect(schema.connectionId).toBe('warehouse');
    expect(schema.tables).toHaveLength(2);
    expect(schema.tables[1]?.columns.map((column) => column.name)).toEqual(['id', 'customer_id']);
    expect(schema.tables[1]?.columns[1]).toMatchObject({
      id: 'public.orders.customer_id',
      tableId: 'public.orders',
      primaryKey: false,
      sampleValues: null,
      embedding: null,
    });
  });

  it('maps snapshot foreign keys into formal schema relationships', () => {
    const source = noDeclaredRelationshipSnapshot();
    const snapshotWithForeignKey = {
      ...source,
      tables: source.tables.map((table) =>
        table.name === 'orders'
          ? {
              ...table,
              foreignKeys: [
                {
                  fromColumn: 'account_id',
                  toCatalog: null,
                  toDb: null,
                  toTable: 'accounts',
                  toColumn: 'id',
                  constraintName: 'orders_account_id_fkey',
                },
              ],
            }
          : table.name === 'accounts'
            ? {
                ...table,
                columns: table.columns.map((column) =>
                  column.name === 'id' ? { ...column, primaryKey: true } : column,
                ),
              }
            : table,
      ),
    };

    const schema = snapshotToKtxEnrichedSchema(snapshotWithForeignKey);

    expect(schema.relationships).toEqual([
      {
        id: 'orders:(orders.account_id)->accounts:(accounts.id)',
        source: 'formal',
        from: {
          tableId: 'orders',
          columnIds: ['orders.account_id'],
          table: { catalog: null, db: null, name: 'orders' },
          columns: ['account_id'],
        },
        to: {
          tableId: 'accounts',
          columnIds: ['accounts.id'],
          table: { catalog: null, db: null, name: 'accounts' },
          columns: ['id'],
        },
        relationshipType: 'many_to_one',
        confidence: 1,
        isPrimaryKeyReference: true,
      },
    ]);
  });

  it('uses the supplied snapshot without calling connector.introspect', async () => {
    const scanConnector = connector();
    const introspect = vi.mocked(scanConnector.introspect);

    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'structural',
      connector: scanConnector,
      snapshot,
      context: { runId: 'scan-run-snapshot' },
      providers: null,
    });

    expect(result.snapshot).toEqual(snapshot);
    expect(introspect).not.toHaveBeenCalled();
  });

  it('falls back to connector.introspect when no snapshot is supplied', async () => {
    const scanConnector = connector();

    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'structural',
      connector: scanConnector,
      context: { runId: 'scan-run-introspect' },
      providers: null,
    });

    expect(result.snapshot).toEqual(snapshot);
    expect(scanConnector.introspect).toHaveBeenCalledTimes(1);
  });

  it('fails when connector driver and snapshot driver differ', async () => {
    const mismatchedConnector: KtxScanConnector = {
      ...connector(),
      driver: 'mysql',
    };

    await expect(
      runLocalScanEnrichment({
        connectionId: 'warehouse',
        mode: 'relationships',
        detectRelationships: true,
        connector: mismatchedConnector,
        snapshot,
        context: { runId: 'scan-run-driver-mismatch' },
        providers: null,
      }),
    ).rejects.toThrow(
      'ktx scan connector driver "mysql" does not match snapshot driver "postgres" for connection "warehouse"',
    );
  });

  it('runs deterministic relationship detection for relationship scans', async () => {
    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'relationships',
      detectRelationships: true,
      connector: connector(),
      context: { runId: 'scan-run-1' },
      providers: null,
    });

    expect(result.summary).toMatchObject({
      deterministicRelationships: 'completed',
      llmRelationshipValidation: 'skipped',
      embeddings: 'skipped',
    });
    expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
    expect(result.summary.statisticalValidation).toBe('skipped');
    expect(result.warnings).toContainEqual({
      code: 'relationship_validation_failed',
      message: 'KTX scan connector advertises readOnlySql but does not expose executeReadOnly',
      recoverable: true,
      metadata: { capability: 'readOnlySql' },
    });
  });

  it('runs relationship discovery with connector SQL evidence', async () => {
    const executor = new InMemorySqliteExecutor();
    try {
      executor.db.exec(`
        CREATE TABLE accounts (id INTEGER NOT NULL);
        CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
        INSERT INTO accounts (id) VALUES (1), (2);
        INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
      `);
      const scanConnector = {
        ...connector(),
        driver: 'sqlite' as const,
        capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
        introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
        executeReadOnly: executor.executeReadOnly.bind(executor),
      };

      const result = await runLocalScanEnrichment({
        connectionId: 'warehouse',
        mode: 'relationships',
        detectRelationships: true,
        connector: scanConnector,
        context: { runId: 'scan-run-relationship-discovery' },
        providers: null,
      });

      expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
      expect(result.summary.statisticalValidation).toBe('completed');
      expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
      expect(result.resolvedRelationships).toEqual([
        expect.objectContaining({
          status: 'accepted',
          from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
          to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
        }),
      ]);
      expect(result.relationshipUpdate?.accepted).toHaveLength(1);
    } finally {
      executor.close();
    }
  });

  it('honors scan relationship config when LLM proposals are disabled', async () => {
    const providers = createDeterministicLocalScanEnrichmentProviders();
    const generateObject = vi.fn();
    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'relationships',
      detectRelationships: true,
      connector: connector(),
      context: { runId: 'scan-run-llm-disabled' },
      providers: {
        ...providers,
        llmRuntime: {
          ...providers.llmRuntime,
          generateObject: generateObject as never,
        },
      },
      relationshipSettings: {
        ...buildDefaultKtxProjectConfig().scan.relationships,
        llmProposals: false,
        maxLlmTablesPerBatch: 40,
      },
    });

    expect(result.summary.llmRelationshipValidation).toBe('skipped');
    expect(generateObject).not.toHaveBeenCalled();
  });

  it('skips relationship detection when scan relationships are disabled', async () => {
    const settings = {
      ...buildDefaultKtxProjectConfig().scan.relationships,
      enabled: false,
    };
    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      connector: connector(),
      context: { runId: 'disabled-relationships' },
      providers: createDeterministicLocalScanEnrichmentProviders(),
      relationshipSettings: settings,
    });

    expect(result.summary.deterministicRelationships).toBe('skipped');
    expect(result.summary.statisticalValidation).toBe('skipped');
    expect(result.summary.llmRelationshipValidation).toBe('skipped');
    expect(result.relationships).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
    expect(result.relationshipUpdate).toBeNull();
    expect(result.relationshipProfile).toBeNull();
    expect(result.resolvedRelationships).toBeNull();
  });

  it('forwards context.logger and emits warnings when sampleTable fails repeatedly', async () => {
    const failingConnector: KtxScanConnector = {
      ...connector(),
      sampleTable: vi.fn(async () => {
        throw new Error('pool: ECONNRESET');
      }),
    };
    const logger = {
      debug: vi.fn(),
      info: vi.fn(),
      warn: vi.fn(),
      error: vi.fn(),
    };

    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      detectRelationships: false,
      connector: failingConnector,
      context: { runId: 'scan-run-warnings', logger },
      providers: createDeterministicLocalScanEnrichmentProviders(),
    });

    const codes = result.warnings.map((warning) => warning.code);
    expect(codes).toContain('sampling_failed');
    expect(codes).toContain('description_fallback_used');
    expect(result.warnings.some((warning) => warning.table === 'customers')).toBe(true);
    expect(logger.warn).toHaveBeenCalled();
    expect(logger.error).toHaveBeenCalled();
    // Each of the two tables produced sampling_failed + description_fallback_used, so 2 + 2 = 4 warnings minimum.
    expect(result.warnings.length).toBeGreaterThanOrEqual(4);
    // Sampling was retried 3× for each of the 2 tables = 6 calls
    expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
  });

  it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      detectRelationships: true,
      connector: connector(),
      context: { runId: 'scan-run-2' },
      providers: createDeterministicLocalScanEnrichmentProviders(),
    });

    expect(result.summary).toMatchObject({
      dataDictionary: 'completed',
      tableDescriptions: 'completed',
      columnDescriptions: 'completed',
      embeddings: 'skipped',
      deterministicRelationships: 'completed',
    });
    expect(result.embeddingUpdates).toEqual([]);
    expect(result.snapshot).toEqual(snapshot);
    expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
  });

  it('generates batched table descriptions with bounded table-level concurrency', async () => {
    const concurrentSnapshot: KtxSchemaSnapshot = {
      ...snapshot,
      tables: Array.from({ length: 8 }, (_, index) => ({
        catalog: null,
        db: 'public',
        name: `table_${index + 1}`,
        kind: 'table' as const,
        comment: null,
        estimatedRows: 2,
        foreignKeys: [],
        columns: [
          {
            name: 'id',
            nativeType: 'integer',
            normalizedType: 'integer',
            dimensionType: 'number' as const,
            nullable: false,
            primaryKey: true,
            comment: null,
          },
        ],
      })),
    };
    let activeTableSamples = 0;
    let maxActiveTableSamples = 0;
    const scanConnector = {
      ...connector(),
      introspect: vi.fn(async () => concurrentSnapshot),
      sampleColumn: vi.fn(async () => ({
        values: ['1'],
        nullCount: 0,
        distinctCount: 1,
      })),
      sampleTable: vi.fn(async () => {
        activeTableSamples += 1;
        maxActiveTableSamples = Math.max(maxActiveTableSamples, activeTableSamples);
        await new Promise((resolve) => setTimeout(resolve, 10));
        activeTableSamples -= 1;
        return {
          headers: ['id'],
          rows: [[1]],
          totalRows: 1,
        };
      }),
    };
    const settings = {
      ...buildDefaultKtxProjectConfig().scan.relationships,
      enabled: false,
    };

    await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      connector: scanConnector,
      context: { runId: 'scan-run-concurrent-descriptions' },
      providers: createDeterministicLocalScanEnrichmentProviders(),
      relationshipSettings: settings,
    });

    expect(maxActiveTableSamples).toBe(4);
    expect(scanConnector.sampleColumn).not.toHaveBeenCalled();
  });

  it('reports enrichment progress for countable stages', async () => {
    const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
    const progress = {
      async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
        events.push({ progress: progressValue, message, transient: options?.transient });
      },
      startPhase() {
        return progress;
      },
    };

    await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      detectRelationships: true,
      connector: connector(),
      context: { runId: 'scan-run-progress', progress },
      providers: {
        ...createDeterministicLocalScanEnrichmentProviders(),
        embedding: fakeScanEmbedding({ dimensions: 6 }),
      },
    });

    expect(events).toEqual(
      expect.arrayContaining([
        expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
        expect.objectContaining({ message: 'Generating descriptions 2/2 tables', transient: true }),
        expect.objectContaining({ message: 'Building embeddings 1/1 batches', transient: true }),
        expect.objectContaining({ message: 'Detecting relationships' }),
      ]),
    );
  });

  it('reports progress before enrichment connector introspection starts', async () => {
    const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
    const progress = {
      async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
        events.push({ progress: progressValue, message, transient: options?.transient });
      },
      startPhase() {
        return progress;
      },
    };
    const scanConnector = {
      ...connector(),
      introspect: vi.fn(async () => {
        expect(events).toContainEqual(expect.objectContaining({ message: 'Loading enrichment schema snapshot' }));
        return snapshot;
      }),
    };

    await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'relationships',
      detectRelationships: true,
      connector: scanConnector,
      context: { runId: 'scan-run-progress-before-introspection', progress },
      providers: null,
    });

    expect(scanConnector.introspect).toHaveBeenCalled();
  });

  it('splits enrichment embedding requests by provider batch size', async () => {
    const manyColumnSnapshot: KtxSchemaSnapshot = {
      ...snapshot,
      tables: [
        {
          catalog: null,
          db: 'public',
          name: 'wide_orders',
          kind: 'table',
          comment: 'Wide order facts',
          estimatedRows: 3,
          foreignKeys: [],
          columns: Array.from({ length: 5 }, (_, index) => ({
            name: `metric_${index + 1}`,
            nativeType: 'integer',
            normalizedType: 'integer',
            dimensionType: 'number' as const,
            nullable: false,
            primaryKey: false,
            comment: `Metric ${index + 1}`,
          })),
        },
      ],
    };
    const scanConnector = {
      ...connector(),
      introspect: vi.fn(async () => manyColumnSnapshot),
    };
    const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
    const embedBatch = vi.fn(async (texts: string[]) => {
      if (texts.length > 2) {
        throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
      }
      return texts.map((_, index) => [index, index + 1, index + 2]);
    });

    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      detectRelationships: false,
      connector: scanConnector,
      context: { runId: 'scan-run-batched-embeddings' },
      providers: {
        llmRuntime: deterministicProviders.llmRuntime,
        embedding: {
          dimensions: 3,
          maxBatchSize: 2,
          embedBatch,
        },
      },
    });

    expect(result.embeddingUpdates).toHaveLength(5);
    expect(embedBatch.mock.calls.map(([texts]) => texts).map((texts) => texts.length)).toEqual([2, 2, 1]);
  });

  it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
    const stateStore = memoryEnrichmentStateStore();
    const scanConnector = connector();
    const providers = {
      ...createDeterministicLocalScanEnrichmentProviders(),
      embedding: fakeScanEmbedding({ dimensions: 6 }),
    };

    const first = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      detectRelationships: true,
      connector: scanConnector,
      context: { runId: 'scan-run-resume-1' },
      providers,
      stateStore,
      syncId: 'sync-resume-1',
      providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
    });

    const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
    const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
    const second = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      detectRelationships: true,
      connector: scanConnector,
      context: { runId: 'scan-run-resume-1' },
      providers,
      stateStore,
      syncId: 'sync-resume-1',
      providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
    });

    expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
    expect(first.state.resumedStages).toEqual([]);
    expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
    expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
    expect(generateObject).not.toHaveBeenCalled();
    expect(embedBatch).not.toHaveBeenCalled();
    expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
    expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
    expect(second.relationships).toEqual(first.relationships);
  });

  it('does not reuse completed stages when the snapshot changes', async () => {
    const stateStore = memoryEnrichmentStateStore();
    const providers = {
      ...createDeterministicLocalScanEnrichmentProviders(),
      embedding: fakeScanEmbedding({ dimensions: 6 }),
    };
    const scanConnector = connector();

    await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      detectRelationships: false,
      connector: scanConnector,
      context: { runId: 'scan-run-resume-hash' },
      providers,
      stateStore,
      syncId: 'sync-resume-hash',
      providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
    });

    const firstTable = snapshot.tables[0];
    if (!firstTable) {
      throw new Error('Expected test snapshot table');
    }
    const changedConnector = {
      ...connector(),
      introspect: vi.fn(async () => ({
        ...snapshot,
        tables: [{ ...firstTable, name: 'customers' }],
      })),
    };
    const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');

    const result = await runLocalScanEnrichment({
      connectionId: 'warehouse',
      mode: 'enriched',
      detectRelationships: false,
      connector: changedConnector,
      context: { runId: 'scan-run-resume-hash' },
      providers,
      stateStore,
      syncId: 'sync-resume-hash',
      providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
    });

    expect(result.state.resumedStages).toEqual([]);
    expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
    expect(generateObject).toHaveBeenCalled();
  });

  it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
    const executor = new InMemorySqliteExecutor();
    try {
      executor.db.exec(`
        CREATE TABLE accounts (id INTEGER NOT NULL);
        CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
        INSERT INTO accounts (id) VALUES (1), (2);
        INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
      `);
      const scanConnector = {
        ...connector(),
        driver: 'sqlite' as const,
        capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
        introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
        executeReadOnly: executor.executeReadOnly.bind(executor),
      };

      const result = await runLocalScanEnrichment({
        connectionId: 'warehouse',
        mode: 'enriched',
        detectRelationships: false,
        connector: scanConnector,
        context: { runId: 'scan-run-providerless-enriched' },
        providers: null,
      });

      expect(result.summary).toEqual({
        dataDictionary: 'skipped',
        tableDescriptions: 'skipped',
        columnDescriptions: 'skipped',
        embeddings: 'skipped',
        deterministicRelationships: 'completed',
        llmRelationshipValidation: 'skipped',
        statisticalValidation: 'completed',
      });
      expect(result.descriptionUpdates).toEqual([]);
      expect(result.embeddingUpdates).toEqual([]);
      expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
      expect(result.relationshipUpdate?.accepted).toHaveLength(1);
      expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
      expect(result.resolvedRelationships).toEqual([
        expect.objectContaining({
          status: 'accepted',
          from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
          to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
        }),
      ]);
      expect(result.warnings).toContainEqual({
        code: 'scan_enrichment_backend_not_configured',
        message:
          'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
        recoverable: true,
        metadata: {
          skippedStages: ['descriptions', 'embeddings'],
          relationshipDetection: true,
        },
      });
    } finally {
      executor.close();
    }
  });

});
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								import Database from 'better-sqlite3';
 								import { describe, expect, it, vi } from 'vitest';
-												test: split cli tests from source tree (#216)

* feat(cli): define full warehouse dialect contract

* test(cli): keep dialect edge tests focused

* fix(cli): stabilize dialect contract foundation

* refactor(connectors): own read-only query preparation

* refactor(connectors): resolve dialects through registry

* refactor(connectors): keep concrete dialect classes internal

* chore(workspace): enforce dialect import boundary

* refactor(cli): resolve relationship dialect at scan boundary

* refactor(cli): use dialect display parsing for entity details

* refactor(cli): use dialect display parsing for warehouse catalog

* refactor(cli): use dialect SQL in relationship workflows

* test(cli): verify solid dialect scan workflow closure

* test: split cli tests from source tree

* refactor(cli): standardize BigQuery scope listing

* feat(sqlite): implement connector scope listing

* test(connectors): cover required table listing

* feat(cli): add warehouse driver registry

* refactor(setup): route scope discovery through driver registry

* refactor(cli): route local query execution through driver registry

* refactor(historic-sql): route dialect support through driver registry

* refactor(cli): test warehouse connections through driver registry

* fix(cli): close driver registry type export gaps

* Improve setup daemon diagnostics

* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback

Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.

* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match

The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.

Align the picker boundary with the canonical 3-level KtxTableRef:

- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
  resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
  (resolveEnabledTables already accepts the 3-part shape) and
  schemasFromEnabledTables now goes through parseDottedTableEntry so it
  recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
  reuse.

Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).

* fix(cli): allow debug telemetry under opt-out env
											
										
										
											2026-05-26 08:49:05 +02:00
+								import { buildDefaultKtxProjectConfig } from '../../../src/context/project/config.js';
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								import type {
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								  KtxScanEnrichmentCompletedStage,
 								  KtxScanEnrichmentFailedStage,
 								  KtxScanEnrichmentStageLookup,
 								  KtxScanEnrichmentStateStore,
-												test: split cli tests from source tree (#216)

* feat(cli): define full warehouse dialect contract

* test(cli): keep dialect edge tests focused

* fix(cli): stabilize dialect contract foundation

* refactor(connectors): own read-only query preparation

* refactor(connectors): resolve dialects through registry

* refactor(connectors): keep concrete dialect classes internal

* chore(workspace): enforce dialect import boundary

* refactor(cli): resolve relationship dialect at scan boundary

* refactor(cli): use dialect display parsing for entity details

* refactor(cli): use dialect display parsing for warehouse catalog

* refactor(cli): use dialect SQL in relationship workflows

* test(cli): verify solid dialect scan workflow closure

* test: split cli tests from source tree

* refactor(cli): standardize BigQuery scope listing

* feat(sqlite): implement connector scope listing

* test(connectors): cover required table listing

* feat(cli): add warehouse driver registry

* refactor(setup): route scope discovery through driver registry

* refactor(cli): route local query execution through driver registry

* refactor(historic-sql): route dialect support through driver registry

* refactor(cli): test warehouse connections through driver registry

* fix(cli): close driver registry type export gaps

* Improve setup daemon diagnostics

* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback

Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.

* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match

The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.

Align the picker boundary with the canonical 3-level KtxTableRef:

- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
  resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
  (resolveEnabledTables already accepts the 3-part shape) and
  schemasFromEnabledTables now goes through parseDottedTableEntry so it
  recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
  reuse.

Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).

* fix(cli): allow debug telemetry under opt-out env
											
										
										
											2026-05-26 08:49:05 +02:00
+								} from '../../../src/context/scan/enrichment-state.js';
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								import {
 								  createDeterministicLocalScanEnrichmentProviders,
 								  runLocalScanEnrichment,
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								  snapshotToKtxEnrichedSchema,
-												test: split cli tests from source tree (#216)

* feat(cli): define full warehouse dialect contract

* test(cli): keep dialect edge tests focused

* fix(cli): stabilize dialect contract foundation

* refactor(connectors): own read-only query preparation

* refactor(connectors): resolve dialects through registry

* refactor(connectors): keep concrete dialect classes internal

* chore(workspace): enforce dialect import boundary

* refactor(cli): resolve relationship dialect at scan boundary

* refactor(cli): use dialect display parsing for entity details

* refactor(cli): use dialect display parsing for warehouse catalog

* refactor(cli): use dialect SQL in relationship workflows

* test(cli): verify solid dialect scan workflow closure

* test: split cli tests from source tree

* refactor(cli): standardize BigQuery scope listing

* feat(sqlite): implement connector scope listing

* test(connectors): cover required table listing

* feat(cli): add warehouse driver registry

* refactor(setup): route scope discovery through driver registry

* refactor(cli): route local query execution through driver registry

* refactor(historic-sql): route dialect support through driver registry

* refactor(cli): test warehouse connections through driver registry

* fix(cli): close driver registry type export gaps

* Improve setup daemon diagnostics

* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback

Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.

* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match

The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.

Align the picker boundary with the canonical 3-level KtxTableRef:

- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
  resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
  (resolveEnabledTables already accepts the 3-part shape) and
  schemasFromEnabledTables now goes through parseDottedTableEntry so it
  recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
  reuse.

Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).

* fix(cli): allow debug telemetry under opt-out env
											
										
										
											2026-05-26 08:49:05 +02:00
+								} from '../../../src/context/scan/local-enrichment.js';
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								import {
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								  createKtxConnectorCapabilities,
 								  type KtxQueryResult,
 								  type KtxReadOnlyQueryInput,
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								  type KtxEmbeddingPort,
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								  type KtxScanConnector,
 								  type KtxScanContext,
 								  type KtxSchemaSnapshot,
-												test: split cli tests from source tree (#216)

* feat(cli): define full warehouse dialect contract

* test(cli): keep dialect edge tests focused

* fix(cli): stabilize dialect contract foundation

* refactor(connectors): own read-only query preparation

* refactor(connectors): resolve dialects through registry

* refactor(connectors): keep concrete dialect classes internal

* chore(workspace): enforce dialect import boundary

* refactor(cli): resolve relationship dialect at scan boundary

* refactor(cli): use dialect display parsing for entity details

* refactor(cli): use dialect display parsing for warehouse catalog

* refactor(cli): use dialect SQL in relationship workflows

* test(cli): verify solid dialect scan workflow closure

* test: split cli tests from source tree

* refactor(cli): standardize BigQuery scope listing

* feat(sqlite): implement connector scope listing

* test(connectors): cover required table listing

* feat(cli): add warehouse driver registry

* refactor(setup): route scope discovery through driver registry

* refactor(cli): route local query execution through driver registry

* refactor(historic-sql): route dialect support through driver registry

* refactor(cli): test warehouse connections through driver registry

* fix(cli): close driver registry type export gaps

* Improve setup daemon diagnostics

* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback

Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.

* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match

The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.

Align the picker boundary with the canonical 3-level KtxTableRef:

- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
  resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
  (resolveEnabledTables already accepts the 3-part shape) and
  schemasFromEnabledTables now goes through parseDottedTableEntry so it
  recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
  reuse.

Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).

* fix(cli): allow debug telemetry under opt-out env
											
										
										
											2026-05-26 08:49:05 +02:00
+								} from '../../../src/context/scan/types.js';
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
 								  return {
 								    dimensions: options.dimensions,
 								    maxBatchSize: options.maxBatchSize ?? 64,
 								    async embedBatch(texts) {
 								      return texts.map((_, textIndex) =>
 								        Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
 								      );
 								    },
 								  };
 								}
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								const snapshot: KtxSchemaSnapshot = {
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								  connectionId: 'warehouse',
 								  driver: 'postgres',
 								  extractedAt: '2026-04-29T12:00:00.000Z',
 								  scope: { schemas: ['public'] },
 								  metadata: {},
 								  tables: [
 								    {
 								      catalog: null,
 								      db: 'public',
 								      name: 'customers',
 								      kind: 'table',
 								      comment: 'Customer accounts',
 								      estimatedRows: 2,
 								      foreignKeys: [],
 								      columns: [
 								        {
 								          name: 'id',
 								          nativeType: 'integer',
 								          normalizedType: 'integer',
 								          dimensionType: 'number',
 								          nullable: false,
 								          primaryKey: true,
 								          comment: 'Customer id',
 								        },
 								      ],
 								    },
 								    {
 								      catalog: null,
 								      db: 'public',
 								      name: 'orders',
 								      kind: 'table',
 								      comment: 'Customer orders',
 								      estimatedRows: 3,
 								      foreignKeys: [],
 								      columns: [
 								        {
 								          name: 'id',
 								          nativeType: 'integer',
 								          normalizedType: 'integer',
 								          dimensionType: 'number',
 								          nullable: false,
 								          primaryKey: true,
 								          comment: 'Order id',
 								        },
 								        {
 								          name: 'customer_id',
 								          nativeType: 'integer',
 								          normalizedType: 'integer',
 								          dimensionType: 'number',
 								          nullable: false,
 								          primaryKey: false,
 								          comment: 'Customer id',
 								        },
 								      ],
 								    },
 								  ],
 								};
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								function connector(): KtxScanConnector {
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								  return {
 								    id: 'test:warehouse',
 								    driver: 'postgres',
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								    capabilities: createKtxConnectorCapabilities({
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								      tableSampling: true,
 								      columnSampling: true,
 								      readOnlySql: true,
 								      columnStats: true,
 								    }),
 								    introspect: vi.fn(async () => snapshot),
-												test: split cli tests from source tree (#216)

* feat(cli): define full warehouse dialect contract

* test(cli): keep dialect edge tests focused

* fix(cli): stabilize dialect contract foundation

* refactor(connectors): own read-only query preparation

* refactor(connectors): resolve dialects through registry

* refactor(connectors): keep concrete dialect classes internal

* chore(workspace): enforce dialect import boundary

* refactor(cli): resolve relationship dialect at scan boundary

* refactor(cli): use dialect display parsing for entity details

* refactor(cli): use dialect display parsing for warehouse catalog

* refactor(cli): use dialect SQL in relationship workflows

* test(cli): verify solid dialect scan workflow closure

* test: split cli tests from source tree

* refactor(cli): standardize BigQuery scope listing

* feat(sqlite): implement connector scope listing

* test(connectors): cover required table listing

* feat(cli): add warehouse driver registry

* refactor(setup): route scope discovery through driver registry

* refactor(cli): route local query execution through driver registry

* refactor(historic-sql): route dialect support through driver registry

* refactor(cli): test warehouse connections through driver registry

* fix(cli): close driver registry type export gaps

* Improve setup daemon diagnostics

* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback

Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.

* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match

The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.

Align the picker boundary with the canonical 3-level KtxTableRef:

- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
  resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
  (resolveEnabledTables already accepts the 3-part shape) and
  schemasFromEnabledTables now goes through parseDottedTableEntry so it
  recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
  reuse.

Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).

* fix(cli): allow debug telemetry under opt-out env
											
										
										
											2026-05-26 08:49:05 +02:00
+								    listSchemas: vi.fn(async () => []),
 								    listTables: vi.fn(async () => []),
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    sampleTable: vi.fn(async () => ({
 								      headers: ['id', 'customer_id'],
 								      rows: [[1, 10]],
 								      totalRows: 1,
 								    })),
 								    sampleColumn: vi.fn(async () => ({
 								      values: ['10', '11'],
 								      nullCount: 0,
 								      distinctCount: 2,
 								    })),
 								  };
 								}
 								class InMemorySqliteExecutor {
 								  readonly db = new Database(':memory:');
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								  executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
 								    const headers = Object.keys(rows[0] ?? {});
 								    return Promise.resolve({
 								      headers,
 								      rows: rows.map((row) => headers.map((header) => row[header])),
 								      totalRows: rows.length,
 								      rowCount: rows.length,
 								    });
 								  }
 								  close(): void {
 								    this.db.close();
 								  }
 								}
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								function noDeclaredRelationshipSnapshot(): KtxSchemaSnapshot {
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								  return {
 								    connectionId: 'warehouse',
 								    driver: 'sqlite',
 								    extractedAt: '2026-05-07T00:00:00.000Z',
 								    scope: {},
 								    metadata: {},
 								    tables: [
 								      {
 								        catalog: null,
 								        db: null,
 								        name: 'accounts',
 								        kind: 'table',
 								        comment: null,
 								        estimatedRows: 2,
 								        foreignKeys: [],
 								        columns: [
 								          {
 								            name: 'id',
 								            nativeType: 'INTEGER',
 								            normalizedType: 'integer',
 								            dimensionType: 'number',
 								            nullable: false,
 								            primaryKey: false,
 								            comment: null,
 								          },
 								        ],
 								      },
 								      {
 								        catalog: null,
 								        db: null,
 								        name: 'orders',
 								        kind: 'table',
 								        comment: null,
 								        estimatedRows: 3,
 								        foreignKeys: [],
 								        columns: [
 								          {
 								            name: 'id',
 								            nativeType: 'INTEGER',
 								            normalizedType: 'integer',
 								            dimensionType: 'number',
 								            nullable: false,
 								            primaryKey: false,
 								            comment: null,
 								          },
 								          {
 								            name: 'account_id',
 								            nativeType: 'INTEGER',
 								            normalizedType: 'integer',
 								            dimensionType: 'number',
 								            nullable: false,
 								            primaryKey: false,
 								            comment: null,
 								          },
 								        ],
 								      },
 								    ],
 								  };
 								}
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								function memoryEnrichmentStateStore(): KtxScanEnrichmentStateStore {
 								  const records = new Map<string, KtxScanEnrichmentCompletedStage | KtxScanEnrichmentFailedStage>();
 								  const key = (input: Pick<KtxScanEnrichmentStageLookup, 'runId' | 'stage'>) => `${input.runId}:${input.stage}`;
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								  return {
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								    async findCompletedStage<TOutput>(input: KtxScanEnrichmentStageLookup) {
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								      const record = records.get(key(input));
 								      if (!record || record.status !== 'completed' || record.inputHash !== input.inputHash) {
 								        return null;
 								      }
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								      return record as KtxScanEnrichmentCompletedStage<TOutput>;
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    },
 								    async saveCompletedStage(input) {
 								      records.set(key(input), {
 								        ...input,
 								        status: 'completed',
 								        errorMessage: null,
 								      });
 								    },
 								    async saveFailedStage(input) {
 								      records.set(key(input), {
 								        ...input,
 								        status: 'failed',
 								        output: null,
 								      });
 								    },
 								    async listRunStages(runId) {
 								      return [...records.values()].filter((record) => record.runId === runId);
 								    },
 								  };
 								}
 								describe('local scan enrichment', () => {
 								  it('maps a scan snapshot into relationship detector schema', () => {
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								    const schema = snapshotToKtxEnrichedSchema(snapshot);
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
 								    expect(schema.connectionId).toBe('warehouse');
 								    expect(schema.tables).toHaveLength(2);
 								    expect(schema.tables[1]?.columns.map((column) => column.name)).toEqual(['id', 'customer_id']);
 								    expect(schema.tables[1]?.columns[1]).toMatchObject({
 								      id: 'public.orders.customer_id',
 								      tableId: 'public.orders',
 								      primaryKey: false,
 								      sampleValues: null,
 								      embedding: null,
 								    });
 								  });
 								  it('maps snapshot foreign keys into formal schema relationships', () => {
 								    const source = noDeclaredRelationshipSnapshot();
 								    const snapshotWithForeignKey = {
 								      ...source,
 								      tables: source.tables.map((table) =>
 								        table.name === 'orders'
 								          ? {
 								              ...table,
 								              foreignKeys: [
 								                {
 								                  fromColumn: 'account_id',
 								                  toCatalog: null,
 								                  toDb: null,
 								                  toTable: 'accounts',
 								                  toColumn: 'id',
 								                  constraintName: 'orders_account_id_fkey',
 								                },
 								              ],
 								            }
 								          : table.name === 'accounts'
 								            ? {
 								                ...table,
 								                columns: table.columns.map((column) =>
 								                  column.name === 'id' ? { ...column, primaryKey: true } : column,
 								                ),
 								              }
 								            : table,
 								      ),
 								    };
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								    const schema = snapshotToKtxEnrichedSchema(snapshotWithForeignKey);
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
 								    expect(schema.relationships).toEqual([
 								      {
 								        id: 'orders:(orders.account_id)->accounts:(accounts.id)',
 								        source: 'formal',
 								        from: {
 								          tableId: 'orders',
 								          columnIds: ['orders.account_id'],
 								          table: { catalog: null, db: null, name: 'orders' },
 								          columns: ['account_id'],
 								        },
 								        to: {
 								          tableId: 'accounts',
 								          columnIds: ['accounts.id'],
 								          table: { catalog: null, db: null, name: 'accounts' },
 								          columns: ['id'],
 								        },
 								        relationshipType: 'many_to_one',
 								        confidence: 1,
 								        isPrimaryKeyReference: true,
 								      },
 								    ]);
 								  });
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								  it('uses the supplied snapshot without calling connector.introspect', async () => {
 								    const scanConnector = connector();
 								    const introspect = vi.mocked(scanConnector.introspect);
 								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'structural',
 								      connector: scanConnector,
 								      snapshot,
 								      context: { runId: 'scan-run-snapshot' },
 								      providers: null,
 								    });
 								    expect(result.snapshot).toEqual(snapshot);
 								    expect(introspect).not.toHaveBeenCalled();
 								  });
 								  it('falls back to connector.introspect when no snapshot is supplied', async () => {
 								    const scanConnector = connector();
 								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'structural',
 								      connector: scanConnector,
 								      context: { runId: 'scan-run-introspect' },
 								      providers: null,
 								    });
 								    expect(result.snapshot).toEqual(snapshot);
 								    expect(scanConnector.introspect).toHaveBeenCalledTimes(1);
 								  });
-												test: split cli tests from source tree (#216)

* feat(cli): define full warehouse dialect contract

* test(cli): keep dialect edge tests focused

* fix(cli): stabilize dialect contract foundation

* refactor(connectors): own read-only query preparation

* refactor(connectors): resolve dialects through registry

* refactor(connectors): keep concrete dialect classes internal

* chore(workspace): enforce dialect import boundary

* refactor(cli): resolve relationship dialect at scan boundary

* refactor(cli): use dialect display parsing for entity details

* refactor(cli): use dialect display parsing for warehouse catalog

* refactor(cli): use dialect SQL in relationship workflows

* test(cli): verify solid dialect scan workflow closure

* test: split cli tests from source tree

* refactor(cli): standardize BigQuery scope listing

* feat(sqlite): implement connector scope listing

* test(connectors): cover required table listing

* feat(cli): add warehouse driver registry

* refactor(setup): route scope discovery through driver registry

* refactor(cli): route local query execution through driver registry

* refactor(historic-sql): route dialect support through driver registry

* refactor(cli): test warehouse connections through driver registry

* fix(cli): close driver registry type export gaps

* Improve setup daemon diagnostics

* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback

Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.

* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match

The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.

Align the picker boundary with the canonical 3-level KtxTableRef:

- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
  resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
  (resolveEnabledTables already accepts the 3-part shape) and
  schemasFromEnabledTables now goes through parseDottedTableEntry so it
  recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
  reuse.

Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).

* fix(cli): allow debug telemetry under opt-out env
											
										
										
											2026-05-26 08:49:05 +02:00
+								  it('fails when connector driver and snapshot driver differ', async () => {
 								    const mismatchedConnector: KtxScanConnector = {
 								      ...connector(),
 								      driver: 'mysql',
 								    };
 								    await expect(
 								      runLocalScanEnrichment({
 								        connectionId: 'warehouse',
 								        mode: 'relationships',
 								        detectRelationships: true,
 								        connector: mismatchedConnector,
 								        snapshot,
 								        context: { runId: 'scan-run-driver-mismatch' },
 								        providers: null,
 								      }),
 								    ).rejects.toThrow(
 								      'ktx scan connector driver "mysql" does not match snapshot driver "postgres" for connection "warehouse"',
 								    );
 								  });
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								  it('runs deterministic relationship detection for relationship scans', async () => {
 								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'relationships',
 								      detectRelationships: true,
 								      connector: connector(),
 								      context: { runId: 'scan-run-1' },
 								      providers: null,
 								    });
 								    expect(result.summary).toMatchObject({
 								      deterministicRelationships: 'completed',
 								      llmRelationshipValidation: 'skipped',
 								      embeddings: 'skipped',
 								    });
 								    expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
 								    expect(result.summary.statisticalValidation).toBe('skipped');
 								    expect(result.warnings).toContainEqual({
 								      code: 'relationship_validation_failed',
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								      message: 'KTX scan connector advertises readOnlySql but does not expose executeReadOnly',
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								      recoverable: true,
 								      metadata: { capability: 'readOnlySql' },
 								    });
 								  });
 								  it('runs relationship discovery with connector SQL evidence', async () => {
 								    const executor = new InMemorySqliteExecutor();
 								    try {
 								      executor.db.exec(`
 								        CREATE TABLE accounts (id INTEGER NOT NULL);
 								        CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
 								        INSERT INTO accounts (id) VALUES (1), (2);
 								        INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
 								      `);
 								      const scanConnector = {
 								        ...connector(),
 								        driver: 'sqlite' as const,
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								        capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								        introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
 								        executeReadOnly: executor.executeReadOnly.bind(executor),
 								      };
 								      const result = await runLocalScanEnrichment({
 								        connectionId: 'warehouse',
 								        mode: 'relationships',
 								        detectRelationships: true,
 								        connector: scanConnector,
 								        context: { runId: 'scan-run-relationship-discovery' },
 								        providers: null,
 								      });
 								      expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
 								      expect(result.summary.statisticalValidation).toBe('completed');
 								      expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
 								      expect(result.resolvedRelationships).toEqual([
 								        expect.objectContaining({
 								          status: 'accepted',
 								          from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
 								          to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
 								        }),
 								      ]);
 								      expect(result.relationshipUpdate?.accepted).toHaveLength(1);
 								    } finally {
 								      executor.close();
 								    }
 								  });
 								  it('honors scan relationship config when LLM proposals are disabled', async () => {
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								    const providers = createDeterministicLocalScanEnrichmentProviders();
-												feat: add claude-code llm backend with runtime port (#115)

* docs: revise claude-code ingest backend spec

* docs: keep claude-code spec focused on ingest

* docs: expand claude-code spec to full llm parity

* Refine claude-code backend spec after adversarial review iteration 1

* Refine claude-code backend spec after adversarial review iteration 2

* Refine claude-code backend spec after adversarial review iteration 3

* feat: recognize claude-code llm backend

* feat: add ktx llm runtime port

* feat: add claude-code llm runtime

* feat: route non-agent llm calls through runtime

* feat: run ingest agents through llm runtime

* feat: support claude-code setup and status

* test: verify claude-code backend runtime

* docs: add claude-code backend v1 runtime plan

* fix: close claude-code runtime isolation checks

* fix: warn on claude-code prompt caching during setup

* chore: verify claude-code v1 closure

* docs: add claude-code backend v1 isolation closure plan

* fix: update claude-code ingest setup guidance

* docs: add claude-code backend v1 ingest guidance closure plan

* docs: align claude-code isolation spec with sdk metadata

* test: cover claude-code host discovery metadata

* fix: tolerate claude-code host discovery metadata

* docs: clarify claude-code host discovery metadata

* docs: add claude-code auth-probe isolation fix plan

* chore: prepare kaelio ktx rc1 release

* chore: add semantic release workflow

* fix: unblock ci checks

* chore(release): 0.1.0-rc.1

* feat: add Claude Code model selection to setup

* fix: keep git maintenance attached in local repos
											
										
										
											2026-05-16 12:06:34 +02:00
+								    const generateObject = vi.fn();
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'relationships',
 								      detectRelationships: true,
 								      connector: connector(),
 								      context: { runId: 'scan-run-llm-disabled' },
 								      providers: {
 								        ...providers,
-												feat: add claude-code llm backend with runtime port (#115)

* docs: revise claude-code ingest backend spec

* docs: keep claude-code spec focused on ingest

* docs: expand claude-code spec to full llm parity

* Refine claude-code backend spec after adversarial review iteration 1

* Refine claude-code backend spec after adversarial review iteration 2

* Refine claude-code backend spec after adversarial review iteration 3

* feat: recognize claude-code llm backend

* feat: add ktx llm runtime port

* feat: add claude-code llm runtime

* feat: route non-agent llm calls through runtime

* feat: run ingest agents through llm runtime

* feat: support claude-code setup and status

* test: verify claude-code backend runtime

* docs: add claude-code backend v1 runtime plan

* fix: close claude-code runtime isolation checks

* fix: warn on claude-code prompt caching during setup

* chore: verify claude-code v1 closure

* docs: add claude-code backend v1 isolation closure plan

* fix: update claude-code ingest setup guidance

* docs: add claude-code backend v1 ingest guidance closure plan

* docs: align claude-code isolation spec with sdk metadata

* test: cover claude-code host discovery metadata

* fix: tolerate claude-code host discovery metadata

* docs: clarify claude-code host discovery metadata

* docs: add claude-code auth-probe isolation fix plan

* chore: prepare kaelio ktx rc1 release

* chore: add semantic release workflow

* fix: unblock ci checks

* chore(release): 0.1.0-rc.1

* feat: add Claude Code model selection to setup

* fix: keep git maintenance attached in local repos
											
										
										
											2026-05-16 12:06:34 +02:00
+								        llmRuntime: {
 								          ...providers.llmRuntime,
 								          generateObject: generateObject as never,
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								        },
 								      },
 								      relationshipSettings: {
-												fix: remove project from ktx config (#95)
											
										
										
											2026-05-14 17:39:31 +02:00
+								        ...buildDefaultKtxProjectConfig().scan.relationships,
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								        llmProposals: false,
 								        maxLlmTablesPerBatch: 40,
 								      },
 								    });
 								    expect(result.summary.llmRelationshipValidation).toBe('skipped');
-												feat: add claude-code llm backend with runtime port (#115)

* docs: revise claude-code ingest backend spec

* docs: keep claude-code spec focused on ingest

* docs: expand claude-code spec to full llm parity

* Refine claude-code backend spec after adversarial review iteration 1

* Refine claude-code backend spec after adversarial review iteration 2

* Refine claude-code backend spec after adversarial review iteration 3

* feat: recognize claude-code llm backend

* feat: add ktx llm runtime port

* feat: add claude-code llm runtime

* feat: route non-agent llm calls through runtime

* feat: run ingest agents through llm runtime

* feat: support claude-code setup and status

* test: verify claude-code backend runtime

* docs: add claude-code backend v1 runtime plan

* fix: close claude-code runtime isolation checks

* fix: warn on claude-code prompt caching during setup

* chore: verify claude-code v1 closure

* docs: add claude-code backend v1 isolation closure plan

* fix: update claude-code ingest setup guidance

* docs: add claude-code backend v1 ingest guidance closure plan

* docs: align claude-code isolation spec with sdk metadata

* test: cover claude-code host discovery metadata

* fix: tolerate claude-code host discovery metadata

* docs: clarify claude-code host discovery metadata

* docs: add claude-code auth-probe isolation fix plan

* chore: prepare kaelio ktx rc1 release

* chore: add semantic release workflow

* fix: unblock ci checks

* chore(release): 0.1.0-rc.1

* feat: add Claude Code model selection to setup

* fix: keep git maintenance attached in local repos
											
										
										
											2026-05-16 12:06:34 +02:00
+								    expect(generateObject).not.toHaveBeenCalled();
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								  });
 								  it('skips relationship detection when scan relationships are disabled', async () => {
 								    const settings = {
-												fix: remove project from ktx config (#95)
											
										
										
											2026-05-14 17:39:31 +02:00
+								      ...buildDefaultKtxProjectConfig().scan.relationships,
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								      enabled: false,
 								    };
 								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      connector: connector(),
 								      context: { runId: 'disabled-relationships' },
 								      providers: createDeterministicLocalScanEnrichmentProviders(),
 								      relationshipSettings: settings,
 								    });
 								    expect(result.summary.deterministicRelationships).toBe('skipped');
 								    expect(result.summary.statisticalValidation).toBe('skipped');
 								    expect(result.summary.llmRelationshipValidation).toBe('skipped');
 								    expect(result.relationships).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
 								    expect(result.relationshipUpdate).toBeNull();
 								    expect(result.relationshipProfile).toBeNull();
 								    expect(result.resolvedRelationships).toBeNull();
 								  });
-												fix(context): merge overlay columns onto manifest columns by name (#94)

* fix(context): merge overlay columns onto manifest columns by name

composeOverlay was appending overlay columns to the manifest column list,
producing duplicate entries when dbt/metabase overlays declared a column
just to attach descriptions. The duplicates carried no `type`, so the
pydantic SourceDefinition rejected them at semantic-query time and broke
`ktx sl query` for every overlay-backed measure. Now overlay columns
match base columns by name (case-insensitive): same-name entries merge
onto the manifest (overlay fields win, type/role fall back to the base,
descriptions merge per source key) and only new names append.

* refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract

Overlay sources now have two distinct collections: `columns:` for computed
columns (requiring `expr` + `type`) and `column_overrides:` for metadata
patches to inherited manifest columns. Composing or loading an overlay that
mixes the two — or references an unknown column — fails with a typed error.

Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` /
`toResolvedWire` as the strict shape sent to the Python engine, and add a
schema contract test that diffs Zod against the Pydantic JSON schema dumped
by `python -m semantic_layer dump-schema`. `SourceDefinition` is now
`extra="forbid"` on the Python side.

`loadAllSources` surfaces per-file load errors instead of swallowing them,
so validation/query paths can report manifest shard parse failures.

* fix(context): make scan description generation resilient and quiet

A transient sampleTable failure during ingest used to take out every
table in a connection: generateTableDescription returned a hardcoded
'Table not found' string into descriptions.ai, and KtxDescriptionGenerator
was constructed without a logger, so the failure left no trail anywhere.

- sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff,
  honouring KtxScanContext.signal via a new KtxAbortedError.
- On retry exhaustion or missing capability, table generation falls back
  to a metadata-only prompt built from column name / native type / comment
  / rawDescriptions. The column path follows the same rule -- call the
  LLM when any of samples or rawDescriptions are available; skip only
  when both are absent.
- Logger is now threaded from KtxScanContext into the generator. Failures
  emit structured KtxScanWarning entries (new description_fallback_used
  code, plus existing sampling_failed / enrichment_failed /
  connector_capability_missing). ktx scan groups warnings by code so a
  batch of identical failures collapses to one summary line plus sample.
- Returns null on failure instead of the 'Table not found' sentinel; the
  manifest writer's existing guard already skips empty descriptions, so
  schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS
  already strips stale 'ai' on merge, so existing YAML clears on next run.

Also suppress AI SDK v6 'system in messages' warning: pull system messages
out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages
helper and pass them top-level to generateText (preserves cacheControl
providerOptions on the SystemModelMessage). Agent-runner's local
splitSystemPromptMessages dedupes onto the shared helper.

* test(docs): align examples-docs assertions with revamped docs

PR #103 (setup/guide doc revamp) reworded several CLI examples and
connection labels; the assertions in scripts/examples-docs.test.mjs
still referenced the pre-revamp wording and were failing in CI on main.
Update the regexes to match the post-revamp content:

- drop the `--json` flag from the sl-query example expectation
- move the `Driver:` / `Status: ok` probe to the connection reference,
  which is where that output now lives (driver id is lowercase
  `postgres`, not the display name `PostgreSQL`)
- drop the obsolete `Install \`uv\`...` troubleshooting line
- accept `<connectionId>` everywhere; the docs no longer use the
  hyphenated `<connection-id>` form
- match the `warehouse` connection id used in the quickstart instead of
  the `postgres-warehouse` id only used in the README and setup ref

* fix(sl): skip TS/Python schema contract test when uv is unavailable

The TypeScript checks CI job does not install uv or Python, so the
module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw
ENOENT and failed the suite. Wrap the schema dump in a try/catch and
guard the describe block with `describe.skipIf` so the test skips in
environments without uv. Local dev and any CI job that has uv on PATH
still runs the cross-language contract assertion.
											
										
										
											2026-05-15 02:11:04 +02:00
+								  it('forwards context.logger and emits warnings when sampleTable fails repeatedly', async () => {
 								    const failingConnector: KtxScanConnector = {
 								      ...connector(),
 								      sampleTable: vi.fn(async () => {
 								        throw new Error('pool: ECONNRESET');
 								      }),
 								    };
 								    const logger = {
 								      debug: vi.fn(),
 								      info: vi.fn(),
 								      warn: vi.fn(),
 								      error: vi.fn(),
 								    };
 								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      detectRelationships: false,
 								      connector: failingConnector,
 								      context: { runId: 'scan-run-warnings', logger },
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      providers: createDeterministicLocalScanEnrichmentProviders(),
-												fix(context): merge overlay columns onto manifest columns by name (#94)

* fix(context): merge overlay columns onto manifest columns by name

composeOverlay was appending overlay columns to the manifest column list,
producing duplicate entries when dbt/metabase overlays declared a column
just to attach descriptions. The duplicates carried no `type`, so the
pydantic SourceDefinition rejected them at semantic-query time and broke
`ktx sl query` for every overlay-backed measure. Now overlay columns
match base columns by name (case-insensitive): same-name entries merge
onto the manifest (overlay fields win, type/role fall back to the base,
descriptions merge per source key) and only new names append.

* refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract

Overlay sources now have two distinct collections: `columns:` for computed
columns (requiring `expr` + `type`) and `column_overrides:` for metadata
patches to inherited manifest columns. Composing or loading an overlay that
mixes the two — or references an unknown column — fails with a typed error.

Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` /
`toResolvedWire` as the strict shape sent to the Python engine, and add a
schema contract test that diffs Zod against the Pydantic JSON schema dumped
by `python -m semantic_layer dump-schema`. `SourceDefinition` is now
`extra="forbid"` on the Python side.

`loadAllSources` surfaces per-file load errors instead of swallowing them,
so validation/query paths can report manifest shard parse failures.

* fix(context): make scan description generation resilient and quiet

A transient sampleTable failure during ingest used to take out every
table in a connection: generateTableDescription returned a hardcoded
'Table not found' string into descriptions.ai, and KtxDescriptionGenerator
was constructed without a logger, so the failure left no trail anywhere.

- sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff,
  honouring KtxScanContext.signal via a new KtxAbortedError.
- On retry exhaustion or missing capability, table generation falls back
  to a metadata-only prompt built from column name / native type / comment
  / rawDescriptions. The column path follows the same rule -- call the
  LLM when any of samples or rawDescriptions are available; skip only
  when both are absent.
- Logger is now threaded from KtxScanContext into the generator. Failures
  emit structured KtxScanWarning entries (new description_fallback_used
  code, plus existing sampling_failed / enrichment_failed /
  connector_capability_missing). ktx scan groups warnings by code so a
  batch of identical failures collapses to one summary line plus sample.
- Returns null on failure instead of the 'Table not found' sentinel; the
  manifest writer's existing guard already skips empty descriptions, so
  schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS
  already strips stale 'ai' on merge, so existing YAML clears on next run.

Also suppress AI SDK v6 'system in messages' warning: pull system messages
out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages
helper and pass them top-level to generateText (preserves cacheControl
providerOptions on the SystemModelMessage). Agent-runner's local
splitSystemPromptMessages dedupes onto the shared helper.

* test(docs): align examples-docs assertions with revamped docs

PR #103 (setup/guide doc revamp) reworded several CLI examples and
connection labels; the assertions in scripts/examples-docs.test.mjs
still referenced the pre-revamp wording and were failing in CI on main.
Update the regexes to match the post-revamp content:

- drop the `--json` flag from the sl-query example expectation
- move the `Driver:` / `Status: ok` probe to the connection reference,
  which is where that output now lives (driver id is lowercase
  `postgres`, not the display name `PostgreSQL`)
- drop the obsolete `Install \`uv\`...` troubleshooting line
- accept `<connectionId>` everywhere; the docs no longer use the
  hyphenated `<connection-id>` form
- match the `warehouse` connection id used in the quickstart instead of
  the `postgres-warehouse` id only used in the README and setup ref

* fix(sl): skip TS/Python schema contract test when uv is unavailable

The TypeScript checks CI job does not install uv or Python, so the
module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw
ENOENT and failed the suite. Wrap the schema dump in a try/catch and
guard the describe block with `describe.skipIf` so the test skips in
environments without uv. Local dev and any CI job that has uv on PATH
still runs the cross-language contract assertion.
											
										
										
											2026-05-15 02:11:04 +02:00
+								    });
 								    const codes = result.warnings.map((warning) => warning.code);
 								    expect(codes).toContain('sampling_failed');
 								    expect(codes).toContain('description_fallback_used');
 								    expect(result.warnings.some((warning) => warning.table === 'customers')).toBe(true);
 								    expect(logger.warn).toHaveBeenCalled();
 								    expect(logger.error).toHaveBeenCalled();
 								    // Each of the two tables produced sampling_failed + description_fallback_used, so 2 + 2 = 4 warnings minimum.
 								    expect(result.warnings.length).toBeGreaterThanOrEqual(4);
 								    // Sampling was retried 3× for each of the 2 tables = 6 calls
 								    expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
 								  });
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								  it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      detectRelationships: true,
 								      connector: connector(),
 								      context: { runId: 'scan-run-2' },
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      providers: createDeterministicLocalScanEnrichmentProviders(),
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    });
 								    expect(result.summary).toMatchObject({
 								      dataDictionary: 'completed',
 								      tableDescriptions: 'completed',
 								      columnDescriptions: 'completed',
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      embeddings: 'skipped',
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								      deterministicRelationships: 'completed',
 								    });
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								    expect(result.embeddingUpdates).toEqual([]);
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    expect(result.snapshot).toEqual(snapshot);
 								    expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
 								  });
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								  it('generates batched table descriptions with bounded table-level concurrency', async () => {
-												perf: parallelize scan description generation

											
										
										
											2026-05-12 14:34:59 +02:00
+								    const concurrentSnapshot: KtxSchemaSnapshot = {
 								      ...snapshot,
 								      tables: Array.from({ length: 8 }, (_, index) => ({
 								        catalog: null,
 								        db: 'public',
 								        name: `table_${index + 1}`,
 								        kind: 'table' as const,
 								        comment: null,
 								        estimatedRows: 2,
 								        foreignKeys: [],
 								        columns: [
 								          {
 								            name: 'id',
 								            nativeType: 'integer',
 								            normalizedType: 'integer',
 								            dimensionType: 'number' as const,
 								            nullable: false,
 								            primaryKey: true,
 								            comment: null,
 								          },
 								        ],
 								      })),
 								    };
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								    let activeTableSamples = 0;
 								    let maxActiveTableSamples = 0;
-												perf: parallelize scan description generation

											
										
										
											2026-05-12 14:34:59 +02:00
+								    const scanConnector = {
 								      ...connector(),
 								      introspect: vi.fn(async () => concurrentSnapshot),
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								      sampleColumn: vi.fn(async () => ({
 								        values: ['1'],
 								        nullCount: 0,
 								        distinctCount: 1,
 								      })),
 								      sampleTable: vi.fn(async () => {
 								        activeTableSamples += 1;
 								        maxActiveTableSamples = Math.max(maxActiveTableSamples, activeTableSamples);
-												perf: parallelize scan description generation

											
										
										
											2026-05-12 14:34:59 +02:00
+								        await new Promise((resolve) => setTimeout(resolve, 10));
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								        activeTableSamples -= 1;
-												perf: parallelize scan description generation

											
										
										
											2026-05-12 14:34:59 +02:00
+								        return {
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								          headers: ['id'],
 								          rows: [[1]],
 								          totalRows: 1,
-												perf: parallelize scan description generation

											
										
										
											2026-05-12 14:34:59 +02:00
+								        };
 								      }),
 								    };
 								    const settings = {
-												fix: remove project from ktx config (#95)
											
										
										
											2026-05-14 17:39:31 +02:00
+								      ...buildDefaultKtxProjectConfig().scan.relationships,
-												perf: parallelize scan description generation

											
										
										
											2026-05-12 14:34:59 +02:00
+								      enabled: false,
 								    };
 								    await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      connector: scanConnector,
 								      context: { runId: 'scan-run-concurrent-descriptions' },
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      providers: createDeterministicLocalScanEnrichmentProviders(),
-												perf: parallelize scan description generation

											
										
										
											2026-05-12 14:34:59 +02:00
+								      relationshipSettings: settings,
 								    });
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								    expect(maxActiveTableSamples).toBe(4);
 								    expect(scanConnector.sampleColumn).not.toHaveBeenCalled();
-												perf: parallelize scan description generation

											
										
										
											2026-05-12 14:34:59 +02:00
+								  });
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								  it('reports enrichment progress for countable stages', async () => {
 								    const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
 								    const progress = {
 								      async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
 								        events.push({ progress: progressValue, message, transient: options?.transient });
 								      },
 								      startPhase() {
 								        return progress;
 								      },
 								    };
 								    await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      detectRelationships: true,
 								      connector: connector(),
 								      context: { runId: 'scan-run-progress', progress },
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      providers: {
 								        ...createDeterministicLocalScanEnrichmentProviders(),
 								        embedding: fakeScanEmbedding({ dimensions: 6 }),
 								      },
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    });
 								    expect(events).toEqual(
 								      expect.arrayContaining([
 								        expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
 								        expect.objectContaining({ message: 'Generating descriptions 2/2 tables', transient: true }),
 								        expect.objectContaining({ message: 'Building embeddings 1/1 batches', transient: true }),
 								        expect.objectContaining({ message: 'Detecting relationships' }),
 								      ]),
 								    );
 								  });
 								  it('reports progress before enrichment connector introspection starts', async () => {
 								    const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
 								    const progress = {
 								      async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
 								        events.push({ progress: progressValue, message, transient: options?.transient });
 								      },
 								      startPhase() {
 								        return progress;
 								      },
 								    };
 								    const scanConnector = {
 								      ...connector(),
 								      introspect: vi.fn(async () => {
 								        expect(events).toContainEqual(expect.objectContaining({ message: 'Loading enrichment schema snapshot' }));
 								        return snapshot;
 								      }),
 								    };
 								    await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'relationships',
 								      detectRelationships: true,
 								      connector: scanConnector,
 								      context: { runId: 'scan-run-progress-before-introspection', progress },
 								      providers: null,
 								    });
 								    expect(scanConnector.introspect).toHaveBeenCalled();
 								  });
 								  it('splits enrichment embedding requests by provider batch size', async () => {
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								    const manyColumnSnapshot: KtxSchemaSnapshot = {
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								      ...snapshot,
 								      tables: [
 								        {
 								          catalog: null,
 								          db: 'public',
 								          name: 'wide_orders',
 								          kind: 'table',
 								          comment: 'Wide order facts',
 								          estimatedRows: 3,
 								          foreignKeys: [],
 								          columns: Array.from({ length: 5 }, (_, index) => ({
 								            name: `metric_${index + 1}`,
 								            nativeType: 'integer',
 								            normalizedType: 'integer',
 								            dimensionType: 'number' as const,
 								            nullable: false,
 								            primaryKey: false,
 								            comment: `Metric ${index + 1}`,
 								          })),
 								        },
 								      ],
 								    };
 								    const scanConnector = {
 								      ...connector(),
 								      introspect: vi.fn(async () => manyColumnSnapshot),
 								    };
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								    const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    const embedBatch = vi.fn(async (texts: string[]) => {
 								      if (texts.length > 2) {
 								        throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
 								      }
 								      return texts.map((_, index) => [index, index + 1, index + 2]);
 								    });
 								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      detectRelationships: false,
 								      connector: scanConnector,
 								      context: { runId: 'scan-run-batched-embeddings' },
 								      providers: {
-												feat: add claude-code llm backend with runtime port (#115)

* docs: revise claude-code ingest backend spec

* docs: keep claude-code spec focused on ingest

* docs: expand claude-code spec to full llm parity

* Refine claude-code backend spec after adversarial review iteration 1

* Refine claude-code backend spec after adversarial review iteration 2

* Refine claude-code backend spec after adversarial review iteration 3

* feat: recognize claude-code llm backend

* feat: add ktx llm runtime port

* feat: add claude-code llm runtime

* feat: route non-agent llm calls through runtime

* feat: run ingest agents through llm runtime

* feat: support claude-code setup and status

* test: verify claude-code backend runtime

* docs: add claude-code backend v1 runtime plan

* fix: close claude-code runtime isolation checks

* fix: warn on claude-code prompt caching during setup

* chore: verify claude-code v1 closure

* docs: add claude-code backend v1 isolation closure plan

* fix: update claude-code ingest setup guidance

* docs: add claude-code backend v1 ingest guidance closure plan

* docs: align claude-code isolation spec with sdk metadata

* test: cover claude-code host discovery metadata

* fix: tolerate claude-code host discovery metadata

* docs: clarify claude-code host discovery metadata

* docs: add claude-code auth-probe isolation fix plan

* chore: prepare kaelio ktx rc1 release

* chore: add semantic release workflow

* fix: unblock ci checks

* chore(release): 0.1.0-rc.1

* feat: add Claude Code model selection to setup

* fix: keep git maintenance attached in local repos
											
										
										
											2026-05-16 12:06:34 +02:00
+								        llmRuntime: deterministicProviders.llmRuntime,
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								        embedding: {
 								          dimensions: 3,
 								          maxBatchSize: 2,
 								          embedBatch,
 								        },
 								      },
 								    });
 								    expect(result.embeddingUpdates).toHaveLength(5);
 								    expect(embedBatch.mock.calls.map(([texts]) => texts).map((texts) => texts.length)).toEqual([2, 2, 1]);
 								  });
 								  it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
 								    const stateStore = memoryEnrichmentStateStore();
 								    const scanConnector = connector();
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								    const providers = {
 								      ...createDeterministicLocalScanEnrichmentProviders(),
 								      embedding: fakeScanEmbedding({ dimensions: 6 }),
 								    };
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
 								    const first = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      detectRelationships: true,
 								      connector: scanConnector,
 								      context: { runId: 'scan-run-resume-1' },
 								      providers,
 								      stateStore,
 								      syncId: 'sync-resume-1',
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    });
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								    const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
 								    const second = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      detectRelationships: true,
 								      connector: scanConnector,
 								      context: { runId: 'scan-run-resume-1' },
 								      providers,
 								      stateStore,
 								      syncId: 'sync-resume-1',
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    });
 								    expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
 								    expect(first.state.resumedStages).toEqual([]);
 								    expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
 								    expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								    expect(generateObject).not.toHaveBeenCalled();
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    expect(embedBatch).not.toHaveBeenCalled();
 								    expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
 								    expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
 								    expect(second.relationships).toEqual(first.relationships);
 								  });
 								  it('does not reuse completed stages when the snapshot changes', async () => {
 								    const stateStore = memoryEnrichmentStateStore();
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								    const providers = {
 								      ...createDeterministicLocalScanEnrichmentProviders(),
 								      embedding: fakeScanEmbedding({ dimensions: 6 }),
 								    };
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    const scanConnector = connector();
 								    await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      detectRelationships: false,
 								      connector: scanConnector,
 								      context: { runId: 'scan-run-resume-hash' },
 								      providers,
 								      stateStore,
 								      syncId: 'sync-resume-hash',
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    });
 								    const firstTable = snapshot.tables[0];
 								    if (!firstTable) {
 								      throw new Error('Expected test snapshot table');
 								    }
 								    const changedConnector = {
 								      ...connector(),
 								      introspect: vi.fn(async () => ({
 								        ...snapshot,
 								        tables: [{ ...firstTable, name: 'customers' }],
 								      })),
 								    };
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								    const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
 								    const result = await runLocalScanEnrichment({
 								      connectionId: 'warehouse',
 								      mode: 'enriched',
 								      detectRelationships: false,
 								      connector: changedConnector,
 								      context: { runId: 'scan-run-resume-hash' },
 								      providers,
 								      stateStore,
 								      syncId: 'sync-resume-hash',
-												fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
											
										
										
											2026-05-19 16:40:01 +02:00
+								      providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								    });
 								    expect(result.state.resumedStages).toEqual([]);
 								    expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
-												fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)

* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure

Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.

- Remove the free-text Snowflake schema prompt; only pass `schema` to
  snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
  user for a comma-separated list, persist it as `schema_names`, and use
  it as both the table-list filter and the multiselect default. Applies
  to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
  documented single-schema shorthand.

* fix(snowflake): keep introspecting when primary-key discovery is denied

The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.

Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.

* fix(scan): unblock relationship discovery on Snowflake

Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:

- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
  for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
  profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
  Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
  (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
  `connector.sampleColumn` into bare locals, losing the `this` binding when
  the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
  Every sample call threw "Cannot read properties of undefined (reading
  'assertConnection')" and degraded LLM descriptions to metadata-only
  prompts. Call the methods through the connector instead.

Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.

* test(scan): cover table-ref helpers

* feat(scan): plumb tableScope through live-database introspection port

* feat(scan): apply tableScope during metadata fetch

* feat(scan): enforce table scope at fetch boundary

* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded

* feat(scan): dispatch query-history probes by connection driver

Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.

Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.

* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject

The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.

Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.

generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.

* chore(scripts): add ktx-reset.sh project-cleanup helper

Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
											
										
										
											2026-05-23 10:41:30 +02:00
+								    expect(generateObject).toHaveBeenCalled();
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								  });
 								  it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
 								    const executor = new InMemorySqliteExecutor();
 								    try {
 								      executor.db.exec(`
 								        CREATE TABLE accounts (id INTEGER NOT NULL);
 								        CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
 								        INSERT INTO accounts (id) VALUES (1), (2);
 								        INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
 								      `);
 								      const scanConnector = {
 								        ...connector(),
 								        driver: 'sqlite' as const,
-												rename klo to ktx

											
										
										
											2026-05-10 23:51:24 +02:00
+								        capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
-												Initial open-source release

											
										
										
											2026-05-10 23:12:26 +02:00
+								        introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
 								        executeReadOnly: executor.executeReadOnly.bind(executor),
 								      };
 								      const result = await runLocalScanEnrichment({
 								        connectionId: 'warehouse',
 								        mode: 'enriched',
 								        detectRelationships: false,
 								        connector: scanConnector,
 								        context: { runId: 'scan-run-providerless-enriched' },
 								        providers: null,
 								      });
 								      expect(result.summary).toEqual({
 								        dataDictionary: 'skipped',
 								        tableDescriptions: 'skipped',
 								        columnDescriptions: 'skipped',
 								        embeddings: 'skipped',
 								        deterministicRelationships: 'completed',
 								        llmRelationshipValidation: 'skipped',
 								        statisticalValidation: 'completed',
 								      });
 								      expect(result.descriptionUpdates).toEqual([]);
 								      expect(result.embeddingUpdates).toEqual([]);
 								      expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
 								      expect(result.relationshipUpdate?.accepted).toHaveLength(1);
 								      expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
 								      expect(result.resolvedRelationships).toEqual([
 								        expect.objectContaining({
 								          status: 'accepted',
 								          from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
 								          to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
 								        }),
 								      ]);
 								      expect(result.warnings).toContainEqual({
 								        code: 'scan_enrichment_backend_not_configured',
 								        message:
 								          'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
 								        recoverable: true,
 								        metadata: {
 								          skippedStages: ['descriptions', 'embeddings'],
 								          relationshipDetection: true,
 								        },
 								      });
 								    } finally {
 								      executor.close();
 								    }
 								  });
 								});