ktx/packages/cli/test/context/sl/dictionary-search.test.ts

import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
import { createKtxDictionarySearchService } from '../../../src/context/sl/dictionary-search.js';

describe('createKtxDictionarySearchService', () => {
  let tempDir: string;
  let project: KtxLocalProject;

  beforeEach(async () => {
    tempDir = await mkdtemp(join(tmpdir(), 'ktx-dictionary-search-'));
    project = await initKtxProject({ projectDir: join(tempDir, 'project') });
    project.config.connections.warehouse = { driver: 'postgres', url: 'env:DATABASE_URL' };
    project.config.connections.billing = { driver: 'postgres', url: 'env:BILLING_DATABASE_URL' };
  });

  afterEach(async () => {
    await rm(tempDir, { recursive: true, force: true });
  });

  async function seedProfile(input: {
    connectionId: string;
    syncId: string;
    columns: Record<string, unknown>;
  }): Promise<void> {
    await project.fileStore.writeFile(
      `raw-sources/${input.connectionId}/live-database/${input.syncId}/enrichment/relationship-profile.json`,
      `${JSON.stringify(
        {
          connectionId: input.connectionId,
          driver: 'postgres',
          sqlAvailable: true,
          queryCount: 4,
          tables: [],
          columns: input.columns,
          warnings: [],
        },
        null,
        2,
      )}\n`,
      'ktx',
      'ktx@example.com',
      'Seed relationship profile',
    );
  }

  it('returns matches and non-authoritative misses across configured connections', async () => {
    await seedProfile({
      connectionId: 'warehouse',
      syncId: 'sync-1',
      columns: {
        'orders.status': {
          table: { catalog: null, db: 'public', name: 'orders' },
          column: 'status',
          nativeType: 'text',
          normalizedType: 'string',
          distinctCount: 3,
          sampleValues: ['paid', 'refunded', 'pending'],
        },
      },
    });
    await seedProfile({
      connectionId: 'billing',
      syncId: 'sync-2',
      columns: {
        'customers.name': {
          table: { catalog: null, db: 'public', name: 'customers' },
          column: 'name',
          nativeType: 'text',
          normalizedType: 'string',
          distinctCount: 4,
          sampleValues: ['Acme Corp', 'Globex'],
        },
      },
    });
    const service = createKtxDictionarySearchService(project);

    await expect(service.search({ values: ['PAID', 'missing'] })).resolves.toEqual({
      searched: [
        {
          connectionId: 'billing',
          coverage: {
            sampledRows: null,
            valuesPerColumn: null,
            profiledColumns: 1,
            syncId: 'sync-2',
            profiledAt: null,
          },
          status: 'ready',
        },
        {
          connectionId: 'warehouse',
          coverage: {
            sampledRows: null,
            valuesPerColumn: null,
            profiledColumns: 1,
            syncId: 'sync-1',
            profiledAt: null,
          },
          status: 'ready',
        },
      ],
      results: [
        {
          value: 'PAID',
          matches: [
            {
              connectionId: 'warehouse',
              sourceName: 'orders',
              columnName: 'status',
              matchedValue: 'paid',
              cardinality: 3,
            },
          ],
          misses: [{ connectionId: 'billing', reason: 'value_not_in_sample' }],
        },
        {
          value: 'missing',
          matches: [],
          misses: [
            { connectionId: 'billing', reason: 'value_not_in_sample' },
            { connectionId: 'warehouse', reason: 'value_not_in_sample' },
          ],
        },
      ],
    });
  });

  it('distinguishes missing profile artifacts from profiles with no candidate columns', async () => {
    await seedProfile({
      connectionId: 'billing',
      syncId: 'sync-empty',
      columns: {
        'events.id': {
          table: { catalog: null, db: 'public', name: 'events' },
          column: 'id',
          nativeType: 'integer',
          normalizedType: 'integer',
          distinctCount: 100,
          sampleValues: [1, 2, 3],
        },
      },
    });
    const service = createKtxDictionarySearchService(project);

    await expect(service.search({ values: ['Acme'] })).resolves.toEqual({
      searched: [
        {
          connectionId: 'billing',
          coverage: {
            sampledRows: null,
            valuesPerColumn: null,
            profiledColumns: 0,
            syncId: 'sync-empty',
            profiledAt: null,
          },
          status: 'no_candidate_columns',
        },
        {
          connectionId: 'warehouse',
          coverage: {
            sampledRows: null,
            valuesPerColumn: null,
            profiledColumns: 0,
            syncId: null,
            profiledAt: null,
          },
          status: 'no_profile_artifact',
        },
      ],
      results: [
        {
          value: 'Acme',
          matches: [],
          misses: [
            { connectionId: 'billing', reason: 'no_candidate_columns' },
            { connectionId: 'warehouse', reason: 'no_profile_artifact' },
          ],
        },
      ],
    });
  });

  it('scopes search to the requested connection', async () => {
    await seedProfile({
      connectionId: 'warehouse',
      syncId: 'sync-1',
      columns: {
        'orders.status': {
          table: { catalog: null, db: 'public', name: 'orders' },
          column: 'status',
          nativeType: 'text',
          normalizedType: 'string',
          distinctCount: 3,
          sampleValues: ['paid'],
        },
      },
    });
    await seedProfile({
      connectionId: 'billing',
      syncId: 'sync-2',
      columns: {
        'invoices.status': {
          table: { catalog: null, db: 'public', name: 'invoices' },
          column: 'status',
          nativeType: 'text',
          normalizedType: 'string',
          distinctCount: 2,
          sampleValues: ['paid'],
        },
      },
    });
    const service = createKtxDictionarySearchService(project);

    await expect(service.search({ connectionId: 'billing', values: ['paid'] })).resolves.toMatchObject({
      searched: [{ connectionId: 'billing', status: 'ready' }],
      results: [
        {
          value: 'paid',
          matches: [{ connectionId: 'billing', sourceName: 'invoices', columnName: 'status', matchedValue: 'paid' }],
          misses: [],
        },
      ],
    });
  });
});
feat(mcp):added MCP server (#97) * docs(specs): design research-agent MCP tools and ktx mcp daemon Adds the 2026-05-14 design spec for exposing four new MCP tools (discover_data, entity_details, dictionary_search, sql_execution), shipping a ktx-research skill, and introducing an HTTP-only ktx mcp daemon so external agents can use KTX as a research-capable context layer. * Refine research-agent MCP tools spec after adversarial review iteration 1 * Refine research-agent MCP tools spec after adversarial review iteration 2 * Refine research-agent MCP tools spec after adversarial review iteration 3 * Refine spec: drop connectionName compat carve-out and ground summary/snippet provenance per kind * feat(daemon): validate read-only SQL with sqlglot * feat(context): expose read-only SQL validation port * feat(context): register MCP sql execution tool * feat(context): execute MCP SQL through validated connector path * test(context): update SQL analysis port fixtures * docs: add research-agent MCP sql execution foundation plan * feat(context): add scan-backed entity details service * feat(context): register MCP entity details tool * feat(context): expose local MCP entity details * test(context): align entity details scan fixtures * docs: add research-agent MCP entity_details plan * feat(context): add dictionary search service * feat(context): register MCP dictionary search tool * feat(context): expose local MCP dictionary search * docs: add research-agent MCP dictionary_search plan * feat: add MCP discover data service * feat: expose discover data MCP tool * feat: wire local discover data MCP port * docs: add research-agent MCP discover_data plan * feat(cli): add mcp http security helpers * feat(cli): host mcp over streamable http * feat(cli): manage mcp daemon lifecycle * feat(cli): add ktx mcp commands * fix(cli): stabilize mcp daemon verification * docs: add research-agent MCP http daemon plan * feat(cli): install KTX research skill * feat(cli): configure MCP clients in setup agents * feat(cli): support Claude local MCP setup scope * docs: add research-agent MCP setup-agents plan * refactor(context): use connectionId in warehouse verification tools * docs(context): update ingest verification prompts for connectionId * docs: add research-agent MCP ingest contract convergence plan * chore: build runtime artifacts in conductor setup --------- Co-authored-by: Andrey Avtomonov <7889985+andreybavt@users.noreply.github.com> 2026-05-15 02:35:09 +02:00			`import { mkdtemp, rm } from 'node:fs/promises';`
			`import { tmpdir } from 'node:os';`
			`import { join } from 'node:path';`
			`import { afterEach, beforeEach, describe, expect, it } from 'vitest';`
test: split cli tests from source tree (#216) * feat(cli): define full warehouse dialect contract * test(cli): keep dialect edge tests focused * fix(cli): stabilize dialect contract foundation * refactor(connectors): own read-only query preparation * refactor(connectors): resolve dialects through registry * refactor(connectors): keep concrete dialect classes internal * chore(workspace): enforce dialect import boundary * refactor(cli): resolve relationship dialect at scan boundary * refactor(cli): use dialect display parsing for entity details * refactor(cli): use dialect display parsing for warehouse catalog * refactor(cli): use dialect SQL in relationship workflows * test(cli): verify solid dialect scan workflow closure * test: split cli tests from source tree * refactor(cli): standardize BigQuery scope listing * feat(sqlite): implement connector scope listing * test(connectors): cover required table listing * feat(cli): add warehouse driver registry * refactor(setup): route scope discovery through driver registry * refactor(cli): route local query execution through driver registry * refactor(historic-sql): route dialect support through driver registry * refactor(cli): test warehouse connections through driver registry * fix(cli): close driver registry type export gaps * Improve setup daemon diagnostics * refactor(setup): centralize rail-prefixed diagnostics + query-history fallback Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput into clack.ts so the setup wizard, managed daemons, and embedding/agent steps share one rail-formatted writer. setup-databases.ts also adds a "disable query history and retry" option when the schema-context build fails and query history is the likely culprit, surfaced via a new failed-query-history-unavailable status. * fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match The setup picker's KtxTableListEntry was a 2-level { schema, name }, so qualifiedTableId always wrote db.name into enabled_tables. When BigQuery, Snowflake, or SQL Server later ran fast ingest, their introspect step filtered the scope set with scopedTableNames(scope, { catalog: projectId\|database, db }) — catalog was non-null on the introspect side but null in the scope refs, so every entry was rejected, the live-database adapter staged zero table files, and detect() failed with 'Adapter "live-database" did not recognize fetched source output'. Align the picker boundary with the canonical 3-level KtxTableRef: - Add catalog: string \| null to KtxTableListEntry. - BigQuery/Snowflake/SQL Server listTables populate catalog from the resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null. - qualifiedTableId emits catalog.schema.name when catalog is non-null (resolveEnabledTables already accepts the 3-part shape) and schemasFromEnabledTables now goes through parseDottedTableEntry so it recovers the schema correctly from both 2-part and 3-part entries. - Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker reuse. Update listTables expectations in all seven connector tests and the setup / picker test fixtures. Add a picker regression test that covers the catalog-bearing round-trip (save + refine). * fix(cli): allow debug telemetry under opt-out env 2026-05-26 08:49:05 +02:00			`import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';`
			`import { createKtxDictionarySearchService } from '../../../src/context/sl/dictionary-search.js';`
feat(mcp):added MCP server (#97) * docs(specs): design research-agent MCP tools and ktx mcp daemon Adds the 2026-05-14 design spec for exposing four new MCP tools (discover_data, entity_details, dictionary_search, sql_execution), shipping a ktx-research skill, and introducing an HTTP-only ktx mcp daemon so external agents can use KTX as a research-capable context layer. * Refine research-agent MCP tools spec after adversarial review iteration 1 * Refine research-agent MCP tools spec after adversarial review iteration 2 * Refine research-agent MCP tools spec after adversarial review iteration 3 * Refine spec: drop connectionName compat carve-out and ground summary/snippet provenance per kind * feat(daemon): validate read-only SQL with sqlglot * feat(context): expose read-only SQL validation port * feat(context): register MCP sql execution tool * feat(context): execute MCP SQL through validated connector path * test(context): update SQL analysis port fixtures * docs: add research-agent MCP sql execution foundation plan * feat(context): add scan-backed entity details service * feat(context): register MCP entity details tool * feat(context): expose local MCP entity details * test(context): align entity details scan fixtures * docs: add research-agent MCP entity_details plan * feat(context): add dictionary search service * feat(context): register MCP dictionary search tool * feat(context): expose local MCP dictionary search * docs: add research-agent MCP dictionary_search plan * feat: add MCP discover data service * feat: expose discover data MCP tool * feat: wire local discover data MCP port * docs: add research-agent MCP discover_data plan * feat(cli): add mcp http security helpers * feat(cli): host mcp over streamable http * feat(cli): manage mcp daemon lifecycle * feat(cli): add ktx mcp commands * fix(cli): stabilize mcp daemon verification * docs: add research-agent MCP http daemon plan * feat(cli): install KTX research skill * feat(cli): configure MCP clients in setup agents * feat(cli): support Claude local MCP setup scope * docs: add research-agent MCP setup-agents plan * refactor(context): use connectionId in warehouse verification tools * docs(context): update ingest verification prompts for connectionId * docs: add research-agent MCP ingest contract convergence plan * chore: build runtime artifacts in conductor setup --------- Co-authored-by: Andrey Avtomonov <7889985+andreybavt@users.noreply.github.com> 2026-05-15 02:35:09 +02:00
			`describe('createKtxDictionarySearchService', () => {`
			`let tempDir: string;`
			`let project: KtxLocalProject;`

			`beforeEach(async () => {`
			`tempDir = await mkdtemp(join(tmpdir(), 'ktx-dictionary-search-'));`
			`project = await initKtxProject({ projectDir: join(tempDir, 'project') });`
			`project.config.connections.warehouse = { driver: 'postgres', url: 'env:DATABASE_URL' };`
			`project.config.connections.billing = { driver: 'postgres', url: 'env:BILLING_DATABASE_URL' };`
			`});`

			`afterEach(async () => {`
			`await rm(tempDir, { recursive: true, force: true });`
			`});`

			`async function seedProfile(input: {`
			`connectionId: string;`
			`syncId: string;`
			`columns: Record<string, unknown>;`
			`}): Promise<void> {`
			`await project.fileStore.writeFile(`
			`raw-sources/${input.connectionId}/live-database/${input.syncId}/enrichment/relationship-profile.json`,
			`${JSON.stringify(
			`{`
			`connectionId: input.connectionId,`
			`driver: 'postgres',`
			`sqlAvailable: true,`
			`queryCount: 4,`
			`tables: [],`
			`columns: input.columns,`
			`warnings: [],`
			`},`
			`null,`
			`2,`
			)}\n`,
			`'ktx',`
			`'ktx@example.com',`
			`'Seed relationship profile',`
			`);`
			`}`

			`it('returns matches and non-authoritative misses across configured connections', async () => {`
			`await seedProfile({`
			`connectionId: 'warehouse',`
			`syncId: 'sync-1',`
			`columns: {`
			`'orders.status': {`
			`table: { catalog: null, db: 'public', name: 'orders' },`
			`column: 'status',`
			`nativeType: 'text',`
			`normalizedType: 'string',`
			`distinctCount: 3,`
			`sampleValues: ['paid', 'refunded', 'pending'],`
			`},`
			`},`
			`});`
			`await seedProfile({`
			`connectionId: 'billing',`
			`syncId: 'sync-2',`
			`columns: {`
			`'customers.name': {`
			`table: { catalog: null, db: 'public', name: 'customers' },`
			`column: 'name',`
			`nativeType: 'text',`
			`normalizedType: 'string',`
			`distinctCount: 4,`
			`sampleValues: ['Acme Corp', 'Globex'],`
			`},`
			`},`
			`});`
			`const service = createKtxDictionarySearchService(project);`

			`await expect(service.search({ values: ['PAID', 'missing'] })).resolves.toEqual({`
			`searched: [`
			`{`
			`connectionId: 'billing',`
			`coverage: {`
			`sampledRows: null,`
			`valuesPerColumn: null,`
			`profiledColumns: 1,`
			`syncId: 'sync-2',`
			`profiledAt: null,`
			`},`
			`status: 'ready',`
			`},`
			`{`
			`connectionId: 'warehouse',`
			`coverage: {`
			`sampledRows: null,`
			`valuesPerColumn: null,`
			`profiledColumns: 1,`
			`syncId: 'sync-1',`
			`profiledAt: null,`
			`},`
			`status: 'ready',`
			`},`
			`],`
			`results: [`
			`{`
			`value: 'PAID',`
			`matches: [`
			`{`
			`connectionId: 'warehouse',`
			`sourceName: 'orders',`
			`columnName: 'status',`
			`matchedValue: 'paid',`
			`cardinality: 3,`
			`},`
			`],`
			`misses: [{ connectionId: 'billing', reason: 'value_not_in_sample' }],`
			`},`
			`{`
			`value: 'missing',`
			`matches: [],`
			`misses: [`
			`{ connectionId: 'billing', reason: 'value_not_in_sample' },`
			`{ connectionId: 'warehouse', reason: 'value_not_in_sample' },`
			`],`
			`},`
			`],`
			`});`
			`});`

			`it('distinguishes missing profile artifacts from profiles with no candidate columns', async () => {`
			`await seedProfile({`
			`connectionId: 'billing',`
			`syncId: 'sync-empty',`
			`columns: {`
			`'events.id': {`
			`table: { catalog: null, db: 'public', name: 'events' },`
			`column: 'id',`
			`nativeType: 'integer',`
			`normalizedType: 'integer',`
			`distinctCount: 100,`
			`sampleValues: [1, 2, 3],`
			`},`
			`},`
			`});`
			`const service = createKtxDictionarySearchService(project);`

			`await expect(service.search({ values: ['Acme'] })).resolves.toEqual({`
			`searched: [`
			`{`
			`connectionId: 'billing',`
			`coverage: {`
			`sampledRows: null,`
			`valuesPerColumn: null,`
			`profiledColumns: 0,`
			`syncId: 'sync-empty',`
			`profiledAt: null,`
			`},`
			`status: 'no_candidate_columns',`
			`},`
			`{`
			`connectionId: 'warehouse',`
			`coverage: {`
			`sampledRows: null,`
			`valuesPerColumn: null,`
			`profiledColumns: 0,`
			`syncId: null,`
			`profiledAt: null,`
			`},`
			`status: 'no_profile_artifact',`
			`},`
			`],`
			`results: [`
			`{`
			`value: 'Acme',`
			`matches: [],`
			`misses: [`
			`{ connectionId: 'billing', reason: 'no_candidate_columns' },`
			`{ connectionId: 'warehouse', reason: 'no_profile_artifact' },`
			`],`
			`},`
			`],`
			`});`
			`});`

			`it('scopes search to the requested connection', async () => {`
			`await seedProfile({`
			`connectionId: 'warehouse',`
			`syncId: 'sync-1',`
			`columns: {`
			`'orders.status': {`
			`table: { catalog: null, db: 'public', name: 'orders' },`
			`column: 'status',`
			`nativeType: 'text',`
			`normalizedType: 'string',`
			`distinctCount: 3,`
			`sampleValues: ['paid'],`
			`},`
			`},`
			`});`
			`await seedProfile({`
			`connectionId: 'billing',`
			`syncId: 'sync-2',`
			`columns: {`
			`'invoices.status': {`
			`table: { catalog: null, db: 'public', name: 'invoices' },`
			`column: 'status',`
			`nativeType: 'text',`
			`normalizedType: 'string',`
			`distinctCount: 2,`
			`sampleValues: ['paid'],`
			`},`
			`},`
			`});`
			`const service = createKtxDictionarySearchService(project);`

			`await expect(service.search({ connectionId: 'billing', values: ['paid'] })).resolves.toMatchObject({`
			`searched: [{ connectionId: 'billing', status: 'ready' }],`
			`results: [`
			`{`
			`value: 'paid',`
			`matches: [{ connectionId: 'billing', sourceName: 'invoices', columnName: 'status', matchedValue: 'paid' }],`
			`misses: [],`
			`},`
			`],`
			`});`
			`});`
			`});`