feat(mcp):added MCP server (#97)

* docs(specs): design research-agent MCP tools and ktx mcp daemon

Adds the 2026-05-14 design spec for exposing four new MCP tools
(discover_data, entity_details, dictionary_search, sql_execution),
shipping a ktx-research skill, and introducing an HTTP-only ktx mcp
daemon so external agents can use KTX as a research-capable context
layer.

* Refine research-agent MCP tools spec after adversarial review iteration 1

* Refine research-agent MCP tools spec after adversarial review iteration 2

* Refine research-agent MCP tools spec after adversarial review iteration 3

* Refine spec: drop connectionName compat carve-out and ground summary/snippet provenance per kind

* feat(daemon): validate read-only SQL with sqlglot

* feat(context): expose read-only SQL validation port

* feat(context): register MCP sql execution tool

* feat(context): execute MCP SQL through validated connector path

* test(context): update SQL analysis port fixtures

* docs: add research-agent MCP sql execution foundation plan

* feat(context): add scan-backed entity details service

* feat(context): register MCP entity details tool

* feat(context): expose local MCP entity details

* test(context): align entity details scan fixtures

* docs: add research-agent MCP entity_details plan

* feat(context): add dictionary search service

* feat(context): register MCP dictionary search tool

* feat(context): expose local MCP dictionary search

* docs: add research-agent MCP dictionary_search plan

* feat: add MCP discover data service

* feat: expose discover data MCP tool

* feat: wire local discover data MCP port

* docs: add research-agent MCP discover_data plan

* feat(cli): add mcp http security helpers

* feat(cli): host mcp over streamable http

* feat(cli): manage mcp daemon lifecycle

* feat(cli): add ktx mcp commands

* fix(cli): stabilize mcp daemon verification

* docs: add research-agent MCP http daemon plan

* feat(cli): install KTX research skill

* feat(cli): configure MCP clients in setup agents

* feat(cli): support Claude local MCP setup scope

* docs: add research-agent MCP setup-agents plan

* refactor(context): use connectionId in warehouse verification tools

* docs(context): update ingest verification prompts for connectionId

* docs: add research-agent MCP ingest contract convergence plan

* chore: build runtime artifacts in conductor setup

---------

Co-authored-by: Andrey Avtomonov <7889985+andreybavt@users.noreply.github.com>
This commit is contained in:
Andrey Avtomonov 2026-05-15 02:35:09 +02:00 committed by GitHub
parent c7b64379bf
commit b759a4a286
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
78 changed files with 13689 additions and 190 deletions

View file

@ -0,0 +1,291 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKtxProject, type KtxLocalProject } from '../project/index.js';
import { createKtxEntityDetailsService } from './entity-details.js';
import type { KtxConnectionDriver, KtxScanReport, KtxSchemaTable } from './types.js';
describe('createKtxEntityDetailsService', () => {
let tempDir: string;
let project: KtxLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-entity-details-service-'));
project = await initKtxProject({ projectDir: join(tempDir, 'project') });
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
function scanReport(input: {
connectionId: string;
syncId: string;
runId: string;
driver?: KtxConnectionDriver;
createdAt?: string;
}): KtxScanReport {
const rawSourcesDir = `raw-sources/${input.connectionId}/live-database/${input.syncId}`;
return {
connectionId: input.connectionId,
driver: input.driver ?? 'postgres',
syncId: input.syncId,
runId: input.runId,
trigger: 'mcp',
mode: 'structural',
dryRun: false,
artifactPaths: {
rawSourcesDir,
reportPath: `${rawSourcesDir}/scan-report.json`,
manifestShards: [],
enrichmentArtifacts: [],
},
diffSummary: {
tablesAdded: 0,
tablesModified: 0,
tablesDeleted: 0,
tablesUnchanged: 1,
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
},
manifestShardsWritten: 0,
structuralSyncStats: {
tablesCreated: 1,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
},
enrichment: {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'skipped',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
capabilityGaps: [],
warnings: [],
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] },
createdAt: input.createdAt ?? '2026-05-14T09:00:00.000Z',
};
}
function ordersTable(input: { db?: string | null; estimatedRows?: number | null } = {}): KtxSchemaTable {
return {
catalog: null,
db: input.db ?? 'public',
name: 'orders',
kind: 'table',
comment: 'Customer orders',
estimatedRows: input.estimatedRows ?? 12,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'status',
nativeType: 'text',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: 'Order status',
},
],
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: 'orders_customer_id_fkey',
},
],
};
}
async function seedScan(input: {
connectionId?: string;
syncId: string;
runId: string;
driver?: KtxConnectionDriver;
extractedAt?: string;
tables?: KtxSchemaTable[];
}): Promise<void> {
const connectionId = input.connectionId ?? 'warehouse';
const report = scanReport({
connectionId,
syncId: input.syncId,
runId: input.runId,
driver: input.driver,
createdAt: input.extractedAt,
});
const root = report.artifactPaths.rawSourcesDir;
await project.fileStore.writeFile(
`${root}/connection.json`,
JSON.stringify(
{
connectionId,
driver: report.driver,
extractedAt: input.extractedAt ?? report.createdAt,
scope: { schemas: ['public'] },
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed connection',
);
for (const table of input.tables ?? [ordersTable()]) {
await project.fileStore.writeFile(
`${root}/tables/${table.db ?? 'default'}-${table.name}.json`,
JSON.stringify(table, null, 2),
'ktx',
'ktx@example.com',
`seed ${table.name}`,
);
}
await project.fileStore.writeFile(
`${root}/scan-report.json`,
JSON.stringify(report, null, 2),
'ktx',
'ktx@example.com',
'seed scan report',
);
}
it('returns the latest scan snapshot table details for a display string', async () => {
await seedScan({ syncId: 'sync-1', runId: 'scan-old', extractedAt: '2026-05-14T08:00:00.000Z' });
await seedScan({
syncId: 'sync-2',
runId: 'scan-new',
extractedAt: '2026-05-14T09:00:00.000Z',
tables: [ordersTable({ estimatedRows: 99 })],
});
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: 'public.orders' }],
});
expect(result.results).toHaveLength(1);
expect(result.results[0]).toMatchObject({
ok: true,
connectionId: 'warehouse',
display: 'public.orders',
estimatedRows: 99,
snapshot: {
syncId: 'sync-2',
scanRunId: 'scan-new',
extractedAt: '2026-05-14T09:00:00.000Z',
},
columns: [
{ name: 'id', nativeType: 'integer', primaryKey: true },
{ name: 'status', nativeType: 'text', nullable: false },
],
});
});
it('filters requested columns while keeping full-table foreign keys', async () => {
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: { catalog: null, db: 'public', name: 'orders' }, columns: ['status'] }],
});
expect(result.results[0]).toMatchObject({
ok: true,
columns: [{ name: 'status' }],
foreignKeys: [
{
fromColumn: 'customer_id',
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
},
],
});
});
it('returns a structured missing-scan error', async () => {
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: 'public.orders' }],
});
expect(result.results).toEqual([
{
ok: false,
connectionId: 'warehouse',
table: 'public.orders',
error: {
code: 'scan_missing',
message: 'No live-database scan found for connection "warehouse"; run `ktx ingest warehouse` or `ktx scan warehouse`.',
},
},
]);
});
it('reports ambiguous bare table names across schemas', async () => {
await seedScan({
syncId: 'sync-1',
runId: 'scan-1',
tables: [ordersTable({ db: 'public' }), ordersTable({ db: 'archive' })],
});
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: 'orders' }],
});
expect(result.results[0]).toMatchObject({
ok: false,
error: {
code: 'ambiguous_table',
candidates: [
{ tableRef: { catalog: null, db: 'archive', name: 'orders' }, display: 'archive.orders' },
{ tableRef: { catalog: null, db: 'public', name: 'orders' }, display: 'public.orders' },
],
},
});
});
it('reports missing requested columns with available column candidates', async () => {
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: 'public.orders', columns: ['status', 'plan_tier'] }],
});
expect(result.results[0]).toMatchObject({
ok: false,
error: {
code: 'column_not_found',
message: 'Column(s) not found on public.orders: plan_tier',
candidates: ['id', 'status'],
},
});
});
});

View file

@ -0,0 +1,315 @@
import type { KtxLocalProject } from '../project/index.js';
import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
import type {
KtxConnectionDriver,
KtxScanReport,
KtxSchemaColumn,
KtxSchemaSnapshot,
KtxSchemaTable,
KtxTableRef,
} from './types.js';
export type KtxEntityDetailsTableInput = string | KtxTableRef;
export interface KtxEntityDetailsInput {
connectionId: string;
entities: Array<{
table: KtxEntityDetailsTableInput;
columns?: string[];
}>;
}
export interface KtxEntityDetailsSnapshotInfo {
syncId: string;
extractedAt: string;
scanRunId: string | null;
}
export interface KtxEntityDetailsColumn {
name: string;
nativeType: string;
normalizedType: string;
dimensionType: KtxSchemaColumn['dimensionType'];
nullable: boolean;
primaryKey: boolean;
comment: string | null;
}
export interface KtxEntityDetailsRecord {
ok: true;
connectionId: string;
tableRef: KtxTableRef;
display: string;
kind: KtxSchemaTable['kind'];
comment: string | null;
estimatedRows: number | null;
columns: KtxEntityDetailsColumn[];
foreignKeys: KtxSchemaTable['foreignKeys'];
snapshot: KtxEntityDetailsSnapshotInfo;
}
export type KtxEntityDetailsErrorCode = 'scan_missing' | 'table_not_found' | 'ambiguous_table' | 'column_not_found';
export interface KtxEntityDetailsErrorResult {
ok: false;
connectionId: string;
table: KtxEntityDetailsTableInput;
snapshot?: KtxEntityDetailsSnapshotInfo;
error: {
code: KtxEntityDetailsErrorCode;
message: string;
candidates?: Array<{ tableRef: KtxTableRef; display: string }> | string[];
};
}
export interface KtxEntityDetailsResponse {
results: Array<KtxEntityDetailsRecord | KtxEntityDetailsErrorResult>;
}
interface LatestScan {
report: KtxScanReport;
snapshot: KtxSchemaSnapshot;
}
interface ResolveResult {
table: KtxSchemaTable | null;
error?: Omit<KtxEntityDetailsErrorResult['error'], 'message'> & { message: string };
}
function normalize(value: string | null | undefined): string {
return (value ?? '').toLowerCase();
}
function refsEqual(left: KtxTableRef, right: KtxTableRef): boolean {
return (
normalize(left.catalog) === normalize(right.catalog) &&
normalize(left.db) === normalize(right.db) &&
normalize(left.name) === normalize(right.name)
);
}
function cleanIdentifierPart(part: string): string {
return part.trim().replace(/^["'`\[]|["'`\]]$/g, '');
}
function splitDisplay(display: string): string[] {
return display
.trim()
.split('.')
.map(cleanIdentifierPart)
.filter(Boolean);
}
function displayForTable(driver: KtxConnectionDriver, table: KtxTableRef): string {
if (driver === 'sqlite') {
return table.name;
}
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
}
function tableRef(table: KtxSchemaTable): KtxTableRef {
return { catalog: table.catalog, db: table.db, name: table.name };
}
function candidateList(
driver: KtxConnectionDriver,
tables: KtxSchemaTable[],
): Array<{ tableRef: KtxTableRef; display: string }> {
return tables
.map((table) => ({
tableRef: tableRef(table),
display: displayForTable(driver, table),
}))
.sort((left, right) => left.display.localeCompare(right.display));
}
function parseDisplayRef(driver: KtxConnectionDriver, display: string): KtxTableRef | null {
const parts = splitDisplay(display);
if (driver === 'sqlite') {
return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null;
}
if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') {
return parts.length === 3 ? { catalog: parts[0]!, db: parts[1]!, name: parts[2]! } : null;
}
if (parts.length === 2) {
return { catalog: null, db: parts[0]!, name: parts[1]! };
}
if (parts.length === 3) {
return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! };
}
return null;
}
function resolveTable(snapshot: KtxSchemaSnapshot, input: KtxEntityDetailsTableInput): ResolveResult {
if (typeof input !== 'string') {
const table = snapshot.tables.find((candidate) => refsEqual(candidate, input)) ?? null;
return table
? { table }
: {
table: null,
error: {
code: 'table_not_found',
message: `Table not found in latest scan: ${displayForTable(snapshot.driver, input)}`,
candidates: candidateList(snapshot.driver, snapshot.tables),
},
};
}
const parsed = parseDisplayRef(snapshot.driver, input);
if (parsed) {
const table = snapshot.tables.find((candidate) => refsEqual(candidate, parsed)) ?? null;
return table
? { table }
: {
table: null,
error: {
code: 'table_not_found',
message: `Table not found in latest scan: ${input}`,
candidates: candidateList(snapshot.driver, snapshot.tables),
},
};
}
const byName = snapshot.tables.filter((candidate) => normalize(candidate.name) === normalize(input));
if (byName.length === 1) {
return { table: byName[0]! };
}
if (byName.length > 1) {
return {
table: null,
error: {
code: 'ambiguous_table',
message: `Table name "${input}" is ambiguous across schemas/catalogs; pass a structured table ref.`,
candidates: candidateList(snapshot.driver, byName),
},
};
}
return {
table: null,
error: {
code: 'table_not_found',
message: `Table not found in latest scan: ${input}`,
candidates: candidateList(snapshot.driver, snapshot.tables),
},
};
}
function toColumn(column: KtxSchemaColumn): KtxEntityDetailsColumn {
return {
name: column.name,
nativeType: column.nativeType,
normalizedType: column.normalizedType,
dimensionType: column.dimensionType,
nullable: column.nullable,
primaryKey: column.primaryKey,
comment: column.comment,
};
}
function snapshotInfo(report: KtxScanReport, snapshot: KtxSchemaSnapshot): KtxEntityDetailsSnapshotInfo {
return {
syncId: report.syncId,
extractedAt: snapshot.extractedAt,
scanRunId: report.runId ?? null,
};
}
async function readJson<T>(project: KtxLocalProject, path: string): Promise<T> {
return JSON.parse((await project.fileStore.readFile(path)).content) as T;
}
async function latestScan(project: KtxLocalProject, connectionId: string): Promise<LatestScan | null> {
const root = `raw-sources/${connectionId}/live-database`;
let listed;
try {
listed = await project.fileStore.listFiles(root);
} catch {
return null;
}
const reportPath = listed.files.filter((path) => path.endsWith('/scan-report.json')).sort().at(-1);
if (!reportPath) {
return null;
}
const report = await readJson<KtxScanReport>(project, reportPath);
const rawSourcesDir = report.artifactPaths.rawSourcesDir ?? reportPath.slice(0, -'/scan-report.json'.length);
const snapshot = await readLocalScanStructuralSnapshot({
project,
connectionId,
driver: report.driver,
rawSourcesDir,
extractedAtFallback: report.createdAt,
});
return { report, snapshot };
}
export function createKtxEntityDetailsService(project: KtxLocalProject) {
return {
async read(input: KtxEntityDetailsInput): Promise<KtxEntityDetailsResponse> {
const scan = await latestScan(project, input.connectionId);
if (!scan) {
return {
results: input.entities.map((entity) => ({
ok: false,
connectionId: input.connectionId,
table: entity.table,
error: {
code: 'scan_missing',
message: `No live-database scan found for connection "${input.connectionId}"; run \`ktx ingest ${input.connectionId}\` or \`ktx scan ${input.connectionId}\`.`,
},
})),
};
}
const info = snapshotInfo(scan.report, scan.snapshot);
const results: KtxEntityDetailsResponse['results'] = [];
for (const entity of input.entities) {
const resolved = resolveTable(scan.snapshot, entity.table);
if (!resolved.table) {
results.push({
ok: false,
connectionId: input.connectionId,
table: entity.table,
snapshot: info,
error: resolved.error!,
});
continue;
}
const requested = new Set((entity.columns ?? []).map((column) => normalize(column)));
const columns = requested.size
? resolved.table.columns.filter((column) => requested.has(normalize(column.name)))
: resolved.table.columns;
if (requested.size && columns.length !== requested.size) {
const found = new Set(columns.map((column) => normalize(column.name)));
const missing = [...requested].filter((column) => !found.has(column));
results.push({
ok: false,
connectionId: input.connectionId,
table: entity.table,
snapshot: info,
error: {
code: 'column_not_found',
message: `Column(s) not found on ${displayForTable(scan.snapshot.driver, resolved.table)}: ${missing.join(', ')}`,
candidates: resolved.table.columns.map((column) => column.name),
},
});
continue;
}
results.push({
ok: true,
connectionId: input.connectionId,
tableRef: tableRef(resolved.table),
display: displayForTable(scan.snapshot.driver, resolved.table),
kind: resolved.table.kind,
comment: resolved.table.comment,
estimatedRows: resolved.table.estimatedRows,
columns: columns.map(toColumn),
foreignKeys: resolved.table.foreignKeys,
snapshot: info,
});
}
return { results };
},
};
}

View file

@ -60,6 +60,24 @@ export {
ktxScanErrorMessage,
skippedKtxScanEnrichmentSummary,
} from './enrichment-summary.js';
export type {
KtxEntityDetailsColumn,
KtxEntityDetailsErrorCode,
KtxEntityDetailsErrorResult,
KtxEntityDetailsInput,
KtxEntityDetailsRecord,
KtxEntityDetailsResponse,
KtxEntityDetailsSnapshotInfo,
KtxEntityDetailsTableInput,
} from './entity-details.js';
export { createKtxEntityDetailsService } from './entity-details.js';
export type {
DisplayTargetResolution,
RawSchemaHit,
TableDetail,
WarehouseCatalogServiceDeps,
} from './warehouse-catalog.js';
export { WarehouseCatalogService } from './warehouse-catalog.js';
export type {
KtxColumnSampleUpdate,
KtxDescriptionSource,

View file

@ -0,0 +1,205 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKtxProject, type KtxLocalProject } from '../project/index.js';
import { WarehouseCatalogService } from './warehouse-catalog.js';
describe('WarehouseCatalogService', () => {
let tempDir: string;
let project: KtxLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-warehouse-catalog-'));
project = await initKtxProject({ projectDir: join(tempDir, 'project') });
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
async function seedLiveDatabaseScan(connectionId = 'warehouse', syncId = 'sync-2', driver = 'postgres') {
const root = `raw-sources/${connectionId}/live-database/${syncId}`;
const tableRef = {
catalog: driver === 'bigquery' ? 'analytics' : null,
db: driver === 'sqlite' ? null : 'public',
name: 'orders',
};
await project.fileStore.writeFile(
`${root}/connection.json`,
JSON.stringify({ connectionId, driver, extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2),
'ktx',
'ktx@example.com',
'seed connection',
);
await project.fileStore.writeFile(
`${root}/tables/orders.json`,
JSON.stringify(
{
catalog: tableRef.catalog,
db: tableRef.db,
name: tableRef.name,
kind: 'table',
comment: 'Customer orders',
estimatedRows: 12,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'status',
nativeType: 'text',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: 'Order status',
},
],
foreignKeys: [],
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed orders',
);
await project.fileStore.writeFile(
`${root}/enrichment/relationship-profile.json`,
JSON.stringify(
{
connectionId,
driver,
sqlAvailable: true,
queryCount: 3,
tables: [{ table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name }, rowCount: 12 }],
columns: {
'orders.status': {
table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name },
column: 'status',
nativeType: 'text',
normalizedType: 'text',
rowCount: 12,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 0.1667,
nullRate: 0,
sampleValues: ['paid', 'refunded'],
minTextLength: 4,
maxTextLength: 8,
},
},
warnings: [],
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed profile',
);
}
it('finds the latest sync and merges table schema with relationship profile values', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-1');
await seedLiveDatabaseScan('warehouse', 'sync-2');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.getLatestSyncId('warehouse')).resolves.toBe('sync-2');
const detail = await catalog.getTable({ connectionId: 'warehouse', catalog: null, db: 'public', name: 'orders' });
expect(detail).toMatchObject({
connectionId: 'warehouse',
display: 'public.orders',
rowCount: 12,
columns: [
{ name: 'id', nativeType: 'integer', primaryKey: true },
{ name: 'status', nativeType: 'text', sampleValues: ['paid', 'refunded'], distinctCount: 2 },
],
});
expect(detail).not.toHaveProperty(['connection', 'Name'].join(''));
const hits = await catalog.searchByName('warehouse', 'orders', 5);
expect(hits[0]).toMatchObject({
kind: 'table',
connectionId: 'warehouse',
display: 'public.orders',
});
expect(hits[0]).not.toHaveProperty(['connection', 'Name'].join(''));
});
it('returns scanAvailable=false when no live-database scan exists', async () => {
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.getTable({ connectionId: 'missing', catalog: null, db: 'public', name: 'orders' })).resolves.toBeNull();
await expect(catalog.hasScan('missing')).resolves.toBe(false);
});
it('resolves postgres display strings and returns closest candidates for missing tables', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
resolved: { catalog: null, db: 'public', name: 'orders' },
candidates: [],
dialect: 'postgres',
});
await expect(catalog.resolveDisplay('warehouse', 'public.orderz')).resolves.toMatchObject({
resolved: null,
candidates: [{ name: 'orders' }],
});
});
it('treats two-part BigQuery identifiers as ambiguous instead of guessing', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
resolved: null,
dialect: 'bigquery',
});
});
it('resolves postgres column display strings without treating the column as a table', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplayTarget('warehouse', 'public.orders.status')).resolves.toMatchObject({
resolved: { catalog: null, db: 'public', name: 'orders', column: 'status' },
candidates: [],
dialect: 'postgres',
});
});
it('resolves BigQuery column display strings with four parts', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplayTarget('warehouse', 'analytics.public.orders.status')).resolves.toMatchObject({
resolved: { catalog: 'analytics', db: 'public', name: 'orders', column: 'status' },
candidates: [],
dialect: 'bigquery',
});
});
it('searches table names, column names, comments, and descriptions', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.searchByName('warehouse', 'status', 10)).resolves.toEqual(
expect.arrayContaining([
expect.objectContaining({
kind: 'column',
ref: expect.objectContaining({ db: 'public', name: 'orders', column: 'status' }),
matchedOn: 'name',
}),
]),
);
});
});

View file

@ -0,0 +1,448 @@
import { getDialectForDriver } from '../connections/index.js';
import type { KtxFileStorePort } from '../core/index.js';
import type {
KtxConnectionDriver,
KtxSchemaColumn,
KtxSchemaForeignKey,
KtxSchemaTable,
KtxTableRef,
} from './types.js';
type CatalogDriver = KtxConnectionDriver | 'sqlite3';
export interface WarehouseCatalogServiceDeps {
fileStore: KtxFileStorePort;
}
interface WarehouseColumnDetail extends KtxSchemaColumn {
descriptions: Record<string, string>;
rowCount: number | null;
nullCount: number | null;
distinctCount: number | null;
nullRate: number | null;
sampleValues: string[];
}
export interface TableDetail {
connectionId: string;
catalog: string | null;
db: string | null;
name: string;
display: string;
kind: string;
comment: string | null;
description: string | null;
rowCount: number | null;
columns: WarehouseColumnDetail[];
foreignKeys: KtxSchemaForeignKey[];
}
export type RawSchemaHit =
| {
kind: 'table';
connectionId: string;
ref: KtxTableRef;
display: string;
matchedOn: 'name' | 'db' | 'comment' | 'description';
}
| {
kind: 'column';
connectionId: string;
ref: KtxTableRef & { column: string };
display: string;
matchedOn: 'name' | 'comment' | 'description';
};
export interface DisplayTargetResolution {
resolved: (KtxTableRef & { column?: string }) | null;
candidates: KtxTableRef[];
dialect: string;
}
interface ConnectionArtifact {
driver?: CatalogDriver;
}
interface RelationshipProfileColumn {
table?: KtxTableRef;
column?: string;
rowCount?: number;
nullCount?: number;
distinctCount?: number;
nullRate?: number;
sampleValues?: unknown[];
}
interface RelationshipProfileArtifact {
driver?: CatalogDriver;
tables?: Array<{ table?: KtxTableRef; rowCount?: number }>;
columns?: Record<string, RelationshipProfileColumn>;
}
interface ConnectionCatalog {
connectionId: string;
syncId: string;
driver: CatalogDriver;
tables: KtxSchemaTable[];
profile: RelationshipProfileArtifact | null;
}
type TableWithDescriptions = KtxSchemaTable & {
descriptions?: Record<string, string>;
columns: Array<KtxSchemaColumn & { descriptions?: Record<string, string> }>;
};
function normalize(value: string | null | undefined): string {
return (value ?? '').toLowerCase();
}
function refsEqual(left: KtxTableRef, right: KtxTableRef): boolean {
return (
normalize(left.catalog) === normalize(right.catalog) &&
normalize(left.db) === normalize(right.db) &&
normalize(left.name) === normalize(right.name)
);
}
function refKey(ref: KtxTableRef): string {
return [ref.catalog, ref.db, ref.name].map((part) => normalize(part)).join('.');
}
function columnKey(ref: KtxTableRef, column: string): string {
return `${refKey(ref)}.${normalize(column)}`;
}
function readJson<T>(content: string): T {
return JSON.parse(content) as T;
}
function cleanIdentifierPart(part: string): string {
return part.trim().replace(/^["'`\[]|["'`\]]$/g, '');
}
function splitDisplay(display: string): string[] {
return display
.trim()
.split('.')
.map(cleanIdentifierPart)
.filter(Boolean);
}
function formatDisplay(driver: CatalogDriver, table: KtxTableRef): string {
if (driver === 'sqlite' || driver === 'sqlite3') {
return table.name;
}
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
}
function parseDisplay(driver: CatalogDriver, display: string): KtxTableRef | null {
const parts = splitDisplay(display);
if (driver === 'sqlite' || driver === 'sqlite3') {
return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null;
}
if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') {
if (parts.length !== 3) {
return null;
}
return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! };
}
if (parts.length === 2) {
return { catalog: null, db: parts[0]!, name: parts[1]! };
}
if (parts.length === 3) {
return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! };
}
return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null;
}
function expectedDisplayPartCount(driver: CatalogDriver): number {
if (driver === 'sqlite' || driver === 'sqlite3') {
return 1;
}
if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') {
return 3;
}
return 2;
}
function parseColumnDisplay(driver: CatalogDriver, display: string): (KtxTableRef & { column: string }) | null {
const parts = splitDisplay(display);
const tablePartCount = expectedDisplayPartCount(driver);
if (parts.length !== tablePartCount + 1) {
return null;
}
const column = parts.at(-1);
if (!column) {
return null;
}
const table = parseDisplay(driver, parts.slice(0, -1).join('.'));
return table ? { ...table, column } : null;
}
function bestCandidates(tables: KtxSchemaTable[], display: string, limit = 5): KtxTableRef[] {
const needle = normalize(splitDisplay(display).at(-1) ?? display);
return tables
.map((table) => {
const name = normalize(table.name);
let score = 0;
if (name === needle) {
score = 100;
} else if (name.includes(needle) || needle.includes(name)) {
score = 80;
} else {
const samePrefix = [...name].filter((char, index) => needle[index] === char).length;
score = samePrefix / Math.max(name.length, needle.length, 1);
}
return { table, score };
})
.filter((entry) => entry.score > 0)
.sort((left, right) => right.score - left.score || left.table.name.localeCompare(right.table.name))
.slice(0, limit)
.map(({ table }) => ({ catalog: table.catalog, db: table.db, name: table.name }));
}
function firstDescription(descriptions: Record<string, string> | undefined): string | null {
return Object.values(descriptions ?? {}).find((value) => value.trim().length > 0) ?? null;
}
function matchedOnTable(table: TableWithDescriptions, query: string): RawSchemaHit['matchedOn'] | null {
const q = normalize(query);
if (!q) {
return null;
}
if (normalize(table.name).includes(q)) {
return 'name';
}
if (normalize(table.db).includes(q)) {
return 'db';
}
if (normalize(table.comment).includes(q)) {
return 'comment';
}
if (normalize(firstDescription(table.descriptions)).includes(q)) {
return 'description';
}
return null;
}
function matchedOnColumn(
column: KtxSchemaColumn & { descriptions?: Record<string, string> },
query: string,
): 'name' | 'comment' | 'description' | null {
const q = normalize(query);
if (!q) {
return null;
}
if (normalize(column.name).includes(q)) {
return 'name';
}
if (normalize(column.comment).includes(q)) {
return 'comment';
}
if (normalize(firstDescription(column.descriptions)).includes(q)) {
return 'description';
}
return null;
}
export class WarehouseCatalogService {
private readonly catalogs = new Map<string, Promise<ConnectionCatalog | null>>();
constructor(private readonly deps: WarehouseCatalogServiceDeps) {}
async hasScan(connectionId: string): Promise<boolean> {
return (await this.loadCatalog(connectionId)) !== null;
}
async getLatestSyncId(connectionId: string): Promise<string | null> {
return (await this.loadCatalog(connectionId))?.syncId ?? null;
}
async listTables(connectionId: string): Promise<KtxTableRef[]> {
const catalog = await this.loadCatalog(connectionId);
return catalog?.tables.map((table) => ({ catalog: table.catalog, db: table.db, name: table.name })) ?? [];
}
async getTable(ref: { connectionId: string } & KtxTableRef): Promise<TableDetail | null> {
const catalog = await this.loadCatalog(ref.connectionId);
if (!catalog) {
return null;
}
const table = catalog.tables.find((candidate) => refsEqual(candidate, ref)) as TableWithDescriptions | undefined;
if (!table) {
return null;
}
const profileTables = catalog.profile?.tables ?? [];
const profileTable = profileTables.find((candidate) => candidate.table && refsEqual(candidate.table, table));
const profileColumns = catalog.profile?.columns ?? {};
return {
connectionId: ref.connectionId,
catalog: table.catalog,
db: table.db,
name: table.name,
display: formatDisplay(catalog.driver, table),
kind: table.kind,
comment: table.comment,
description: firstDescription(table.descriptions),
rowCount: profileTable?.rowCount ?? table.estimatedRows ?? null,
columns: table.columns.map((rawColumn) => {
const column = rawColumn as KtxSchemaColumn & { descriptions?: Record<string, string> };
const profileColumn =
profileColumns[columnKey(table, column.name)] ??
Object.entries(profileColumns).find(
([key, value]) =>
normalize(key) === `${normalize(table.name)}.${normalize(column.name)}` ||
(value.table && refsEqual(value.table, table) && normalize(value.column) === normalize(column.name)),
)?.[1];
return {
...column,
descriptions: column.descriptions ?? {},
rowCount: profileColumn?.rowCount ?? null,
nullCount: profileColumn?.nullCount ?? null,
distinctCount: profileColumn?.distinctCount ?? null,
nullRate: profileColumn?.nullRate ?? null,
sampleValues: (profileColumn?.sampleValues ?? []).map((value) => String(value)),
};
}),
foreignKeys: table.foreignKeys,
};
}
async resolveDisplay(
connectionId: string,
display: string,
): Promise<{
resolved: KtxTableRef | null;
candidates: KtxTableRef[];
dialect: string;
}> {
const catalog = await this.loadCatalog(connectionId);
if (!catalog) {
return { resolved: null, candidates: [], dialect: 'unknown' };
}
const dialect = getDialectForDriver(catalog.driver).type;
const parsed = parseDisplay(catalog.driver, display);
if (!parsed) {
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
}
const table = catalog.tables.find((candidate) => refsEqual(candidate, parsed));
if (!table) {
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
}
return { resolved: { catalog: table.catalog, db: table.db, name: table.name }, candidates: [], dialect };
}
async resolveDisplayTarget(connectionId: string, display: string): Promise<DisplayTargetResolution> {
const catalog = await this.loadCatalog(connectionId);
if (!catalog) {
return { resolved: null, candidates: [], dialect: 'unknown' };
}
const dialect = getDialectForDriver(catalog.driver).type;
const tableResolution = await this.resolveDisplay(connectionId, display);
if (tableResolution.resolved) {
return tableResolution;
}
const parsedColumn = parseColumnDisplay(catalog.driver, display);
if (!parsedColumn) {
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
}
const table = catalog.tables.find((candidate) => refsEqual(candidate, parsedColumn));
if (!table) {
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
}
return {
resolved: {
catalog: table.catalog,
db: table.db,
name: table.name,
column: parsedColumn.column,
},
candidates: [],
dialect,
};
}
async searchByName(connectionId: string, query: string, limit: number): Promise<RawSchemaHit[]> {
const catalog = await this.loadCatalog(connectionId);
if (!catalog) {
return [];
}
const hits: RawSchemaHit[] = [];
for (const table of catalog.tables as TableWithDescriptions[]) {
const tableMatch = matchedOnTable(table, query);
if (tableMatch) {
hits.push({
kind: 'table',
connectionId,
ref: { catalog: table.catalog, db: table.db, name: table.name },
display: formatDisplay(catalog.driver, table),
matchedOn: tableMatch,
});
}
for (const column of table.columns) {
const columnMatch = matchedOnColumn(column, query);
if (!columnMatch) {
continue;
}
hits.push({
kind: 'column',
connectionId,
ref: { catalog: table.catalog, db: table.db, name: table.name, column: column.name },
display: `${formatDisplay(catalog.driver, table)}.${column.name}`,
matchedOn: columnMatch,
});
}
}
return hits.slice(0, Math.max(0, limit));
}
private loadCatalog(connectionId: string): Promise<ConnectionCatalog | null> {
const existing = this.catalogs.get(connectionId);
if (existing) {
return existing;
}
const pending = this.readCatalog(connectionId);
this.catalogs.set(connectionId, pending);
return pending;
}
private async readCatalog(connectionId: string): Promise<ConnectionCatalog | null> {
const root = `raw-sources/${connectionId}/live-database`;
const listed = await this.deps.fileStore.listFiles(root);
const connectionFiles = listed.files.filter((file) => file.endsWith('/connection.json')).sort();
const latestConnectionPath = connectionFiles.at(-1);
if (!latestConnectionPath) {
return null;
}
const latestRoot = latestConnectionPath.slice(0, -'/connection.json'.length);
const syncId = latestRoot.split('/').at(-1) ?? '';
const connection = readJson<ConnectionArtifact>((await this.deps.fileStore.readFile(latestConnectionPath)).content);
const tablesListing = await this.deps.fileStore.listFiles(`${latestRoot}/tables`);
const tables: KtxSchemaTable[] = [];
for (const tablePath of tablesListing.files.filter((file) => file.endsWith('.json')).sort()) {
tables.push(readJson<KtxSchemaTable>((await this.deps.fileStore.readFile(tablePath)).content));
}
let profile: RelationshipProfileArtifact | null = null;
try {
profile = readJson<RelationshipProfileArtifact>(
(await this.deps.fileStore.readFile(`${latestRoot}/enrichment/relationship-profile.json`)).content,
);
} catch {
profile = null;
}
return {
connectionId,
syncId,
driver: connection.driver ?? profile?.driver ?? 'postgres',
tables,
profile,
};
}
}