test: split cli tests from source tree (#216)

* feat(cli): define full warehouse dialect contract

* test(cli): keep dialect edge tests focused

* fix(cli): stabilize dialect contract foundation

* refactor(connectors): own read-only query preparation

* refactor(connectors): resolve dialects through registry

* refactor(connectors): keep concrete dialect classes internal

* chore(workspace): enforce dialect import boundary

* refactor(cli): resolve relationship dialect at scan boundary

* refactor(cli): use dialect display parsing for entity details

* refactor(cli): use dialect display parsing for warehouse catalog

* refactor(cli): use dialect SQL in relationship workflows

* test(cli): verify solid dialect scan workflow closure

* test: split cli tests from source tree

* refactor(cli): standardize BigQuery scope listing

* feat(sqlite): implement connector scope listing

* test(connectors): cover required table listing

* feat(cli): add warehouse driver registry

* refactor(setup): route scope discovery through driver registry

* refactor(cli): route local query execution through driver registry

* refactor(historic-sql): route dialect support through driver registry

* refactor(cli): test warehouse connections through driver registry

* fix(cli): close driver registry type export gaps

* Improve setup daemon diagnostics

* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback

Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.

* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match

The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.

Align the picker boundary with the canonical 3-level KtxTableRef:

- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
  resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
  (resolveEnabledTables already accepts the 3-part shape) and
  schemasFromEnabledTables now goes through parseDottedTableEntry so it
  recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
  reuse.

Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).

* fix(cli): allow debug telemetry under opt-out env
This commit is contained in:
Andrey Avtomonov 2026-05-26 08:49:05 +02:00 committed by GitHub
parent 924868841d
commit 56985b7e09
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
548 changed files with 5048 additions and 2228 deletions

View file

@ -0,0 +1,70 @@
import { describe, expect, it } from 'vitest';
import { constraintDiscoveryWarning, tryConstraintQuery } from '../../../src/context/scan/constraint-discovery.js';
describe('tryConstraintQuery', () => {
it('returns the query value when the query succeeds', async () => {
await expect(
tryConstraintQuery(
{
schema: 'public',
kind: 'primary_key',
isDeniedError: () => false,
},
async () => ['id'],
),
).resolves.toEqual({ ok: true, value: ['id'] });
});
it('returns a recoverable warning when the classifier recognizes denial', async () => {
const error = Object.assign(new Error('permission denied'), { code: '42501' });
await expect(
tryConstraintQuery(
{
schema: 'analytics',
kind: 'foreign_key',
isDeniedError: (candidate) => candidate === error,
},
async () => {
throw error;
},
),
).resolves.toEqual({
ok: false,
warning: {
code: 'constraint_discovery_unauthorized',
message: 'Skipped foreign-key discovery in analytics (insufficient grants on system catalogs)',
recoverable: true,
metadata: { schema: 'analytics', kind: 'foreign_key' },
},
});
});
it('rethrows non-denial errors unchanged', async () => {
const error = Object.assign(new Error('connection reset'), { code: 'ECONNRESET' });
await expect(
tryConstraintQuery(
{
schema: 'public',
kind: 'primary_key',
isDeniedError: () => false,
},
async () => {
throw error;
},
),
).rejects.toBe(error);
});
});
describe('constraintDiscoveryWarning', () => {
it('formats stable primary-key warning text and metadata', () => {
expect(constraintDiscoveryWarning({ schema: 'public', kind: 'primary_key' })).toEqual({
code: 'constraint_discovery_unauthorized',
message: 'Skipped primary-key discovery in public (insufficient grants on system catalogs)',
recoverable: true,
metadata: { schema: 'public', kind: 'primary_key' },
});
});
});

View file

@ -0,0 +1,183 @@
import { describe, expect, it } from 'vitest';
import { REDACTED_KTX_CREDENTIAL_VALUE } from '../../../src/context/core/redaction.js';
import {
redactKtxCredentialEnvelope,
redactKtxCredentialValue,
redactKtxScanMetadata,
redactKtxScanReport,
redactKtxScanWarning,
} from '../../../src/context/scan/credentials.js';
import type { KtxCredentialEnvelope, KtxScanReport, KtxScanWarning } from '../../../src/context/scan/types.js';
describe('KTX scan credential redaction', () => {
it('keeps credential references inspectable', () => {
const envReference: KtxCredentialEnvelope = { kind: 'env', name: 'DATABASE_URL' };
const fileReference: KtxCredentialEnvelope = { kind: 'file', path: '~/.config/ktx/warehouse' };
expect(redactKtxCredentialEnvelope(envReference)).toEqual(envReference);
expect(redactKtxCredentialEnvelope(fileReference)).toEqual(fileReference);
});
it('redacts resolved credential envelope values recursively', () => {
expect(
redactKtxCredentialEnvelope({
kind: 'resolved',
source: 'host',
values: {
username: 'readonly',
password: 'secret-password', // pragma: allowlist secret
nested: {
api_key: 'phx_123', // pragma: allowlist secret
warehouse: 'compute_wh',
},
headers: [{ authorizationToken: 'token-value' }, { label: 'safe' }],
},
}),
).toEqual({
kind: 'resolved',
source: 'host',
redacted: true,
values: {
username: 'readonly',
password: REDACTED_KTX_CREDENTIAL_VALUE,
nested: {
api_key: REDACTED_KTX_CREDENTIAL_VALUE,
warehouse: 'compute_wh',
},
headers: [{ authorizationToken: REDACTED_KTX_CREDENTIAL_VALUE }, { label: 'safe' }],
},
});
});
it('redacts scan metadata fields that commonly contain secrets', () => {
expect(
redactKtxScanMetadata({
driver: 'postgres',
url: 'postgres://user:pass@example.test/db', // pragma: allowlist secret
serviceAccountJson: {
client_email: 'reader@example.test',
private_key: 'pem-value', // pragma: allowlist secret
},
safeCount: 3,
}),
).toEqual({
driver: 'postgres',
url: REDACTED_KTX_CREDENTIAL_VALUE,
serviceAccountJson: {
client_email: 'reader@example.test',
private_key: REDACTED_KTX_CREDENTIAL_VALUE,
},
safeCount: 3,
});
});
it('redacts scan warning messages and metadata without hiding safe context', () => {
const warning: KtxScanWarning = {
code: 'sampling_failed',
message: 'sample failed for postgres://reader:secret@example.test/db', // pragma: allowlist secret
recoverable: true,
metadata: {
table: 'orders',
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
nested: {
api_key: 'sk_test_123', // pragma: allowlist secret
schema: 'public',
},
},
};
expect(redactKtxScanWarning(warning)).toEqual({
code: 'sampling_failed',
message: 'sample failed for postgres://reader:<redacted>@example.test/db',
recoverable: true,
metadata: {
table: 'orders',
url: REDACTED_KTX_CREDENTIAL_VALUE,
nested: {
api_key: REDACTED_KTX_CREDENTIAL_VALUE,
schema: 'public',
},
},
});
});
it('redacts scan report warning metadata recursively', () => {
const report: KtxScanReport = {
connectionId: 'warehouse',
driver: 'postgres',
syncId: 'sync-1',
runId: 'run-1',
trigger: 'cli',
mode: 'structural',
dryRun: false,
artifactPaths: {
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-1',
reportPath: 'raw-sources/warehouse/live-database/sync-1/scan-report.json',
manifestShards: [],
enrichmentArtifacts: [],
},
diffSummary: {
tablesAdded: 0,
tablesModified: 0,
tablesDeleted: 0,
tablesUnchanged: 0,
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
},
manifestShardsWritten: 0,
structuralSyncStats: {
tablesCreated: 0,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
},
enrichment: {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'skipped',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
capabilityGaps: [],
warnings: [
{
code: 'credential_redacted',
message: 'metadata redacted',
recoverable: true,
metadata: {
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
safeCount: 2,
},
},
],
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
enrichmentState: {
resumedStages: [],
completedStages: [],
failedStages: [],
},
createdAt: '2026-04-29T00:00:00.000Z',
};
const redacted = redactKtxScanReport(report);
expect(redacted.warnings[0]?.metadata).toEqual({
credentials_json: REDACTED_KTX_CREDENTIAL_VALUE,
safeCount: 2,
});
expect(report.warnings[0]?.metadata).toEqual({
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
safeCount: 2,
});
});
it('redacts standalone primitive credential values only when the field key is sensitive', () => {
expect(redactKtxCredentialValue('password', 'abc')).toBe(REDACTED_KTX_CREDENTIAL_VALUE);
expect(redactKtxCredentialValue('schema', 'public')).toBe('public');
});
});

View file

@ -0,0 +1,114 @@
import { describe, expect, it } from 'vitest';
import {
defaultKtxDataDictionarySettings,
isKtxDataDictionaryCandidate,
shouldKtxSampleColumnForDictionary,
} from '../../../src/context/scan/data-dictionary.js';
const defaultPatterns = defaultKtxDataDictionarySettings.excludePatterns;
describe('KTX scan data dictionary policy', () => {
it('includes text-like and boolean categorical types', () => {
expect(isKtxDataDictionaryCandidate('varchar(50)', 'status', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('VARCHAR', 'category', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('text', 'region', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('string', 'payment_method', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('nvarchar(100)', 'tier', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('enum', 'status', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('boolean', 'active', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('bool', 'verified', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('character varying(50)', 'region', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('character(1)', 'flag', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('ntext', 'category', defaultPatterns)).toBe(true);
});
it('excludes non-categorical primitive types', () => {
expect(isKtxDataDictionaryCandidate('integer', 'count', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('bigint', 'total', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('timestamp', 'created', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('date', 'birth', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('numeric', 'amount', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('decimal(10,2)', 'price', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('float', 'rate', defaultPatterns)).toBe(false);
});
it('excludes configured high-cardinality or sensitive name patterns', () => {
expect(isKtxDataDictionaryCandidate('varchar', 'user_id', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'session_uuid', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'api_key', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'password_hash', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'auth_token', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'id', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'created_at', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'birth_date', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('text', 'description', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('text', 'email_body', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'image_url', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'email', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'phone_number', defaultPatterns)).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'street_address', defaultPatterns)).toBe(false);
});
it('keeps business categorical names eligible', () => {
expect(isKtxDataDictionaryCandidate('varchar', 'status', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'region', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'country', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'payment_method', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'currency', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'plan', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'category', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'tier', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'gender', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'language', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'order_type', defaultPatterns)).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'order_status', defaultPatterns)).toBe(true);
});
it('respects host-provided exclusion patterns and skips invalid regex patterns', () => {
expect(isKtxDataDictionaryCandidate('varchar', 'company_size', ['company'])).toBe(false);
expect(isKtxDataDictionaryCandidate('varchar', 'status', ['company'])).toBe(true);
expect(isKtxDataDictionaryCandidate('varchar', 'status', ['[invalid', '(unclosed'])).toBe(true);
});
it('skips columns that already have persisted dictionary state', () => {
expect(
shouldKtxSampleColumnForDictionary({
columnType: 'varchar',
columnName: 'status',
sampleValues: ['paid'],
cardinality: null,
settings: defaultKtxDataDictionarySettings,
}),
).toEqual({ sample: false, reason: 'already_populated' });
expect(
shouldKtxSampleColumnForDictionary({
columnType: 'varchar',
columnName: 'empty_status',
sampleValues: null,
cardinality: 0,
settings: defaultKtxDataDictionarySettings,
}),
).toEqual({ sample: false, reason: 'empty_column' });
expect(
shouldKtxSampleColumnForDictionary({
columnType: 'varchar',
columnName: 'customer_name',
sampleValues: null,
cardinality: 300,
settings: defaultKtxDataDictionarySettings,
}),
).toEqual({ sample: false, reason: 'high_cardinality' });
expect(
shouldKtxSampleColumnForDictionary({
columnType: 'varchar',
columnName: 'status',
sampleValues: null,
cardinality: null,
settings: defaultKtxDataDictionarySettings,
}),
).toEqual({ sample: true });
});
});

View file

@ -0,0 +1,784 @@
import { describe, expect, it, vi } from 'vitest';
vi.mock('ai', async (importOriginal) => {
const actual = await importOriginal<typeof import('ai')>();
return { ...actual, generateText: vi.fn() };
});
import { generateText } from 'ai';
import {
buildKtxColumnDescriptionPrompt,
buildKtxDataSourceDescriptionPrompt,
buildKtxTableDescriptionPrompt,
type KtxDescriptionCachePort,
KtxDescriptionGenerator,
} from '../../../src/context/scan/description-generation.js';
import { createKtxConnectorCapabilities, type KtxScanConnector } from '../../../src/context/scan/types.js';
function createCache(initial: Record<string, string> = {}): KtxDescriptionCachePort {
const data = new Map(Object.entries(initial));
return {
buildTableKey: (table) => [table.catalog, table.db, table.name].filter(Boolean).join('.'),
buildColumnKey: (table, columnName) => [table.catalog, table.db, table.name, columnName].filter(Boolean).join('.'),
buildConnectionKey: (connectionName) => `__connection:${connectionName}`,
get: vi.fn(async (key: string) => data.get(key) ?? null),
set: vi.fn(async (key: string, value: string) => {
data.set(key, value);
}),
};
}
function createLlmProvider(text = 'generated description') {
vi.mocked(generateText).mockResolvedValue({ text } as never);
return {
generateText: vi.fn(async (input) => {
const result = await generateText({
system: input.system ? { role: 'system', content: input.system } : undefined,
messages: [{ role: 'user', content: input.prompt }],
temperature: input.temperature,
} as never);
return result.text;
}),
generateObject: vi.fn(),
runAgentLoop: vi.fn(),
} as any;
}
function createFailingLlmProvider(message = 'timeout exceeded when trying to connect') {
vi.mocked(generateText).mockRejectedValue(new Error(message) as never);
return {
generateText: vi.fn(async (input) => {
const result = await generateText({
system: input.system ? { role: 'system', content: input.system } : undefined,
messages: [{ role: 'user', content: input.prompt }],
temperature: input.temperature,
} as never);
return result.text;
}),
generateObject: vi.fn(),
runAgentLoop: vi.fn(),
} as any;
}
function createConnector(): KtxScanConnector {
return {
id: 'test-connector',
driver: 'postgres',
capabilities: createKtxConnectorCapabilities({
tableSampling: true,
columnSampling: true,
nestedAnalysis: true,
}),
introspect: vi.fn(async () => {
throw new Error('introspection is not used by description generation');
}),
listSchemas: vi.fn(async () => []),
listTables: vi.fn(async () => []),
sampleColumn: vi.fn(async () => ({
values: ['paid', 'refunded', null],
nullCount: 1,
distinctCount: 2,
})),
sampleTable: vi.fn(async () => ({
headers: ['id', 'status', 'amount'],
rows: [
[1, 'paid', 20],
[2, 'refunded', 10],
],
totalRows: 2,
})),
};
}
describe('KTX description prompt builders', () => {
it('builds column prompts with sample values, source descriptions, and nested BigQuery guidance', () => {
const { system, user } = buildKtxColumnDescriptionPrompt({
columnName: 'payload',
columnValues: [{ nested: true }, '[1,2]'],
tableContext: 'Table: events | Columns: payload | Data source: BIGQUERY',
dataSourceType: 'BIGQUERY',
supportsNestedAnalysis: true,
rawDescriptions: { db: 'Raw event payload', ai: 'Old AI text', user: 'User text' },
maxWords: 12,
});
expect(user).toContain(
'<table_context> Table: events | Columns: payload | Data source: BIGQUERY </table_context>',
);
expect(user).toContain('<column_name> payload </column_name>');
expect(user).toContain('<sample_values> [object Object], [1,2] </sample_values>');
expect(user).toContain('<db_documentation> Raw event payload </db_documentation>');
expect(user).not.toContain('Old AI text');
expect(user).not.toContain('User text');
expect(system).toContain('nested/structured data');
expect(system).toContain('12 words or less');
expect(user).not.toContain('12 words or less');
});
it('builds table and data-source prompts from sampled rows', () => {
const sample = {
headers: ['id', 'status'],
rows: [
[1, 'paid'],
[2, 'refunded'],
],
totalRows: 2,
};
const table = buildKtxTableDescriptionPrompt({
tableName: 'orders',
sampleData: sample,
dataSourceType: 'POSTGRESQL',
rawDescriptions: { dbt: 'Fact table for commerce orders' },
});
expect(table.user).toContain('status: paid, refunded');
expect(table.system).toContain('Analyze database tables');
const datasource = buildKtxDataSourceDescriptionPrompt({
tableSamples: [['orders', sample]],
dataSourceType: 'POSTGRESQL',
});
expect(datasource.user).toContain('orders (2 columns, 2 sample rows)');
expect(datasource.system).toContain('Analyze databases');
});
});
describe('KtxDescriptionGenerator', () => {
it('generates column descriptions with pre-fetched values, cache hits, and word-limit metadata', async () => {
const cache = createCache({ 'warehouse.public.orders.cached_status': 'Cached status description' });
const llmRuntime = createLlmProvider('Payment state');
const connector = createConnector();
const generator = new KtxDescriptionGenerator({
llmRuntime,
cache,
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
temperature: 0.2,
concurrencyLimit: 2,
},
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: 'warehouse',
db: 'public',
name: 'orders',
columns: [
{ name: 'status', sampleValues: ['paid', 'refunded'], rawDescriptions: { db: 'Payment lifecycle' } },
{ name: 'cached_status', sampleValues: ['open'] },
],
},
skipExisting: false,
existingDescriptions: {},
});
expect(result).toEqual({
columnDescriptions: [
['status', 'Payment state'],
['cached_status', 'Cached status description'],
],
processedColumns: ['status'],
skippedColumns: ['cached_status'],
});
expect(connector.sampleColumn).not.toHaveBeenCalled();
expect(generateText).toHaveBeenCalledWith(
expect.objectContaining({
temperature: 0.2,
system: expect.objectContaining({
role: 'system',
content: expect.stringContaining('Please provide a concise description in 12 words or less.'),
}),
messages: expect.arrayContaining([
expect.objectContaining({
role: 'user',
content: expect.stringContaining('<column_name> status </column_name>'),
}),
]),
}),
);
const lastCall = vi.mocked(generateText).mock.calls.at(-1)?.[0];
expect(lastCall?.messages?.some((message) => message.role === 'system')).toBe(false);
});
it('samples through the connector when column values are not pre-fetched', async () => {
const connector = createConnector();
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('Current order state'),
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
},
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status' }],
},
});
expect(connector.sampleColumn).toHaveBeenCalledWith(
{
connectionId: 'conn-1',
table: { catalog: null, db: 'public', name: 'orders' },
column: 'status',
limit: 50,
},
{ runId: 'run-1' },
);
expect(result.columnDescriptions).toEqual([['status', 'Current order state']]);
});
it('samples through a description sampling port without requiring structural introspection', async () => {
const sampler = {
id: 'description-sampler:conn-1',
sampleColumn: vi.fn(async () => ({
values: ['paid', 'refunded'],
nullCount: null,
distinctCount: null,
})),
sampleTable: vi.fn(async () => ({
headers: ['id', 'status'],
rows: [[1, 'paid']],
totalRows: 1,
})),
};
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('Generated through sampler'),
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
},
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector: sampler,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status' }],
},
});
expect(result.columnDescriptions).toEqual([['status', 'Generated through sampler']]);
expect(sampler.sampleColumn).toHaveBeenCalledWith(
{
connectionId: 'conn-1',
table: { catalog: null, db: 'public', name: 'orders' },
column: 'status',
limit: 50,
},
{ runId: 'run-1' },
);
expect('introspect' in sampler).toBe(false);
});
it('does not turn LLM failures into generated descriptions', async () => {
const cache = createCache();
const connector = createConnector();
const generator = new KtxDescriptionGenerator({
llmRuntime: createFailingLlmProvider(),
cache,
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
},
});
const columnResult = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status' }],
},
});
await expect(
generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: { catalog: null, db: 'public', name: 'orders' },
}),
).resolves.toBeNull();
expect(columnResult).toEqual({
columnDescriptions: [['status', null]],
processedColumns: [],
skippedColumns: [],
});
expect(cache.set).not.toHaveBeenCalled();
});
it('generates and caches table and data-source descriptions', async () => {
const cache = createCache();
const connector = createConnector();
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('Commerce orders'),
cache,
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
concurrencyLimit: 2,
},
});
await expect(
generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: { catalog: 'warehouse', db: 'public', name: 'orders', rawDescriptions: { db: 'Raw orders' } },
}),
).resolves.toBe('Commerce orders');
await expect(
generator.generateDataSourceDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
tables: [
{ catalog: 'warehouse', db: 'public', name: 'orders' },
{ catalog: 'warehouse', db: 'public', name: 'customers' },
],
connectionName: 'Warehouse',
}),
).resolves.toBe('Commerce orders');
expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders');
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
});
it('generates one structured table description and reuses table samples for all columns', async () => {
const llmRuntime = createLlmProvider('unused');
llmRuntime.generateObject = vi.fn(async () => ({
tableDescription: 'Commerce orders',
columns: [
{ name: 'status', description: 'Current order state' },
{ name: 'amount', description: 'Order amount in dollars' },
],
}));
const connector = createConnector();
const generator = new KtxDescriptionGenerator({
llmRuntime,
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateBatchedTableDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
rawDescriptions: { db: 'Orders fact table' },
columns: [
{ name: 'status', type: 'text' },
{ name: 'amount', type: 'numeric' },
],
},
});
expect(result.tableDescription).toBe('Commerce orders');
expect(Object.fromEntries(result.columnDescriptions)).toEqual({
status: 'Current order state',
amount: 'Order amount in dollars',
});
expect(connector.sampleTable).toHaveBeenCalledTimes(1);
expect(connector.sampleColumn).not.toHaveBeenCalled();
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
expect(llmRuntime.generateText).not.toHaveBeenCalled();
});
it('falls back to one column generateText call for each missing structured column', async () => {
const llmRuntime = createLlmProvider('Fallback status');
llmRuntime.generateObject = vi.fn(async () => ({
tableDescription: 'Commerce orders',
columns: [{ name: 'amount', description: 'Order amount in dollars' }],
}));
const connector = createConnector();
const generator = new KtxDescriptionGenerator({
llmRuntime,
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateBatchedTableDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [
{ name: 'status', type: 'text' },
{ name: 'amount', type: 'numeric' },
],
},
});
expect(Object.fromEntries(result.columnDescriptions)).toEqual({
status: 'Fallback status',
amount: 'Order amount in dollars',
});
expect(connector.sampleColumn).not.toHaveBeenCalled();
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
expect(llmRuntime.generateText).toHaveBeenCalledTimes(1);
});
it('does not run per-column fallback when structured object generation throws', async () => {
const llmRuntime = createLlmProvider('Fallback description');
llmRuntime.generateObject = vi.fn(async () => {
throw new Error('object output unavailable');
});
const warnings: string[] = [];
const generator = new KtxDescriptionGenerator({
llmRuntime,
onWarning: (warning) => warnings.push(warning.code),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateBatchedTableDescriptions({
connectionId: 'conn-1',
connector: createConnector(),
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status', type: 'text' }],
},
});
expect(result.tableDescription).toBeNull();
expect(Object.fromEntries(result.columnDescriptions)).toEqual({ status: null });
expect(warnings).toContain('enrichment_failed');
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
expect(llmRuntime.generateText).not.toHaveBeenCalled();
});
});
describe('KtxDescriptionGenerator resilience', () => {
function createLogger() {
return {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
};
}
it('retries sampleTable on transient failure and uses sampled rows when it eventually succeeds', async () => {
const sampleTable = vi
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
.mockRejectedValueOnce(new Error('pool: transient ECONNRESET'))
.mockRejectedValueOnce(new Error('pool: transient ECONNRESET'))
.mockResolvedValue({
headers: ['id', 'status'],
rows: [
[1, 'paid'],
[2, 'refunded'],
],
totalRows: 2,
});
const connector: KtxScanConnector = {
...createConnector(),
sampleTable,
};
const logger = createLogger();
const warnings: Array<{ code: string; table?: string }> = [];
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('Commerce orders'),
logger,
onWarning: (warning) => warnings.push({ code: warning.code, ...(warning.table ? { table: warning.table } : {}) }),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24, concurrencyLimit: 2 },
});
const description = await generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: { catalog: null, db: 'public', name: 'orders' },
});
expect(description).toBe('Commerce orders');
expect(sampleTable).toHaveBeenCalledTimes(3);
expect(logger.warn).toHaveBeenCalledTimes(2);
expect(warnings).toEqual([]);
});
it('falls back to metadata-only prompt when sampleTable retries exhaust', async () => {
const sampleTable = vi
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
.mockRejectedValue(new Error('pool: connection refused'));
const connector: KtxScanConnector = {
...createConnector(),
sampleTable,
};
const logger = createLogger();
const warnings: Array<{ code: string; table?: string; metadata?: Record<string, unknown> }> = [];
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('Customer reference data'),
logger,
onWarning: (warning) =>
warnings.push({
code: warning.code,
...(warning.table ? { table: warning.table } : {}),
...(warning.metadata ? { metadata: warning.metadata } : {}),
}),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24, concurrencyLimit: 2 },
});
const description = await generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: {
catalog: null,
db: 'public',
name: 'customers',
columns: [
{ name: 'id', nativeType: 'uuid' },
{ name: 'email', nativeType: 'text', comment: 'Primary contact email' },
],
},
});
expect(description).toBe('Customer reference data');
expect(sampleTable).toHaveBeenCalledTimes(3);
expect(warnings.map((warning) => warning.code)).toEqual(['sampling_failed', 'description_fallback_used']);
expect(warnings[1]?.metadata?.reason).toBe('sampling_failed');
const userPrompt = (vi.mocked(generateText).mock.calls.at(-1)?.[0] as { messages: Array<{ role: string; content: string }> })
.messages.find((message) => message.role === 'user')?.content;
expect(userPrompt).toContain('Columns (metadata only, no sample rows)');
expect(userPrompt).toContain('email (text)');
expect(userPrompt).toContain('Primary contact email');
});
it('emits enrichment_failed and returns null when both sampling and metadata-only LLM fail', async () => {
const sampleTable = vi
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
.mockRejectedValue(new Error('pool: connection refused'));
const connector: KtxScanConnector = {
...createConnector(),
sampleTable,
};
const warnings: string[] = [];
const generator = new KtxDescriptionGenerator({
llmRuntime: createFailingLlmProvider(),
onWarning: (warning) => warnings.push(warning.code),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const description = await generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: { catalog: null, db: 'public', name: 'orphan', columns: [{ name: 'id' }] },
});
expect(description).toBeNull();
expect(warnings).toEqual(['sampling_failed', 'enrichment_failed']);
});
it('uses metadata-only fallback when connector has no sampleTable', async () => {
const connector = createConnector();
const samplerWithoutTable: KtxScanConnector = {
...connector,
sampleTable: undefined,
};
const warnings: string[] = [];
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('Orders mart'),
onWarning: (warning) => warnings.push(warning.code),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const description = await generator.generateTableDescription({
connectionId: 'conn-1',
connector: samplerWithoutTable,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: {
catalog: null,
db: 'public',
name: 'mart_orders',
columns: [{ name: 'order_id', nativeType: 'uuid' }],
},
});
expect(description).toBe('Orders mart');
expect(warnings).toEqual(['connector_capability_missing', 'description_fallback_used']);
});
it('aborts retry loop when the scan context signal fires', async () => {
const controller = new AbortController();
const sampleTable = vi.fn<NonNullable<KtxScanConnector['sampleTable']>>().mockImplementation(async () => {
controller.abort();
throw new Error('first attempt blew up');
});
const connector: KtxScanConnector = {
...createConnector(),
sampleTable,
};
const warnings: string[] = [];
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('should not be called'),
onWarning: (warning) => warnings.push(warning.code),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
await expect(
generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1', signal: controller.signal },
dataSourceType: 'POSTGRESQL',
table: { catalog: null, db: 'public', name: 'orders' },
}),
).rejects.toThrow('aborted');
expect(sampleTable).toHaveBeenCalledTimes(1);
expect(warnings).toEqual([]);
});
it('generates column descriptions from rawDescriptions when sampleColumn is unavailable', async () => {
const samplerWithoutColumn: KtxScanConnector = {
...createConnector(),
sampleColumn: undefined,
};
const logger = createLogger();
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('Payment lifecycle state'),
logger,
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector: samplerWithoutColumn,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status', rawDescriptions: { db: 'order lifecycle state' } }],
},
});
expect(result.columnDescriptions).toEqual([['status', 'Payment lifecycle state']]);
expect(logger.warn).toHaveBeenCalled();
const userPrompt = (
vi.mocked(generateText).mock.calls.at(-1)?.[0] as { messages: Array<{ role: string; content: string }> }
).messages.find((message) => message.role === 'user')?.content;
expect(userPrompt).toContain('<sample_values> unavailable </sample_values>');
expect(userPrompt).toContain('<db_documentation> order lifecycle state </db_documentation>');
});
it('generates column descriptions from rawDescriptions when sampleColumn retries exhaust', async () => {
const sampleColumn = vi
.fn<NonNullable<KtxScanConnector['sampleColumn']>>()
.mockRejectedValue(new Error('pool: connection refused'));
const flakyConnector: KtxScanConnector = {
...createConnector(),
sampleColumn,
};
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('Customer reference identifier'),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector: flakyConnector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'customer_id', rawDescriptions: { db: 'FK to customers.id' } }],
},
});
expect(sampleColumn).toHaveBeenCalledTimes(3);
expect(result.columnDescriptions).toEqual([['customer_id', 'Customer reference identifier']]);
});
it('skips column LLM call only when neither samples nor rawDescriptions are available', async () => {
const sampleColumn = vi
.fn<NonNullable<KtxScanConnector['sampleColumn']>>()
.mockResolvedValue({ values: [null, null], nullCount: 2, distinctCount: 0 });
const connector: KtxScanConnector = {
...createConnector(),
sampleColumn,
};
vi.mocked(generateText).mockClear();
const generator = new KtxDescriptionGenerator({
llmRuntime: createLlmProvider('should not be called'),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'opaque_blob' }],
},
});
expect(result.columnDescriptions).toEqual([['opaque_blob', null]]);
expect(generateText).not.toHaveBeenCalled();
});
});

View file

@ -0,0 +1,47 @@
import { describe, expect, it } from 'vitest';
import { buildKtxColumnEmbeddingText } from '../../../src/context/scan/embedding-text.js';
describe('KTX scan embedding text', () => {
it('builds column embedding text with table, description, FK, and sample-value context', () => {
expect(
buildKtxColumnEmbeddingText({
tableName: 'orders',
columnName: 'status',
columnType: 'varchar',
resolvedDescription: 'Payment lifecycle state',
sampleValues: ['paid', 'refunded', 'pending'],
resolvedTableDescription: 'Customer orders',
foreignKeys: {
outgoing: [{ toTable: 'customers', toColumn: 'id' }],
incoming: [{ fromTable: 'refunds', fromColumn: 'order_status' }],
},
maxSampleValues: 2,
}),
).toBe(
'orders.status (varchar). Table: Customer orders. Payment lifecycle state. FK -> customers.id. FK <- refunds.order_status. Values: paid, refunded',
);
});
it('omits optional sections when the scan has no enrichment context yet', () => {
expect(
buildKtxColumnEmbeddingText({
tableName: 'orders',
columnName: 'id',
columnType: 'integer',
resolvedDescription: null,
}),
).toBe('orders.id (integer)');
});
it('keeps all available sample values when no explicit max is supplied', () => {
expect(
buildKtxColumnEmbeddingText({
tableName: 'orders',
columnName: 'status',
columnType: 'varchar',
resolvedDescription: null,
sampleValues: ['paid', 'refunded'],
}),
).toBe('orders.status (varchar). Values: paid, refunded');
});
});

View file

@ -0,0 +1,175 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
completedKtxScanEnrichmentStateSummary,
computeKtxScanEnrichmentInputHash,
summarizeKtxScanEnrichmentState,
} from '../../../src/context/scan/enrichment-state.js';
import { SqliteLocalScanEnrichmentStateStore } from '../../../src/context/scan/sqlite-local-enrichment-state-store.js';
import type { KtxSchemaSnapshot } from '../../../src/context/scan/types.js';
const snapshot: KtxSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 1,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
},
],
};
describe('scan enrichment state', () => {
let tempDir: string;
let store: SqliteLocalScanEnrichmentStateStore;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-scan-enrichment-state-'));
store = new SqliteLocalScanEnrichmentStateStore({ dbPath: join(tempDir, 'db.sqlite') });
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('computes stable input hashes without depending on object key order', () => {
const first = computeKtxScanEnrichmentInputHash({
snapshot,
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
});
const second = computeKtxScanEnrichmentInputHash({
snapshot: { ...snapshot, metadata: {} },
mode: 'enriched',
detectRelationships: true,
providerIdentity: { llmModel: 'a', provider: 'local-heuristic' },
});
const firstTable = snapshot.tables[0];
if (!firstTable) {
throw new Error('Expected test snapshot table');
}
const changed = computeKtxScanEnrichmentInputHash({
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
});
expect(first).toMatch(/^[a-f0-9]{64}$/);
expect(second).toBe(first);
expect(changed).not.toBe(first);
});
it('persists completed stages and ignores stale hashes', async () => {
const inputHash = computeKtxScanEnrichmentInputHash({
snapshot,
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'local-heuristic' },
});
await store.saveCompletedStage({
runId: 'scan-run-1',
connectionId: 'warehouse',
syncId: 'sync-1',
mode: 'enriched',
stage: 'descriptions',
inputHash,
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
updatedAt: '2026-04-29T12:01:00.000Z',
});
await expect(
store.findCompletedStage({
runId: 'scan-run-1',
stage: 'descriptions',
inputHash,
}),
).resolves.toMatchObject({
runId: 'scan-run-1',
stage: 'descriptions',
status: 'completed',
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
});
await expect(
store.findCompletedStage({
runId: 'scan-run-1',
stage: 'descriptions',
inputHash: 'different-hash',
}),
).resolves.toBeNull();
});
it('records failed stages without making them reusable', async () => {
await store.saveFailedStage({
runId: 'scan-run-2',
connectionId: 'warehouse',
syncId: 'sync-2',
mode: 'enriched',
stage: 'embeddings',
inputHash: 'hash-2',
errorMessage: 'embedding service timed out',
updatedAt: '2026-04-29T12:02:00.000Z',
});
await expect(
store.findCompletedStage({
runId: 'scan-run-2',
stage: 'embeddings',
inputHash: 'hash-2',
}),
).resolves.toBeNull();
await expect(store.listRunStages('scan-run-2')).resolves.toEqual([
expect.objectContaining({
runId: 'scan-run-2',
stage: 'embeddings',
status: 'failed',
errorMessage: 'embedding service timed out',
}),
]);
});
it('summarizes resumed, completed, and failed stages for reports', () => {
expect(
summarizeKtxScanEnrichmentState({
resumedStages: ['descriptions'],
completedStages: ['descriptions', 'embeddings'],
failedStages: ['relationships'],
}),
).toEqual({
resumedStages: ['descriptions'],
completedStages: ['descriptions', 'embeddings'],
failedStages: ['relationships'],
});
expect(completedKtxScanEnrichmentStateSummary()).toEqual({
resumedStages: [],
completedStages: [],
failedStages: [],
});
});
});

View file

@ -0,0 +1,42 @@
import { describe, expect, it } from 'vitest';
import {
failedKtxScanEnrichmentSummary,
ktxScanErrorMessage,
skippedKtxScanEnrichmentSummary,
} from '../../../src/context/scan/enrichment-summary.js';
describe('KTX scan enrichment summaries', () => {
it('keeps structural scans skipped when no enrichment was requested', () => {
expect(failedKtxScanEnrichmentSummary('structural', false)).toEqual(skippedKtxScanEnrichmentSummary);
});
it('marks relationship stages failed when relationship detection fails', () => {
expect(failedKtxScanEnrichmentSummary('relationships', true)).toEqual({
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'failed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'failed',
});
});
it('marks every enriched-only stage failed when full enrichment fails', () => {
expect(failedKtxScanEnrichmentSummary('enriched', true)).toEqual({
dataDictionary: 'failed',
tableDescriptions: 'failed',
columnDescriptions: 'failed',
embeddings: 'failed',
deterministicRelationships: 'failed',
llmRelationshipValidation: 'failed',
statisticalValidation: 'failed',
});
});
it('formats unknown thrown values for scan warnings', () => {
expect(ktxScanErrorMessage(new Error('gateway timeout'))).toBe('gateway timeout');
expect(ktxScanErrorMessage('plain failure')).toBe('plain failure');
expect(ktxScanErrorMessage({ code: 'E_SCAN' })).toBe('{"code":"E_SCAN"}');
});
});

View file

@ -0,0 +1,159 @@
import { describe, expect, it } from 'vitest';
import type {
KtxColumnSampleUpdate,
KtxDescriptionUpdate,
KtxEmbeddingUpdate,
KtxEnrichedSchema,
KtxJoinUpdate,
KtxRelationshipEndpoint,
KtxRelationshipUpdate,
KtxScanMetadataStore,
KtxStructuralSyncPlan,
} from '../../../src/context/scan/enrichment-types.js';
describe('KTX scan enrichment contracts', () => {
it('models an enriched schema with reusable table, column, and relationship metadata', () => {
const schema: KtxEnrichedSchema = {
connectionId: 'warehouse',
tables: [
{
id: 'table-orders',
ref: { catalog: 'analytics', db: 'public', name: 'orders' },
enabled: true,
descriptions: { db: 'Raw orders', ai: 'Customer orders' },
columns: [
{
id: 'column-orders-status',
tableId: 'table-orders',
tableRef: { catalog: 'analytics', db: 'public', name: 'orders' },
name: 'status',
nativeType: 'varchar',
normalizedType: 'string',
dimensionType: 'string',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: { db: 'Status code' },
embedding: [0.1, 0.2],
sampleValues: ['paid', 'refunded'],
cardinality: 2,
},
],
},
],
relationships: [
{
id: 'rel-orders-customers',
source: 'formal',
from: {
tableId: 'table-orders',
columnIds: ['column-orders-customer-id'],
table: { catalog: 'analytics', db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'table-customers',
columnIds: ['column-customers-id'],
table: { catalog: 'analytics', db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
};
expect(schema.tables[0].columns[0].sampleValues).toEqual(['paid', 'refunded']);
expect(schema.relationships[0].source).toBe('formal');
});
it('models metadata-store updates without requiring a concrete store implementation', async () => {
const structuralPlan: KtxStructuralSyncPlan = {
connectionId: 'warehouse',
snapshotId: 'snapshot-1',
operations: [{ kind: 'create_table', table: 'orders' }],
};
const descriptionUpdate: KtxDescriptionUpdate = {
connectionId: 'warehouse',
table: { catalog: 'analytics', db: 'public', name: 'orders' },
source: 'ai',
tableDescription: 'Customer orders',
columnDescriptions: { status: 'Payment lifecycle state' },
};
const sampleUpdate: KtxColumnSampleUpdate = {
columnId: 'column-orders-status',
sampleValues: ['paid', 'refunded'],
cardinality: 2,
};
const embeddingUpdate: KtxEmbeddingUpdate = {
columnId: 'column-orders-status',
text: 'orders.status (varchar). Values: paid, refunded',
embedding: [0.25, 0.75],
};
const relationshipUpdate: KtxRelationshipUpdate = {
connectionId: 'warehouse',
accepted: [],
rejected: [],
skipped: [{ reason: 'missing parent table', relationshipId: 'candidate-1' }],
};
const store: KtxScanMetadataStore = {
loadSchema: async () => null,
applyStructuralPlan: async (plan) => ({
connectionId: plan.connectionId,
tables: [],
relationships: [],
}),
updateDescriptions: async (input) => {
expect(input).toEqual(descriptionUpdate);
},
updateColumnSamples: async (input) => {
expect(input).toEqual([sampleUpdate]);
},
updateColumnEmbeddings: async (input) => {
expect(input).toEqual([embeddingUpdate]);
},
updateInferredRelationships: async (input) => {
expect(input).toEqual(relationshipUpdate);
},
};
await expect(store.loadSchema('warehouse')).resolves.toBeNull();
await expect(store.applyStructuralPlan(structuralPlan)).resolves.toEqual({
connectionId: 'warehouse',
tables: [],
relationships: [],
});
await expect(store.updateDescriptions(descriptionUpdate)).resolves.toBeUndefined();
await expect(store.updateColumnSamples([sampleUpdate])).resolves.toBeUndefined();
await expect(store.updateColumnEmbeddings([embeddingUpdate])).resolves.toBeUndefined();
await expect(store.updateInferredRelationships(relationshipUpdate)).resolves.toBeUndefined();
});
});
describe('relationship tuple contracts', () => {
it('represents relationship endpoints and join updates as ordered column tuples', () => {
const endpoint: KtxRelationshipEndpoint = {
tableId: 'public.order_lines',
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
table: { catalog: null, db: 'public', name: 'order_lines' },
columns: ['order_id', 'line_number'],
};
const update: KtxJoinUpdate = {
connectionId: 'warehouse',
fromTable: 'order_line_allocations',
fromColumns: ['order_id', 'line_number'],
toTable: 'order_lines',
toColumns: ['order_id', 'line_number'],
relationship: 'many_to_one',
author: 'ktx',
authorEmail: 'ktx@example.com',
};
expect(endpoint.columns).toEqual(['order_id', 'line_number']);
expect(endpoint.columnIds).toEqual(['public.order_lines.order_id', 'public.order_lines.line_number']);
expect(update.fromColumns).toEqual(['order_id', 'line_number']);
expect(update.toColumns).toEqual(['order_id', 'line_number']);
});
});

View file

@ -0,0 +1,307 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
import { createKtxEntityDetailsService } from '../../../src/context/scan/entity-details.js';
import type { KtxConnectionDriver, KtxScanReport, KtxSchemaTable } from '../../../src/context/scan/types.js';
describe('createKtxEntityDetailsService', () => {
let tempDir: string;
let project: KtxLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-entity-details-service-'));
project = await initKtxProject({ projectDir: join(tempDir, 'project') });
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
function scanReport(input: {
connectionId: string;
syncId: string;
runId: string;
driver?: KtxConnectionDriver;
createdAt?: string;
}): KtxScanReport {
const rawSourcesDir = `raw-sources/${input.connectionId}/live-database/${input.syncId}`;
return {
connectionId: input.connectionId,
driver: input.driver ?? 'postgres',
syncId: input.syncId,
runId: input.runId,
trigger: 'mcp',
mode: 'structural',
dryRun: false,
artifactPaths: {
rawSourcesDir,
reportPath: `${rawSourcesDir}/scan-report.json`,
manifestShards: [],
enrichmentArtifacts: [],
},
diffSummary: {
tablesAdded: 0,
tablesModified: 0,
tablesDeleted: 0,
tablesUnchanged: 1,
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
},
manifestShardsWritten: 0,
structuralSyncStats: {
tablesCreated: 1,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
},
enrichment: {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'skipped',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
capabilityGaps: [],
warnings: [],
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] },
createdAt: input.createdAt ?? '2026-05-14T09:00:00.000Z',
};
}
function ordersTable(input: { db?: string | null; estimatedRows?: number | null } = {}): KtxSchemaTable {
return {
catalog: null,
db: input.db ?? 'public',
name: 'orders',
kind: 'table',
comment: 'Customer orders',
estimatedRows: input.estimatedRows ?? 12,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'status',
nativeType: 'text',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: 'Order status',
},
],
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: 'orders_customer_id_fkey',
},
],
};
}
async function seedScan(input: {
connectionId?: string;
syncId: string;
runId: string;
driver?: KtxConnectionDriver;
extractedAt?: string;
tables?: KtxSchemaTable[];
}): Promise<void> {
const connectionId = input.connectionId ?? 'warehouse';
const report = scanReport({
connectionId,
syncId: input.syncId,
runId: input.runId,
driver: input.driver,
createdAt: input.extractedAt,
});
const root = report.artifactPaths.rawSourcesDir;
await project.fileStore.writeFile(
`${root}/connection.json`,
JSON.stringify(
{
connectionId,
driver: report.driver,
extractedAt: input.extractedAt ?? report.createdAt,
scope: { schemas: ['public'] },
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed connection',
);
for (const table of input.tables ?? [ordersTable()]) {
await project.fileStore.writeFile(
`${root}/tables/${table.db ?? 'default'}-${table.name}.json`,
JSON.stringify(table, null, 2),
'ktx',
'ktx@example.com',
`seed ${table.name}`,
);
}
await project.fileStore.writeFile(
`${root}/scan-report.json`,
JSON.stringify(report, null, 2),
'ktx',
'ktx@example.com',
'seed scan report',
);
}
it('returns the latest scan snapshot table details for a display string', async () => {
await seedScan({ syncId: 'sync-1', runId: 'scan-old', extractedAt: '2026-05-14T08:00:00.000Z' });
await seedScan({
syncId: 'sync-2',
runId: 'scan-new',
extractedAt: '2026-05-14T09:00:00.000Z',
tables: [ordersTable({ estimatedRows: 99 })],
});
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: 'public.orders' }],
});
expect(result.results).toHaveLength(1);
expect(result.results[0]).toMatchObject({
ok: true,
connectionId: 'warehouse',
display: 'public.orders',
estimatedRows: 99,
snapshot: {
syncId: 'sync-2',
scanRunId: 'scan-new',
extractedAt: '2026-05-14T09:00:00.000Z',
},
columns: [
{ name: 'id', nativeType: 'integer', primaryKey: true },
{ name: 'status', nativeType: 'text', nullable: false },
],
});
});
it('resolves quoted qualified display strings through the dialect parser', async () => {
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: '"public"."orders"' }],
});
expect(result.results[0]).toMatchObject({
ok: true,
display: 'public.orders',
tableRef: { catalog: null, db: 'public', name: 'orders' },
});
});
it('filters requested columns while keeping full-table foreign keys', async () => {
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: { catalog: null, db: 'public', name: 'orders' }, columns: ['status'] }],
});
expect(result.results[0]).toMatchObject({
ok: true,
columns: [{ name: 'status' }],
foreignKeys: [
{
fromColumn: 'customer_id',
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
},
],
});
});
it('returns a structured missing-scan error', async () => {
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: 'public.orders' }],
});
expect(result.results).toEqual([
{
ok: false,
connectionId: 'warehouse',
table: 'public.orders',
error: {
code: 'scan_missing',
message: 'No live-database scan found for connection "warehouse"; run `ktx ingest warehouse` or `ktx scan warehouse`.',
},
},
]);
});
it('reports ambiguous bare table names across schemas', async () => {
await seedScan({
syncId: 'sync-1',
runId: 'scan-1',
tables: [ordersTable({ db: 'public' }), ordersTable({ db: 'archive' })],
});
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: 'orders' }],
});
expect(result.results[0]).toMatchObject({
ok: false,
error: {
code: 'ambiguous_table',
candidates: [
{ tableRef: { catalog: null, db: 'archive', name: 'orders' }, display: 'archive.orders' },
{ tableRef: { catalog: null, db: 'public', name: 'orders' }, display: 'public.orders' },
],
},
});
});
it('reports missing requested columns with available column candidates', async () => {
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
const service = createKtxEntityDetailsService(project);
const result = await service.read({
connectionId: 'warehouse',
entities: [{ table: 'public.orders', columns: ['status', 'plan_tier'] }],
});
expect(result.results[0]).toMatchObject({
ok: false,
error: {
code: 'column_not_found',
message: 'Column(s) not found on public.orders: plan_tier',
candidates: ['id', 'status'],
},
});
});
});

View file

@ -0,0 +1,911 @@
import { mkdtemp, readFile, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import YAML from 'yaml';
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
import type { KtxLocalScanEnrichmentResult } from '../../../src/context/scan/local-enrichment.js';
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from '../../../src/context/scan/local-enrichment-artifacts.js';
import type { KtxSchemaSnapshot } from '../../../src/context/scan/types.js';
const snapshot: KtxSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'customers',
kind: 'table',
comment: 'DB customer table',
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'DB customer id',
},
],
},
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'DB orders table',
estimatedRows: 3,
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: 'orders_customer_id_fkey',
},
],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'DB order id',
},
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: 'DB customer id',
},
],
},
],
};
function enrichment(): KtxLocalScanEnrichmentResult {
return {
snapshot,
summary: {
dataDictionary: 'completed',
tableDescriptions: 'completed',
columnDescriptions: 'completed',
embeddings: 'completed',
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
state: {
resumedStages: [],
completedStages: ['descriptions', 'embeddings', 'relationships'],
failedStages: [],
},
warnings: [],
descriptionUpdates: [
{
table: { catalog: null, db: 'public', name: 'orders' },
tableDescription: 'AI orders table',
columnDescriptions: {
id: 'AI order id',
customer_id: 'AI customer reference',
},
},
{
table: { catalog: null, db: 'public', name: 'customers' },
tableDescription: 'AI customers table',
columnDescriptions: {
id: 'AI customer id',
},
},
],
embeddingUpdates: [
{ columnId: 'public.orders.id', text: 'orders id', embedding: [0.1, 0.2] },
{ columnId: 'public.orders.customer_id', text: 'orders customer_id', embedding: [0.3, 0.4] },
],
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'inferred',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
relationshipProfile: {
connectionId: 'warehouse',
driver: 'postgres',
sqlAvailable: true,
queryCount: 6,
tables: [{ table: { catalog: null, db: 'public', name: 'customers' }, rowCount: 2 }],
columns: {
'customers.id': {
table: { catalog: null, db: 'public', name: 'customers' },
column: 'id',
nativeType: 'integer',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
},
warnings: [],
},
resolvedRelationships: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'llm_proposal',
status: 'accepted',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.92,
pkScore: 0.95,
fkScore: 0.91,
score: 0.9,
evidence: {
sourceColumnBase: 'buyer',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.88,
nameScore: 0.45,
reasons: ['llm_proposal', 'llm_pk_proposal'],
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
},
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
checkedValues: 2,
reasons: ['validation_passed'],
},
graph: {
targetPkScore: 0.95,
incomingCandidateCount: 1,
conflictRank: 1,
reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'],
},
},
],
compositeRelationships: null,
};
}
describe('writeLocalScanEnrichmentArtifacts', () => {
let tempDir: string;
let project: KtxLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-enrichment-artifacts-'));
project = await initKtxProject({
projectDir: join(tempDir, 'project'),
});
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('writes enrichment artifacts and manifest shards while preserving external descriptions', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify(
{
tables: {
orders: {
table: 'public.orders',
descriptions: { user: 'Pinned analyst description', ai: 'Old AI description' },
columns: [
{
name: 'id',
type: 'number',
descriptions: { user: 'Pinned id description', ai: 'Old AI id' },
},
{ name: 'customer_id', type: 'number' },
],
joins: [
{
to: 'customers',
on: 'orders.id = customers.id',
relationship: 'many_to_one',
source: 'manual',
},
],
},
},
},
{ indent: 2, lineWidth: 0 },
),
'ktx',
'ktx@example.com',
'Seed manifest shard',
);
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
syncId: 'sync-1',
driver: 'postgres',
enrichment: enrichment(),
dryRun: false,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.91,
reviewThreshold: 0.61,
maxLlmTablesPerBatch: 12,
maxCandidatesPerColumn: 7,
profileSampleRows: 500,
profileConcurrency: 3,
validationConcurrency: 2,
},
});
expect(result).toEqual({
enrichmentArtifacts: [
'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/embeddings.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json',
],
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
manifestShardsWritten: 1,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json'),
'utf-8',
),
).resolves.toContain('AI orders table');
const relationshipsRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json'),
'utf-8',
);
const relationshipsArtifact = JSON.parse(relationshipsRaw) as {
accepted: Array<{
id: string;
status: string;
source: string;
pkScore: number;
fkScore: number;
evidence: unknown;
reasons: string[];
validation: unknown;
graph: unknown;
}>;
review: unknown[];
rejected: unknown[];
skipped: unknown[];
};
expect(relationshipsArtifact.accepted).toHaveLength(1);
expect(relationshipsArtifact.accepted[0]).toMatchObject({
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
status: 'accepted',
source: 'llm_proposal',
pkScore: 0.95,
fkScore: 0.91,
evidence: expect.objectContaining({
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
}),
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
validation: expect.objectContaining({ reasons: ['validation_passed'] }),
graph: expect.objectContaining({ reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'] }),
});
expect(relationshipsArtifact.review).toEqual([]);
expect(relationshipsArtifact.rejected).toEqual([]);
expect(relationshipsArtifact.skipped).toEqual([]);
const profileRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json'),
'utf-8',
);
expect(JSON.parse(profileRaw)).toMatchObject({
connectionId: 'warehouse',
driver: 'postgres',
sqlAvailable: true,
queryCount: 6,
warnings: [],
});
const diagnosticsRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json'),
'utf-8',
);
expect(JSON.parse(diagnosticsRaw)).toMatchObject({
connectionId: 'warehouse',
summary: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
noAcceptedReason: null,
candidateCountsBySource: { llm_proposal: 1 },
validation: { available: true, sqlAvailable: true, queryCount: 6 },
thresholds: { acceptThreshold: 0.91, reviewThreshold: 0.61 },
policy: {
validationRequiredForManifest: true,
maxCandidatesPerColumn: 7,
profileSampleRows: 500,
profileConcurrency: 3,
validationConcurrency: 2,
},
profileWarnings: [],
});
const manifestRaw = await readFile(
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
'utf-8',
);
const manifest = YAML.parse(manifestRaw) as {
tables: {
orders: {
descriptions: Record<string, string>;
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
joins: Array<{ to: string; on: string; source: string }>;
};
};
};
expect(manifest.tables.orders.descriptions).toEqual({
user: 'Pinned analyst description',
db: 'DB orders table',
ai: 'AI orders table',
});
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
user: 'Pinned id description',
db: 'DB order id',
ai: 'AI order id',
});
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
expect.objectContaining({
to: 'customers',
on: 'orders.id = customers.id',
source: 'manual',
}),
]),
);
});
it('writes formal accepted relationships into relationship artifacts and manifest shards', async () => {
const source = enrichment();
const formalEnrichment: KtxLocalScanEnrichmentResult = {
...source,
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'formal',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
resolvedRelationships: [],
compositeRelationships: null,
};
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
driver: 'sqlite',
syncId: 'sync-formal',
enrichment: formalEnrichment,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.85,
reviewThreshold: 0.55,
maxLlmTablesPerBatch: 40,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
profileConcurrency: 4,
validationConcurrency: 4,
},
dryRun: false,
});
const relationshipsPath = 'raw-sources/warehouse/live-database/sync-formal/enrichment/relationships.json';
const relationships = JSON.parse((await project.fileStore.readFile(relationshipsPath)).content) as {
accepted: Array<{ source: string; reasons: string[] }>;
};
expect(relationships.accepted).toEqual([
expect.objectContaining({
source: 'formal',
reasons: ['formal_metadata_accepted'],
}),
]);
const manifestPath = result.manifestShards[0];
if (!manifestPath) {
throw new Error('Expected manifest shard path');
}
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
tables: { orders: { joins: Array<{ to: string; on: string; source: string }> } };
};
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
]),
);
});
it('writes manually applied relationship joins with manual source', async () => {
const result = await writeLocalScanManifestShards({
project,
connectionId: 'warehouse',
syncId: 'sync-manual',
driver: 'postgres',
snapshot,
dryRun: false,
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:(public.orders.customer_id)->public.customers:(public.customers.id)',
source: 'manual',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
});
expect(result.manifestShardsWritten).toBe(1);
const shard = YAML.parse(await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'));
expect(shard.tables.orders.joins).toContainEqual({
to: 'customers',
on: 'orders.customer_id = customers.id',
relationship: 'many_to_one',
source: 'manual',
});
});
it('does not persist generated error descriptions in manifest shards', async () => {
await writeLocalScanManifestShards({
project,
connectionId: 'warehouse',
syncId: 'sync-error-description',
driver: 'postgres',
snapshot,
descriptionUpdates: [
{
table: { catalog: null, db: 'public', name: 'orders' },
tableDescription: 'Error generating description: timeout exceeded when trying to connect',
columnDescriptions: {
id: 'Error generating description: timeout exceeded when trying to connect',
customer_id: 'AI customer reference',
},
},
],
dryRun: false,
});
const shard = YAML.parse(
await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'),
) as {
tables: {
orders: {
descriptions?: Record<string, string>;
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
};
};
};
expect(shard.tables.orders.descriptions).toEqual({ db: 'DB orders table' });
expect(shard.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
db: 'DB order id',
});
expect(shard.tables.orders.columns.find((column) => column.name === 'customer_id')?.descriptions).toEqual({
db: 'DB customer id',
ai: 'AI customer reference',
});
});
it('writes accepted composite relationships to relationship artifacts and manifest shards', async () => {
const compositeSnapshot: KtxSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-05-07T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'order_lines',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'order_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'line_number',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: 'public',
name: 'order_line_allocations',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'order_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'line_number',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
const compositeEnrichment: KtxLocalScanEnrichmentResult = Object.assign(enrichment(), {
snapshot: compositeSnapshot,
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
descriptionUpdates: [],
embeddingUpdates: [],
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'inferred',
from: {
tableId: 'public.order_line_allocations',
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
columns: ['order_id', 'line_number'],
},
to: {
tableId: 'public.order_lines',
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
table: { catalog: null, db: 'public', name: 'order_lines' },
columns: ['order_id', 'line_number'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
resolvedRelationships: [],
compositeRelationships: [
{
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'composite_profile_match',
status: 'accepted',
from: {
tableId: 'public.order_line_allocations',
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
columns: ['order_id', 'line_number'],
},
to: {
tableId: 'public.order_lines',
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
table: { catalog: null, db: 'public', name: 'order_lines' },
columns: ['order_id', 'line_number'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
reasons: ['composite_validation_passed'],
},
},
],
});
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
driver: 'postgres',
syncId: 'sync-composite',
enrichment: compositeEnrichment,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.85,
reviewThreshold: 0.55,
maxLlmTablesPerBatch: 40,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
profileConcurrency: 4,
validationConcurrency: 4,
},
dryRun: false,
});
const relationships = JSON.parse(
(await project.fileStore.readFile('raw-sources/warehouse/live-database/sync-composite/enrichment/relationships.json'))
.content,
) as { accepted: Array<{ from: { columns: string[] }; to: { columns: string[] }; reasons: string[] }> };
expect(relationships.accepted[0]).toMatchObject({
from: { columns: ['order_id', 'line_number'] },
to: { columns: ['order_id', 'line_number'] },
reasons: ['composite_validation_passed'],
});
const manifestPath = result.manifestShards[0];
if (!manifestPath) {
throw new Error('Expected manifest shard path');
}
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
tables: { order_line_allocations: { joins: Array<{ to: string; on: string; source: string }> } };
};
expect(manifest.tables.order_line_allocations.joins).toEqual([
{
to: 'order_lines',
on: 'order_line_allocations.order_id = order_lines.order_id AND order_line_allocations.line_number = order_lines.line_number',
relationship: 'many_to_one',
source: 'inferred',
},
]);
});
it('writes structural manifest shards without enrichment artifacts', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify(
{
tables: {
orders: {
table: 'public.orders',
descriptions: { user: 'Pinned structural description', ai: 'Old generated text' },
usage: {
narrative: 'Orders are commonly filtered by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
ownerNote: 'Preserve analyst note',
},
columns: [
{
name: 'id',
type: 'number',
descriptions: { user: 'Pinned structural id', ai: 'Old generated id' },
},
{ name: 'customer_id', type: 'number' },
],
joins: [
{
to: 'customers',
on: 'orders.id = customers.id',
relationship: 'many_to_one',
source: 'manual',
},
],
},
},
},
{ indent: 2, lineWidth: 0 },
),
'ktx',
'ktx@example.com',
'Seed structural manifest shard',
);
const result = await writeLocalScanManifestShards({
project,
connectionId: 'warehouse',
syncId: 'sync-structural-1',
driver: 'postgres',
snapshot,
dryRun: false,
});
expect(result).toEqual({
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
manifestShardsWritten: 1,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-structural-1/enrichment/descriptions.json'),
'utf-8',
),
).rejects.toMatchObject({ code: 'ENOENT' });
const manifestRaw = await readFile(
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
'utf-8',
);
const manifest = YAML.parse(manifestRaw) as {
tables: {
orders: {
descriptions: Record<string, string>;
usage?: Record<string, unknown>;
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
joins: Array<{ to: string; on: string; source: string }>;
};
};
};
expect(manifest.tables.orders.descriptions).toEqual({
user: 'Pinned structural description',
db: 'DB orders table',
});
expect(manifest.tables.orders.usage).toEqual({
narrative: 'Orders are commonly filtered by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
ownerNote: 'Preserve analyst note',
});
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
user: 'Pinned structural id',
db: 'DB order id',
});
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
expect.objectContaining({
to: 'customers',
on: 'orders.id = customers.id',
source: 'manual',
}),
]),
);
});
it('returns planned empty paths without writing files during dry runs', async () => {
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
syncId: 'sync-dry-run',
driver: 'postgres',
enrichment: enrichment(),
dryRun: true,
});
expect(result).toEqual({
enrichmentArtifacts: [],
manifestShards: [],
manifestShardsWritten: 0,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-dry-run/enrichment/descriptions.json'),
'utf-8',
),
).rejects.toMatchObject({ code: 'ENOENT' });
});
});

View file

@ -0,0 +1,871 @@
import Database from 'better-sqlite3';
import { describe, expect, it, vi } from 'vitest';
import { buildDefaultKtxProjectConfig } from '../../../src/context/project/config.js';
import type {
KtxScanEnrichmentCompletedStage,
KtxScanEnrichmentFailedStage,
KtxScanEnrichmentStageLookup,
KtxScanEnrichmentStateStore,
} from '../../../src/context/scan/enrichment-state.js';
import {
createDeterministicLocalScanEnrichmentProviders,
runLocalScanEnrichment,
snapshotToKtxEnrichedSchema,
} from '../../../src/context/scan/local-enrichment.js';
import {
createKtxConnectorCapabilities,
type KtxQueryResult,
type KtxReadOnlyQueryInput,
type KtxEmbeddingPort,
type KtxScanConnector,
type KtxScanContext,
type KtxSchemaSnapshot,
} from '../../../src/context/scan/types.js';
function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
return {
dimensions: options.dimensions,
maxBatchSize: options.maxBatchSize ?? 64,
async embedBatch(texts) {
return texts.map((_, textIndex) =>
Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
);
},
};
}
const snapshot: KtxSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'customers',
kind: 'table',
comment: 'Customer accounts',
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Customer id',
},
],
},
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'Customer orders',
estimatedRows: 3,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: 'Customer id',
},
],
},
],
};
function connector(): KtxScanConnector {
return {
id: 'test:warehouse',
driver: 'postgres',
capabilities: createKtxConnectorCapabilities({
tableSampling: true,
columnSampling: true,
readOnlySql: true,
columnStats: true,
}),
introspect: vi.fn(async () => snapshot),
listSchemas: vi.fn(async () => []),
listTables: vi.fn(async () => []),
sampleTable: vi.fn(async () => ({
headers: ['id', 'customer_id'],
rows: [[1, 10]],
totalRows: 1,
})),
sampleColumn: vi.fn(async () => ({
values: ['10', '11'],
nullCount: 0,
distinctCount: 2,
})),
};
}
class InMemorySqliteExecutor {
readonly db = new Database(':memory:');
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
function noDeclaredRelationshipSnapshot(): KtxSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [
{
catalog: null,
db: null,
name: 'accounts',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: null,
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 3,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'account_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
}
function memoryEnrichmentStateStore(): KtxScanEnrichmentStateStore {
const records = new Map<string, KtxScanEnrichmentCompletedStage | KtxScanEnrichmentFailedStage>();
const key = (input: Pick<KtxScanEnrichmentStageLookup, 'runId' | 'stage'>) => `${input.runId}:${input.stage}`;
return {
async findCompletedStage<TOutput>(input: KtxScanEnrichmentStageLookup) {
const record = records.get(key(input));
if (!record || record.status !== 'completed' || record.inputHash !== input.inputHash) {
return null;
}
return record as KtxScanEnrichmentCompletedStage<TOutput>;
},
async saveCompletedStage(input) {
records.set(key(input), {
...input,
status: 'completed',
errorMessage: null,
});
},
async saveFailedStage(input) {
records.set(key(input), {
...input,
status: 'failed',
output: null,
});
},
async listRunStages(runId) {
return [...records.values()].filter((record) => record.runId === runId);
},
};
}
describe('local scan enrichment', () => {
it('maps a scan snapshot into relationship detector schema', () => {
const schema = snapshotToKtxEnrichedSchema(snapshot);
expect(schema.connectionId).toBe('warehouse');
expect(schema.tables).toHaveLength(2);
expect(schema.tables[1]?.columns.map((column) => column.name)).toEqual(['id', 'customer_id']);
expect(schema.tables[1]?.columns[1]).toMatchObject({
id: 'public.orders.customer_id',
tableId: 'public.orders',
primaryKey: false,
sampleValues: null,
embedding: null,
});
});
it('maps snapshot foreign keys into formal schema relationships', () => {
const source = noDeclaredRelationshipSnapshot();
const snapshotWithForeignKey = {
...source,
tables: source.tables.map((table) =>
table.name === 'orders'
? {
...table,
foreignKeys: [
{
fromColumn: 'account_id',
toCatalog: null,
toDb: null,
toTable: 'accounts',
toColumn: 'id',
constraintName: 'orders_account_id_fkey',
},
],
}
: table.name === 'accounts'
? {
...table,
columns: table.columns.map((column) =>
column.name === 'id' ? { ...column, primaryKey: true } : column,
),
}
: table,
),
};
const schema = snapshotToKtxEnrichedSchema(snapshotWithForeignKey);
expect(schema.relationships).toEqual([
{
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
source: 'formal',
from: {
tableId: 'orders',
columnIds: ['orders.account_id'],
table: { catalog: null, db: null, name: 'orders' },
columns: ['account_id'],
},
to: {
tableId: 'accounts',
columnIds: ['accounts.id'],
table: { catalog: null, db: null, name: 'accounts' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
]);
});
it('uses the supplied snapshot without calling connector.introspect', async () => {
const scanConnector = connector();
const introspect = vi.mocked(scanConnector.introspect);
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'structural',
connector: scanConnector,
snapshot,
context: { runId: 'scan-run-snapshot' },
providers: null,
});
expect(result.snapshot).toEqual(snapshot);
expect(introspect).not.toHaveBeenCalled();
});
it('falls back to connector.introspect when no snapshot is supplied', async () => {
const scanConnector = connector();
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'structural',
connector: scanConnector,
context: { runId: 'scan-run-introspect' },
providers: null,
});
expect(result.snapshot).toEqual(snapshot);
expect(scanConnector.introspect).toHaveBeenCalledTimes(1);
});
it('fails when connector driver and snapshot driver differ', async () => {
const mismatchedConnector: KtxScanConnector = {
...connector(),
driver: 'mysql',
};
await expect(
runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: mismatchedConnector,
snapshot,
context: { runId: 'scan-run-driver-mismatch' },
providers: null,
}),
).rejects.toThrow(
'ktx scan connector driver "mysql" does not match snapshot driver "postgres" for connection "warehouse"',
);
});
it('runs deterministic relationship detection for relationship scans', async () => {
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-1' },
providers: null,
});
expect(result.summary).toMatchObject({
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
embeddings: 'skipped',
});
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
expect(result.summary.statisticalValidation).toBe('skipped');
expect(result.warnings).toContainEqual({
code: 'relationship_validation_failed',
message: 'KTX scan connector advertises readOnlySql but does not expose executeReadOnly',
recoverable: true,
metadata: { capability: 'readOnlySql' },
});
});
it('runs relationship discovery with connector SQL evidence', async () => {
const executor = new InMemorySqliteExecutor();
try {
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts (id) VALUES (1), (2);
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
`);
const scanConnector = {
...connector(),
driver: 'sqlite' as const,
capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
executeReadOnly: executor.executeReadOnly.bind(executor),
};
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: scanConnector,
context: { runId: 'scan-run-relationship-discovery' },
providers: null,
});
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.summary.statisticalValidation).toBe('completed');
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
expect(result.resolvedRelationships).toEqual([
expect.objectContaining({
status: 'accepted',
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
}),
]);
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
} finally {
executor.close();
}
});
it('honors scan relationship config when LLM proposals are disabled', async () => {
const providers = createDeterministicLocalScanEnrichmentProviders();
const generateObject = vi.fn();
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-llm-disabled' },
providers: {
...providers,
llmRuntime: {
...providers.llmRuntime,
generateObject: generateObject as never,
},
},
relationshipSettings: {
...buildDefaultKtxProjectConfig().scan.relationships,
llmProposals: false,
maxLlmTablesPerBatch: 40,
},
});
expect(result.summary.llmRelationshipValidation).toBe('skipped');
expect(generateObject).not.toHaveBeenCalled();
});
it('skips relationship detection when scan relationships are disabled', async () => {
const settings = {
...buildDefaultKtxProjectConfig().scan.relationships,
enabled: false,
};
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
connector: connector(),
context: { runId: 'disabled-relationships' },
providers: createDeterministicLocalScanEnrichmentProviders(),
relationshipSettings: settings,
});
expect(result.summary.deterministicRelationships).toBe('skipped');
expect(result.summary.statisticalValidation).toBe('skipped');
expect(result.summary.llmRelationshipValidation).toBe('skipped');
expect(result.relationships).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate).toBeNull();
expect(result.relationshipProfile).toBeNull();
expect(result.resolvedRelationships).toBeNull();
});
it('forwards context.logger and emits warnings when sampleTable fails repeatedly', async () => {
const failingConnector: KtxScanConnector = {
...connector(),
sampleTable: vi.fn(async () => {
throw new Error('pool: ECONNRESET');
}),
};
const logger = {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
};
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: failingConnector,
context: { runId: 'scan-run-warnings', logger },
providers: createDeterministicLocalScanEnrichmentProviders(),
});
const codes = result.warnings.map((warning) => warning.code);
expect(codes).toContain('sampling_failed');
expect(codes).toContain('description_fallback_used');
expect(result.warnings.some((warning) => warning.table === 'customers')).toBe(true);
expect(logger.warn).toHaveBeenCalled();
expect(logger.error).toHaveBeenCalled();
// Each of the two tables produced sampling_failed + description_fallback_used, so 2 + 2 = 4 warnings minimum.
expect(result.warnings.length).toBeGreaterThanOrEqual(4);
// Sampling was retried 3× for each of the 2 tables = 6 calls
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
});
it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-2' },
providers: createDeterministicLocalScanEnrichmentProviders(),
});
expect(result.summary).toMatchObject({
dataDictionary: 'completed',
tableDescriptions: 'completed',
columnDescriptions: 'completed',
embeddings: 'skipped',
deterministicRelationships: 'completed',
});
expect(result.embeddingUpdates).toEqual([]);
expect(result.snapshot).toEqual(snapshot);
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
});
it('generates batched table descriptions with bounded table-level concurrency', async () => {
const concurrentSnapshot: KtxSchemaSnapshot = {
...snapshot,
tables: Array.from({ length: 8 }, (_, index) => ({
catalog: null,
db: 'public',
name: `table_${index + 1}`,
kind: 'table' as const,
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number' as const,
nullable: false,
primaryKey: true,
comment: null,
},
],
})),
};
let activeTableSamples = 0;
let maxActiveTableSamples = 0;
const scanConnector = {
...connector(),
introspect: vi.fn(async () => concurrentSnapshot),
sampleColumn: vi.fn(async () => ({
values: ['1'],
nullCount: 0,
distinctCount: 1,
})),
sampleTable: vi.fn(async () => {
activeTableSamples += 1;
maxActiveTableSamples = Math.max(maxActiveTableSamples, activeTableSamples);
await new Promise((resolve) => setTimeout(resolve, 10));
activeTableSamples -= 1;
return {
headers: ['id'],
rows: [[1]],
totalRows: 1,
};
}),
};
const settings = {
...buildDefaultKtxProjectConfig().scan.relationships,
enabled: false,
};
await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
connector: scanConnector,
context: { runId: 'scan-run-concurrent-descriptions' },
providers: createDeterministicLocalScanEnrichmentProviders(),
relationshipSettings: settings,
});
expect(maxActiveTableSamples).toBe(4);
expect(scanConnector.sampleColumn).not.toHaveBeenCalled();
});
it('reports enrichment progress for countable stages', async () => {
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
const progress = {
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
events.push({ progress: progressValue, message, transient: options?.transient });
},
startPhase() {
return progress;
},
};
await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-progress', progress },
providers: {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
},
});
expect(events).toEqual(
expect.arrayContaining([
expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
expect.objectContaining({ message: 'Generating descriptions 2/2 tables', transient: true }),
expect.objectContaining({ message: 'Building embeddings 1/1 batches', transient: true }),
expect.objectContaining({ message: 'Detecting relationships' }),
]),
);
});
it('reports progress before enrichment connector introspection starts', async () => {
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
const progress = {
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
events.push({ progress: progressValue, message, transient: options?.transient });
},
startPhase() {
return progress;
},
};
const scanConnector = {
...connector(),
introspect: vi.fn(async () => {
expect(events).toContainEqual(expect.objectContaining({ message: 'Loading enrichment schema snapshot' }));
return snapshot;
}),
};
await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: scanConnector,
context: { runId: 'scan-run-progress-before-introspection', progress },
providers: null,
});
expect(scanConnector.introspect).toHaveBeenCalled();
});
it('splits enrichment embedding requests by provider batch size', async () => {
const manyColumnSnapshot: KtxSchemaSnapshot = {
...snapshot,
tables: [
{
catalog: null,
db: 'public',
name: 'wide_orders',
kind: 'table',
comment: 'Wide order facts',
estimatedRows: 3,
foreignKeys: [],
columns: Array.from({ length: 5 }, (_, index) => ({
name: `metric_${index + 1}`,
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number' as const,
nullable: false,
primaryKey: false,
comment: `Metric ${index + 1}`,
})),
},
],
};
const scanConnector = {
...connector(),
introspect: vi.fn(async () => manyColumnSnapshot),
};
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
const embedBatch = vi.fn(async (texts: string[]) => {
if (texts.length > 2) {
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
}
return texts.map((_, index) => [index, index + 1, index + 2]);
});
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: scanConnector,
context: { runId: 'scan-run-batched-embeddings' },
providers: {
llmRuntime: deterministicProviders.llmRuntime,
embedding: {
dimensions: 3,
maxBatchSize: 2,
embedBatch,
},
},
});
expect(result.embeddingUpdates).toHaveLength(5);
expect(embedBatch.mock.calls.map(([texts]) => texts).map((texts) => texts.length)).toEqual([2, 2, 1]);
});
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
const stateStore = memoryEnrichmentStateStore();
const scanConnector = connector();
const providers = {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
};
const first = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: scanConnector,
context: { runId: 'scan-run-resume-1' },
providers,
stateStore,
syncId: 'sync-resume-1',
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
const second = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: scanConnector,
context: { runId: 'scan-run-resume-1' },
providers,
stateStore,
syncId: 'sync-resume-1',
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
expect(first.state.resumedStages).toEqual([]);
expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
expect(generateObject).not.toHaveBeenCalled();
expect(embedBatch).not.toHaveBeenCalled();
expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
expect(second.relationships).toEqual(first.relationships);
});
it('does not reuse completed stages when the snapshot changes', async () => {
const stateStore = memoryEnrichmentStateStore();
const providers = {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
};
const scanConnector = connector();
await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: scanConnector,
context: { runId: 'scan-run-resume-hash' },
providers,
stateStore,
syncId: 'sync-resume-hash',
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
const firstTable = snapshot.tables[0];
if (!firstTable) {
throw new Error('Expected test snapshot table');
}
const changedConnector = {
...connector(),
introspect: vi.fn(async () => ({
...snapshot,
tables: [{ ...firstTable, name: 'customers' }],
})),
};
const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: changedConnector,
context: { runId: 'scan-run-resume-hash' },
providers,
stateStore,
syncId: 'sync-resume-hash',
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
expect(result.state.resumedStages).toEqual([]);
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
expect(generateObject).toHaveBeenCalled();
});
it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
const executor = new InMemorySqliteExecutor();
try {
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts (id) VALUES (1), (2);
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
`);
const scanConnector = {
...connector(),
driver: 'sqlite' as const,
capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
executeReadOnly: executor.executeReadOnly.bind(executor),
};
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: scanConnector,
context: { runId: 'scan-run-providerless-enriched' },
providers: null,
});
expect(result.summary).toEqual({
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'completed',
});
expect(result.descriptionUpdates).toEqual([]);
expect(result.embeddingUpdates).toEqual([]);
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
expect(result.resolvedRelationships).toEqual([
expect.objectContaining({
status: 'accepted',
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
}),
]);
expect(result.warnings).toContainEqual({
code: 'scan_enrichment_backend_not_configured',
message:
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
recoverable: true,
metadata: {
skippedStages: ['descriptions', 'embeddings'],
relationshipDetection: true,
},
});
} finally {
executor.close();
}
});
});

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,278 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
import { readLocalScanStructuralSnapshot } from '../../../src/context/scan/local-structural-artifacts.js';
describe('readLocalScanStructuralSnapshot', () => {
let tempDir: string;
let project: KtxLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-structural-artifacts-'));
project = await initKtxProject({
projectDir: join(tempDir, 'project'),
});
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('rebuilds a canonical snapshot from persisted live-database raw files', async () => {
const rawRoot = 'raw-sources/warehouse/live-database/sync-1';
await project.fileStore.writeFile(
`${rawRoot}/connection.json`,
`${JSON.stringify(
{
connectionId: 'warehouse',
extractedAt: '2026-04-29T12:00:00.000Z',
metadata: { source: 'sqlite-smoke' },
tableCount: 2,
},
null,
2,
)}\n`,
'ktx',
'ktx@example.com',
'Seed connection artifact',
);
await project.fileStore.writeFile(
`${rawRoot}/tables/customers.json`,
`${JSON.stringify(
{
name: 'customers',
catalog: null,
db: 'public',
kind: 'table',
comment: 'Customer table',
estimatedRows: 12,
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Customer id',
},
],
foreignKeys: [],
},
null,
2,
)}\n`,
'ktx',
'ktx@example.com',
'Seed customers artifact',
);
await project.fileStore.writeFile(
`${rawRoot}/tables/orders.json`,
`${JSON.stringify(
{
name: 'orders',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: 20,
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
{
name: 'customer_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: null,
},
],
},
null,
2,
)}\n`,
'ktx',
'ktx@example.com',
'Seed orders artifact',
);
const snapshot = await readLocalScanStructuralSnapshot({
project,
connectionId: 'warehouse',
driver: 'sqlite',
rawSourcesDir: rawRoot,
extractedAtFallback: '2026-04-29T13:00:00.000Z',
});
expect(snapshot).toMatchObject({
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-04-29T12:00:00.000Z',
metadata: { source: 'sqlite-smoke' },
tables: [
{
db: 'public',
name: 'customers',
comment: 'Customer table',
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Customer id',
},
],
},
{
db: 'public',
name: 'orders',
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: null,
},
],
},
],
});
});
it('rebuilds scan warnings from persisted live-database warning files', async () => {
const rawRoot = 'raw-sources/warehouse/live-database/sync-warnings';
await project.fileStore.writeFile(
`${rawRoot}/connection.json`,
'{"connectionId":"warehouse","metadata":{}}\n',
'ktx',
'ktx@example.com',
'Seed connection artifact',
);
await project.fileStore.writeFile(
`${rawRoot}/warnings.json`,
`${JSON.stringify(
{
warnings: [
{
code: 'constraint_discovery_unauthorized',
message: 'Skipped foreign-key discovery in public (insufficient grants on system catalogs)',
recoverable: true,
metadata: { schema: 'public', kind: 'foreign_key' },
},
],
},
null,
2,
)}\n`,
'ktx',
'ktx@example.com',
'Seed warning artifact',
);
await project.fileStore.writeFile(
`${rawRoot}/tables/orders.json`,
'{"name":"orders","catalog":null,"db":"public","kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":false,"comment":null}],"foreignKeys":[]}\n',
'ktx',
'ktx@example.com',
'Seed orders artifact',
);
const snapshot = await readLocalScanStructuralSnapshot({
project,
connectionId: 'warehouse',
driver: 'postgres',
rawSourcesDir: rawRoot,
extractedAtFallback: '2026-04-29T13:00:00.000Z',
});
expect(snapshot.warnings).toEqual([
{
code: 'constraint_discovery_unauthorized',
message: 'Skipped foreign-key discovery in public (insufficient grants on system catalogs)',
recoverable: true,
metadata: { schema: 'public', kind: 'foreign_key' },
},
]);
});
it('uses the scan report timestamp when connection.json omits extractedAt', async () => {
const rawRoot = 'raw-sources/warehouse/live-database/sync-2';
await project.fileStore.writeFile(
`${rawRoot}/connection.json`,
'{"connectionId":"warehouse","metadata":{}}\n',
'ktx',
'ktx@example.com',
'Seed connection artifact without extractedAt',
);
await project.fileStore.writeFile(
`${rawRoot}/tables/orders.json`,
'{"name":"orders","catalog":null,"db":null,"kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n',
'ktx',
'ktx@example.com',
'Seed orders artifact',
);
const snapshot = await readLocalScanStructuralSnapshot({
project,
connectionId: 'warehouse',
driver: 'postgres',
rawSourcesDir: rawRoot,
extractedAtFallback: '2026-04-29T13:00:00.000Z',
});
expect(snapshot.extractedAt).toBe('2026-04-29T13:00:00.000Z');
});
it('tolerates older live-database staged directories without warnings.json', async () => {
const rawRoot = 'raw-sources/warehouse/live-database/sync-no-warnings';
await project.fileStore.writeFile(
`${rawRoot}/connection.json`,
'{"connectionId":"warehouse","metadata":{}}\n',
'ktx',
'ktx@example.com',
'Seed connection artifact',
);
await project.fileStore.writeFile(
`${rawRoot}/tables/orders.json`,
'{"name":"orders","catalog":null,"db":null,"kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n',
'ktx',
'ktx@example.com',
'Seed orders artifact',
);
const snapshot = await readLocalScanStructuralSnapshot({
project,
connectionId: 'warehouse',
driver: 'postgres',
rawSourcesDir: rawRoot,
extractedAtFallback: '2026-04-29T13:00:00.000Z',
});
expect(snapshot.warnings).toEqual([]);
});
});

View file

@ -0,0 +1,451 @@
import { describe, expect, it } from 'vitest';
import {
buildKtxRelationshipBenchmarkReport,
formatKtxRelationshipBenchmarkReportMarkdown,
} from '../../../src/context/scan/relationship-benchmark-report.js';
import type {
KtxRelationshipBenchmarkCaseResult,
KtxRelationshipBenchmarkFixture,
KtxRelationshipBenchmarkSuiteResult,
} from '../../../src/context/scan/relationship-benchmarks.js';
type CaseResultOverrides = Omit<Partial<KtxRelationshipBenchmarkCaseResult>, 'metrics'> & {
metrics?: Partial<KtxRelationshipBenchmarkCaseResult['metrics']>;
};
function caseResult(overrides: CaseResultOverrides = {}): KtxRelationshipBenchmarkCaseResult {
return {
fixtureId: overrides.fixtureId ?? 'demo_b2b_no_declared_constraints',
mode: overrides.mode ?? 'declared_pks_and_declared_fks_removed',
metrics: {
pkPrecision: 1,
pkRecall: 0.5,
pkF1: 0.6666666666666666,
fkPrecision: 1,
fkRecall: 1,
fkF1: 1,
acceptedFalsePositiveCount: 0,
reviewRecall: 0,
acceptedOrReviewRecall: 1,
runtimeSeconds: 0.012345,
sqlQueries: 14,
llmCalls: 0,
...(overrides.metrics ?? {}),
},
expected: overrides.expected ?? {
pk: ['accounts.(id)', 'users.(id)'],
fk: ['users.(account_id)->accounts.(id)'],
},
predicted: overrides.predicted ?? {
pk: ['accounts.(id)'],
fk: ['users.(account_id)->accounts.(id)'],
acceptedFk: ['users.(account_id)->accounts.(id)'],
reviewFk: [],
},
falsePositives: overrides.falsePositives ?? { pk: [], fk: [] },
falseNegatives: overrides.falseNegatives ?? { pk: ['users.(id)'], fk: [] },
skippedComposite: overrides.skippedComposite ?? { pk: [], fk: [] },
validationBlocked: overrides.validationBlocked ?? false,
};
}
function fixture(overrides: Partial<KtxRelationshipBenchmarkFixture> = {}): KtxRelationshipBenchmarkFixture {
return {
id: overrides.id ?? 'demo_b2b_no_declared_constraints',
name: overrides.name ?? 'Packaged B2B demo with declared PK and FK metadata masked',
tier: overrides.tier ?? 'smoke',
origin: overrides.origin ?? 'synthetic',
thresholdEligible: overrides.thresholdEligible,
validationBudget: overrides.validationBudget,
snapshot: overrides.snapshot ?? {
connectionId: 'demo_b2b',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [],
},
expected: overrides.expected ?? { expectedPks: [], expectedLinks: [] },
defaultModes: overrides.defaultModes ?? ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
dataPath: overrides.dataPath ?? '/tmp/demo.sqlite',
columnEmbeddings: overrides.columnEmbeddings ?? {},
};
}
describe('relationship benchmark report', () => {
it('classifies run, validation-blocked, and not-run benchmark cases', () => {
const suite: KtxRelationshipBenchmarkSuiteResult = {
cases: [
caseResult(),
caseResult({
mode: 'validation_disabled',
validationBlocked: true,
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
predicted: {
pk: ['accounts.(id)'],
fk: ['users.(account_id)->accounts.(id)'],
acceptedFk: [],
reviewFk: ['users.(account_id)->accounts.(id)'],
},
}),
],
validationBlockedCases: ['demo_b2b_no_declared_constraints:validation_disabled'],
aggregate: {
caseCount: 2,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 1,
headlineAcceptedOrReviewRecall: 1,
meanPkRecall: 0.5,
meanFkRecall: 0.5,
meanAcceptedOrReviewRecall: 1,
},
};
const report = buildKtxRelationshipBenchmarkReport({
fixtures: [fixture()],
suite,
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled', 'profiling_disabled'],
});
expect(report.headline).toEqual({
caseCount: 2,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 1,
headlineAcceptedOrReviewRecall: 1,
acceptedFalsePositiveCount: 0,
validationBlockedCount: 1,
});
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.status}`)).toEqual([
'demo_b2b_no_declared_constraints:declared_pks_and_declared_fks_removed:run',
'demo_b2b_no_declared_constraints:validation_disabled:validation_blocked',
'demo_b2b_no_declared_constraints:profiling_disabled:not_run',
]);
expect(report.cases[2]?.reason).toBe('mode not selected by fixture defaultModes');
});
it('surfaces validation budget review candidates in the report reason', () => {
const suite: KtxRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
fixtureId: 'scale_stress_no_declared_constraints',
metrics: { fkRecall: 0.5, acceptedOrReviewRecall: 1 },
predicted: {
pk: ['dim_entity_00.(entity_00_key)'],
fk: [
'fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)',
'fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)',
],
acceptedFk: ['fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)'],
reviewFk: ['fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)'],
},
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 1,
headlineCaseCount: 0,
headlinePkRecall: 1,
headlineFkRecall: 0.5,
headlineAcceptedOrReviewRecall: 1,
meanPkRecall: 1,
meanFkRecall: 0.5,
meanAcceptedOrReviewRecall: 1,
},
};
const report = buildKtxRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'scale_stress_no_declared_constraints',
name: 'Scale stress fixture',
tier: 'row_bearing',
validationBudget: 800,
defaultModes: ['declared_pks_and_declared_fks_removed'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed'],
});
expect(report.cases[0]?.reason).toBe('review candidate validation reasons: validation_unattempted (1)');
expect(formatKtxRelationshipBenchmarkReportMarkdown(report)).toContain('validation_unattempted');
});
it('uses benchmark suite eligibility for product and smoke report rows', () => {
const productCase = caseResult({ fixtureId: 'product_curated' });
const productBlocked = caseResult({
fixtureId: 'product_curated',
mode: 'validation_disabled',
validationBlocked: true,
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
});
const smokeCase = caseResult({ fixtureId: 'smoke_even_if_marked' });
const suite: KtxRelationshipBenchmarkSuiteResult = {
cases: [productCase, productBlocked, smokeCase],
validationBlockedCases: ['product_curated:validation_disabled'],
aggregate: {
caseCount: 3,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 1,
headlineAcceptedOrReviewRecall: 1,
meanPkRecall: 0.5,
meanFkRecall: 0.6666666666666666,
meanAcceptedOrReviewRecall: 1,
},
};
const report = buildKtxRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'product_curated',
name: 'Curated product fixture',
tier: 'product',
thresholdEligible: true,
defaultModes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
}),
fixture({
id: 'smoke_even_if_marked',
name: 'Marked smoke fixture',
tier: 'smoke',
thresholdEligible: true,
defaultModes: ['declared_pks_and_declared_fks_removed'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
});
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.tuningEligible}`)).toEqual([
'product_curated:declared_pks_and_declared_fks_removed:true',
'product_curated:validation_disabled:false',
'smoke_even_if_marked:declared_pks_and_declared_fks_removed:false',
'smoke_even_if_marked:validation_disabled:false',
]);
expect(formatKtxRelationshipBenchmarkReportMarkdown(report)).toContain(
'| product_curated | product | declared_pks_and_declared_fks_removed | run | yes |',
);
});
it('formats a compact Markdown report with false negatives and blocked modes', () => {
const suite: KtxRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
metrics: { fkRecall: 0, acceptedOrReviewRecall: 0 },
falseNegatives: { pk: ['users.(id)'], fk: ['users.(account_id)->accounts.(id)'] },
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 1,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 0,
headlineAcceptedOrReviewRecall: 0,
meanPkRecall: 0.5,
meanFkRecall: 0,
meanAcceptedOrReviewRecall: 0,
},
};
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
buildKtxRelationshipBenchmarkReport({
fixtures: [fixture()],
suite,
modes: ['declared_pks_and_declared_fks_removed'],
}),
);
expect(markdown).toContain('# KTX Relationship Discovery Benchmark Evidence');
expect(markdown).toContain(
'| demo_b2b_no_declared_constraints | smoke | declared_pks_and_declared_fks_removed | run | no | 0.500 | 0.000 | 0.000 | 0 |',
);
expect(markdown).toContain(
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(id)',
);
expect(markdown).toContain(
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(account_id)->accounts.(id)',
);
});
it('keeps headline failures separate from non-headline failure details', () => {
const suite: KtxRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
fixtureId: 'product_curated',
falseNegatives: { pk: [], fk: [] },
metrics: { pkRecall: 1, fkRecall: 1, acceptedOrReviewRecall: 1 },
}),
caseResult({
fixtureId: 'product_curated',
mode: 'embeddings_disabled',
falseNegatives: {
pk: ['customers.(id)'],
fk: ['orders.(buyer_ref)->customers.(id)'],
},
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 2,
headlineCaseCount: 1,
headlinePkRecall: 1,
headlineFkRecall: 1,
headlineAcceptedOrReviewRecall: 1,
meanPkRecall: 0.75,
meanFkRecall: 0.5,
meanAcceptedOrReviewRecall: 0.5,
},
};
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
buildKtxRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'product_curated',
name: 'Curated product fixture',
tier: 'product',
thresholdEligible: true,
defaultModes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
}),
);
expect(markdown).toContain('## Failure Details');
expect(markdown).toContain('### Headline False Negative FKs\n\n- none');
expect(markdown).toContain(
'- `product_curated` / `embeddings_disabled` / `run`: orders.(buyer_ref)->customers.(id)',
);
expect(markdown).toContain('- `product_curated` / `embeddings_disabled` / `run`: customers.(id)');
});
it('formats headline failure context from remaining headline false negatives', () => {
const suite: KtxRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
fixtureId: 'public_headline_fixture',
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
falseNegatives: {
pk: ['parent_table.(opaque_key)'],
fk: ['child_table.(parent_table_id)->parent_table.(opaque_key)'],
},
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 1,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 0,
headlineAcceptedOrReviewRecall: 0,
meanPkRecall: 0.5,
meanFkRecall: 0,
meanAcceptedOrReviewRecall: 0,
},
};
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
buildKtxRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'public_headline_fixture',
name: 'Public headline fixture',
tier: 'row_bearing',
thresholdEligible: true,
defaultModes: ['declared_pks_and_declared_fks_removed'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed'],
}),
);
expect(markdown).toContain('## Headline Failure Context');
expect(markdown).toContain('- Remaining headline false-negative PKs: 1');
expect(markdown).toContain('- Remaining headline false-negative FKs: 1');
expect(markdown).toContain(
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: parent_table.(opaque_key)',
);
expect(markdown).toContain(
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: child_table.(parent_table_id)->parent_table.(opaque_key)',
);
});
it('formats skipped composite ground truth separately from false-negative details', () => {
const compositePk = 'order_lines.(order_id,line_number)';
const compositeFk = 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)';
const suite: KtxRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
fixtureId: 'composite_keys_no_declared_constraints',
metrics: { pkRecall: 0, fkRecall: 0, acceptedOrReviewRecall: 0 },
expected: {
pk: [compositePk],
fk: [compositeFk],
},
predicted: {
pk: [],
fk: [],
acceptedFk: [],
reviewFk: [],
},
falseNegatives: {
pk: [compositePk],
fk: [compositeFk],
},
skippedComposite: {
pk: [compositePk],
fk: [compositeFk],
},
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 1,
headlineCaseCount: 1,
headlinePkRecall: 0,
headlineFkRecall: 0,
headlineAcceptedOrReviewRecall: 0,
meanPkRecall: 0,
meanFkRecall: 0,
meanAcceptedOrReviewRecall: 0,
},
};
const report = buildKtxRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'composite_keys_no_declared_constraints',
name: 'Composite key fixture with no declared constraints',
tier: 'row_bearing',
defaultModes: ['declared_pks_and_declared_fks_removed'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed'],
});
expect(report.cases[0]?.skippedComposite).toEqual({
pk: [compositePk],
fk: [compositeFk],
});
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(report);
expect(markdown).toContain('## Composite Ground Truth Skips');
expect(markdown).toContain(
'### Skipped Composite PKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_lines.(order_id,line_number)',
);
expect(markdown).toContain(
'### Skipped Composite FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
);
expect(markdown).toContain(
'### Headline False Negative FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
);
});
});

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,86 @@
import { describe, expect, it } from 'vitest';
import { applyKtxRelationshipValidationBudget, defaultKtxRelationshipValidationBudget } from '../../../src/context/scan/relationship-budget.js';
interface Candidate {
id: string;
confidence: number;
}
describe('relationship validation budget', () => {
it('computes the default validation budget from table count', () => {
expect(defaultKtxRelationshipValidationBudget(0)).toBe(0);
expect(defaultKtxRelationshipValidationBudget(3)).toBe(6);
expect(defaultKtxRelationshipValidationBudget(400)).toBe(800);
expect(defaultKtxRelationshipValidationBudget(900)).toBe(1000);
expect(defaultKtxRelationshipValidationBudget(-4)).toBe(0);
expect(defaultKtxRelationshipValidationBudget(3.8)).toBe(6);
});
it('splits candidates by descending score with stable tie ordering', () => {
const result = applyKtxRelationshipValidationBudget<Candidate>({
candidates: [
{ id: 'first', confidence: 0.8 },
{ id: 'second', confidence: 0.9 },
{ id: 'third', confidence: 0.9 },
{ id: 'fourth', confidence: 0.2 },
],
tableCount: 100,
budget: 2,
score: (candidate) => candidate.confidence,
});
expect(result.effectiveBudget).toBe(2);
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['second', 'third']);
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'fourth']);
expect(result.toValidate.map((entry) => entry.originalIndex)).toEqual([1, 2]);
});
it('uses the default budget when the budget is omitted', () => {
const candidates = Array.from({ length: 8 }, (_, index) => ({
id: `candidate-${index}`,
confidence: 1 - index / 10,
}));
const result = applyKtxRelationshipValidationBudget<Candidate>({
candidates,
tableCount: 2,
score: (candidate) => candidate.confidence,
});
expect(result.effectiveBudget).toBe(4);
expect(result.toValidate).toHaveLength(4);
expect(result.deferred).toHaveLength(4);
});
it('treats budget zero as disabling SQL validation', () => {
const result = applyKtxRelationshipValidationBudget<Candidate>({
candidates: [
{ id: 'first', confidence: 1 },
{ id: 'second', confidence: 0.5 },
],
tableCount: 10,
budget: 0,
score: (candidate) => candidate.confidence,
});
expect(result.effectiveBudget).toBe(0);
expect(result.toValidate).toEqual([]);
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
});
it('treats budget all as validating every candidate', () => {
const result = applyKtxRelationshipValidationBudget<Candidate>({
candidates: [
{ id: 'first', confidence: 0.1 },
{ id: 'second', confidence: 0.9 },
],
tableCount: 1,
budget: 'all',
score: (candidate) => candidate.confidence,
});
expect(result.effectiveBudget).toBe('all');
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
expect(result.deferred).toEqual([]);
});
});

View file

@ -0,0 +1,881 @@
import { describe, expect, it } from 'vitest';
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
import { normalizeKtxRelationshipName } from '../../../src/context/scan/relationship-name-similarity.js';
import {
generateKtxRelationshipDiscoveryCandidates,
inferKtxRelationshipTargetPks,
mergeKtxRelationshipDiscoveryCandidates,
} from '../../../src/context/scan/relationship-candidates.js';
import type { KtxRelationshipProfileArtifact } from '../../../src/context/scan/relationship-profiling.js';
function column(
tableId: string,
id: string,
name: string,
options: Partial<KtxEnrichedColumn> = {},
): KtxEnrichedColumn {
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
return {
id,
tableId,
tableRef,
name,
nativeType: options.nativeType ?? 'INTEGER',
normalizedType: options.normalizedType ?? 'integer',
dimensionType: options.dimensionType ?? 'number',
nullable: options.nullable ?? true,
primaryKey: options.primaryKey ?? false,
parentColumnId: options.parentColumnId ?? null,
descriptions: options.descriptions ?? {},
embedding: options.embedding ?? null,
sampleValues: options.sampleValues ?? null,
cardinality: options.cardinality ?? null,
};
}
function table(id: string, name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
const ref = { catalog: null, db: 'public', name };
return {
id,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
};
}
function schema(tables: KtxEnrichedTable[]): KtxEnrichedSchema {
return {
connectionId: 'warehouse',
tables,
relationships: [],
};
}
function planCodeProfiles(): KtxRelationshipProfileArtifact {
return {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 0,
tables: [
{ table: { catalog: null, db: 'public', name: 'stg_plans' }, rowCount: 4 },
{ table: { catalog: null, db: 'public', name: 'mart_account_segments' }, rowCount: 4 },
{ table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' }, rowCount: 4 },
],
warnings: [],
columns: {
'stg_plans.plan_code': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'stg_plans.created_at': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'created_at',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['2026-05-01', '2026-05-02', '2026-05-03', '2026-05-04'],
minTextLength: 10,
maxTextLength: 10,
},
'stg_plans.email': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'email',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['a@example.test', 'b@example.test', 'c@example.test', 'd@example.test'],
minTextLength: 14,
maxTextLength: 14,
},
'stg_plans.is_deleted': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'is_deleted',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['deleted-a', 'deleted-b', 'deleted-c', 'deleted-d'],
minTextLength: 9,
maxTextLength: 9,
},
'mart_account_segments.current_plan_code': {
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
column: 'current_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'mart_account_segments.normalized_plan_code': {
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
column: 'normalized_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'stg_plan_segment_mapping.canonical_plan_code': {
table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' },
column: 'canonical_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'stg_plans.canonical_plan_code': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'canonical_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
},
};
}
describe('relationship discovery candidates', () => {
it('normalizes warehouse prefixes and emits review candidates without declared primary keys', () => {
const accounts = table('accounts-id', 'dim_accounts', [
column('accounts-id', 'accounts-id-col', 'id', { primaryKey: false }),
column('accounts-id', 'accounts-name-col', 'account_name', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const invoices = table('invoices-id', 'fct_invoices', [
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: false }),
column('invoices-id', 'account-id-col', 'account_id', { primaryKey: false }),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]));
expect(candidates).toHaveLength(1);
expect(candidates[0]).toMatchObject({
from: { tableId: 'invoices-id', columnIds: ['account-id-col'], columns: ['account_id'] },
to: { tableId: 'accounts-id', columnIds: ['accounts-id-col'], columns: ['id'] },
relationshipType: 'many_to_one',
status: 'review',
source: 'normalized_table_match',
evidence: {
sourceColumnBase: 'account',
targetTableBase: 'account',
targetKeyScore: 0.92,
},
});
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.8);
expect(candidates[0]?.evidence.signalVector).toMatchObject({
nameSimilarity: 0.92,
typeCompatibility: 1,
valueOverlap: 0,
embeddingSimilarity: 0,
profileUniqueness: 0.92,
});
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
expect(candidates[0]?.evidence.scoreBreakdown?.contributions.nameSimilarity).toBeGreaterThan(0);
expect(candidates[0]?.evidence.reasons).toEqual(
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
);
});
it('generates candidates for PascalCase ID columns without declared keys', () => {
const artists = table('artist-id', 'Artist', [
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const albums = table('album-id', 'Album', [
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([artists, albums]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['Album.ArtistId->Artist.ArtistId']);
expect(candidates[0]).toMatchObject({
source: 'normalized_table_match',
evidence: {
sourceColumnBase: 'artist',
targetTableBase: 'artist',
targetColumnBase: 'artist_id',
targetKeyScore: 0.9,
reasons: expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
},
});
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
});
it('uses the locality cap before scanning parent tables', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
const invoices = table('invoices-id', 'invoices', [
column('invoices-id', 'invoice-id-col', 'id'),
column('invoices-id', 'account-id-col', 'account_id'),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]), {
maxCandidateParentTables: 0,
});
expect(candidates).toEqual([]);
});
it('keeps the nearest parent when the locality cap is one', () => {
const artists = table('artist-id', 'Artist', [
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const albums = table('album-id', 'Album', [
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
]);
const fillerTables = Array.from({ length: 25 }, (_, index) =>
table(`filler-${index}`, `WarehouseFiller${index}`, [
column(`filler-${index}`, `filler-${index}-id`, 'WarehouseFillerId', { primaryKey: false }),
]),
);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([albums, ...fillerTables, artists]), {
maxCandidateParentTables: 1,
});
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['Album.ArtistId->Artist.ArtistId']);
});
it('uses final table tokens from dotted parent table names', () => {
const customers = table('customer-id', 'SalesLT.Customer', [
column('customer-id', 'customer-id-col', 'CustomerID', { primaryKey: false }),
column('customer-id', 'customer-name-col', 'CustomerName', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const orders = table('order-id', 'SalesLT.SalesOrderHeader', [
column('order-id', 'order-id-col', 'SalesOrderID', { primaryKey: false }),
column('order-id', 'customer-id-fk-col', 'CustomerID', { primaryKey: false }),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customers, orders]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['SalesLT.SalesOrderHeader.CustomerID->SalesLT.Customer.CustomerID']);
expect(candidates[0]).toMatchObject({
evidence: {
sourceColumnBase: 'customer',
targetTableBase: 'sales_lt_customer',
targetColumnBase: 'customer_id',
targetKeyScore: 0.9,
reasons: expect.arrayContaining(['foreign_key_suffix', 'inflection', 'target_key_like']),
},
});
});
it('emits lower-confidence parent-table-name candidates when the target key name differs from the table name', () => {
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', { primaryKey: true }),
column('customer-account-id', 'account-name-col', 'AccountName', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
column('subscriptions-id', 'subscription-id-col', 'SubscriptionID', { primaryKey: false }),
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID']);
expect(candidates[0]).toMatchObject({
source: 'parent_table_name_match',
relationshipType: 'many_to_one',
status: 'review',
evidence: {
sourceColumnBase: 'customer_account',
targetTableBase: 'crm_customer_account',
targetColumnBase: 'business_entity_id',
targetKeyScore: 1,
nameScore: 0.82,
reasons: expect.arrayContaining(['foreign_key_suffix', 'parent_table_name_match', 'target_key_like']),
},
});
expect(candidates[0]?.evidence.signalVector).toMatchObject({
nameSimilarity: 0.82,
typeCompatibility: 1,
});
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
});
it('does not emit parent-table-name candidates when the target key type is incompatible', () => {
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', {
primaryKey: true,
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', {
primaryKey: false,
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
}),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).not.toContain('fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID');
});
it('does not use parent-table-name matching to create same-table same-column self-links', () => {
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
column('customer-account-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
column('customer-account-id', 'account-name-col', 'AccountName', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).not.toContain('crm.CustomerAccount.CustomerAccountID->crm.CustomerAccount.CustomerAccountID');
});
it('uses profile evidence to generate natural-key candidates without id-like target names', () => {
const countries = table('countries-id', 'dim_countries', [
column('countries-id', 'countries-code-col', 'iso_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('countries-id', 'countries-name-col', 'name', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const accounts = table('accounts-id', 'fct_accounts', [
column('accounts-id', 'account-id-col', 'id', { primaryKey: false }),
column('accounts-id', 'country-code-col', 'country_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const profiles = {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 0,
tables: [],
warnings: [],
columns: {
'dim_countries.iso_code': {
table: { catalog: null, db: 'public', name: 'dim_countries' },
column: 'iso_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['DE', 'FR', 'US'],
minTextLength: 2,
maxTextLength: 2,
},
'fct_accounts.country_code': {
table: { catalog: null, db: 'public', name: 'fct_accounts' },
column: 'country_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 0.75,
nullRate: 0,
sampleValues: ['FR', 'US'],
minTextLength: 2,
maxTextLength: 2,
},
},
} satisfies KtxRelationshipProfileArtifact;
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([countries, accounts]), { profiles });
expect(candidates).toHaveLength(1);
expect(candidates[0]).toMatchObject({
source: 'profile_match',
from: { tableId: 'accounts-id', columnIds: ['country-code-col'], columns: ['country_code'] },
to: { tableId: 'countries-id', columnIds: ['countries-code-col'], columns: ['iso_code'] },
evidence: {
sourceColumnBase: 'country',
targetTableBase: 'country',
targetColumnBase: 'iso_code',
targetKeyScore: 0.86,
},
});
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.78);
expect(candidates[0]?.evidence.reasons).toEqual(
expect.arrayContaining([
'foreign_key_code_suffix',
'normalized_table_name',
'profile_unique_target',
'profile_sample_overlap',
]),
);
});
it('drops same-table same-column self-links using ordered endpoint equality', () => {
const accounts = table('accounts-id', 'stg_accounts', [
column('accounts-id', 'accounts-account-id-col', 'account_id', { primaryKey: false }),
column('accounts-id', 'accounts-name-col', 'account_name', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).not.toContain('stg_accounts.account_id->stg_accounts.account_id');
});
it('keeps legitimate same-table different-column self-references', () => {
const employees = table('employees-id', 'employees', [
column('employees-id', 'employees-id-col', 'id', { primaryKey: false }),
column('employees-id', 'employees-parent-id-col', 'parent_id', { primaryKey: false }),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([employees]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toContain('employees.parent_id->employees.id');
expect(candidates[0]).toMatchObject({
source: 'self_reference',
evidence: {
reasons: expect.arrayContaining(['self_reference']),
},
});
});
it('emits column_suffix_match candidates for relationship-key-shaped trailing target columns', () => {
const plans = table('plans-id', 'stg_plans', [
column('plans-id', 'plans-plan-code-col', 'plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('plans-id', 'plans-canonical-plan-code-col', 'canonical_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('plans-id', 'plans-created-at-col', 'created_at', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('plans-id', 'plans-email-col', 'email', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('plans-id', 'plans-is-deleted-col', 'is_deleted', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const accountSegments = table('account-segments-id', 'mart_account_segments', [
column('account-segments-id', 'current-plan-code-col', 'current_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('account-segments-id', 'normalized-plan-code-col', 'normalized_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('account-segments-id', 'source-created-at-col', 'source_created_at', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('account-segments-id', 'billing-email-col', 'billing_email', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('account-segments-id', 'source-is-deleted-col', 'source_is_deleted', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const mapping = table('mapping-id', 'stg_plan_segment_mapping', [
column('mapping-id', 'mapping-canonical-plan-code-col', 'canonical_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([plans, accountSegments, mapping]), {
profiles: planCodeProfiles(),
});
const candidateKeys = candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
);
expect(candidateKeys).toEqual([
'mart_account_segments.current_plan_code->stg_plans.plan_code',
'mart_account_segments.normalized_plan_code->stg_plans.plan_code',
'stg_plan_segment_mapping.canonical_plan_code->stg_plans.plan_code',
'stg_plans.canonical_plan_code->stg_plans.plan_code',
]);
expect(candidates).toEqual(
expect.arrayContaining([
expect.objectContaining({
source: 'column_suffix_match',
confidence: expect.any(Number),
evidence: expect.objectContaining({
nameScore: 0.78,
targetKeyScore: 0.86,
reasons: expect.arrayContaining(['column_suffix_match', 'profile_unique_target']),
}),
}),
]),
);
expect(candidateKeys).not.toContain('mart_account_segments.source_created_at->stg_plans.created_at');
expect(candidateKeys).not.toContain('mart_account_segments.billing_email->stg_plans.email');
expect(candidateKeys).not.toContain('mart_account_segments.source_is_deleted->stg_plans.is_deleted');
const suffixCandidate = candidates.find(
(candidate) => candidate.from.table.name === 'mart_account_segments' && candidate.from.columns[0] === 'current_plan_code',
);
expect(suffixCandidate?.confidence).toBe(suffixCandidate?.evidence.scoreBreakdown?.score);
expect(suffixCandidate?.evidence.signalVector).toMatchObject({
nameSimilarity: 0.78,
typeCompatibility: 1,
valueOverlap: 1,
profileUniqueness: 1,
profileNullRate: 1,
});
});
it('does not suffix-match bare single-token targets or incompatible target types', () => {
const users = table('users-id', 'users', [
column('users-id', 'users-id-col', 'id', { primaryKey: false }),
column('users-id', 'users-account-id-col', 'account_id', { primaryKey: false }),
]);
const plans = table('plans-id', 'plans', [
column('plans-id', 'plans-plan-code-col', 'plan_code', {
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
}),
]);
const accounts = table('accounts-id', 'accounts', [
column('accounts-id', 'current-plan-code-col', 'current_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const profiles = {
...planCodeProfiles(),
columns: {
...planCodeProfiles().columns,
'users.id': {
table: { catalog: null, db: 'public', name: 'users' },
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
'plans.plan_code': {
table: { catalog: null, db: 'public', name: 'plans' },
column: 'plan_code',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
},
} satisfies KtxRelationshipProfileArtifact;
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([users, plans, accounts]), { profiles });
const candidateKeys = candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
);
expect(candidateKeys).not.toContain('users.account_id->users.id');
expect(candidateKeys).not.toContain('accounts.current_plan_code->plans.plan_code');
});
it('uses column embeddings as a recall source for non-standard source names', () => {
const customers = table('customers-id', 'customers', [
column('customers-id', 'customers-id-col', 'id', {
primaryKey: false,
embedding: [1, 0, 0],
}),
column('customers-id', 'customers-name-col', 'name', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
embedding: [0, 1, 0],
}),
]);
const orders = table('orders-id', 'orders', [
column('orders-id', 'orders-id-col', 'id', {
primaryKey: false,
embedding: [0, 0, 1],
}),
column('orders-id', 'buyer-ref-col', 'buyer_ref', {
primaryKey: false,
embedding: [0.995, 0.005, 0],
}),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customers, orders]), {
embeddingSimilarityThreshold: 0.95,
});
expect(candidates).toHaveLength(1);
expect(candidates[0]).toMatchObject({
source: 'embedding_similarity',
from: { tableId: 'orders-id', columnIds: ['buyer-ref-col'], columns: ['buyer_ref'] },
to: { tableId: 'customers-id', columnIds: ['customers-id-col'], columns: ['id'] },
relationshipType: 'many_to_one',
status: 'review',
evidence: {
sourceColumnBase: 'buyer_ref',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.92,
embeddingSimilarity: expect.any(Number),
},
});
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
expect(candidates[0]?.evidence.reasons).toEqual(
expect.arrayContaining(['embedding_similarity', 'target_key_like']),
);
});
it('singularizes names and caps candidates per source column deterministically', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
const archivedAccounts = table('archived-accounts-id', 'accounts_archive', [
column('archived-accounts-id', 'archived-accounts-id-col', 'id'),
]);
const events = table('events-id', 'product_events', [
column('events-id', 'event-id-col', 'id'),
column('events-id', 'account-id-col', 'account_id'),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([events, archivedAccounts, accounts]), {
maxCandidatesPerColumn: 1,
});
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['product_events.account_id->accounts.id']);
});
it('infers target primary-key candidates from incoming review links', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
const users = table('users-id', 'users', [column('users-id', 'users-id-col', 'id')]);
const events = table('events-id', 'product_events', [
column('events-id', 'event-id-col', 'id'),
column('events-id', 'account-id-col', 'account_id'),
column('events-id', 'user-id-col', 'user_id'),
]);
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, users, events]));
const inferredPks = inferKtxRelationshipTargetPks(candidates);
expect(inferredPks).toEqual([
{
table: 'accounts',
columns: ['id'],
score: expect.any(Number),
status: 'review',
incomingCandidateCount: 1,
},
{
table: 'users',
columns: ['id'],
score: expect.any(Number),
status: 'review',
incomingCandidateCount: 1,
},
]);
expect(inferredPks.every((pk) => pk.score >= 0.8)).toBe(true);
});
it('does not generate candidates from primary-key source columns or incompatible target types', () => {
const accounts = table('accounts-id', 'accounts', [
column('accounts-id', 'accounts-id-col', 'id', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const invoices = table('invoices-id', 'invoices', [
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: true }),
column('invoices-id', 'account-id-col', 'account_id', { nativeType: 'INTEGER', normalizedType: 'integer' }),
]);
expect(generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]))).toEqual([]);
});
it('normalizes layer prefixes, punctuation, plural forms, and non-plural trailing s words', () => {
expect(normalizeKtxRelationshipName('mart__Sales_Accounts')).toMatchObject({
normalized: 'sales_accounts',
singular: 'sales_account',
tokens: ['sales', 'accounts'],
});
expect(normalizeKtxRelationshipName('dim_users')).toMatchObject({
normalized: 'users',
singular: 'user',
tokens: ['users'],
});
expect(normalizeKtxRelationshipName('Address')).toMatchObject({
normalized: 'address',
singular: 'address',
plural: 'addresses',
tokens: ['address'],
});
});
it('merges duplicate deterministic and LLM proposal candidates without losing LLM rationale', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'account-id-col', 'account_id')]);
const [deterministic] = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]));
if (!deterministic) {
throw new Error('Expected deterministic relationship candidate');
}
const llmCandidate = {
...deterministic,
confidence: 0.99,
source: 'llm_proposal' as const,
evidence: {
...deterministic.evidence,
reasons: ['llm_proposal', 'llm_pk_proposal'],
llmConfidence: 0.89,
llmRationale: 'Invoices point at the owning account dimension.',
},
};
const merged = mergeKtxRelationshipDiscoveryCandidates([deterministic, llmCandidate]);
expect(merged).toHaveLength(1);
expect(merged[0]).toMatchObject({
id: deterministic.id,
source: 'normalized_table_match',
confidence: 0.99,
evidence: {
llmConfidence: 0.89,
llmRationale: 'Invoices point at the owning account dimension.',
},
});
expect(merged[0]?.evidence.reasons).toEqual(
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like', 'llm_proposal']),
);
});
});

View file

@ -0,0 +1,85 @@
import Database from 'better-sqlite3';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { getDialectForDriver } from '../../../src/context/connections/dialects.js';
import { snapshotToKtxEnrichedSchema } from '../../../src/context/scan/local-enrichment.js';
import { loadKtxRelationshipBenchmarkFixture, maskKtxRelationshipBenchmarkSnapshot } from '../../../src/context/scan/relationship-benchmarks.js';
import { discoverKtxCompositeRelationships } from '../../../src/context/scan/relationship-composite-candidates.js';
import { profileKtxRelationshipSchema, type KtxRelationshipReadOnlyExecutor } from '../../../src/context/scan/relationship-profiling.js';
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from '../../../src/context/scan/types.js';
class TestSqliteExecutor implements KtxRelationshipReadOnlyExecutor {
private readonly db: Database.Database;
constructor(dataPath: string) {
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
}
async executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return {
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
};
}
close(): void {
this.db.close();
}
}
describe('composite relationship discovery detector', () => {
it('infers composite primary keys and validates composite foreign keys from row evidence', async () => {
const fixtureRoot = new URL('../../fixtures/relationship-benchmarks', import.meta.url);
const fixture = await loadKtxRelationshipBenchmarkFixture(
join(fixtureRoot.pathname, 'composite_keys_no_declared_constraints'),
);
const snapshot = maskKtxRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
const schema = snapshotToKtxEnrichedSchema(snapshot, new Map());
const executor = new TestSqliteExecutor(fixture.dataPath ?? '');
const profiles = await profileKtxRelationshipSchema({
connectionId: snapshot.connectionId,
dialect: getDialectForDriver(snapshot.driver),
schema,
executor,
ctx: { runId: 'test:composite-profile' },
});
const result = await discoverKtxCompositeRelationships({
connectionId: snapshot.connectionId,
dialect: getDialectForDriver(snapshot.driver),
schema,
profiles,
executor,
ctx: { runId: 'test:composite-detect' },
});
executor.close();
expect(result.primaryKeys.map((item) => `${item.table.name}.(${item.columns.join(',')})`)).toEqual([
'order_line_allocations.(order_id,line_number,warehouse_code)',
'order_lines.(order_id,line_number)',
]);
expect(
result.relationships.map(
(item) =>
`${item.from.table.name}.(${item.from.columns.join(',')})->${item.to.table.name}.(${item.to.columns.join(',')})`,
),
).toEqual(['order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)']);
expect(result.relationships[0]).toMatchObject({
relationshipType: 'many_to_one',
status: 'accepted',
confidence: 0.95,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
reasons: ['composite_validation_passed'],
},
});
expect(result.queryCount).toBeGreaterThan(0);
});
});

View file

@ -0,0 +1,373 @@
import { describe, expect, it } from 'vitest';
import type { KtxEnrichedRelationship, KtxRelationshipEndpoint } from '../../../src/context/scan/enrichment-types.js';
import type { KtxResolvedRelationshipDiscoveryCandidate } from '../../../src/context/scan/relationship-graph-resolver.js';
import {
buildKtxRelationshipArtifacts,
buildKtxRelationshipDiagnostics,
emptyKtxRelationshipProfileArtifact,
} from '../../../src/context/scan/relationship-diagnostics.js';
function endpoint(table: string, column: string): KtxRelationshipEndpoint {
return {
tableId: table,
columnIds: [`${table}.${column}`],
table: { catalog: null, db: null, name: table },
columns: [column],
};
}
function enrichedRelationship(input: {
id: string;
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
confidence?: number;
}): KtxEnrichedRelationship {
return {
id: input.id,
source: 'inferred',
from: endpoint(input.fromTable, input.fromColumn),
to: endpoint(input.toTable, input.toColumn),
relationshipType: 'many_to_one',
confidence: input.confidence ?? 0.92,
isPrimaryKeyReference: true,
};
}
function resolvedRelationship(input: {
id: string;
status: 'accepted' | 'review' | 'rejected';
source?: 'normalized_table_match' | 'exact_column_match' | 'inflection' | 'self_reference' | 'llm_proposal';
fkScore?: number;
pkScore?: number;
validationReasons?: string[];
graphReasons?: string[];
}): KtxResolvedRelationshipDiscoveryCandidate {
return {
id: input.id,
from: endpoint('orders', 'customer_id'),
to: endpoint('customers', 'id'),
relationshipType: 'many_to_one',
confidence: 0.88,
source: input.source ?? 'normalized_table_match',
status: input.status,
evidence:
input.source === 'llm_proposal'
? {
sourceColumnBase: 'buyer',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.88,
nameScore: 0.45,
reasons: ['llm_proposal', 'llm_pk_proposal'],
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
}
: {
sourceColumnBase: 'customer',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.9,
nameScore: 0.85,
reasons: ['table_name_matches_source_column'],
},
score: 0.91,
validation: {
targetUniqueness: 1,
sourceCoverage: input.status === 'rejected' ? 0.2 : 1,
violationCount: input.status === 'rejected' ? 8 : 0,
violationRatio: input.status === 'rejected' ? 0.8 : 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 10,
parentDistinct: 10,
overlap: input.status === 'rejected' ? 2 : 10,
checkedValues: 10,
reasons: input.validationReasons ?? ['validation_passed'],
},
pkScore: input.pkScore ?? 0.97,
fkScore: input.fkScore ?? 0.94,
graph: {
targetPkScore: input.pkScore ?? 0.97,
incomingCandidateCount: 1,
conflictRank: 1,
reasons: input.graphReasons ?? ['target_pk_score_passed', 'fk_score_passed'],
},
};
}
describe('relationship diagnostics artifacts', () => {
it('groups graph-resolved relationships and preserves evidence reasons', () => {
const artifacts = buildKtxRelationshipArtifacts({
connectionId: 'warehouse',
resolvedRelationships: [
resolvedRelationship({ id: 'accepted-edge', status: 'accepted', source: 'llm_proposal' }),
resolvedRelationship({
id: 'review-edge',
status: 'review',
validationReasons: ['validation_unavailable'],
graphReasons: ['validation_unavailable_review_only', 'fk_score_review'],
}),
resolvedRelationship({
id: 'rejected-edge',
status: 'rejected',
validationReasons: ['low_source_coverage'],
graphReasons: ['fk_score_rejected'],
}),
],
});
expect(artifacts.accepted).toHaveLength(1);
expect(artifacts.accepted[0]).toMatchObject({
source: 'llm_proposal',
evidence: {
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
},
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
});
expect(artifacts.review).toHaveLength(1);
expect(artifacts.rejected).toHaveLength(1);
expect(artifacts.review[0]).toMatchObject({
id: 'review-edge',
status: 'review',
source: 'normalized_table_match',
fkScore: 0.94,
reasons: expect.arrayContaining(['validation_unavailable', 'validation_unavailable_review_only']),
});
expect(artifacts.rejected[0]?.reasons).toEqual(
expect.arrayContaining(['table_name_matches_source_column', 'low_source_coverage', 'fk_score_rejected']),
);
});
it('adapts relationship updates into the artifact shape', () => {
const artifacts = buildKtxRelationshipArtifacts({
connectionId: 'warehouse',
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
enrichedRelationship({
id: 'orders-customer',
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
}),
],
rejected: [
enrichedRelationship({
id: 'orders-account',
fromTable: 'orders',
fromColumn: 'account_id',
toTable: 'accounts',
toColumn: 'id',
confidence: 0.4,
}),
],
skipped: [{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }],
},
});
expect(artifacts.accepted[0]).toMatchObject({
id: 'orders-customer',
status: 'accepted',
source: 'inferred',
reasons: ['accepted_relationship_update'],
});
expect(artifacts.rejected[0]).toMatchObject({
id: 'orders-account',
status: 'rejected',
reasons: ['rejected_relationship_update'],
});
expect(artifacts.skipped).toEqual([{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }]);
});
it('deduplicates resolved and formal relationship update artifacts by edge id', () => {
const artifacts = buildKtxRelationshipArtifacts({
connectionId: 'warehouse',
resolvedRelationships: [
{
id: 'orders:orders.account_id->accounts:accounts.id',
from: endpoint('orders', 'account_id'),
to: endpoint('accounts', 'id'),
relationshipType: 'many_to_one',
source: 'normalized_table_match',
status: 'accepted',
confidence: 0.92,
score: 0.9,
pkScore: 0.92,
fkScore: 0.9,
evidence: {
sourceColumnBase: 'account',
targetTableBase: 'account',
targetColumnBase: 'id',
targetKeyScore: 0.92,
nameScore: 0.92,
reasons: ['foreign_key_suffix'],
},
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
checkedValues: 2,
reasons: ['validation_passed'],
},
graph: {
targetPkScore: 0.92,
incomingCandidateCount: 1,
conflictRank: 1,
reasons: ['fk_score_passed'],
},
},
],
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'orders:orders.account_id->accounts:accounts.id',
source: 'formal',
from: endpoint('orders', 'account_id'),
to: endpoint('accounts', 'id'),
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
});
expect(artifacts.accepted).toHaveLength(1);
expect(artifacts.accepted[0]).toMatchObject({
id: 'orders:orders.account_id->accounts:accounts.id',
source: 'normalized_table_match',
reasons: expect.arrayContaining(['foreign_key_suffix', 'validation_passed', 'fk_score_passed']),
});
});
it('explains validation-unavailable review candidates', () => {
const artifacts = buildKtxRelationshipArtifacts({
connectionId: 'warehouse',
resolvedRelationships: [
resolvedRelationship({
id: 'review-edge',
status: 'review',
validationReasons: ['validation_unavailable'],
graphReasons: ['validation_unavailable_review_only'],
}),
],
});
const profile = emptyKtxRelationshipProfileArtifact({
connectionId: 'warehouse',
driver: 'sqlite',
reason: 'read_only_sql_unavailable',
});
const diagnostics = buildKtxRelationshipDiagnostics({
connectionId: 'warehouse',
generatedAt: '2026-05-07T12:00:00.000Z',
artifacts,
profile,
warnings: [
{
code: 'connector_capability_missing',
message: 'KTX scan connector cannot run standalone statistical relationship validation',
recoverable: true,
metadata: { capability: 'readOnlySql' },
},
],
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
});
expect(diagnostics.summary).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
expect(diagnostics.noAcceptedReason).toBe('validation unavailable; review candidates written');
expect(diagnostics.candidateCountsBySource).toEqual({ normalized_table_match: 1 });
expect(diagnostics.validation).toEqual({
available: false,
sqlAvailable: false,
queryCount: 0,
});
expect(diagnostics.profileWarnings).toEqual(['read_only_sql_unavailable']);
expect(diagnostics.warnings[0]).toMatchObject({ code: 'connector_capability_missing' });
});
it('explains empty relationship output as a no-candidate outcome', () => {
const artifacts = buildKtxRelationshipArtifacts({ connectionId: 'warehouse' });
const diagnostics = buildKtxRelationshipDiagnostics({
connectionId: 'warehouse',
generatedAt: '2026-05-07T12:00:00.000Z',
artifacts,
profile: emptyKtxRelationshipProfileArtifact({
connectionId: 'warehouse',
driver: 'sqlite',
reason: 'relationship_profiling_not_run',
}),
});
expect(diagnostics.summary).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
expect(diagnostics.noAcceptedReason).toBe('no candidate pairs passed type compatibility');
expect(diagnostics.candidateCountsBySource).toEqual({});
});
it('records composite relationship endpoints in relationship artifacts', () => {
const artifacts = buildKtxRelationshipArtifacts({
connectionId: 'warehouse',
compositeRelationships: [
{
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'composite_profile_match',
status: 'accepted',
from: {
tableId: 'order_line_allocations',
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
table: { catalog: null, db: null, name: 'order_line_allocations' },
columns: ['order_id', 'line_number'],
},
to: {
tableId: 'order_lines',
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
table: { catalog: null, db: null, name: 'order_lines' },
columns: ['order_id', 'line_number'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
reasons: ['composite_validation_passed'],
},
},
],
});
expect(artifacts.accepted).toEqual([
expect.objectContaining({
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'composite_profile_match',
from: expect.objectContaining({
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
columns: ['order_id', 'line_number'],
}),
to: expect.objectContaining({
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
columns: ['order_id', 'line_number'],
}),
reasons: ['composite_validation_passed'],
validation: expect.objectContaining({ sourceCoverage: 1 }),
}),
]);
});
});

View file

@ -0,0 +1,683 @@
import Database from 'better-sqlite3';
import { afterEach, describe, expect, it, vi } from 'vitest';
import type { KtxLlmRuntimePort } from '../../../src/context/llm/runtime-port.js';
import { getDialectForDriver } from '../../../src/context/connections/dialects.js';
import { buildDefaultKtxProjectConfig } from '../../../src/context/project/config.js';
import { snapshotToKtxEnrichedSchema } from '../../../src/context/scan/local-enrichment.js';
import {
loadKtxRelationshipBenchmarkFixture,
maskKtxRelationshipBenchmarkSnapshot,
} from '../../../src/context/scan/relationship-benchmarks.js';
import { discoverKtxRelationships } from '../../../src/context/scan/relationship-discovery.js';
import { createKtxConnectorCapabilities } from '../../../src/context/scan/types.js';
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanConnector, KtxScanContext, KtxSchemaSnapshot } from '../../../src/context/scan/types.js';
class InMemorySqliteExecutor {
readonly db = new Database(':memory:');
queryCount = 0;
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
function snapshot(): KtxSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [
{
catalog: null,
db: null,
name: 'accounts',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'name',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: null,
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 3,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'account_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
}
function declaredForeignKeySnapshot(): KtxSchemaSnapshot {
const source = snapshot();
return {
...source,
tables: source.tables.map((table) =>
table.name === 'accounts'
? {
...table,
columns: table.columns.map((column) => (column.name === 'id' ? { ...column, primaryKey: true } : column)),
}
: table.name === 'orders'
? {
...table,
foreignKeys: [
{
fromColumn: 'account_id',
toCatalog: null,
toDb: null,
toTable: 'accounts',
toColumn: 'id',
constraintName: 'orders_account_id_fkey',
},
],
}
: table,
),
};
}
function naturalKeySnapshot(): KtxSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [
{
catalog: null,
db: null,
name: 'dim_countries',
kind: 'table',
comment: null,
estimatedRows: 3,
foreignKeys: [],
columns: [
{
name: 'iso_code',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'name',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: null,
name: 'fct_accounts',
kind: 'table',
comment: null,
estimatedRows: 4,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'country_code',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
}
function connector(executor: InMemorySqliteExecutor | null): KtxScanConnector {
return {
id: 'sqlite:test',
driver: 'sqlite',
capabilities: createKtxConnectorCapabilities({
readOnlySql: executor !== null,
columnStats: executor !== null,
tableSampling: false,
columnSampling: false,
}),
introspect: async () => snapshot(),
listSchemas: async () => [],
listTables: async () => [],
executeReadOnly: executor ? executor.executeReadOnly.bind(executor) : undefined,
};
}
function llmRuntime(output: unknown): KtxLlmRuntimePort {
return {
generateText: vi.fn(),
generateObject: vi.fn(async () => output) as KtxLlmRuntimePort['generateObject'],
runAgentLoop: vi.fn(),
};
}
function relationshipSettings() {
return buildDefaultKtxProjectConfig().scan.relationships;
}
function llmOnlyRelationshipSnapshot(): KtxSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [
{
catalog: null,
db: null,
name: 'customers',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: null,
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'buyer_ref',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
}
describe('production relationship discovery', () => {
let executor: InMemorySqliteExecutor | null = null;
afterEach(() => {
executor?.close();
executor = null;
});
it('accepts a validated relationship without declared PK or FK metadata', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
`);
const result = await discoverKtxRelationships({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
connector: connector(executor),
schema: snapshotToKtxEnrichedSchema(snapshot()),
context: { runId: 'relationship-run-1' },
settings: relationshipSettings(),
});
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.statisticalValidation).toBe('completed');
expect(result.profile.sqlAvailable).toBe(true);
expect(result.profile.queryCount).toBeGreaterThan(0);
expect(result.relationshipUpdate.accepted).toEqual([
expect.objectContaining({
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
relationshipType: 'many_to_one',
source: 'inferred',
isPrimaryKeyReference: true,
}),
]);
expect(result.resolvedRelationships[0]).toMatchObject({
status: 'accepted',
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
});
});
it('accepts a profile-driven natural-key relationship without declared metadata', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE dim_countries (iso_code TEXT NOT NULL, name TEXT NOT NULL);
CREATE TABLE fct_accounts (id INTEGER NOT NULL, country_code TEXT NOT NULL);
INSERT INTO dim_countries (iso_code, name) VALUES ('US', 'United States'), ('FR', 'France'), ('DE', 'Germany');
INSERT INTO fct_accounts (id, country_code) VALUES (1, 'US'), (2, 'FR'), (3, 'US'), (4, 'DE');
`);
const schema = naturalKeySnapshot();
const result = await discoverKtxRelationships({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
connector: {
...connector(executor),
introspect: async () => schema,
},
schema: snapshotToKtxEnrichedSchema(schema),
context: { runId: 'natural-key-relationship-run' },
settings: relationshipSettings(),
});
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate.accepted).toEqual([
expect.objectContaining({
from: expect.objectContaining({ table: expect.objectContaining({ name: 'fct_accounts' }), columns: ['country_code'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'dim_countries' }), columns: ['iso_code'] }),
relationshipType: 'many_to_one',
source: 'inferred',
isPrimaryKeyReference: true,
}),
]);
expect(result.resolvedRelationships[0]).toMatchObject({
source: 'profile_match',
status: 'accepted',
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
});
});
it('accepts an embedding-driven relationship without declared metadata or LLM proposals', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE customers (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, buyer_ref INTEGER NOT NULL);
INSERT INTO customers (id, name) VALUES (1, 'Acme'), (2, 'Orbit'), (3, 'Globex');
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2), (12, 2), (13, 3);
`);
const sourceSnapshot = llmOnlyRelationshipSnapshot();
const schema = snapshotToKtxEnrichedSchema(
sourceSnapshot,
new Map([
['customers.id', [1, 0, 0]],
['customers.name', [0, 1, 0]],
['orders.id', [0, 0, 1]],
['orders.buyer_ref', [0.995, 0.005, 0]],
]),
);
const result = await discoverKtxRelationships({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
connector: {
...connector(executor),
introspect: async () => sourceSnapshot,
},
schema,
context: { runId: 'embedding-relationship-run' },
settings: {
...relationshipSettings(),
llmProposals: false,
},
});
expect(result.llmRelationshipValidation).toBe('skipped');
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
to: { table: { name: 'customers' }, columns: ['id'] },
});
expect(result.resolvedRelationships[0]).toMatchObject({
source: 'embedding_similarity',
status: 'accepted',
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
evidence: expect.objectContaining({
reasons: expect.arrayContaining(['embedding_similarity', 'target_key_like']),
embeddingSimilarity: expect.any(Number),
}),
});
});
it('keeps candidates review-only when read-only SQL is unavailable', async () => {
const result = await discoverKtxRelationships({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
connector: connector(null),
schema: snapshotToKtxEnrichedSchema(snapshot()),
context: { runId: 'relationship-run-no-sql' },
settings: relationshipSettings(),
});
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
expect(result.statisticalValidation).toBe('skipped');
expect(result.relationshipUpdate.accepted).toEqual([]);
expect(result.resolvedRelationships[0]).toMatchObject({
status: 'review',
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_unavailable']) }),
});
expect(result.warnings).toContainEqual({
code: 'connector_capability_missing',
message: 'KTX scan connector cannot run read-only SQL relationship validation',
recoverable: true,
metadata: { capability: 'readOnlySql' },
});
});
it('accepts formal metadata relationships when read-only SQL is unavailable', async () => {
const sourceSnapshot = declaredForeignKeySnapshot();
const result = await discoverKtxRelationships({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
connector: connector(null),
schema: snapshotToKtxEnrichedSchema(sourceSnapshot),
context: { runId: 'formal-metadata-no-sql' },
settings: relationshipSettings(),
});
expect(result.statisticalValidation).toBe('skipped');
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.resolvedRelationships).toEqual([]);
expect(result.relationshipUpdate.accepted).toEqual([
expect.objectContaining({
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
source: 'formal',
confidence: 1,
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
}),
]);
expect(result.relationshipUpdate.rejected).toEqual([]);
expect(result.relationshipUpdate.skipped).toEqual([]);
});
it('accepts LLM-only relationship proposals only after SQL validation and graph resolution pass', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE customers (id INTEGER);
CREATE TABLE orders (id INTEGER, buyer_ref INTEGER);
INSERT INTO customers (id) VALUES (1), (2);
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2);
`);
const llmOutput = {
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.91, rationale: 'Unique customer key.' }],
fkCandidates: [
{
fromTable: 'orders',
fromColumn: 'buyer_ref',
toTable: 'customers',
toColumn: 'id',
confidence: 0.89,
rationale: 'Buyer reference values align with customer identifiers.',
},
],
};
const result = await discoverKtxRelationships({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
connector: connector(executor),
schema: snapshotToKtxEnrichedSchema(llmOnlyRelationshipSnapshot()),
context: { runId: 'llm-relationship-orchestrator' },
settings: relationshipSettings(),
llmRuntime: llmRuntime(llmOutput),
});
expect(result.llmRelationshipValidation).toBe('completed');
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.resolvedRelationships[0]).toMatchObject({
source: 'llm_proposal',
status: 'accepted',
evidence: {
llmRationale: 'Buyer reference values align with customer identifiers.',
},
});
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
to: { table: { name: 'customers' }, columns: ['id'] },
});
});
it('uses configured acceptance thresholds when resolving graph relationships', async () => {
const executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
`);
const settings = {
...buildDefaultKtxProjectConfig().scan.relationships,
acceptThreshold: 0.99,
reviewThreshold: 0.55,
};
const result = await discoverKtxRelationships({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
connector: connector(executor),
schema: snapshotToKtxEnrichedSchema(snapshot()),
context: { runId: 'configured-thresholds' },
settings,
});
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate.accepted).toEqual([]);
expect(result.resolvedRelationships[0]).toMatchObject({
status: 'review',
graph: { reasons: expect.arrayContaining(['fk_score_review']) },
});
executor.close();
});
it('passes maxCandidatesPerColumn into broad deterministic candidate generation', async () => {
const executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE account_archive (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
INSERT INTO account_archive VALUES (99, 'Archive');
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
`);
const richSnapshot = snapshot();
richSnapshot.tables.splice(1, 0, {
catalog: null,
db: null,
name: 'account_archive',
kind: 'table',
comment: null,
estimatedRows: 1,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'name',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
],
});
const result = await discoverKtxRelationships({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
connector: {
...connector(executor),
introspect: async () => richSnapshot,
},
schema: snapshotToKtxEnrichedSchema(richSnapshot),
context: { runId: 'candidate-cap' },
settings: {
...buildDefaultKtxProjectConfig().scan.relationships,
maxCandidatesPerColumn: 1,
},
});
const sourceTargets = result.resolvedRelationships
.filter((relationship) => relationship.from.columns[0] === 'account_id')
.map((relationship) => `${relationship.to.table.name}.${relationship.to.columns[0]}`);
expect(sourceTargets).toHaveLength(1);
expect(sourceTargets).toEqual(['accounts.id']);
executor.close();
});
it('accepts SQL-validated composite relationships in production relationship-discovery detection', async () => {
const fixtureRoot = new URL(
'../../fixtures/relationship-benchmarks/composite_keys_no_declared_constraints',
import.meta.url,
);
const fixture = await loadKtxRelationshipBenchmarkFixture(fixtureRoot.pathname);
const maskedSnapshot = maskKtxRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
const database = new Database(fixture.dataPath ?? '', { readonly: true, fileMustExist: true });
const testConnector: KtxScanConnector = {
id: 'sqlite:composite',
driver: 'sqlite',
capabilities: createKtxConnectorCapabilities({
readOnlySql: true,
columnStats: true,
tableSampling: false,
columnSampling: false,
}),
introspect: async () => maskedSnapshot,
listSchemas: async () => [],
listTables: async () => [],
executeReadOnly: async (input) => {
const rows = database.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return {
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
};
},
};
const result = await discoverKtxRelationships({
connectionId: maskedSnapshot.connectionId,
dialect: getDialectForDriver(maskedSnapshot.driver),
connector: testConnector,
schema: snapshotToKtxEnrichedSchema(maskedSnapshot, new Map()),
context: { runId: 'test:production-composite' },
settings: relationshipSettings(),
});
database.close();
expect(
result.relationshipUpdate.accepted.map(
(relationship) =>
`${relationship.from.table.name}.(${relationship.from.columns.join(',')})->${relationship.to.table.name}.(${relationship.to.columns.join(',')})`,
),
).toContain('order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)');
expect(result.relationships.accepted).toBeGreaterThanOrEqual(1);
expect(result.compositeRelationships.map((relationship) => relationship.status)).toContain('accepted');
});
});

View file

@ -0,0 +1,134 @@
import { describe, expect, it } from 'vitest';
import type { KtxEnrichedRelationship, KtxEnrichedSchema } from '../../../src/context/scan/enrichment-types.js';
import { collectKtxFormalMetadataRelationships } from '../../../src/context/scan/relationship-formal-metadata.js';
function schema(relationships: KtxEnrichedRelationship[]): KtxEnrichedSchema {
return {
connectionId: 'warehouse',
tables: [
{
id: 'accounts',
ref: { catalog: null, db: null, name: 'accounts' },
enabled: true,
descriptions: {},
columns: [
{
id: 'accounts.id',
tableId: 'accounts',
tableRef: { catalog: null, db: null, name: 'accounts' },
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
{
id: 'orders',
ref: { catalog: null, db: null, name: 'orders' },
enabled: true,
descriptions: {},
columns: [
{
id: 'orders.account_id',
tableId: 'orders',
tableRef: { catalog: null, db: null, name: 'orders' },
name: 'account_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
],
relationships,
};
}
function formalRelationship(overrides: Partial<KtxEnrichedRelationship> = {}): KtxEnrichedRelationship {
return {
id: 'orders:orders.account_id->accounts:accounts.id',
source: 'formal',
from: {
tableId: 'orders',
columnIds: ['orders.account_id'],
table: { catalog: null, db: null, name: 'orders' },
columns: ['account_id'],
},
to: {
tableId: 'accounts',
columnIds: ['accounts.id'],
table: { catalog: null, db: null, name: 'accounts' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.6,
isPrimaryKeyReference: false,
...overrides,
};
}
describe('formal metadata relationship collection', () => {
it('accepts valid formal relationships with ground-truth confidence', () => {
const result = collectKtxFormalMetadataRelationships(schema([formalRelationship()]));
expect(result.accepted).toEqual([
expect.objectContaining({
id: 'orders:orders.account_id->accounts:accounts.id',
source: 'formal',
confidence: 1,
isPrimaryKeyReference: true,
}),
]);
expect(result.skipped).toEqual([]);
expect(result.acceptedIds).toEqual(new Set(['orders:orders.account_id->accounts:accounts.id']));
});
it('skips duplicate and invalid formal relationships with reasons', () => {
const result = collectKtxFormalMetadataRelationships(
schema([
formalRelationship(),
formalRelationship(),
formalRelationship({
id: 'orders:orders.missing_account_id->accounts:accounts.id',
from: {
tableId: 'orders',
columnIds: ['orders.missing_account_id'],
table: { catalog: null, db: null, name: 'orders' },
columns: ['missing_account_id'],
},
}),
formalRelationship({
id: 'manual-edge',
source: 'manual',
}),
]),
);
expect(result.accepted).toHaveLength(1);
expect(result.skipped).toEqual([
{
relationshipId: 'orders:orders.account_id->accounts:accounts.id',
reason: 'formal_metadata_duplicate',
},
{
relationshipId: 'orders:orders.missing_account_id->accounts:accounts.id',
reason: 'formal_metadata_endpoint_not_found',
},
]);
});
});

View file

@ -0,0 +1,649 @@
import { describe, expect, it } from 'vitest';
import type {
KtxEnrichedColumn,
KtxEnrichedSchema,
KtxEnrichedTable,
KtxRelationshipEndpoint,
} from '../../../src/context/scan/enrichment-types.js';
import type { KtxRelationshipProfileArtifact } from '../../../src/context/scan/relationship-profiling.js';
import type { KtxValidatedRelationshipDiscoveryCandidate } from '../../../src/context/scan/relationship-validation.js';
import { resolveKtxRelationshipGraph } from '../../../src/context/scan/relationship-graph-resolver.js';
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
return {
id: `${tableId}.${name}`,
tableId,
tableRef,
name,
nativeType: overrides.nativeType ?? 'INTEGER',
normalizedType: overrides.normalizedType ?? 'integer',
dimensionType: overrides.dimensionType ?? 'number',
nullable: overrides.nullable ?? true,
primaryKey: overrides.primaryKey ?? false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
...overrides,
};
}
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
const ref = { catalog: null, db: null, name };
return {
id: name,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
};
}
function schema(overrides: { accountsPrimaryKey?: boolean } = {}): KtxEnrichedSchema {
return {
connectionId: 'warehouse',
tables: [
table('accounts', [
column('accounts', 'id', { nullable: false, primaryKey: overrides.accountsPrimaryKey ?? false }),
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
]),
table('account_archive', [column('account_archive', 'id', { nullable: false })]),
table('users', [
column('users', 'id', { nullable: false }),
column('users', 'account_id', { nullable: false }),
]),
],
relationships: [],
};
}
function endpoint(tableName: string, columnName: string): KtxRelationshipEndpoint {
return {
tableId: tableName,
columnIds: [`${tableName}.${columnName}`],
table: { catalog: null, db: null, name: tableName },
columns: [columnName],
};
}
function profiles(): KtxRelationshipProfileArtifact {
return {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 0,
tables: [
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 3 },
{ table: { catalog: null, db: null, name: 'account_archive' }, rowCount: 3 },
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
],
columns: {
'accounts.id': {
table: { catalog: null, db: null, name: 'accounts' },
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2', '3'],
minTextLength: 1,
maxTextLength: 1,
},
'account_archive.id': {
table: { catalog: null, db: null, name: 'account_archive' },
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2', '3'],
minTextLength: 1,
maxTextLength: 1,
},
'users.account_id': {
table: { catalog: null, db: null, name: 'users' },
column: 'account_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2', '3'],
minTextLength: 1,
maxTextLength: 1,
},
},
warnings: [],
};
}
function validatedCandidate(
overrides: Partial<KtxValidatedRelationshipDiscoveryCandidate> = {},
): KtxValidatedRelationshipDiscoveryCandidate {
const from = overrides.from ?? endpoint('users', 'account_id');
const to = overrides.to ?? endpoint('accounts', 'id');
return {
id: `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`,
from,
to,
relationshipType: 'many_to_one',
confidence: overrides.confidence ?? 0.95,
source: overrides.source ?? 'normalized_table_match',
status: overrides.status ?? 'accepted',
score: overrides.score ?? 0.96,
evidence: {
sourceColumnBase: 'account',
targetTableBase: to.table.name,
targetColumnBase: to.columns[0] ?? '',
targetKeyScore: 0.92,
nameScore: 0.92,
reasons: ['foreign_key_suffix', 'normalized_table_name', 'target_key_like'],
...overrides.evidence,
},
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 3,
parentDistinct: 3,
overlap: 3,
checkedValues: 3,
reasons: ['validation_passed'],
...overrides.validation,
},
...overrides,
};
}
describe('relationship graph resolver', () => {
it('promotes validated relationship discovery references to accepted relationships and inferred PKs', () => {
const result = resolveKtxRelationshipGraph({
schema: schema(),
profiles: profiles(),
candidates: [validatedCandidate()],
});
expect(result.pks).toContainEqual({
table: 'accounts',
columns: ['id'],
pkScore: expect.any(Number),
status: 'accepted',
incomingCandidateCount: 1,
evidence: {
declaredPrimaryKey: false,
targetUniqueness: 1,
incomingAcceptedCount: 1,
incomingReviewCount: 0,
reasons: expect.arrayContaining(['unique_target_column', 'incoming_validated_reference']),
},
});
expect(result.pks.find((pk) => pk.table === 'accounts')?.pkScore).toBeGreaterThanOrEqual(0.85);
expect(result.relationships).toHaveLength(1);
expect(result.relationships[0]).toMatchObject({
from: { table: { name: 'users' }, columns: ['account_id'] },
to: { table: { name: 'accounts' }, columns: ['id'] },
status: 'accepted',
pkScore: expect.any(Number),
fkScore: expect.any(Number),
graph: {
reasons: expect.arrayContaining(['target_pk_score_passed', 'fk_score_passed']),
},
});
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.85);
});
it('keeps validation-unavailable candidates in review even when name evidence is strong', () => {
const result = resolveKtxRelationshipGraph({
schema: schema(),
profiles: { ...profiles(), sqlAvailable: false, columns: {}, warnings: ['read_only_sql_unavailable'] },
candidates: [
validatedCandidate({
status: 'review',
score: 0.57,
validation: {
targetUniqueness: 0,
sourceCoverage: 0,
violationCount: 0,
violationRatio: 1,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 0,
parentDistinct: 0,
overlap: 0,
checkedValues: 0,
reasons: ['validation_unavailable'],
},
}),
],
});
expect(result.relationships).toHaveLength(1);
expect(result.relationships[0]).toMatchObject({
status: 'review',
graph: {
reasons: expect.arrayContaining(['validation_unavailable_review_only']),
},
});
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.55);
});
it('accepts at most one target per source column and rejects the lower-scored conflict loser', () => {
const winner = validatedCandidate({ confidence: 0.95, score: 0.96 });
const loser = validatedCandidate({
from: endpoint('users', 'account_id'),
to: endpoint('account_archive', 'id'),
confidence: 0.85,
score: 0.9,
evidence: {
sourceColumnBase: 'account',
targetTableBase: 'account_archive',
targetColumnBase: 'id',
targetKeyScore: 0.92,
nameScore: 0.78,
reasons: ['foreign_key_suffix', 'inflection', 'target_key_like'],
},
});
const result = resolveKtxRelationshipGraph({
schema: schema(),
profiles: profiles(),
candidates: [loser, winner],
});
expect(result.relationships.map((relationship) => relationship.status)).toEqual(['accepted', 'rejected']);
expect(result.relationships[0]?.to.table.name).toBe('accounts');
expect(result.relationships[1]).toMatchObject({
to: { table: { name: 'account_archive' }, columns: ['id'] },
status: 'rejected',
graph: {
reasons: expect.arrayContaining(['conflict_lost']),
},
});
});
it('preserves declared primary keys as accepted even without incoming candidates', () => {
const result = resolveKtxRelationshipGraph({
schema: schema({ accountsPrimaryKey: true }),
profiles: profiles(),
candidates: [],
});
expect(result.relationships).toEqual([]);
expect(result.pks).toContainEqual({
table: 'accounts',
columns: ['id'],
pkScore: 1,
status: 'accepted',
incomingCandidateCount: 0,
evidence: {
declaredPrimaryKey: true,
targetUniqueness: 1,
incomingAcceptedCount: 0,
incomingReviewCount: 0,
reasons: ['declared_primary_key'],
},
});
});
it('infers profile-only key-like columns without incoming relationship candidates', () => {
const baseSchema = schema();
const invoices = table('invoices', [
column('invoices', 'id', { nullable: false }),
column('invoices', 'invoice_number', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
}),
column('invoices', 'amount', {
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
}),
]);
const baseProfiles = profiles();
const result = resolveKtxRelationshipGraph({
schema: { ...baseSchema, tables: [...baseSchema.tables, invoices] },
profiles: {
...baseProfiles,
tables: [...baseProfiles.tables, { table: invoices.ref, rowCount: 3 }],
columns: {
...baseProfiles.columns,
'invoices.id': {
table: invoices.ref,
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2', '3'],
minTextLength: 1,
maxTextLength: 1,
},
'invoices.invoice_number': {
table: invoices.ref,
column: 'invoice_number',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['INV-1', 'INV-2', 'INV-3'],
minTextLength: 5,
maxTextLength: 5,
},
'invoices.amount': {
table: invoices.ref,
column: 'amount',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 2 / 3,
nullRate: 0,
sampleValues: ['100', '200'],
minTextLength: 3,
maxTextLength: 3,
},
},
},
candidates: [],
});
expect(result.relationships).toEqual([]);
expect(result.pks).toContainEqual({
table: 'invoices',
columns: ['id'],
pkScore: 1,
status: 'accepted',
incomingCandidateCount: 0,
evidence: {
declaredPrimaryKey: false,
targetUniqueness: 1,
incomingAcceptedCount: 0,
incomingReviewCount: 0,
reasons: expect.arrayContaining([
'unique_target_column',
'profile_key_name',
'not_null_profile',
'profile_only_primary_key',
'no_incoming_references',
]),
},
});
expect(result.pks).toContainEqual(
expect.objectContaining({
table: 'invoices',
columns: ['invoice_number'],
status: 'review',
evidence: expect.objectContaining({
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
}),
}),
);
expect(result.pks.some((pk) => pk.table === 'invoices' && pk.columns[0] === 'amount')).toBe(false);
});
it('pins single-incoming column_suffix_match resolver scores', () => {
const schema = {
connectionId: 'warehouse',
relationships: [],
tables: [
{
id: 'plans-id',
ref: { catalog: null, db: null, name: 'stg_plans' },
enabled: true,
descriptions: {},
columns: [
{
id: 'plan-code-col',
tableId: 'plans-id',
tableRef: { catalog: null, db: null, name: 'stg_plans' },
name: 'plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
{
id: 'segments-id',
ref: { catalog: null, db: null, name: 'mart_account_segments' },
enabled: true,
descriptions: {},
columns: [
{
id: 'current-plan-code-col',
tableId: 'segments-id',
tableRef: { catalog: null, db: null, name: 'mart_account_segments' },
name: 'current_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
],
} satisfies KtxEnrichedSchema;
const profiles = {
connectionId: 'warehouse',
driver: 'sqlite' as const,
sqlAvailable: true,
queryCount: 0,
tables: [],
warnings: [],
columns: {
'stg_plans.plan_code': {
table: { catalog: null, db: null, name: 'stg_plans' },
column: 'plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
},
};
const result = resolveKtxRelationshipGraph({
schema,
profiles,
candidates: [
{
id: 'segments:(current_plan_code)->plans:(plan_code)',
from: {
tableId: 'segments-id',
columnIds: ['current-plan-code-col'],
table: { catalog: null, db: null, name: 'mart_account_segments' },
columns: ['current_plan_code'],
},
to: {
tableId: 'plans-id',
columnIds: ['plan-code-col'],
table: { catalog: null, db: null, name: 'stg_plans' },
columns: ['plan_code'],
},
relationshipType: 'many_to_one',
confidence: 0.902,
source: 'column_suffix_match',
evidence: {
sourceColumnBase: 'current_plan',
targetTableBase: 'plan',
targetColumnBase: 'plan_code',
targetKeyScore: 0.86,
nameScore: 0.78,
reasons: ['column_suffix_match', 'profile_unique_target'],
},
status: 'accepted',
score: 0.98,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 4,
parentDistinct: 4,
overlap: 4,
checkedValues: 4,
reasons: ['validation_passed'],
},
},
],
});
expect(result.pks).toEqual([
expect.objectContaining({
table: 'stg_plans',
columns: ['plan_code'],
pkScore: 0.922,
status: 'accepted',
}),
]);
expect(result.relationships).toEqual([
expect.objectContaining({
source: 'column_suffix_match',
status: 'accepted',
pkScore: 0.922,
fkScore: 0.953,
}),
]);
});
it('keeps strong profile-only primary key evidence when name evidence is weak', () => {
const baseSchema = schema();
baseSchema.tables.push(
table('events', [
column('events', 'warehouse_key', {
nullable: false,
primaryKey: false,
nativeType: 'INTEGER',
normalizedType: 'integer',
}),
]),
);
const baseProfiles = profiles();
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
baseProfiles.columns['events.warehouse_key'] = {
table: { catalog: null, db: null, name: 'events' },
column: 'warehouse_key',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['100', '101', '102'],
minTextLength: 3,
maxTextLength: 3,
};
const result = resolveKtxRelationshipGraph({
schema: baseSchema,
profiles: baseProfiles,
candidates: [],
});
expect(result.pks).toEqual(
expect.arrayContaining([
expect.objectContaining({
table: 'events',
columns: ['warehouse_key'],
status: 'review',
evidence: expect.objectContaining({
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
}),
}),
]),
);
});
it('keeps strong profile-only primary key evidence when the column is not key-shaped', () => {
const baseSchema = schema();
baseSchema.tables.push(
table('events', [
column('events', 'opaque_reference', {
nullable: false,
primaryKey: false,
nativeType: 'INTEGER',
normalizedType: 'integer',
}),
]),
);
const baseProfiles = profiles();
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
baseProfiles.columns['events.opaque_reference'] = {
table: { catalog: null, db: null, name: 'events' },
column: 'opaque_reference',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['100', '101', '102'],
minTextLength: 3,
maxTextLength: 3,
};
const result = resolveKtxRelationshipGraph({
schema: baseSchema,
profiles: baseProfiles,
candidates: [],
});
const inferredPk = result.pks.find((candidate) => candidate.table === 'events');
expect(inferredPk).toMatchObject({
table: 'events',
columns: ['opaque_reference'],
status: 'review',
evidence: expect.objectContaining({
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
}),
});
expect(inferredPk?.pkScore).toBeGreaterThanOrEqual(0.55);
});
});

View file

@ -0,0 +1,214 @@
import { describe, expect, it, vi } from 'vitest';
import type { KtxLlmRuntimePort } from '../../../src/context/llm/runtime-port.js';
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
import type { KtxRelationshipProfileArtifact } from '../../../src/context/scan/relationship-profiling.js';
import { proposeKtxRelationshipCandidatesWithLlm } from '../../../src/context/scan/relationship-llm-proposal.js';
function llmRuntime(output?: unknown): KtxLlmRuntimePort {
return {
generateText: vi.fn(),
generateObject: vi.fn(async () => output) as KtxLlmRuntimePort['generateObject'],
runAgentLoop: vi.fn(),
};
}
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
return {
id: `${tableId}.${name}`,
tableId,
tableRef,
name,
nativeType: overrides.nativeType ?? 'INTEGER',
normalizedType: overrides.normalizedType ?? 'integer',
dimensionType: overrides.dimensionType ?? 'number',
nullable: overrides.nullable ?? true,
primaryKey: overrides.primaryKey ?? false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
...overrides,
};
}
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
const ref = { catalog: null, db: null, name };
return {
id: name,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
};
}
function schema(): KtxEnrichedSchema {
return {
connectionId: 'warehouse',
relationships: [],
tables: [
table('customers', [
column('customers', 'id', { nullable: false }),
column('customers', 'email', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
]),
table('orders', [
column('orders', 'id', { nullable: false }),
column('orders', 'buyer_ref'),
]),
],
};
}
function profile(): KtxRelationshipProfileArtifact {
return {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 4,
warnings: [],
tables: [
{ table: { catalog: null, db: null, name: 'customers' }, rowCount: 2 },
{ table: { catalog: null, db: null, name: 'orders' }, rowCount: 2 },
],
columns: {
'customers.id': {
table: { catalog: null, db: null, name: 'customers' },
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
'orders.buyer_ref': {
table: { catalog: null, db: null, name: 'orders' },
column: 'buyer_ref',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
},
};
}
describe('relationship LLM proposals', () => {
it('maps valid structured FK proposals into review candidates with rationale evidence', async () => {
const runtime = llmRuntime({
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.94, rationale: 'Unique customer identifier.' }],
fkCandidates: [
{
fromTable: 'orders',
fromColumn: 'buyer_ref',
toTable: 'customers',
toColumn: 'id',
confidence: 0.88,
rationale: 'Buyer reference values match customer identifiers.',
},
],
});
const result = await proposeKtxRelationshipCandidatesWithLlm({
connectionId: 'warehouse',
schema: schema(),
profile: profile(),
llmRuntime: runtime,
});
expect(result.summary).toBe('completed');
expect(result.llmCalls).toBe(1);
expect(result.warnings).toEqual([]);
expect(result.candidates).toHaveLength(1);
expect(result.candidates[0]).toMatchObject({
from: { tableId: 'orders', columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
to: { tableId: 'customers', columnIds: ['customers.id'], columns: ['id'] },
source: 'llm_proposal',
status: 'review',
evidence: {
llmConfidence: 0.88,
llmRationale: 'Buyer reference values match customer identifiers.',
reasons: ['llm_proposal', 'llm_pk_proposal'],
},
});
expect(runtime.generateObject).toHaveBeenCalledWith(
expect.objectContaining({
role: 'candidateExtraction',
system: expect.stringContaining('You are helping KTX review possible SQL relationships'),
prompt: expect.stringContaining('"tables"'),
}),
);
const call = vi.mocked(runtime.generateObject).mock.calls[0]?.[0];
expect(call?.prompt).not.toContain('You are helping KTX review possible SQL relationships');
});
it('skips when no runtime is configured', async () => {
const result = await proposeKtxRelationshipCandidatesWithLlm({
connectionId: 'warehouse',
schema: schema(),
profile: profile(),
llmRuntime: null,
});
expect(result).toMatchObject({ candidates: [], llmCalls: 0, summary: 'skipped' });
expect(result.warnings).toEqual([]);
});
it('returns recoverable warnings for invalid references and generation failures', async () => {
const invalidReference = await proposeKtxRelationshipCandidatesWithLlm({
connectionId: 'warehouse',
schema: schema(),
profile: profile(),
llmRuntime: llmRuntime({
pkCandidates: [],
fkCandidates: [
{
fromTable: 'orders',
fromColumn: 'missing_column',
toTable: 'customers',
toColumn: 'id',
confidence: 0.7,
rationale: 'Invalid source column.',
},
],
}),
});
expect(invalidReference.candidates).toEqual([]);
expect(invalidReference.summary).toBe('completed');
expect(invalidReference.warnings[0]).toMatchObject({
code: 'relationship_llm_invalid_reference',
recoverable: true,
});
const failed = await proposeKtxRelationshipCandidatesWithLlm({
connectionId: 'warehouse',
schema: schema(),
profile: profile(),
llmRuntime: {
generateText: vi.fn(),
generateObject: vi.fn(async () => {
throw new Error('model unavailable');
}),
runAgentLoop: vi.fn(),
},
});
expect(failed).toMatchObject({ candidates: [], llmCalls: 1, summary: 'failed' });
expect(failed.warnings[0]).toMatchObject({
code: 'relationship_llm_proposal_failed',
message: 'KTX relationship LLM proposal failed: model unavailable',
recoverable: true,
});
});
});

View file

@ -0,0 +1,151 @@
import { describe, expect, it } from 'vitest';
import type { KtxEnrichedColumn, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
import { localCandidateTables } from '../../../src/context/scan/relationship-locality.js';
function column(
tableId: string,
id: string,
name: string,
options: Partial<KtxEnrichedColumn> = {},
): KtxEnrichedColumn {
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
return {
id,
tableId,
tableRef,
name,
nativeType: options.nativeType ?? 'INTEGER',
normalizedType: options.normalizedType ?? 'integer',
dimensionType: options.dimensionType ?? 'number',
nullable: options.nullable ?? true,
primaryKey: options.primaryKey ?? false,
parentColumnId: options.parentColumnId ?? null,
descriptions: options.descriptions ?? {},
embedding: options.embedding ?? null,
sampleValues: options.sampleValues ?? null,
cardinality: options.cardinality ?? null,
};
}
function table(id: string, name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
const ref = { catalog: null, db: 'public', name };
return {
id,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
};
}
describe('relationship locality', () => {
it('ranks the referenced parent table ahead of the child table for id-like source columns', () => {
const artists = table('artist-id', 'Artist', [column('artist-id', 'artist-pk', 'ArtistId')]);
const albums = table('album-id', 'Album', [
column('album-id', 'album-pk', 'AlbumId'),
column('album-id', 'artist-fk', 'ArtistId'),
]);
const unrelated = table('invoice-id', 'Invoice', [column('invoice-id', 'invoice-pk', 'InvoiceId')]);
const ranked = localCandidateTables({
childTable: albums,
childColumn: albums.columns[1]!,
parentTables: [albums, unrelated, artists],
maxParentTables: 1,
});
expect(ranked.map((item) => item.table.ref.name)).toEqual(['Artist']);
expect(ranked[0]).toMatchObject({
score: expect.any(Number),
tokenScore: expect.any(Number),
embeddingScore: 0,
reasons: expect.arrayContaining(['column_table_token_overlap']),
});
});
it('uses singular and plural variants so plan_code can rank stg_plans', () => {
const plans = table('plans-id', 'stg_plans', [column('plans-id', 'plan-code', 'plan_code')]);
const segments = table('segments-id', 'mart_account_segments', [
column('segments-id', 'current-plan-code', 'current_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
const ranked = localCandidateTables({
childTable: segments,
childColumn: segments.columns[0]!,
parentTables: [accounts, segments, plans],
maxParentTables: 1,
});
expect(ranked.map((item) => item.table.ref.name)).toEqual(['stg_plans']);
expect(ranked[0]?.tokenScore).toBeGreaterThan(0);
});
it('returns all tables when the schema is smaller than the default locality cap', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
const invoices = table('invoices-id', 'invoices', [
column('invoices-id', 'invoice-id', 'id'),
column('invoices-id', 'account-id', 'account_id'),
]);
const ranked = localCandidateTables({
childTable: invoices,
childColumn: invoices.columns[1]!,
parentTables: [invoices, accounts],
});
expect(ranked.map((item) => item.table.ref.name).sort()).toEqual(['accounts', 'invoices']);
});
it('supports an explicit zero cap for deterministic tests', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
const invoices = table('invoices-id', 'invoices', [
column('invoices-id', 'invoice-id', 'id'),
column('invoices-id', 'account-id', 'account_id'),
]);
const ranked = localCandidateTables({
childTable: invoices,
childColumn: invoices.columns[1]!,
parentTables: [invoices, accounts],
maxParentTables: 0,
});
expect(ranked).toEqual([]);
});
it('uses parent-column embeddings when token locality is weak', () => {
const customers = table('customers-id', 'customers', [
column('customers-id', 'customers-id-col', 'id', { embedding: [1, 0, 0] }),
column('customers-id', 'customers-name-col', 'name', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
embedding: [0, 1, 0],
}),
]);
const orders = table('orders-id', 'orders', [
column('orders-id', 'orders-id-col', 'id', { embedding: [0, 0, 1] }),
column('orders-id', 'buyer-ref-col', 'buyer_ref', { embedding: [0.995, 0.005, 0] }),
]);
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'invoice-id', 'id')]);
const ranked = localCandidateTables({
childTable: orders,
childColumn: orders.columns[1]!,
parentTables: [invoices, customers],
maxParentTables: 1,
});
expect(ranked.map((item) => item.table.ref.name)).toEqual(['customers']);
expect(ranked[0]).toMatchObject({
embeddingScore: expect.any(Number),
reasons: expect.arrayContaining(['embedding_similarity']),
});
expect(ranked[0]!.embeddingScore).toBeGreaterThan(0.99);
});
});

View file

@ -0,0 +1,81 @@
import { describe, expect, it } from 'vitest';
import {
normalizeKtxRelationshipName,
pluralizeKtxRelationshipToken,
singularizeKtxRelationshipToken,
tokenSimilarity,
tokenizeKtxRelationshipName,
} from '../../../src/context/scan/relationship-name-similarity.js';
describe('relationship name similarity', () => {
it('tokenizes common warehouse naming styles', () => {
expect(normalizeKtxRelationshipName('AlbumId')).toMatchObject({
normalized: 'album_id',
singular: 'album_id',
plural: 'album_ids',
tokens: ['album', 'id'],
});
expect(normalizeKtxRelationshipName('artistID')).toMatchObject({
normalized: 'artist_id',
tokens: ['artist', 'id'],
});
expect(normalizeKtxRelationshipName('SalesLT.CustomerID')).toMatchObject({
normalized: 'sales_lt_customer_id',
singular: 'sales_lt_customer_id',
tokens: ['sales', 'lt', 'customer', 'id'],
});
expect(normalizeKtxRelationshipName('SCREAMING_CUSTOMER_UUID')).toMatchObject({
normalized: 'screaming_customer_uuid',
tokens: ['screaming', 'customer', 'uuid'],
});
expect(normalizeKtxRelationshipName('billing-account-key')).toMatchObject({
normalized: 'billing_account_key',
tokens: ['billing', 'account', 'key'],
});
});
it('removes only leading warehouse layer prefixes', () => {
expect(normalizeKtxRelationshipName('mart__Sales_Accounts')).toMatchObject({
normalized: 'sales_accounts',
singular: 'sales_account',
plural: 'sales_accounts',
tokens: ['sales', 'accounts'],
});
expect(normalizeKtxRelationshipName('dim_users')).toMatchObject({
normalized: 'users',
singular: 'user',
plural: 'users',
tokens: ['users'],
});
expect(normalizeKtxRelationshipName('customer_dim_id')).toMatchObject({
normalized: 'customer_dim_id',
tokens: ['customer', 'dim', 'id'],
});
});
it('folds accents and preserves non-suffix trailing s words', () => {
expect(normalizeKtxRelationshipName('KundénID')).toMatchObject({
normalized: 'kunden_id',
tokens: ['kunden', 'id'],
});
expect(singularizeKtxRelationshipToken('address')).toBe('address');
expect(singularizeKtxRelationshipToken('addresses')).toBe('address');
expect(singularizeKtxRelationshipToken('status')).toBe('status');
expect(pluralizeKtxRelationshipToken('address')).toBe('addresses');
expect(pluralizeKtxRelationshipToken('company')).toBe('companies');
});
it('returns deterministic tokens for direct tokenization calls', () => {
expect(tokenizeKtxRelationshipName('HTTPResponseCode')).toEqual(['http', 'response', 'code']);
expect(tokenizeKtxRelationshipName('customer2AddressID')).toEqual(['customer', '2', 'address', 'id']);
});
it('scores token overlap and ordered suffix similarity', () => {
expect(tokenSimilarity('artist_id', 'artist_id')).toBe(1);
expect(tokenSimilarity('Album.ArtistId', 'ArtistID')).toBeGreaterThanOrEqual(0.74);
expect(tokenSimilarity('customer_account_id', 'account_id')).toBeGreaterThan(
tokenSimilarity('customer_account_id', 'invoice_id'),
);
expect(tokenSimilarity('', 'artist')).toBe(0);
});
});

View file

@ -0,0 +1,430 @@
import { readFile } from 'node:fs/promises';
import { join } from 'node:path';
import Database from 'better-sqlite3';
import { afterEach, describe, expect, it, vi } from 'vitest';
import { getDialectForDriver } from '../../../src/context/connections/dialects.js';
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
import { snapshotToKtxEnrichedSchema } from '../../../src/context/scan/local-enrichment.js';
import { loadKtxRelationshipBenchmarkFixture, maskKtxRelationshipBenchmarkSnapshot } from '../../../src/context/scan/relationship-benchmarks.js';
import { createKtxRelationshipProfileCache, profileKtxRelationshipSchema } from '../../../src/context/scan/relationship-profiling.js';
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from '../../../src/context/scan/types.js';
class InMemorySqliteExecutor {
readonly db = new Database(':memory:');
queryCount = 0;
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
class FileSqliteExecutor {
readonly db: Database.Database;
queryCount = 0;
constructor(dataPath: string) {
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
}
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
return {
id: `${tableId}.${name}`,
tableId,
tableRef,
name,
nativeType: overrides.nativeType ?? 'INTEGER',
normalizedType: overrides.normalizedType ?? 'integer',
dimensionType: overrides.dimensionType ?? 'number',
nullable: overrides.nullable ?? true,
primaryKey: overrides.primaryKey ?? false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
...overrides,
};
}
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
const ref = { catalog: null, db: null, name };
return {
id: name,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
};
}
function schema(tables: KtxEnrichedTable[]): KtxEnrichedSchema {
return { connectionId: 'warehouse', tables, relationships: [] };
}
describe('relationship profiling', () => {
let executor: InMemorySqliteExecutor | null = null;
afterEach(() => {
executor?.close();
executor = null;
});
it('keeps profiling on the batched table path', async () => {
const source = await readFile(new URL('../../../src/context/scan/relationship-profiling.ts', import.meta.url), 'utf-8');
expect(source).not.toMatch(new RegExp('queryColumn' + 'Profile'));
expect(source).not.toMatch(/for \(const column of table\.columns\)[\s\S]*executeReadOnly/);
expect(source).toMatch(/queryTableProfile/);
expect(source).toMatch(/UNION ALL/);
});
it('profiles row count, null rate, uniqueness, sample values, and text lengths', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
INSERT INTO accounts (id, code, parent_id) VALUES
(1, 'A-1', NULL),
(2, 'B-2', 1),
(3, 'C-3', 1),
(4, 'C-3', 2);
`);
const result = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: schema([
table('accounts', [
column('accounts', 'id', { primaryKey: false, nullable: false }),
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
column('accounts', 'parent_id'),
]),
]),
executor,
ctx: { runId: 'profile-test' },
sampleValuesPerColumn: 3,
});
expect(result.sqlAvailable).toBe(true);
expect(result.queryCount).toBe(1);
expect(executor.queryCount).toBe(1);
expect(result.tables).toHaveLength(1);
expect(result.tables[0]).toMatchObject({ table: { name: 'accounts' }, rowCount: 4 });
expect(result.columns['accounts.id']).toMatchObject({
table: { name: 'accounts' },
column: 'id',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
minTextLength: 1,
maxTextLength: 1,
});
expect(result.columns['accounts.code']).toMatchObject({
distinctCount: 3,
uniquenessRatio: 0.75,
sampleValues: ['C-3', 'A-1', 'B-2'],
minTextLength: 3,
maxTextLength: 3,
});
expect(result.columns['accounts.parent_id']).toMatchObject({
nullCount: 1,
distinctCount: 2,
uniquenessRatio: 0.5,
nullRate: 0.25,
});
});
it('profiles each enabled table with one read-only SQL query', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
CREATE TABLE users (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, code, parent_id) VALUES
(1, 'A-1', NULL),
(2, 'B-2', 1),
(3, 'C-3', 1),
(4, 'C-3', 2);
INSERT INTO users (id, account_id) VALUES
(10, 1),
(11, 1),
(12, 2);
`);
const result = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: schema([
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
column('accounts', 'parent_id'),
]),
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id')]),
]),
executor,
ctx: { runId: 'profile-batched-query-count' },
sampleValuesPerColumn: 3,
});
expect(result.sqlAvailable).toBe(true);
expect(result.queryCount).toBe(2);
expect(executor.queryCount).toBe(2);
expect(result.tables).toEqual([
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 },
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
]);
expect(result.columns['accounts.code']).toMatchObject({
distinctCount: 3,
uniquenessRatio: 0.75,
sampleValues: ['C-3', 'A-1', 'B-2'],
});
expect(result.columns['users.account_id']).toMatchObject({
rowCount: 3,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 2 / 3,
});
});
it('bounds column profile statistics with profileSampleRows', async () => {
const executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a3'), (4, 'a4');
`);
const profiles = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: schema([
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'account_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
}),
]),
]),
executor,
ctx: { runId: 'profile-sample-rows' },
profileSampleRows: 2,
});
expect(profiles.queryCount).toBe(1);
expect(executor.queryCount).toBe(1);
expect(profiles.tables).toEqual([{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 }]);
expect(profiles.columns['accounts.id']).toMatchObject({
rowCount: 2,
distinctCount: 2,
uniquenessRatio: 1,
});
expect(profiles.columns['accounts.account_code']?.sampleValues).toEqual(['a1', 'a2']);
executor.close();
});
it('reuses a profile cache inside one scan run but re-queries with a fresh cache', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a2');
`);
const relationshipSchema = schema([
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'account_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
}),
]),
]);
const cache = createKtxRelationshipProfileCache();
const first = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: relationshipSchema,
executor,
ctx: { runId: 'profile-cache-run' },
cache,
});
const second = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: relationshipSchema,
executor,
ctx: { runId: 'profile-cache-run' },
cache,
});
const third = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: relationshipSchema,
executor,
ctx: { runId: 'profile-cache-fresh-run' },
cache: createKtxRelationshipProfileCache(),
});
expect(first.queryCount).toBe(1);
expect(second.queryCount).toBe(0);
expect(third.queryCount).toBe(1);
expect(executor.queryCount).toBe(2);
expect(second.tables).toEqual(first.tables);
expect(second.columns).toEqual(first.columns);
});
it('profiles the checked-in scale stress fixture with one query per table', async () => {
const fixtureRoot = new URL('../../fixtures/relationship-benchmarks', import.meta.url);
const fixture = await loadKtxRelationshipBenchmarkFixture(join(fixtureRoot.pathname, 'scale_stress_no_declared_constraints'));
if (!fixture.dataPath) {
throw new Error('scale_stress_no_declared_constraints is missing data.sqlite');
}
const maskedSnapshot = maskKtxRelationshipBenchmarkSnapshot(
fixture.snapshot,
'declared_pks_and_declared_fks_removed',
);
const scaleExecutor = new FileSqliteExecutor(fixture.dataPath);
try {
const result = await profileKtxRelationshipSchema({
connectionId: fixture.snapshot.connectionId,
dialect: getDialectForDriver(fixture.snapshot.driver),
schema: snapshotToKtxEnrichedSchema(maskedSnapshot, new Map()),
executor: scaleExecutor,
ctx: { runId: 'scale-stress-profile-query-count' },
profileSampleRows: 3,
});
expect(fixture.snapshot.tables).toHaveLength(400);
expect(result.queryCount).toBe(400);
expect(result.queryCount).toBeLessThanOrEqual(2 * fixture.snapshot.tables.length);
expect(scaleExecutor.queryCount).toBe(400);
} finally {
scaleExecutor.close();
}
});
it('profiles tables concurrently up to profileConcurrency', async () => {
let inFlight = 0;
let maxInFlight = 0;
const executor = {
executeReadOnly: vi.fn(async (input: KtxReadOnlyQueryInput) => {
inFlight += 1;
maxInFlight = Math.max(maxInFlight, inFlight);
await new Promise((resolve) => setTimeout(resolve, 10));
inFlight -= 1;
return {
headers: [
'column_name',
'table_row_count',
'row_count',
'null_count',
'distinct_count',
'min_text_length',
'max_text_length',
'sample_values',
],
rows: [[input.sql.includes('accounts') ? 'id' : 'account_id', 2, 2, 0, 2, 1, 2, '1\u001f2']],
totalRows: 1,
rowCount: 1,
};
}),
};
await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: schemaWithTables(['accounts', 'orders', 'payments', 'refunds']),
executor,
ctx: { runId: 'profile-concurrency' },
profileConcurrency: 4,
});
expect(maxInFlight).toBe(4);
});
it('keeps profiling other tables when one table profile fails', async () => {
const executor = {
executeReadOnly: vi.fn(async (input: KtxReadOnlyQueryInput) => {
if (input.sql.includes('"orders"')) {
throw new Error('orders unavailable');
}
return {
headers: [
'column_name',
'table_row_count',
'row_count',
'null_count',
'distinct_count',
'min_text_length',
'max_text_length',
'sample_values',
],
rows: [['id', 2, 2, 0, 2, 1, 2, '1\u001f2']],
totalRows: 1,
rowCount: 1,
};
}),
};
const result = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: schemaWithTables(['accounts', 'orders']),
executor,
ctx: { runId: 'profile-error-isolated' },
profileConcurrency: 2,
});
expect(result.warnings).toContain('profile_failed:orders:orders unavailable');
expect(result.tables).toHaveLength(2);
expect(Object.keys(result.columns)).toContain('accounts.id');
});
});
function schemaWithTables(names: string[]): KtxEnrichedSchema {
return schema(
names.map((name) =>
table(name, [
column(name, name === 'orders' ? 'account_id' : 'id', {
nullable: false,
primaryKey: name !== 'orders',
}),
]),
),
);
}

View file

@ -0,0 +1,108 @@
import { describe, expect, it } from 'vitest';
import {
calibrateWeightsFromSyntheticFixtures,
defaultKtxRelationshipScoreWeights,
normalizeKtxRelationshipScoreWeights,
scoreKtxRelationshipCandidate,
type KtxRelationshipSignalVector,
} from '../../../src/context/scan/relationship-scoring.js';
function signals(overrides: Partial<KtxRelationshipSignalVector> = {}): KtxRelationshipSignalVector {
return {
nameSimilarity: 0.5,
typeCompatibility: 1,
valueOverlap: 0,
embeddingSimilarity: 0,
profileUniqueness: 0.5,
profileNullRate: 0.5,
structuralPrior: 0.5,
...overrides,
};
}
describe('relationship scoring', () => {
it('scores stronger evidence higher without hard-gating on names', () => {
const weakNameStrongProfile = scoreKtxRelationshipCandidate(
signals({
nameSimilarity: 0.05,
typeCompatibility: 1,
valueOverlap: 0.7,
profileUniqueness: 1,
profileNullRate: 1,
structuralPrior: 0.7,
}),
);
const strongNameWeakProfile = scoreKtxRelationshipCandidate(
signals({
nameSimilarity: 0.95,
typeCompatibility: 1,
valueOverlap: 0,
profileUniqueness: 0.3,
profileNullRate: 0.4,
structuralPrior: 0.5,
}),
);
expect(weakNameStrongProfile.score).toBeGreaterThan(strongNameWeakProfile.score);
expect(weakNameStrongProfile.contributions.profileUniqueness).toBeGreaterThan(0);
expect(weakNameStrongProfile.contributions.nameSimilarity).toBeLessThan(0.02);
});
it('normalizes partial and invalid weights into a usable vector', () => {
const weights = normalizeKtxRelationshipScoreWeights({
nameSimilarity: 3,
typeCompatibility: -1,
valueOverlap: Number.POSITIVE_INFINITY,
profileUniqueness: 1,
});
const total = Object.values(weights).reduce((sum, value) => sum + value, 0);
expect(total).toBeCloseTo(1, 6);
expect(weights.nameSimilarity).toBeGreaterThan(weights.profileUniqueness);
expect(weights.typeCompatibility).toBe(0);
expect(weights.valueOverlap).toBe(0);
});
it('returns deterministic defaults as a defensive copy', () => {
const first = defaultKtxRelationshipScoreWeights();
const second = defaultKtxRelationshipScoreWeights();
expect(first).toEqual(second);
expect(first).not.toBe(second);
expect(Object.values(first).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
});
it('calibrates only from synthetic observations', () => {
expect(() =>
calibrateWeightsFromSyntheticFixtures([
{
fixtureId: 'chinook_with_declared_metadata',
origin: 'public',
expectedRelationship: true,
signals: signals({ nameSimilarity: 1 }),
},
]),
).toThrow(/synthetic/i);
});
it('calibrates deterministic weights from positive and negative synthetic observations', () => {
const weights = calibrateWeightsFromSyntheticFixtures([
{
fixtureId: 'synthetic_positive',
origin: 'synthetic',
expectedRelationship: true,
signals: signals({ nameSimilarity: 0.8, valueOverlap: 0.9, profileUniqueness: 1, profileNullRate: 1 }),
},
{
fixtureId: 'synthetic_negative',
origin: 'synthetic',
expectedRelationship: false,
signals: signals({ nameSimilarity: 0.2, valueOverlap: 0.1, profileUniqueness: 0.4, profileNullRate: 0.5 }),
},
]);
expect(Object.values(weights).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
expect(weights.valueOverlap).toBeGreaterThan(weights.structuralPrior);
expect(weights.profileUniqueness).toBeGreaterThan(weights.embeddingSimilarity);
});
});

View file

@ -0,0 +1,498 @@
import Database from 'better-sqlite3';
import { afterEach, describe, expect, it } from 'vitest';
import { getDialectForDriver } from '../../../src/context/connections/dialects.js';
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
import { generateKtxRelationshipDiscoveryCandidates } from '../../../src/context/scan/relationship-candidates.js';
import type { KtxRelationshipProfileArtifact } from '../../../src/context/scan/relationship-profiling.js';
import { profileKtxRelationshipSchema } from '../../../src/context/scan/relationship-profiling.js';
import { validateKtxRelationshipDiscoveryCandidates } from '../../../src/context/scan/relationship-validation.js';
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from '../../../src/context/scan/types.js';
class InMemorySqliteExecutor {
readonly db = new Database(':memory:');
queryCount = 0;
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
return {
id: `${tableId}.${name}`,
tableId,
tableRef,
name,
nativeType: overrides.nativeType ?? 'INTEGER',
normalizedType: overrides.normalizedType ?? 'integer',
dimensionType: overrides.dimensionType ?? 'number',
nullable: overrides.nullable ?? true,
primaryKey: overrides.primaryKey ?? false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
...overrides,
};
}
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
const ref = { catalog: null, db: null, name };
return {
id: name,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
};
}
function schema(tables?: KtxEnrichedTable[]): KtxEnrichedSchema {
return {
connectionId: 'warehouse',
tables: tables ?? [
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
]),
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
table('invoices', [
column('invoices', 'id', { nullable: false }),
column('invoices', 'account_id', { nullable: false }),
]),
],
relationships: [],
};
}
describe('relationship validation', () => {
let executor: InMemorySqliteExecutor | null = null;
afterEach(() => {
executor?.close();
executor = null;
});
it('accepts a relationship-discovery candidate with unique parent values and full source coverage', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, name TEXT);
CREATE TABLE users (id INTEGER, account_id INTEGER);
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 999);
`);
const testSchema = schema();
const profiles = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: testSchema,
executor,
ctx: { runId: 'validate-test' },
});
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).filter(
(candidate) => candidate.from.table.name === 'users',
);
const validated = await validateKtxRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
candidates,
profiles,
executor,
ctx: { runId: 'validate-test' },
tableCount: testSchema.tables.length,
});
expect(validated).toHaveLength(1);
expect(validated[0]).toMatchObject({
from: { table: { name: 'users' }, columns: ['account_id'] },
to: { table: { name: 'accounts' }, columns: ['id'] },
status: 'accepted',
score: expect.any(Number),
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
reasons: expect.arrayContaining(['validation_passed']),
},
});
expect(validated[0]?.score).toBeGreaterThanOrEqual(0.85);
});
it('rejects a candidate with missing parent values and records the deterministic reason', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, name TEXT);
CREATE TABLE users (id INTEGER, account_id INTEGER);
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 999), (22, 1000);
`);
const testSchema = schema();
const profiles = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: testSchema,
executor,
ctx: { runId: 'validate-test' },
});
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).filter(
(candidate) => candidate.from.table.name === 'invoices',
);
const validated = await validateKtxRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
candidates,
profiles,
executor,
ctx: { runId: 'validate-test' },
tableCount: testSchema.tables.length,
settings: {
minSourceCoverage: 0.9,
maxViolationRatio: 0.01,
},
});
expect(validated).toHaveLength(1);
expect(validated[0]).toMatchObject({
from: { table: { name: 'invoices' }, columns: ['account_id'] },
to: { table: { name: 'accounts' }, columns: ['id'] },
status: 'rejected',
validation: {
sourceCoverage: 1 / 3,
violationCount: 2,
violationRatio: 2 / 3,
reasons: expect.arrayContaining(['low_source_coverage', 'excessive_violations']),
},
});
});
it('keeps over-budget candidates review-only without executing coverage SQL for them', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, name TEXT);
CREATE TABLE users (id INTEGER, account_id INTEGER);
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 3);
`);
const testSchema = schema();
const profiles = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: testSchema,
executor,
ctx: { runId: 'validate-budget-profile' },
});
executor.queryCount = 0;
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).map((candidate) => ({
...candidate,
confidence: candidate.from.table.name === 'users' ? 0.99 : 0.5,
}));
const validated = await validateKtxRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
candidates,
profiles,
executor,
ctx: { runId: 'validate-budget' },
tableCount: testSchema.tables.length,
settings: {
validationBudget: 1,
},
});
expect(executor.queryCount).toBe(1);
expect(validated).toHaveLength(2);
expect(validated.find((candidate) => candidate.from.table.name === 'users')).toMatchObject({
status: 'accepted',
validation: { reasons: expect.arrayContaining(['validation_passed']) },
});
expect(validated.find((candidate) => candidate.from.table.name === 'invoices')).toMatchObject({
status: 'review',
validation: {
reasons: ['validation_unattempted'],
},
});
});
it('treats validation budget zero as review-only validation without coverage SQL', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, name TEXT);
CREATE TABLE users (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
`);
const testSchema = schema([
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
]),
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
]);
const profiles = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: testSchema,
executor,
ctx: { runId: 'validate-zero-budget-profile' },
});
executor.queryCount = 0;
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema);
const validated = await validateKtxRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
candidates,
profiles,
executor,
ctx: { runId: 'validate-zero-budget' },
tableCount: testSchema.tables.length,
settings: {
validationBudget: 0,
},
});
expect(executor.queryCount).toBe(0);
expect(validated).toHaveLength(1);
expect(validated[0]).toMatchObject({
status: 'review',
score: expect.any(Number),
validation: {
checkedValues: 0,
reasons: ['validation_unattempted'],
},
});
});
it('marks rejected LLM proposals with the spec rejection reason', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE customers (id INTEGER);
CREATE TABLE orders (buyer_ref INTEGER);
INSERT INTO customers (id) VALUES (1), (2);
INSERT INTO orders (buyer_ref) VALUES (98), (99);
`);
const testSchema = schema([
table('customers', [column('customers', 'id', { nullable: false })]),
table('orders', [column('orders', 'buyer_ref')]),
]);
const profiles = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: testSchema,
executor,
ctx: { runId: 'llm-rejected-validation' },
});
const [candidate] = generateKtxRelationshipDiscoveryCandidates(
schema([
table('customers', [column('customers', 'id', { nullable: false })]),
table('orders', [column('orders', 'customer_id')]),
]),
);
if (!candidate) {
throw new Error('Expected base candidate');
}
const llmCandidate = {
...candidate,
id: 'orders:(orders.buyer_ref)->customers:(customers.id)',
from: { ...candidate.from, columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
source: 'llm_proposal' as const,
evidence: {
...candidate.evidence,
reasons: ['llm_proposal'],
llmConfidence: 0.84,
llmRationale: 'Buyer references should map to customers.',
},
};
const [validated] = await validateKtxRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
candidates: [llmCandidate],
profiles,
executor,
ctx: { runId: 'llm-rejected-validation' },
tableCount: testSchema.tables.length,
});
expect(validated?.status).toBe('rejected');
expect(validated?.validation.reasons).toEqual(
expect.arrayContaining(['low_source_coverage', 'llm_proposed_but_validation_failed']),
);
});
it('limits validation query concurrency', async () => {
const executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
CREATE TABLE invoices (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts VALUES (1), (2);
INSERT INTO orders VALUES (10, 1), (11, 2);
INSERT INTO invoices VALUES (20, 1), (21, 2);
`);
let active = 0;
let maxActive = 0;
const throttled = {
executeReadOnly: async (input: KtxReadOnlyQueryInput, ctx: KtxScanContext) => {
active += 1;
maxActive = Math.max(maxActive, active);
await new Promise((resolve) => setTimeout(resolve, input.sql.includes('WITH child_values') ? 10 : 0));
const result = await executor.executeReadOnly(input, ctx);
active -= 1;
return result;
},
};
const testSchema = schema([
table('accounts', [column('accounts', 'id', { nullable: false })]),
table('orders', [column('orders', 'id', { nullable: false }), column('orders', 'account_id')]),
table('invoices', [column('invoices', 'id', { nullable: false }), column('invoices', 'account_id')]),
]);
const profiles = await profileKtxRelationshipSchema({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
schema: testSchema,
executor,
ctx: { runId: 'validation-concurrency-profile' },
});
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema);
await validateKtxRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
candidates,
profiles,
executor: throttled,
ctx: { runId: 'validation-concurrency' },
tableCount: testSchema.tables.length,
settings: { concurrency: 1 },
});
expect(maxActive).toBe(1);
executor.close();
});
it('pins column_suffix_match validation scoring for plan-code suffix candidates', async () => {
const candidate = {
id: 'mart:(current_plan_code)->plans:(plan_code)',
from: {
tableId: 'mart-account-segments-id',
columnIds: ['current-plan-code-col'],
table: { catalog: null, db: null, name: 'mart_account_segments' },
columns: ['current_plan_code'],
},
to: {
tableId: 'plans-id',
columnIds: ['plan-code-col'],
table: { catalog: null, db: null, name: 'stg_plans' },
columns: ['plan_code'],
},
relationshipType: 'many_to_one' as const,
confidence: 0.902,
source: 'column_suffix_match' as const,
status: 'review' as const,
evidence: {
sourceColumnBase: 'current_plan',
targetTableBase: 'plan',
targetColumnBase: 'plan_code',
targetKeyScore: 0.86,
nameScore: 0.78,
reasons: ['column_suffix_match', 'profile_unique_target'],
},
};
const profiles = {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 0,
tables: [],
warnings: [],
columns: {
'mart_account_segments.current_plan_code': {
table: { catalog: null, db: null, name: 'mart_account_segments' },
column: 'current_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'stg_plans.plan_code': {
table: { catalog: null, db: null, name: 'stg_plans' },
column: 'plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
},
} satisfies KtxRelationshipProfileArtifact;
const executor = {
async executeReadOnly() {
return {
headers: ['child_distinct', 'parent_distinct', 'overlap', 'violation_count'],
rows: [[4, 4, 4, 0]],
rowCount: 1,
totalRows: 1,
};
},
};
const [validated] = await validateKtxRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
dialect: getDialectForDriver('sqlite'),
candidates: [candidate],
profiles,
executor,
ctx: { runId: 'rule-b-validation-score' },
tableCount: 2,
});
expect(validated).toMatchObject({
status: 'accepted',
score: 0.98,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationRatio: 0,
reasons: ['validation_passed'],
},
});
});
});

View file

@ -0,0 +1,67 @@
import { describe, expect, it } from 'vitest';
import {
scopedTableNames,
tableRefFromKey,
tableRefKey,
tableRefSet,
type KtxTableRefKey,
} from '../../../src/context/scan/table-ref.js';
describe('tableRefKey roundtrip', () => {
it('encodes and decodes a three-part ref', () => {
const ref = { catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' };
expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref);
});
it('treats null catalog/db as the empty segment', () => {
const ref = { catalog: null, db: 'public', name: 'users' };
expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref);
});
it('roundtrips a bare-name ref', () => {
const ref = { catalog: null, db: null, name: 'orders' };
expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref);
});
});
describe('tableRefSet', () => {
it('produces a set with member-equality on canonical keys', () => {
const scope = tableRefSet([
{ catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' },
{ catalog: 'ANALYTICS', db: 'MARTS', name: 'ITEMS' },
]);
expect(scope.size).toBe(2);
expect(scope.has(tableRefKey({ catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' }))).toBe(true);
expect(scope.has(tableRefKey({ catalog: 'ANALYTICS', db: 'MARTS', name: 'OTHER' }))).toBe(false);
});
});
describe('scopedTableNames', () => {
it('projects to the requested (catalog, db) namespace', () => {
const scope = tableRefSet([
{ catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' },
{ catalog: 'ANALYTICS', db: 'MARTS', name: 'ITEMS' },
{ catalog: 'ANALYTICS', db: 'STAGING', name: 'LISTINGS' },
]);
expect(scopedTableNames(scope, { catalog: 'ANALYTICS', db: 'MARTS' }).sort()).toEqual(['ITEMS', 'LISTINGS']);
expect(scopedTableNames(scope, { catalog: 'ANALYTICS', db: 'STAGING' })).toEqual(['LISTINGS']);
});
it('requires non-null scope segments to match the namespace', () => {
const scope = tableRefSet([{ catalog: null, db: 'public', name: 'users' }]);
expect(scopedTableNames(scope, { catalog: 'any-catalog', db: 'public' })).toEqual([]);
});
it('returns empty when no scope entry matches the namespace', () => {
const scope = tableRefSet([{ catalog: 'A', db: 'B', name: 'C' }]);
expect(scopedTableNames(scope, { catalog: 'X', db: 'Y' })).toEqual([]);
});
it('dedupes exact namespace matches only', () => {
const scope: ReadonlySet<KtxTableRefKey> = tableRefSet([
{ catalog: null, db: 'public', name: 'users' },
{ catalog: 'A', db: 'public', name: 'users' },
]);
expect(scopedTableNames(scope, { catalog: 'A', db: 'public' })).toEqual(['users']);
});
});

View file

@ -0,0 +1,24 @@
import { describe, expect, it } from 'vitest';
import { inferKtxDimensionType, ktxColumnTypeMappingFromNative, normalizeKtxNativeType } from '../../../src/context/scan/type-normalization.js';
describe('KTX scan type normalization', () => {
it('normalizes native database type strings', () => {
expect(normalizeKtxNativeType(' NUMERIC(12, 2) ')).toBe('numeric');
expect(normalizeKtxNativeType('TIMESTAMP WITH TIME ZONE')).toBe('timestamp with time zone');
expect(normalizeKtxNativeType('')).toBe('unknown');
});
it('infers dimension types from native types', () => {
expect(inferKtxDimensionType('BOOLEAN')).toBe('boolean');
expect(inferKtxDimensionType('timestamp with time zone')).toBe('time');
expect(inferKtxDimensionType('decimal(10,2)')).toBe('number');
expect(inferKtxDimensionType('varchar(255)')).toBe('string');
});
it('builds a complete column type mapping', () => {
expect(ktxColumnTypeMappingFromNative('BIGINT')).toEqual({
normalizedType: 'bigint',
dimensionType: 'number',
});
});
});

View file

@ -0,0 +1,262 @@
import { describe, expect, it } from 'vitest';
import {
createKtxConnectorCapabilities,
type KtxEventPropertyDiscovery,
type KtxEventPropertyDiscoveryInput,
type KtxEventPropertyValuesInput,
type KtxEventPropertyValuesResult,
type KtxEventStreamDiscoveryPort,
type KtxEventTypeDiscovery,
type KtxEventTypeDiscoveryInput,
type KtxNetworkEndpoint,
type KtxNetworkTunnelPort,
type KtxQueryResult,
type KtxScanConnector,
type KtxScanContext,
type KtxScanInput,
type KtxSchemaSnapshot,
} from '../../../src/context/scan/types.js';
describe('KTX scan contract types', () => {
it('defaults to structural-only connector capabilities', () => {
expect(createKtxConnectorCapabilities()).toEqual({
structuralIntrospection: true,
tableSampling: false,
columnSampling: false,
columnStats: false,
readOnlySql: false,
nestedAnalysis: false,
eventStreamDiscovery: false,
formalForeignKeys: false,
estimatedRowCounts: false,
});
});
it('keeps structural introspection mandatory when optional capabilities are enabled', () => {
expect(
createKtxConnectorCapabilities({
tableSampling: true,
readOnlySql: true,
eventStreamDiscovery: true,
estimatedRowCounts: true,
}),
).toEqual({
structuralIntrospection: true,
tableSampling: true,
columnSampling: false,
columnStats: false,
readOnlySql: true,
nestedAnalysis: false,
eventStreamDiscovery: true,
formalForeignKeys: false,
estimatedRowCounts: true,
});
});
it('describes the connector surface without requiring enrichment methods', async () => {
const snapshot: KtxSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T00:00:00.000Z',
scope: { schemas: ['public'] },
metadata: { source: 'unit-test' },
tables: [
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'Customer orders',
estimatedRows: 42,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Primary key',
},
],
foreignKeys: [],
},
],
};
const connector: KtxScanConnector = {
id: 'test-postgres',
driver: 'postgres',
capabilities: createKtxConnectorCapabilities({ estimatedRowCounts: true }),
async introspect(input: KtxScanInput, ctx: KtxScanContext) {
expect(input.connectionId).toBe('warehouse');
expect(ctx.runId).toBe('scan-run-1');
return snapshot;
},
listSchemas: async () => [],
listTables: async () => [],
};
await expect(
connector.introspect(
{
connectionId: 'warehouse',
driver: 'postgres',
scope: { schemas: ['public'] },
mode: 'structural',
},
{ runId: 'scan-run-1' },
),
).resolves.toEqual(snapshot);
});
it('models optional event-stream discovery as a connector capability and port', async () => {
const eventTypes: KtxEventTypeDiscovery[] = [{ value: '$pageview', count: 42 }];
const propertyKeys: KtxEventPropertyDiscovery[] = [{ key: '$browser', count: 31 }];
const propertyValues: KtxEventPropertyValuesResult = { values: ['Chrome', 'Safari'], cardinality: 2 };
const discovery: KtxEventStreamDiscoveryPort = {
async listEventTypes(input: KtxEventTypeDiscoveryInput) {
expect(input).toEqual({
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
eventColumn: 'event',
limit: 2,
minCount: 30,
lookbackDays: 14,
});
return eventTypes;
},
async listPropertyKeys(input: KtxEventPropertyDiscoveryInput) {
expect(input).toEqual({
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
jsonColumn: 'properties',
sampleSize: 1000,
limit: 5,
lookbackDays: 7,
});
return propertyKeys;
},
async listPropertyValues(input: KtxEventPropertyValuesInput) {
expect(input).toEqual({
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
jsonColumn: 'properties',
propertyKey: '$browser',
limit: 3,
maxCardinality: 1000,
lookbackDays: 30,
});
return propertyValues;
},
};
const connector: KtxScanConnector = {
id: 'clickhouse:product',
driver: 'clickhouse',
capabilities: createKtxConnectorCapabilities({ eventStreamDiscovery: true }),
eventStreamDiscovery: discovery,
async introspect() {
return {
connectionId: 'product',
driver: 'clickhouse',
extractedAt: '2026-04-29T00:00:00.000Z',
scope: { catalogs: ['157881'] },
metadata: {},
tables: [],
};
},
listSchemas: async () => [],
listTables: async () => [],
};
await expect(
connector.eventStreamDiscovery?.listEventTypes(
{
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
eventColumn: 'event',
limit: 2,
minCount: 30,
lookbackDays: 14,
},
{ runId: 'scan-run-1' },
),
).resolves.toEqual([{ value: '$pageview', count: 42 }]);
await expect(
connector.eventStreamDiscovery?.listPropertyKeys(
{
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
jsonColumn: 'properties',
sampleSize: 1000,
limit: 5,
lookbackDays: 7,
},
{ runId: 'scan-run-1' },
),
).resolves.toEqual([{ key: '$browser', count: 31 }]);
await expect(
connector.eventStreamDiscovery?.listPropertyValues(
{
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
jsonColumn: 'properties',
propertyKey: '$browser',
limit: 3,
maxCardinality: 1000,
lookbackDays: 30,
},
{ runId: 'scan-run-1' },
),
).resolves.toEqual({ values: ['Chrome', 'Safari'], cardinality: 2 });
});
it('keeps read-only query results separate from schema snapshots', () => {
const result: KtxQueryResult = {
headers: ['id', 'amount'],
headerTypes: ['integer', 'numeric'],
rows: [[1, 10.5]],
totalRows: 1,
rowCount: 1,
};
expect(result).toEqual({
headers: ['id', 'amount'],
headerTypes: ['integer', 'numeric'],
rows: [[1, 10.5]],
totalRows: 1,
rowCount: 1,
});
});
it('models host-provided network tunnel endpoint resolution without app imports', async () => {
const endpoint: KtxNetworkEndpoint = {
host: '127.0.0.1',
port: 15432,
close: async () => undefined,
};
const tunnelPort: KtxNetworkTunnelPort<{ networkProxy?: { type: 'ssh_tunnel' } }> = {
async resolveEndpoint(input) {
expect(input).toEqual({
connectionId: 'warehouse',
driver: 'postgres',
host: 'db.internal',
port: 5432,
connection: { networkProxy: { type: 'ssh_tunnel' } },
});
return endpoint;
},
};
await expect(
tunnelPort.resolveEndpoint({
connectionId: 'warehouse',
driver: 'postgres',
host: 'db.internal',
port: 5432,
connection: { networkProxy: { type: 'ssh_tunnel' } },
}),
).resolves.toBe(endpoint);
});
});

View file

@ -0,0 +1,216 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
import { WarehouseCatalogService } from '../../../src/context/scan/warehouse-catalog.js';
describe('WarehouseCatalogService', () => {
let tempDir: string;
let project: KtxLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-warehouse-catalog-'));
project = await initKtxProject({ projectDir: join(tempDir, 'project') });
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
async function seedLiveDatabaseScan(connectionId = 'warehouse', syncId = 'sync-2', driver = 'postgres') {
const root = `raw-sources/${connectionId}/live-database/${syncId}`;
const tableRef = {
catalog: driver === 'bigquery' ? 'analytics' : null,
db: driver === 'sqlite' ? null : 'public',
name: 'orders',
};
await project.fileStore.writeFile(
`${root}/connection.json`,
JSON.stringify({ connectionId, driver, extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2),
'ktx',
'ktx@example.com',
'seed connection',
);
await project.fileStore.writeFile(
`${root}/tables/orders.json`,
JSON.stringify(
{
catalog: tableRef.catalog,
db: tableRef.db,
name: tableRef.name,
kind: 'table',
comment: 'Customer orders',
estimatedRows: 12,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'status',
nativeType: 'text',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: 'Order status',
},
],
foreignKeys: [],
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed orders',
);
await project.fileStore.writeFile(
`${root}/enrichment/relationship-profile.json`,
JSON.stringify(
{
connectionId,
driver,
sqlAvailable: true,
queryCount: 3,
tables: [{ table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name }, rowCount: 12 }],
columns: {
'orders.status': {
table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name },
column: 'status',
nativeType: 'text',
normalizedType: 'text',
rowCount: 12,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 0.1667,
nullRate: 0,
sampleValues: ['paid', 'refunded'],
minTextLength: 4,
maxTextLength: 8,
},
},
warnings: [],
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed profile',
);
}
it('finds the latest sync and merges table schema with relationship profile values', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-1');
await seedLiveDatabaseScan('warehouse', 'sync-2');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.getLatestSyncId('warehouse')).resolves.toBe('sync-2');
const detail = await catalog.getTable({ connectionId: 'warehouse', catalog: null, db: 'public', name: 'orders' });
expect(detail).toMatchObject({
connectionId: 'warehouse',
display: 'public.orders',
rowCount: 12,
columns: [
{ name: 'id', nativeType: 'integer', primaryKey: true },
{ name: 'status', nativeType: 'text', sampleValues: ['paid', 'refunded'], distinctCount: 2 },
],
});
expect(detail).not.toHaveProperty(['connection', 'Name'].join(''));
const hits = await catalog.searchByName('warehouse', 'orders', 5);
expect(hits[0]).toMatchObject({
kind: 'table',
connectionId: 'warehouse',
display: 'public.orders',
});
expect(hits[0]).not.toHaveProperty(['connection', 'Name'].join(''));
});
it('returns scanAvailable=false when no live-database scan exists', async () => {
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.getTable({ connectionId: 'missing', catalog: null, db: 'public', name: 'orders' })).resolves.toBeNull();
await expect(catalog.hasScan('missing')).resolves.toBe(false);
});
it('resolves postgres display strings and returns closest candidates for missing tables', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
resolved: { catalog: null, db: 'public', name: 'orders' },
candidates: [],
dialect: 'postgres',
});
await expect(catalog.resolveDisplay('warehouse', 'public.orderz')).resolves.toMatchObject({
resolved: null,
candidates: [{ name: 'orders' }],
});
});
it('keeps one-part table display fallback for loose catalog resolution', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplay('warehouse', 'orders')).resolves.toMatchObject({
resolved: { catalog: null, db: 'public', name: 'orders' },
candidates: [],
dialect: 'postgres',
});
});
it('treats two-part BigQuery identifiers as ambiguous instead of guessing', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
resolved: null,
dialect: 'bigquery',
});
});
it('resolves postgres column display strings without treating the column as a table', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplayTarget('warehouse', 'public.orders.status')).resolves.toMatchObject({
resolved: { catalog: null, db: 'public', name: 'orders', column: 'status' },
candidates: [],
dialect: 'postgres',
});
});
it('resolves BigQuery column display strings with four parts', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplayTarget('warehouse', 'analytics.public.orders.status')).resolves.toMatchObject({
resolved: { catalog: 'analytics', db: 'public', name: 'orders', column: 'status' },
candidates: [],
dialect: 'bigquery',
});
});
it('searches table names, column names, comments, and descriptions', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.searchByName('warehouse', 'status', 10)).resolves.toEqual(
expect.arrayContaining([
expect.objectContaining({
kind: 'column',
ref: expect.objectContaining({ db: 'public', name: 'orders', column: 'status' }),
matchedOn: 'name',
}),
]),
);
});
});