mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-28 08:49:38 +02:00
test: split cli tests from source tree (#216)
* feat(cli): define full warehouse dialect contract
* test(cli): keep dialect edge tests focused
* fix(cli): stabilize dialect contract foundation
* refactor(connectors): own read-only query preparation
* refactor(connectors): resolve dialects through registry
* refactor(connectors): keep concrete dialect classes internal
* chore(workspace): enforce dialect import boundary
* refactor(cli): resolve relationship dialect at scan boundary
* refactor(cli): use dialect display parsing for entity details
* refactor(cli): use dialect display parsing for warehouse catalog
* refactor(cli): use dialect SQL in relationship workflows
* test(cli): verify solid dialect scan workflow closure
* test: split cli tests from source tree
* refactor(cli): standardize BigQuery scope listing
* feat(sqlite): implement connector scope listing
* test(connectors): cover required table listing
* feat(cli): add warehouse driver registry
* refactor(setup): route scope discovery through driver registry
* refactor(cli): route local query execution through driver registry
* refactor(historic-sql): route dialect support through driver registry
* refactor(cli): test warehouse connections through driver registry
* fix(cli): close driver registry type export gaps
* Improve setup daemon diagnostics
* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback
Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.
* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match
The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.
Align the picker boundary with the canonical 3-level KtxTableRef:
- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
(resolveEnabledTables already accepts the 3-part shape) and
schemasFromEnabledTables now goes through parseDottedTableEntry so it
recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
reuse.
Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).
* fix(cli): allow debug telemetry under opt-out env
This commit is contained in:
parent
924868841d
commit
56985b7e09
548 changed files with 5048 additions and 2228 deletions
70
packages/cli/test/context/scan/constraint-discovery.test.ts
Normal file
70
packages/cli/test/context/scan/constraint-discovery.test.ts
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { constraintDiscoveryWarning, tryConstraintQuery } from '../../../src/context/scan/constraint-discovery.js';
|
||||
|
||||
describe('tryConstraintQuery', () => {
|
||||
it('returns the query value when the query succeeds', async () => {
|
||||
await expect(
|
||||
tryConstraintQuery(
|
||||
{
|
||||
schema: 'public',
|
||||
kind: 'primary_key',
|
||||
isDeniedError: () => false,
|
||||
},
|
||||
async () => ['id'],
|
||||
),
|
||||
).resolves.toEqual({ ok: true, value: ['id'] });
|
||||
});
|
||||
|
||||
it('returns a recoverable warning when the classifier recognizes denial', async () => {
|
||||
const error = Object.assign(new Error('permission denied'), { code: '42501' });
|
||||
|
||||
await expect(
|
||||
tryConstraintQuery(
|
||||
{
|
||||
schema: 'analytics',
|
||||
kind: 'foreign_key',
|
||||
isDeniedError: (candidate) => candidate === error,
|
||||
},
|
||||
async () => {
|
||||
throw error;
|
||||
},
|
||||
),
|
||||
).resolves.toEqual({
|
||||
ok: false,
|
||||
warning: {
|
||||
code: 'constraint_discovery_unauthorized',
|
||||
message: 'Skipped foreign-key discovery in analytics (insufficient grants on system catalogs)',
|
||||
recoverable: true,
|
||||
metadata: { schema: 'analytics', kind: 'foreign_key' },
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('rethrows non-denial errors unchanged', async () => {
|
||||
const error = Object.assign(new Error('connection reset'), { code: 'ECONNRESET' });
|
||||
|
||||
await expect(
|
||||
tryConstraintQuery(
|
||||
{
|
||||
schema: 'public',
|
||||
kind: 'primary_key',
|
||||
isDeniedError: () => false,
|
||||
},
|
||||
async () => {
|
||||
throw error;
|
||||
},
|
||||
),
|
||||
).rejects.toBe(error);
|
||||
});
|
||||
});
|
||||
|
||||
describe('constraintDiscoveryWarning', () => {
|
||||
it('formats stable primary-key warning text and metadata', () => {
|
||||
expect(constraintDiscoveryWarning({ schema: 'public', kind: 'primary_key' })).toEqual({
|
||||
code: 'constraint_discovery_unauthorized',
|
||||
message: 'Skipped primary-key discovery in public (insufficient grants on system catalogs)',
|
||||
recoverable: true,
|
||||
metadata: { schema: 'public', kind: 'primary_key' },
|
||||
});
|
||||
});
|
||||
});
|
||||
183
packages/cli/test/context/scan/credentials.test.ts
Normal file
183
packages/cli/test/context/scan/credentials.test.ts
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { REDACTED_KTX_CREDENTIAL_VALUE } from '../../../src/context/core/redaction.js';
|
||||
import {
|
||||
redactKtxCredentialEnvelope,
|
||||
redactKtxCredentialValue,
|
||||
redactKtxScanMetadata,
|
||||
redactKtxScanReport,
|
||||
redactKtxScanWarning,
|
||||
} from '../../../src/context/scan/credentials.js';
|
||||
import type { KtxCredentialEnvelope, KtxScanReport, KtxScanWarning } from '../../../src/context/scan/types.js';
|
||||
|
||||
describe('KTX scan credential redaction', () => {
|
||||
it('keeps credential references inspectable', () => {
|
||||
const envReference: KtxCredentialEnvelope = { kind: 'env', name: 'DATABASE_URL' };
|
||||
const fileReference: KtxCredentialEnvelope = { kind: 'file', path: '~/.config/ktx/warehouse' };
|
||||
|
||||
expect(redactKtxCredentialEnvelope(envReference)).toEqual(envReference);
|
||||
expect(redactKtxCredentialEnvelope(fileReference)).toEqual(fileReference);
|
||||
});
|
||||
|
||||
it('redacts resolved credential envelope values recursively', () => {
|
||||
expect(
|
||||
redactKtxCredentialEnvelope({
|
||||
kind: 'resolved',
|
||||
source: 'host',
|
||||
values: {
|
||||
username: 'readonly',
|
||||
password: 'secret-password', // pragma: allowlist secret
|
||||
nested: {
|
||||
api_key: 'phx_123', // pragma: allowlist secret
|
||||
warehouse: 'compute_wh',
|
||||
},
|
||||
headers: [{ authorizationToken: 'token-value' }, { label: 'safe' }],
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
kind: 'resolved',
|
||||
source: 'host',
|
||||
redacted: true,
|
||||
values: {
|
||||
username: 'readonly',
|
||||
password: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
nested: {
|
||||
api_key: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
warehouse: 'compute_wh',
|
||||
},
|
||||
headers: [{ authorizationToken: REDACTED_KTX_CREDENTIAL_VALUE }, { label: 'safe' }],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan metadata fields that commonly contain secrets', () => {
|
||||
expect(
|
||||
redactKtxScanMetadata({
|
||||
driver: 'postgres',
|
||||
url: 'postgres://user:pass@example.test/db', // pragma: allowlist secret
|
||||
serviceAccountJson: {
|
||||
client_email: 'reader@example.test',
|
||||
private_key: 'pem-value', // pragma: allowlist secret
|
||||
},
|
||||
safeCount: 3,
|
||||
}),
|
||||
).toEqual({
|
||||
driver: 'postgres',
|
||||
url: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
serviceAccountJson: {
|
||||
client_email: 'reader@example.test',
|
||||
private_key: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
},
|
||||
safeCount: 3,
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan warning messages and metadata without hiding safe context', () => {
|
||||
const warning: KtxScanWarning = {
|
||||
code: 'sampling_failed',
|
||||
message: 'sample failed for postgres://reader:secret@example.test/db', // pragma: allowlist secret
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
table: 'orders',
|
||||
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
|
||||
nested: {
|
||||
api_key: 'sk_test_123', // pragma: allowlist secret
|
||||
schema: 'public',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
expect(redactKtxScanWarning(warning)).toEqual({
|
||||
code: 'sampling_failed',
|
||||
message: 'sample failed for postgres://reader:<redacted>@example.test/db',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
table: 'orders',
|
||||
url: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
nested: {
|
||||
api_key: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
schema: 'public',
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan report warning metadata recursively', () => {
|
||||
const report: KtxScanReport = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
syncId: 'sync-1',
|
||||
runId: 'run-1',
|
||||
trigger: 'cli',
|
||||
mode: 'structural',
|
||||
dryRun: false,
|
||||
artifactPaths: {
|
||||
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-1',
|
||||
reportPath: 'raw-sources/warehouse/live-database/sync-1/scan-report.json',
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 0,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [
|
||||
{
|
||||
code: 'credential_redacted',
|
||||
message: 'metadata redacted',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
|
||||
safeCount: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
|
||||
enrichmentState: {
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
},
|
||||
createdAt: '2026-04-29T00:00:00.000Z',
|
||||
};
|
||||
|
||||
const redacted = redactKtxScanReport(report);
|
||||
|
||||
expect(redacted.warnings[0]?.metadata).toEqual({
|
||||
credentials_json: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
safeCount: 2,
|
||||
});
|
||||
expect(report.warnings[0]?.metadata).toEqual({
|
||||
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
|
||||
safeCount: 2,
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts standalone primitive credential values only when the field key is sensitive', () => {
|
||||
expect(redactKtxCredentialValue('password', 'abc')).toBe(REDACTED_KTX_CREDENTIAL_VALUE);
|
||||
expect(redactKtxCredentialValue('schema', 'public')).toBe('public');
|
||||
});
|
||||
});
|
||||
114
packages/cli/test/context/scan/data-dictionary.test.ts
Normal file
114
packages/cli/test/context/scan/data-dictionary.test.ts
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
defaultKtxDataDictionarySettings,
|
||||
isKtxDataDictionaryCandidate,
|
||||
shouldKtxSampleColumnForDictionary,
|
||||
} from '../../../src/context/scan/data-dictionary.js';
|
||||
|
||||
const defaultPatterns = defaultKtxDataDictionarySettings.excludePatterns;
|
||||
|
||||
describe('KTX scan data dictionary policy', () => {
|
||||
it('includes text-like and boolean categorical types', () => {
|
||||
expect(isKtxDataDictionaryCandidate('varchar(50)', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('VARCHAR', 'category', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('text', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('string', 'payment_method', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('nvarchar(100)', 'tier', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('enum', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('boolean', 'active', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('bool', 'verified', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('character varying(50)', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('character(1)', 'flag', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('ntext', 'category', defaultPatterns)).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes non-categorical primitive types', () => {
|
||||
expect(isKtxDataDictionaryCandidate('integer', 'count', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('bigint', 'total', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('timestamp', 'created', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('date', 'birth', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('numeric', 'amount', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('decimal(10,2)', 'price', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('float', 'rate', defaultPatterns)).toBe(false);
|
||||
});
|
||||
|
||||
it('excludes configured high-cardinality or sensitive name patterns', () => {
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'user_id', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'session_uuid', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'api_key', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'password_hash', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'auth_token', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'id', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'created_at', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'birth_date', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('text', 'description', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('text', 'email_body', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'image_url', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'email', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'phone_number', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'street_address', defaultPatterns)).toBe(false);
|
||||
});
|
||||
|
||||
it('keeps business categorical names eligible', () => {
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'country', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'payment_method', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'currency', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'plan', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'category', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'tier', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'gender', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'language', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'order_type', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'order_status', defaultPatterns)).toBe(true);
|
||||
});
|
||||
|
||||
it('respects host-provided exclusion patterns and skips invalid regex patterns', () => {
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'company_size', ['company'])).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'status', ['company'])).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'status', ['[invalid', '(unclosed'])).toBe(true);
|
||||
});
|
||||
|
||||
it('skips columns that already have persisted dictionary state', () => {
|
||||
expect(
|
||||
shouldKtxSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'status',
|
||||
sampleValues: ['paid'],
|
||||
cardinality: null,
|
||||
settings: defaultKtxDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'already_populated' });
|
||||
|
||||
expect(
|
||||
shouldKtxSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'empty_status',
|
||||
sampleValues: null,
|
||||
cardinality: 0,
|
||||
settings: defaultKtxDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'empty_column' });
|
||||
|
||||
expect(
|
||||
shouldKtxSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'customer_name',
|
||||
sampleValues: null,
|
||||
cardinality: 300,
|
||||
settings: defaultKtxDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'high_cardinality' });
|
||||
|
||||
expect(
|
||||
shouldKtxSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'status',
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
settings: defaultKtxDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: true });
|
||||
});
|
||||
});
|
||||
784
packages/cli/test/context/scan/description-generation.test.ts
Normal file
784
packages/cli/test/context/scan/description-generation.test.ts
Normal file
|
|
@ -0,0 +1,784 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
|
||||
vi.mock('ai', async (importOriginal) => {
|
||||
const actual = await importOriginal<typeof import('ai')>();
|
||||
return { ...actual, generateText: vi.fn() };
|
||||
});
|
||||
|
||||
import { generateText } from 'ai';
|
||||
import {
|
||||
buildKtxColumnDescriptionPrompt,
|
||||
buildKtxDataSourceDescriptionPrompt,
|
||||
buildKtxTableDescriptionPrompt,
|
||||
type KtxDescriptionCachePort,
|
||||
KtxDescriptionGenerator,
|
||||
} from '../../../src/context/scan/description-generation.js';
|
||||
import { createKtxConnectorCapabilities, type KtxScanConnector } from '../../../src/context/scan/types.js';
|
||||
|
||||
function createCache(initial: Record<string, string> = {}): KtxDescriptionCachePort {
|
||||
const data = new Map(Object.entries(initial));
|
||||
return {
|
||||
buildTableKey: (table) => [table.catalog, table.db, table.name].filter(Boolean).join('.'),
|
||||
buildColumnKey: (table, columnName) => [table.catalog, table.db, table.name, columnName].filter(Boolean).join('.'),
|
||||
buildConnectionKey: (connectionName) => `__connection:${connectionName}`,
|
||||
get: vi.fn(async (key: string) => data.get(key) ?? null),
|
||||
set: vi.fn(async (key: string, value: string) => {
|
||||
data.set(key, value);
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function createLlmProvider(text = 'generated description') {
|
||||
vi.mocked(generateText).mockResolvedValue({ text } as never);
|
||||
return {
|
||||
generateText: vi.fn(async (input) => {
|
||||
const result = await generateText({
|
||||
system: input.system ? { role: 'system', content: input.system } : undefined,
|
||||
messages: [{ role: 'user', content: input.prompt }],
|
||||
temperature: input.temperature,
|
||||
} as never);
|
||||
return result.text;
|
||||
}),
|
||||
generateObject: vi.fn(),
|
||||
runAgentLoop: vi.fn(),
|
||||
} as any;
|
||||
}
|
||||
|
||||
function createFailingLlmProvider(message = 'timeout exceeded when trying to connect') {
|
||||
vi.mocked(generateText).mockRejectedValue(new Error(message) as never);
|
||||
return {
|
||||
generateText: vi.fn(async (input) => {
|
||||
const result = await generateText({
|
||||
system: input.system ? { role: 'system', content: input.system } : undefined,
|
||||
messages: [{ role: 'user', content: input.prompt }],
|
||||
temperature: input.temperature,
|
||||
} as never);
|
||||
return result.text;
|
||||
}),
|
||||
generateObject: vi.fn(),
|
||||
runAgentLoop: vi.fn(),
|
||||
} as any;
|
||||
}
|
||||
|
||||
function createConnector(): KtxScanConnector {
|
||||
return {
|
||||
id: 'test-connector',
|
||||
driver: 'postgres',
|
||||
capabilities: createKtxConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
nestedAnalysis: true,
|
||||
}),
|
||||
introspect: vi.fn(async () => {
|
||||
throw new Error('introspection is not used by description generation');
|
||||
}),
|
||||
listSchemas: vi.fn(async () => []),
|
||||
listTables: vi.fn(async () => []),
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['paid', 'refunded', null],
|
||||
nullCount: 1,
|
||||
distinctCount: 2,
|
||||
})),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'status', 'amount'],
|
||||
rows: [
|
||||
[1, 'paid', 20],
|
||||
[2, 'refunded', 10],
|
||||
],
|
||||
totalRows: 2,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
describe('KTX description prompt builders', () => {
|
||||
it('builds column prompts with sample values, source descriptions, and nested BigQuery guidance', () => {
|
||||
const { system, user } = buildKtxColumnDescriptionPrompt({
|
||||
columnName: 'payload',
|
||||
columnValues: [{ nested: true }, '[1,2]'],
|
||||
tableContext: 'Table: events | Columns: payload | Data source: BIGQUERY',
|
||||
dataSourceType: 'BIGQUERY',
|
||||
supportsNestedAnalysis: true,
|
||||
rawDescriptions: { db: 'Raw event payload', ai: 'Old AI text', user: 'User text' },
|
||||
maxWords: 12,
|
||||
});
|
||||
|
||||
expect(user).toContain(
|
||||
'<table_context> Table: events | Columns: payload | Data source: BIGQUERY </table_context>',
|
||||
);
|
||||
expect(user).toContain('<column_name> payload </column_name>');
|
||||
expect(user).toContain('<sample_values> [object Object], [1,2] </sample_values>');
|
||||
expect(user).toContain('<db_documentation> Raw event payload </db_documentation>');
|
||||
expect(user).not.toContain('Old AI text');
|
||||
expect(user).not.toContain('User text');
|
||||
expect(system).toContain('nested/structured data');
|
||||
expect(system).toContain('12 words or less');
|
||||
expect(user).not.toContain('12 words or less');
|
||||
});
|
||||
|
||||
it('builds table and data-source prompts from sampled rows', () => {
|
||||
const sample = {
|
||||
headers: ['id', 'status'],
|
||||
rows: [
|
||||
[1, 'paid'],
|
||||
[2, 'refunded'],
|
||||
],
|
||||
totalRows: 2,
|
||||
};
|
||||
|
||||
const table = buildKtxTableDescriptionPrompt({
|
||||
tableName: 'orders',
|
||||
sampleData: sample,
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
rawDescriptions: { dbt: 'Fact table for commerce orders' },
|
||||
});
|
||||
expect(table.user).toContain('status: paid, refunded');
|
||||
expect(table.system).toContain('Analyze database tables');
|
||||
|
||||
const datasource = buildKtxDataSourceDescriptionPrompt({
|
||||
tableSamples: [['orders', sample]],
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
});
|
||||
expect(datasource.user).toContain('orders (2 columns, 2 sample rows)');
|
||||
expect(datasource.system).toContain('Analyze databases');
|
||||
});
|
||||
});
|
||||
|
||||
describe('KtxDescriptionGenerator', () => {
|
||||
it('generates column descriptions with pre-fetched values, cache hits, and word-limit metadata', async () => {
|
||||
const cache = createCache({ 'warehouse.public.orders.cached_status': 'Cached status description' });
|
||||
const llmRuntime = createLlmProvider('Payment state');
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime,
|
||||
cache,
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
temperature: 0.2,
|
||||
concurrencyLimit: 2,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: 'warehouse',
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [
|
||||
{ name: 'status', sampleValues: ['paid', 'refunded'], rawDescriptions: { db: 'Payment lifecycle' } },
|
||||
{ name: 'cached_status', sampleValues: ['open'] },
|
||||
],
|
||||
},
|
||||
skipExisting: false,
|
||||
existingDescriptions: {},
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
columnDescriptions: [
|
||||
['status', 'Payment state'],
|
||||
['cached_status', 'Cached status description'],
|
||||
],
|
||||
processedColumns: ['status'],
|
||||
skippedColumns: ['cached_status'],
|
||||
});
|
||||
expect(connector.sampleColumn).not.toHaveBeenCalled();
|
||||
expect(generateText).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
temperature: 0.2,
|
||||
system: expect.objectContaining({
|
||||
role: 'system',
|
||||
content: expect.stringContaining('Please provide a concise description in 12 words or less.'),
|
||||
}),
|
||||
messages: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
role: 'user',
|
||||
content: expect.stringContaining('<column_name> status </column_name>'),
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
);
|
||||
const lastCall = vi.mocked(generateText).mock.calls.at(-1)?.[0];
|
||||
expect(lastCall?.messages?.some((message) => message.role === 'system')).toBe(false);
|
||||
});
|
||||
|
||||
it('samples through the connector when column values are not pre-fetched', async () => {
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Current order state'),
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(connector.sampleColumn).toHaveBeenCalledWith(
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
limit: 50,
|
||||
},
|
||||
{ runId: 'run-1' },
|
||||
);
|
||||
expect(result.columnDescriptions).toEqual([['status', 'Current order state']]);
|
||||
});
|
||||
|
||||
it('samples through a description sampling port without requiring structural introspection', async () => {
|
||||
const sampler = {
|
||||
id: 'description-sampler:conn-1',
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['paid', 'refunded'],
|
||||
nullCount: null,
|
||||
distinctCount: null,
|
||||
})),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'status'],
|
||||
rows: [[1, 'paid']],
|
||||
totalRows: 1,
|
||||
})),
|
||||
};
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Generated through sampler'),
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector: sampler,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.columnDescriptions).toEqual([['status', 'Generated through sampler']]);
|
||||
expect(sampler.sampleColumn).toHaveBeenCalledWith(
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
limit: 50,
|
||||
},
|
||||
{ runId: 'run-1' },
|
||||
);
|
||||
expect('introspect' in sampler).toBe(false);
|
||||
});
|
||||
|
||||
it('does not turn LLM failures into generated descriptions', async () => {
|
||||
const cache = createCache();
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createFailingLlmProvider(),
|
||||
cache,
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
},
|
||||
});
|
||||
|
||||
const columnResult = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status' }],
|
||||
},
|
||||
});
|
||||
|
||||
await expect(
|
||||
generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
|
||||
expect(columnResult).toEqual({
|
||||
columnDescriptions: [['status', null]],
|
||||
processedColumns: [],
|
||||
skippedColumns: [],
|
||||
});
|
||||
expect(cache.set).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('generates and caches table and data-source descriptions', async () => {
|
||||
const cache = createCache();
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Commerce orders'),
|
||||
cache,
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
concurrencyLimit: 2,
|
||||
},
|
||||
});
|
||||
|
||||
await expect(
|
||||
generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: 'warehouse', db: 'public', name: 'orders', rawDescriptions: { db: 'Raw orders' } },
|
||||
}),
|
||||
).resolves.toBe('Commerce orders');
|
||||
|
||||
await expect(
|
||||
generator.generateDataSourceDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
tables: [
|
||||
{ catalog: 'warehouse', db: 'public', name: 'orders' },
|
||||
{ catalog: 'warehouse', db: 'public', name: 'customers' },
|
||||
],
|
||||
connectionName: 'Warehouse',
|
||||
}),
|
||||
).resolves.toBe('Commerce orders');
|
||||
|
||||
expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders');
|
||||
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
|
||||
});
|
||||
|
||||
it('generates one structured table description and reuses table samples for all columns', async () => {
|
||||
const llmRuntime = createLlmProvider('unused');
|
||||
llmRuntime.generateObject = vi.fn(async () => ({
|
||||
tableDescription: 'Commerce orders',
|
||||
columns: [
|
||||
{ name: 'status', description: 'Current order state' },
|
||||
{ name: 'amount', description: 'Order amount in dollars' },
|
||||
],
|
||||
}));
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime,
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateBatchedTableDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
rawDescriptions: { db: 'Orders fact table' },
|
||||
columns: [
|
||||
{ name: 'status', type: 'text' },
|
||||
{ name: 'amount', type: 'numeric' },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.tableDescription).toBe('Commerce orders');
|
||||
expect(Object.fromEntries(result.columnDescriptions)).toEqual({
|
||||
status: 'Current order state',
|
||||
amount: 'Order amount in dollars',
|
||||
});
|
||||
expect(connector.sampleTable).toHaveBeenCalledTimes(1);
|
||||
expect(connector.sampleColumn).not.toHaveBeenCalled();
|
||||
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
|
||||
expect(llmRuntime.generateText).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('falls back to one column generateText call for each missing structured column', async () => {
|
||||
const llmRuntime = createLlmProvider('Fallback status');
|
||||
llmRuntime.generateObject = vi.fn(async () => ({
|
||||
tableDescription: 'Commerce orders',
|
||||
columns: [{ name: 'amount', description: 'Order amount in dollars' }],
|
||||
}));
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime,
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateBatchedTableDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [
|
||||
{ name: 'status', type: 'text' },
|
||||
{ name: 'amount', type: 'numeric' },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
expect(Object.fromEntries(result.columnDescriptions)).toEqual({
|
||||
status: 'Fallback status',
|
||||
amount: 'Order amount in dollars',
|
||||
});
|
||||
expect(connector.sampleColumn).not.toHaveBeenCalled();
|
||||
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
|
||||
expect(llmRuntime.generateText).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('does not run per-column fallback when structured object generation throws', async () => {
|
||||
const llmRuntime = createLlmProvider('Fallback description');
|
||||
llmRuntime.generateObject = vi.fn(async () => {
|
||||
throw new Error('object output unavailable');
|
||||
});
|
||||
const warnings: string[] = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime,
|
||||
onWarning: (warning) => warnings.push(warning.code),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateBatchedTableDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector: createConnector(),
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status', type: 'text' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.tableDescription).toBeNull();
|
||||
expect(Object.fromEntries(result.columnDescriptions)).toEqual({ status: null });
|
||||
expect(warnings).toContain('enrichment_failed');
|
||||
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
|
||||
expect(llmRuntime.generateText).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('KtxDescriptionGenerator resilience', () => {
|
||||
function createLogger() {
|
||||
return {
|
||||
debug: vi.fn(),
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
error: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
it('retries sampleTable on transient failure and uses sampled rows when it eventually succeeds', async () => {
|
||||
const sampleTable = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
|
||||
.mockRejectedValueOnce(new Error('pool: transient ECONNRESET'))
|
||||
.mockRejectedValueOnce(new Error('pool: transient ECONNRESET'))
|
||||
.mockResolvedValue({
|
||||
headers: ['id', 'status'],
|
||||
rows: [
|
||||
[1, 'paid'],
|
||||
[2, 'refunded'],
|
||||
],
|
||||
totalRows: 2,
|
||||
});
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleTable,
|
||||
};
|
||||
const logger = createLogger();
|
||||
const warnings: Array<{ code: string; table?: string }> = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Commerce orders'),
|
||||
logger,
|
||||
onWarning: (warning) => warnings.push({ code: warning.code, ...(warning.table ? { table: warning.table } : {}) }),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24, concurrencyLimit: 2 },
|
||||
});
|
||||
|
||||
const description = await generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
});
|
||||
|
||||
expect(description).toBe('Commerce orders');
|
||||
expect(sampleTable).toHaveBeenCalledTimes(3);
|
||||
expect(logger.warn).toHaveBeenCalledTimes(2);
|
||||
expect(warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('falls back to metadata-only prompt when sampleTable retries exhaust', async () => {
|
||||
const sampleTable = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
|
||||
.mockRejectedValue(new Error('pool: connection refused'));
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleTable,
|
||||
};
|
||||
const logger = createLogger();
|
||||
const warnings: Array<{ code: string; table?: string; metadata?: Record<string, unknown> }> = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Customer reference data'),
|
||||
logger,
|
||||
onWarning: (warning) =>
|
||||
warnings.push({
|
||||
code: warning.code,
|
||||
...(warning.table ? { table: warning.table } : {}),
|
||||
...(warning.metadata ? { metadata: warning.metadata } : {}),
|
||||
}),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24, concurrencyLimit: 2 },
|
||||
});
|
||||
|
||||
const description = await generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
columns: [
|
||||
{ name: 'id', nativeType: 'uuid' },
|
||||
{ name: 'email', nativeType: 'text', comment: 'Primary contact email' },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
expect(description).toBe('Customer reference data');
|
||||
expect(sampleTable).toHaveBeenCalledTimes(3);
|
||||
expect(warnings.map((warning) => warning.code)).toEqual(['sampling_failed', 'description_fallback_used']);
|
||||
expect(warnings[1]?.metadata?.reason).toBe('sampling_failed');
|
||||
const userPrompt = (vi.mocked(generateText).mock.calls.at(-1)?.[0] as { messages: Array<{ role: string; content: string }> })
|
||||
.messages.find((message) => message.role === 'user')?.content;
|
||||
expect(userPrompt).toContain('Columns (metadata only, no sample rows)');
|
||||
expect(userPrompt).toContain('email (text)');
|
||||
expect(userPrompt).toContain('Primary contact email');
|
||||
});
|
||||
|
||||
it('emits enrichment_failed and returns null when both sampling and metadata-only LLM fail', async () => {
|
||||
const sampleTable = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
|
||||
.mockRejectedValue(new Error('pool: connection refused'));
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleTable,
|
||||
};
|
||||
const warnings: string[] = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createFailingLlmProvider(),
|
||||
onWarning: (warning) => warnings.push(warning.code),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const description = await generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: null, db: 'public', name: 'orphan', columns: [{ name: 'id' }] },
|
||||
});
|
||||
|
||||
expect(description).toBeNull();
|
||||
expect(warnings).toEqual(['sampling_failed', 'enrichment_failed']);
|
||||
});
|
||||
|
||||
it('uses metadata-only fallback when connector has no sampleTable', async () => {
|
||||
const connector = createConnector();
|
||||
const samplerWithoutTable: KtxScanConnector = {
|
||||
...connector,
|
||||
sampleTable: undefined,
|
||||
};
|
||||
const warnings: string[] = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Orders mart'),
|
||||
onWarning: (warning) => warnings.push(warning.code),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const description = await generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector: samplerWithoutTable,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'mart_orders',
|
||||
columns: [{ name: 'order_id', nativeType: 'uuid' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(description).toBe('Orders mart');
|
||||
expect(warnings).toEqual(['connector_capability_missing', 'description_fallback_used']);
|
||||
});
|
||||
|
||||
it('aborts retry loop when the scan context signal fires', async () => {
|
||||
const controller = new AbortController();
|
||||
const sampleTable = vi.fn<NonNullable<KtxScanConnector['sampleTable']>>().mockImplementation(async () => {
|
||||
controller.abort();
|
||||
throw new Error('first attempt blew up');
|
||||
});
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleTable,
|
||||
};
|
||||
const warnings: string[] = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('should not be called'),
|
||||
onWarning: (warning) => warnings.push(warning.code),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
await expect(
|
||||
generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1', signal: controller.signal },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
}),
|
||||
).rejects.toThrow('aborted');
|
||||
|
||||
expect(sampleTable).toHaveBeenCalledTimes(1);
|
||||
expect(warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('generates column descriptions from rawDescriptions when sampleColumn is unavailable', async () => {
|
||||
const samplerWithoutColumn: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleColumn: undefined,
|
||||
};
|
||||
const logger = createLogger();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Payment lifecycle state'),
|
||||
logger,
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector: samplerWithoutColumn,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status', rawDescriptions: { db: 'order lifecycle state' } }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.columnDescriptions).toEqual([['status', 'Payment lifecycle state']]);
|
||||
expect(logger.warn).toHaveBeenCalled();
|
||||
const userPrompt = (
|
||||
vi.mocked(generateText).mock.calls.at(-1)?.[0] as { messages: Array<{ role: string; content: string }> }
|
||||
).messages.find((message) => message.role === 'user')?.content;
|
||||
expect(userPrompt).toContain('<sample_values> unavailable </sample_values>');
|
||||
expect(userPrompt).toContain('<db_documentation> order lifecycle state </db_documentation>');
|
||||
});
|
||||
|
||||
it('generates column descriptions from rawDescriptions when sampleColumn retries exhaust', async () => {
|
||||
const sampleColumn = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleColumn']>>()
|
||||
.mockRejectedValue(new Error('pool: connection refused'));
|
||||
const flakyConnector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleColumn,
|
||||
};
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Customer reference identifier'),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector: flakyConnector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'customer_id', rawDescriptions: { db: 'FK to customers.id' } }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(sampleColumn).toHaveBeenCalledTimes(3);
|
||||
expect(result.columnDescriptions).toEqual([['customer_id', 'Customer reference identifier']]);
|
||||
});
|
||||
|
||||
it('skips column LLM call only when neither samples nor rawDescriptions are available', async () => {
|
||||
const sampleColumn = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleColumn']>>()
|
||||
.mockResolvedValue({ values: [null, null], nullCount: 2, distinctCount: 0 });
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleColumn,
|
||||
};
|
||||
vi.mocked(generateText).mockClear();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('should not be called'),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'opaque_blob' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.columnDescriptions).toEqual([['opaque_blob', null]]);
|
||||
expect(generateText).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
47
packages/cli/test/context/scan/embedding-text.test.ts
Normal file
47
packages/cli/test/context/scan/embedding-text.test.ts
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { buildKtxColumnEmbeddingText } from '../../../src/context/scan/embedding-text.js';
|
||||
|
||||
describe('KTX scan embedding text', () => {
|
||||
it('builds column embedding text with table, description, FK, and sample-value context', () => {
|
||||
expect(
|
||||
buildKtxColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'status',
|
||||
columnType: 'varchar',
|
||||
resolvedDescription: 'Payment lifecycle state',
|
||||
sampleValues: ['paid', 'refunded', 'pending'],
|
||||
resolvedTableDescription: 'Customer orders',
|
||||
foreignKeys: {
|
||||
outgoing: [{ toTable: 'customers', toColumn: 'id' }],
|
||||
incoming: [{ fromTable: 'refunds', fromColumn: 'order_status' }],
|
||||
},
|
||||
maxSampleValues: 2,
|
||||
}),
|
||||
).toBe(
|
||||
'orders.status (varchar). Table: Customer orders. Payment lifecycle state. FK -> customers.id. FK <- refunds.order_status. Values: paid, refunded',
|
||||
);
|
||||
});
|
||||
|
||||
it('omits optional sections when the scan has no enrichment context yet', () => {
|
||||
expect(
|
||||
buildKtxColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'id',
|
||||
columnType: 'integer',
|
||||
resolvedDescription: null,
|
||||
}),
|
||||
).toBe('orders.id (integer)');
|
||||
});
|
||||
|
||||
it('keeps all available sample values when no explicit max is supplied', () => {
|
||||
expect(
|
||||
buildKtxColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'status',
|
||||
columnType: 'varchar',
|
||||
resolvedDescription: null,
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
}),
|
||||
).toBe('orders.status (varchar). Values: paid, refunded');
|
||||
});
|
||||
});
|
||||
175
packages/cli/test/context/scan/enrichment-state.test.ts
Normal file
175
packages/cli/test/context/scan/enrichment-state.test.ts
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import {
|
||||
completedKtxScanEnrichmentStateSummary,
|
||||
computeKtxScanEnrichmentInputHash,
|
||||
summarizeKtxScanEnrichmentState,
|
||||
} from '../../../src/context/scan/enrichment-state.js';
|
||||
import { SqliteLocalScanEnrichmentStateStore } from '../../../src/context/scan/sqlite-local-enrichment-state-store.js';
|
||||
import type { KtxSchemaSnapshot } from '../../../src/context/scan/types.js';
|
||||
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 1,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
describe('scan enrichment state', () => {
|
||||
let tempDir: string;
|
||||
let store: SqliteLocalScanEnrichmentStateStore;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-scan-enrichment-state-'));
|
||||
store = new SqliteLocalScanEnrichmentStateStore({ dbPath: join(tempDir, 'db.sqlite') });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('computes stable input hashes without depending on object key order', () => {
|
||||
const first = computeKtxScanEnrichmentInputHash({
|
||||
snapshot,
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
|
||||
});
|
||||
const second = computeKtxScanEnrichmentInputHash({
|
||||
snapshot: { ...snapshot, metadata: {} },
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { llmModel: 'a', provider: 'local-heuristic' },
|
||||
});
|
||||
const firstTable = snapshot.tables[0];
|
||||
if (!firstTable) {
|
||||
throw new Error('Expected test snapshot table');
|
||||
}
|
||||
const changed = computeKtxScanEnrichmentInputHash({
|
||||
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
|
||||
});
|
||||
|
||||
expect(first).toMatch(/^[a-f0-9]{64}$/);
|
||||
expect(second).toBe(first);
|
||||
expect(changed).not.toBe(first);
|
||||
});
|
||||
|
||||
it('persists completed stages and ignores stale hashes', async () => {
|
||||
const inputHash = computeKtxScanEnrichmentInputHash({
|
||||
snapshot,
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'local-heuristic' },
|
||||
});
|
||||
|
||||
await store.saveCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-1',
|
||||
mode: 'enriched',
|
||||
stage: 'descriptions',
|
||||
inputHash,
|
||||
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
|
||||
updatedAt: '2026-04-29T12:01:00.000Z',
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
inputHash,
|
||||
}),
|
||||
).resolves.toMatchObject({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
status: 'completed',
|
||||
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
inputHash: 'different-hash',
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
});
|
||||
|
||||
it('records failed stages without making them reusable', async () => {
|
||||
await store.saveFailedStage({
|
||||
runId: 'scan-run-2',
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-2',
|
||||
mode: 'enriched',
|
||||
stage: 'embeddings',
|
||||
inputHash: 'hash-2',
|
||||
errorMessage: 'embedding service timed out',
|
||||
updatedAt: '2026-04-29T12:02:00.000Z',
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-2',
|
||||
stage: 'embeddings',
|
||||
inputHash: 'hash-2',
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
|
||||
await expect(store.listRunStages('scan-run-2')).resolves.toEqual([
|
||||
expect.objectContaining({
|
||||
runId: 'scan-run-2',
|
||||
stage: 'embeddings',
|
||||
status: 'failed',
|
||||
errorMessage: 'embedding service timed out',
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('summarizes resumed, completed, and failed stages for reports', () => {
|
||||
expect(
|
||||
summarizeKtxScanEnrichmentState({
|
||||
resumedStages: ['descriptions'],
|
||||
completedStages: ['descriptions', 'embeddings'],
|
||||
failedStages: ['relationships'],
|
||||
}),
|
||||
).toEqual({
|
||||
resumedStages: ['descriptions'],
|
||||
completedStages: ['descriptions', 'embeddings'],
|
||||
failedStages: ['relationships'],
|
||||
});
|
||||
|
||||
expect(completedKtxScanEnrichmentStateSummary()).toEqual({
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
});
|
||||
});
|
||||
});
|
||||
42
packages/cli/test/context/scan/enrichment-summary.test.ts
Normal file
42
packages/cli/test/context/scan/enrichment-summary.test.ts
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
failedKtxScanEnrichmentSummary,
|
||||
ktxScanErrorMessage,
|
||||
skippedKtxScanEnrichmentSummary,
|
||||
} from '../../../src/context/scan/enrichment-summary.js';
|
||||
|
||||
describe('KTX scan enrichment summaries', () => {
|
||||
it('keeps structural scans skipped when no enrichment was requested', () => {
|
||||
expect(failedKtxScanEnrichmentSummary('structural', false)).toEqual(skippedKtxScanEnrichmentSummary);
|
||||
});
|
||||
|
||||
it('marks relationship stages failed when relationship detection fails', () => {
|
||||
expect(failedKtxScanEnrichmentSummary('relationships', true)).toEqual({
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'failed',
|
||||
});
|
||||
});
|
||||
|
||||
it('marks every enriched-only stage failed when full enrichment fails', () => {
|
||||
expect(failedKtxScanEnrichmentSummary('enriched', true)).toEqual({
|
||||
dataDictionary: 'failed',
|
||||
tableDescriptions: 'failed',
|
||||
columnDescriptions: 'failed',
|
||||
embeddings: 'failed',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'failed',
|
||||
statisticalValidation: 'failed',
|
||||
});
|
||||
});
|
||||
|
||||
it('formats unknown thrown values for scan warnings', () => {
|
||||
expect(ktxScanErrorMessage(new Error('gateway timeout'))).toBe('gateway timeout');
|
||||
expect(ktxScanErrorMessage('plain failure')).toBe('plain failure');
|
||||
expect(ktxScanErrorMessage({ code: 'E_SCAN' })).toBe('{"code":"E_SCAN"}');
|
||||
});
|
||||
});
|
||||
159
packages/cli/test/context/scan/enrichment-types.test.ts
Normal file
159
packages/cli/test/context/scan/enrichment-types.test.ts
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type {
|
||||
KtxColumnSampleUpdate,
|
||||
KtxDescriptionUpdate,
|
||||
KtxEmbeddingUpdate,
|
||||
KtxEnrichedSchema,
|
||||
KtxJoinUpdate,
|
||||
KtxRelationshipEndpoint,
|
||||
KtxRelationshipUpdate,
|
||||
KtxScanMetadataStore,
|
||||
KtxStructuralSyncPlan,
|
||||
} from '../../../src/context/scan/enrichment-types.js';
|
||||
|
||||
describe('KTX scan enrichment contracts', () => {
|
||||
it('models an enriched schema with reusable table, column, and relationship metadata', () => {
|
||||
const schema: KtxEnrichedSchema = {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
{
|
||||
id: 'table-orders',
|
||||
ref: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
enabled: true,
|
||||
descriptions: { db: 'Raw orders', ai: 'Customer orders' },
|
||||
columns: [
|
||||
{
|
||||
id: 'column-orders-status',
|
||||
tableId: 'table-orders',
|
||||
tableRef: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
name: 'status',
|
||||
nativeType: 'varchar',
|
||||
normalizedType: 'string',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: { db: 'Status code' },
|
||||
embedding: [0.1, 0.2],
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
cardinality: 2,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
relationships: [
|
||||
{
|
||||
id: 'rel-orders-customers',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'table-orders',
|
||||
columnIds: ['column-orders-customer-id'],
|
||||
table: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'table-customers',
|
||||
columnIds: ['column-customers-id'],
|
||||
table: { catalog: 'analytics', db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
expect(schema.tables[0].columns[0].sampleValues).toEqual(['paid', 'refunded']);
|
||||
expect(schema.relationships[0].source).toBe('formal');
|
||||
});
|
||||
|
||||
it('models metadata-store updates without requiring a concrete store implementation', async () => {
|
||||
const structuralPlan: KtxStructuralSyncPlan = {
|
||||
connectionId: 'warehouse',
|
||||
snapshotId: 'snapshot-1',
|
||||
operations: [{ kind: 'create_table', table: 'orders' }],
|
||||
};
|
||||
const descriptionUpdate: KtxDescriptionUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
table: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
source: 'ai',
|
||||
tableDescription: 'Customer orders',
|
||||
columnDescriptions: { status: 'Payment lifecycle state' },
|
||||
};
|
||||
const sampleUpdate: KtxColumnSampleUpdate = {
|
||||
columnId: 'column-orders-status',
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
cardinality: 2,
|
||||
};
|
||||
const embeddingUpdate: KtxEmbeddingUpdate = {
|
||||
columnId: 'column-orders-status',
|
||||
text: 'orders.status (varchar). Values: paid, refunded',
|
||||
embedding: [0.25, 0.75],
|
||||
};
|
||||
const relationshipUpdate: KtxRelationshipUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [],
|
||||
rejected: [],
|
||||
skipped: [{ reason: 'missing parent table', relationshipId: 'candidate-1' }],
|
||||
};
|
||||
|
||||
const store: KtxScanMetadataStore = {
|
||||
loadSchema: async () => null,
|
||||
applyStructuralPlan: async (plan) => ({
|
||||
connectionId: plan.connectionId,
|
||||
tables: [],
|
||||
relationships: [],
|
||||
}),
|
||||
updateDescriptions: async (input) => {
|
||||
expect(input).toEqual(descriptionUpdate);
|
||||
},
|
||||
updateColumnSamples: async (input) => {
|
||||
expect(input).toEqual([sampleUpdate]);
|
||||
},
|
||||
updateColumnEmbeddings: async (input) => {
|
||||
expect(input).toEqual([embeddingUpdate]);
|
||||
},
|
||||
updateInferredRelationships: async (input) => {
|
||||
expect(input).toEqual(relationshipUpdate);
|
||||
},
|
||||
};
|
||||
|
||||
await expect(store.loadSchema('warehouse')).resolves.toBeNull();
|
||||
await expect(store.applyStructuralPlan(structuralPlan)).resolves.toEqual({
|
||||
connectionId: 'warehouse',
|
||||
tables: [],
|
||||
relationships: [],
|
||||
});
|
||||
await expect(store.updateDescriptions(descriptionUpdate)).resolves.toBeUndefined();
|
||||
await expect(store.updateColumnSamples([sampleUpdate])).resolves.toBeUndefined();
|
||||
await expect(store.updateColumnEmbeddings([embeddingUpdate])).resolves.toBeUndefined();
|
||||
await expect(store.updateInferredRelationships(relationshipUpdate)).resolves.toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('relationship tuple contracts', () => {
|
||||
it('represents relationship endpoints and join updates as ordered column tuples', () => {
|
||||
const endpoint: KtxRelationshipEndpoint = {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
};
|
||||
const update: KtxJoinUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
fromTable: 'order_line_allocations',
|
||||
fromColumns: ['order_id', 'line_number'],
|
||||
toTable: 'order_lines',
|
||||
toColumns: ['order_id', 'line_number'],
|
||||
relationship: 'many_to_one',
|
||||
author: 'ktx',
|
||||
authorEmail: 'ktx@example.com',
|
||||
};
|
||||
|
||||
expect(endpoint.columns).toEqual(['order_id', 'line_number']);
|
||||
expect(endpoint.columnIds).toEqual(['public.order_lines.order_id', 'public.order_lines.line_number']);
|
||||
expect(update.fromColumns).toEqual(['order_id', 'line_number']);
|
||||
expect(update.toColumns).toEqual(['order_id', 'line_number']);
|
||||
});
|
||||
});
|
||||
307
packages/cli/test/context/scan/entity-details.test.ts
Normal file
307
packages/cli/test/context/scan/entity-details.test.ts
Normal file
|
|
@ -0,0 +1,307 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
|
||||
import { createKtxEntityDetailsService } from '../../../src/context/scan/entity-details.js';
|
||||
import type { KtxConnectionDriver, KtxScanReport, KtxSchemaTable } from '../../../src/context/scan/types.js';
|
||||
|
||||
describe('createKtxEntityDetailsService', () => {
|
||||
let tempDir: string;
|
||||
let project: KtxLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-entity-details-service-'));
|
||||
project = await initKtxProject({ projectDir: join(tempDir, 'project') });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
function scanReport(input: {
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
runId: string;
|
||||
driver?: KtxConnectionDriver;
|
||||
createdAt?: string;
|
||||
}): KtxScanReport {
|
||||
const rawSourcesDir = `raw-sources/${input.connectionId}/live-database/${input.syncId}`;
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver ?? 'postgres',
|
||||
syncId: input.syncId,
|
||||
runId: input.runId,
|
||||
trigger: 'mcp',
|
||||
mode: 'structural',
|
||||
dryRun: false,
|
||||
artifactPaths: {
|
||||
rawSourcesDir,
|
||||
reportPath: `${rawSourcesDir}/scan-report.json`,
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 1,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 1,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [],
|
||||
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
|
||||
enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] },
|
||||
createdAt: input.createdAt ?? '2026-05-14T09:00:00.000Z',
|
||||
};
|
||||
}
|
||||
|
||||
function ordersTable(input: { db?: string | null; estimatedRows?: number | null } = {}): KtxSchemaTable {
|
||||
return {
|
||||
catalog: null,
|
||||
db: input.db ?? 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: input.estimatedRows ?? 12,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'Order status',
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function seedScan(input: {
|
||||
connectionId?: string;
|
||||
syncId: string;
|
||||
runId: string;
|
||||
driver?: KtxConnectionDriver;
|
||||
extractedAt?: string;
|
||||
tables?: KtxSchemaTable[];
|
||||
}): Promise<void> {
|
||||
const connectionId = input.connectionId ?? 'warehouse';
|
||||
const report = scanReport({
|
||||
connectionId,
|
||||
syncId: input.syncId,
|
||||
runId: input.runId,
|
||||
driver: input.driver,
|
||||
createdAt: input.extractedAt,
|
||||
});
|
||||
const root = report.artifactPaths.rawSourcesDir;
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/connection.json`,
|
||||
JSON.stringify(
|
||||
{
|
||||
connectionId,
|
||||
driver: report.driver,
|
||||
extractedAt: input.extractedAt ?? report.createdAt,
|
||||
scope: { schemas: ['public'] },
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed connection',
|
||||
);
|
||||
for (const table of input.tables ?? [ordersTable()]) {
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/tables/${table.db ?? 'default'}-${table.name}.json`,
|
||||
JSON.stringify(table, null, 2),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
`seed ${table.name}`,
|
||||
);
|
||||
}
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/scan-report.json`,
|
||||
JSON.stringify(report, null, 2),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed scan report',
|
||||
);
|
||||
}
|
||||
|
||||
it('returns the latest scan snapshot table details for a display string', async () => {
|
||||
await seedScan({ syncId: 'sync-1', runId: 'scan-old', extractedAt: '2026-05-14T08:00:00.000Z' });
|
||||
await seedScan({
|
||||
syncId: 'sync-2',
|
||||
runId: 'scan-new',
|
||||
extractedAt: '2026-05-14T09:00:00.000Z',
|
||||
tables: [ordersTable({ estimatedRows: 99 })],
|
||||
});
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: 'public.orders' }],
|
||||
});
|
||||
|
||||
expect(result.results).toHaveLength(1);
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: true,
|
||||
connectionId: 'warehouse',
|
||||
display: 'public.orders',
|
||||
estimatedRows: 99,
|
||||
snapshot: {
|
||||
syncId: 'sync-2',
|
||||
scanRunId: 'scan-new',
|
||||
extractedAt: '2026-05-14T09:00:00.000Z',
|
||||
},
|
||||
columns: [
|
||||
{ name: 'id', nativeType: 'integer', primaryKey: true },
|
||||
{ name: 'status', nativeType: 'text', nullable: false },
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('resolves quoted qualified display strings through the dialect parser', async () => {
|
||||
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: '"public"."orders"' }],
|
||||
});
|
||||
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: true,
|
||||
display: 'public.orders',
|
||||
tableRef: { catalog: null, db: 'public', name: 'orders' },
|
||||
});
|
||||
});
|
||||
|
||||
it('filters requested columns while keeping full-table foreign keys', async () => {
|
||||
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: { catalog: null, db: 'public', name: 'orders' }, columns: ['status'] }],
|
||||
});
|
||||
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: true,
|
||||
columns: [{ name: 'status' }],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('returns a structured missing-scan error', async () => {
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: 'public.orders' }],
|
||||
});
|
||||
|
||||
expect(result.results).toEqual([
|
||||
{
|
||||
ok: false,
|
||||
connectionId: 'warehouse',
|
||||
table: 'public.orders',
|
||||
error: {
|
||||
code: 'scan_missing',
|
||||
message: 'No live-database scan found for connection "warehouse"; run `ktx ingest warehouse` or `ktx scan warehouse`.',
|
||||
},
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('reports ambiguous bare table names across schemas', async () => {
|
||||
await seedScan({
|
||||
syncId: 'sync-1',
|
||||
runId: 'scan-1',
|
||||
tables: [ordersTable({ db: 'public' }), ordersTable({ db: 'archive' })],
|
||||
});
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: 'orders' }],
|
||||
});
|
||||
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: false,
|
||||
error: {
|
||||
code: 'ambiguous_table',
|
||||
candidates: [
|
||||
{ tableRef: { catalog: null, db: 'archive', name: 'orders' }, display: 'archive.orders' },
|
||||
{ tableRef: { catalog: null, db: 'public', name: 'orders' }, display: 'public.orders' },
|
||||
],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('reports missing requested columns with available column candidates', async () => {
|
||||
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: 'public.orders', columns: ['status', 'plan_tier'] }],
|
||||
});
|
||||
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: false,
|
||||
error: {
|
||||
code: 'column_not_found',
|
||||
message: 'Column(s) not found on public.orders: plan_tier',
|
||||
candidates: ['id', 'status'],
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,911 @@
|
|||
import { mkdtemp, readFile, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import YAML from 'yaml';
|
||||
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
|
||||
import type { KtxLocalScanEnrichmentResult } from '../../../src/context/scan/local-enrichment.js';
|
||||
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from '../../../src/context/scan/local-enrichment-artifacts.js';
|
||||
import type { KtxSchemaSnapshot } from '../../../src/context/scan/types.js';
|
||||
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: 'DB customer table',
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'DB customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'DB orders table',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'DB order id',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'DB customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
function enrichment(): KtxLocalScanEnrichmentResult {
|
||||
return {
|
||||
snapshot,
|
||||
summary: {
|
||||
dataDictionary: 'completed',
|
||||
tableDescriptions: 'completed',
|
||||
columnDescriptions: 'completed',
|
||||
embeddings: 'completed',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
state: {
|
||||
resumedStages: [],
|
||||
completedStages: ['descriptions', 'embeddings', 'relationships'],
|
||||
failedStages: [],
|
||||
},
|
||||
warnings: [],
|
||||
descriptionUpdates: [
|
||||
{
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
tableDescription: 'AI orders table',
|
||||
columnDescriptions: {
|
||||
id: 'AI order id',
|
||||
customer_id: 'AI customer reference',
|
||||
},
|
||||
},
|
||||
{
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
tableDescription: 'AI customers table',
|
||||
columnDescriptions: {
|
||||
id: 'AI customer id',
|
||||
},
|
||||
},
|
||||
],
|
||||
embeddingUpdates: [
|
||||
{ columnId: 'public.orders.id', text: 'orders id', embedding: [0.1, 0.2] },
|
||||
{ columnId: 'public.orders.customer_id', text: 'orders customer_id', embedding: [0.3, 0.4] },
|
||||
],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'inferred',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
relationshipProfile: {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 6,
|
||||
tables: [{ table: { catalog: null, db: 'public', name: 'customers' }, rowCount: 2 }],
|
||||
columns: {
|
||||
'customers.id': {
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
column: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
resolvedRelationships: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'llm_proposal',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.92,
|
||||
pkScore: 0.95,
|
||||
fkScore: 0.91,
|
||||
score: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'buyer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.88,
|
||||
nameScore: 0.45,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
checkedValues: 2,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
graph: {
|
||||
targetPkScore: 0.95,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
compositeRelationships: null,
|
||||
};
|
||||
}
|
||||
|
||||
describe('writeLocalScanEnrichmentArtifacts', () => {
|
||||
let tempDir: string;
|
||||
let project: KtxLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-enrichment-artifacts-'));
|
||||
project = await initKtxProject({
|
||||
projectDir: join(tempDir, 'project'),
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('writes enrichment artifacts and manifest shards while preserving external descriptions', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify(
|
||||
{
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
descriptions: { user: 'Pinned analyst description', ai: 'Old AI description' },
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
descriptions: { user: 'Pinned id description', ai: 'Old AI id' },
|
||||
},
|
||||
{ name: 'customer_id', type: 'number' },
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
{ indent: 2, lineWidth: 0 },
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed manifest shard',
|
||||
);
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-1',
|
||||
driver: 'postgres',
|
||||
enrichment: enrichment(),
|
||||
dryRun: false,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.91,
|
||||
reviewThreshold: 0.61,
|
||||
maxLlmTablesPerBatch: 12,
|
||||
maxCandidatesPerColumn: 7,
|
||||
profileSampleRows: 500,
|
||||
profileConcurrency: 3,
|
||||
validationConcurrency: 2,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
enrichmentArtifacts: [
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/embeddings.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json',
|
||||
],
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
manifestShardsWritten: 1,
|
||||
});
|
||||
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).resolves.toContain('AI orders table');
|
||||
|
||||
const relationshipsRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json'),
|
||||
'utf-8',
|
||||
);
|
||||
const relationshipsArtifact = JSON.parse(relationshipsRaw) as {
|
||||
accepted: Array<{
|
||||
id: string;
|
||||
status: string;
|
||||
source: string;
|
||||
pkScore: number;
|
||||
fkScore: number;
|
||||
evidence: unknown;
|
||||
reasons: string[];
|
||||
validation: unknown;
|
||||
graph: unknown;
|
||||
}>;
|
||||
review: unknown[];
|
||||
rejected: unknown[];
|
||||
skipped: unknown[];
|
||||
};
|
||||
expect(relationshipsArtifact.accepted).toHaveLength(1);
|
||||
expect(relationshipsArtifact.accepted[0]).toMatchObject({
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
status: 'accepted',
|
||||
source: 'llm_proposal',
|
||||
pkScore: 0.95,
|
||||
fkScore: 0.91,
|
||||
evidence: expect.objectContaining({
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
}),
|
||||
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
|
||||
validation: expect.objectContaining({ reasons: ['validation_passed'] }),
|
||||
graph: expect.objectContaining({ reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'] }),
|
||||
});
|
||||
expect(relationshipsArtifact.review).toEqual([]);
|
||||
expect(relationshipsArtifact.rejected).toEqual([]);
|
||||
expect(relationshipsArtifact.skipped).toEqual([]);
|
||||
|
||||
const profileRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(JSON.parse(profileRaw)).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 6,
|
||||
warnings: [],
|
||||
});
|
||||
|
||||
const diagnosticsRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(JSON.parse(diagnosticsRaw)).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
summary: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
noAcceptedReason: null,
|
||||
candidateCountsBySource: { llm_proposal: 1 },
|
||||
validation: { available: true, sqlAvailable: true, queryCount: 6 },
|
||||
thresholds: { acceptThreshold: 0.91, reviewThreshold: 0.61 },
|
||||
policy: {
|
||||
validationRequiredForManifest: true,
|
||||
maxCandidatesPerColumn: 7,
|
||||
profileSampleRows: 500,
|
||||
profileConcurrency: 3,
|
||||
validationConcurrency: 2,
|
||||
},
|
||||
profileWarnings: [],
|
||||
});
|
||||
|
||||
const manifestRaw = await readFile(
|
||||
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
||||
'utf-8',
|
||||
);
|
||||
const manifest = YAML.parse(manifestRaw) as {
|
||||
tables: {
|
||||
orders: {
|
||||
descriptions: Record<string, string>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
joins: Array<{ to: string; on: string; source: string }>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
expect(manifest.tables.orders.descriptions).toEqual({
|
||||
user: 'Pinned analyst description',
|
||||
db: 'DB orders table',
|
||||
ai: 'AI orders table',
|
||||
});
|
||||
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
user: 'Pinned id description',
|
||||
db: 'DB order id',
|
||||
ai: 'AI order id',
|
||||
});
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('writes formal accepted relationships into relationship artifacts and manifest shards', async () => {
|
||||
const source = enrichment();
|
||||
const formalEnrichment: KtxLocalScanEnrichmentResult = {
|
||||
...source,
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
resolvedRelationships: [],
|
||||
compositeRelationships: null,
|
||||
};
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
syncId: 'sync-formal',
|
||||
enrichment: formalEnrichment,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
profileConcurrency: 4,
|
||||
validationConcurrency: 4,
|
||||
},
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
const relationshipsPath = 'raw-sources/warehouse/live-database/sync-formal/enrichment/relationships.json';
|
||||
const relationships = JSON.parse((await project.fileStore.readFile(relationshipsPath)).content) as {
|
||||
accepted: Array<{ source: string; reasons: string[] }>;
|
||||
};
|
||||
expect(relationships.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
source: 'formal',
|
||||
reasons: ['formal_metadata_accepted'],
|
||||
}),
|
||||
]);
|
||||
|
||||
const manifestPath = result.manifestShards[0];
|
||||
if (!manifestPath) {
|
||||
throw new Error('Expected manifest shard path');
|
||||
}
|
||||
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
|
||||
tables: { orders: { joins: Array<{ to: string; on: string; source: string }> } };
|
||||
};
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('writes manually applied relationship joins with manual source', async () => {
|
||||
const result = await writeLocalScanManifestShards({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-manual',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
dryRun: false,
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:(public.orders.customer_id)->public.customers:(public.customers.id)',
|
||||
source: 'manual',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.manifestShardsWritten).toBe(1);
|
||||
const shard = YAML.parse(await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'));
|
||||
expect(shard.tables.orders.joins).toContainEqual({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
});
|
||||
});
|
||||
|
||||
it('does not persist generated error descriptions in manifest shards', async () => {
|
||||
await writeLocalScanManifestShards({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-error-description',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
descriptionUpdates: [
|
||||
{
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
tableDescription: 'Error generating description: timeout exceeded when trying to connect',
|
||||
columnDescriptions: {
|
||||
id: 'Error generating description: timeout exceeded when trying to connect',
|
||||
customer_id: 'AI customer reference',
|
||||
},
|
||||
},
|
||||
],
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
const shard = YAML.parse(
|
||||
await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'),
|
||||
) as {
|
||||
tables: {
|
||||
orders: {
|
||||
descriptions?: Record<string, string>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
expect(shard.tables.orders.descriptions).toEqual({ db: 'DB orders table' });
|
||||
expect(shard.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
db: 'DB order id',
|
||||
});
|
||||
expect(shard.tables.orders.columns.find((column) => column.name === 'customer_id')?.descriptions).toEqual({
|
||||
db: 'DB customer id',
|
||||
ai: 'AI customer reference',
|
||||
});
|
||||
});
|
||||
|
||||
it('writes accepted composite relationships to relationship artifacts and manifest shards', async () => {
|
||||
const compositeSnapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-05-07T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'order_lines',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'order_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'line_number',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'order_line_allocations',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'order_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'line_number',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const compositeEnrichment: KtxLocalScanEnrichmentResult = Object.assign(enrichment(), {
|
||||
snapshot: compositeSnapshot,
|
||||
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
descriptionUpdates: [],
|
||||
embeddingUpdates: [],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'inferred',
|
||||
from: {
|
||||
tableId: 'public.order_line_allocations',
|
||||
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
resolvedRelationships: [],
|
||||
compositeRelationships: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'public.order_line_allocations',
|
||||
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
syncId: 'sync-composite',
|
||||
enrichment: compositeEnrichment,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
profileConcurrency: 4,
|
||||
validationConcurrency: 4,
|
||||
},
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
const relationships = JSON.parse(
|
||||
(await project.fileStore.readFile('raw-sources/warehouse/live-database/sync-composite/enrichment/relationships.json'))
|
||||
.content,
|
||||
) as { accepted: Array<{ from: { columns: string[] }; to: { columns: string[] }; reasons: string[] }> };
|
||||
expect(relationships.accepted[0]).toMatchObject({
|
||||
from: { columns: ['order_id', 'line_number'] },
|
||||
to: { columns: ['order_id', 'line_number'] },
|
||||
reasons: ['composite_validation_passed'],
|
||||
});
|
||||
|
||||
const manifestPath = result.manifestShards[0];
|
||||
if (!manifestPath) {
|
||||
throw new Error('Expected manifest shard path');
|
||||
}
|
||||
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
|
||||
tables: { order_line_allocations: { joins: Array<{ to: string; on: string; source: string }> } };
|
||||
};
|
||||
expect(manifest.tables.order_line_allocations.joins).toEqual([
|
||||
{
|
||||
to: 'order_lines',
|
||||
on: 'order_line_allocations.order_id = order_lines.order_id AND order_line_allocations.line_number = order_lines.line_number',
|
||||
relationship: 'many_to_one',
|
||||
source: 'inferred',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('writes structural manifest shards without enrichment artifacts', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify(
|
||||
{
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
descriptions: { user: 'Pinned structural description', ai: 'Old generated text' },
|
||||
usage: {
|
||||
narrative: 'Orders are commonly filtered by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
ownerNote: 'Preserve analyst note',
|
||||
},
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
descriptions: { user: 'Pinned structural id', ai: 'Old generated id' },
|
||||
},
|
||||
{ name: 'customer_id', type: 'number' },
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
{ indent: 2, lineWidth: 0 },
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed structural manifest shard',
|
||||
);
|
||||
|
||||
const result = await writeLocalScanManifestShards({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-structural-1',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
manifestShardsWritten: 1,
|
||||
});
|
||||
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-structural-1/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
|
||||
const manifestRaw = await readFile(
|
||||
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
||||
'utf-8',
|
||||
);
|
||||
const manifest = YAML.parse(manifestRaw) as {
|
||||
tables: {
|
||||
orders: {
|
||||
descriptions: Record<string, string>;
|
||||
usage?: Record<string, unknown>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
joins: Array<{ to: string; on: string; source: string }>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
expect(manifest.tables.orders.descriptions).toEqual({
|
||||
user: 'Pinned structural description',
|
||||
db: 'DB orders table',
|
||||
});
|
||||
expect(manifest.tables.orders.usage).toEqual({
|
||||
narrative: 'Orders are commonly filtered by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
ownerNote: 'Preserve analyst note',
|
||||
});
|
||||
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
user: 'Pinned structural id',
|
||||
db: 'DB order id',
|
||||
});
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('returns planned empty paths without writing files during dry runs', async () => {
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-dry-run',
|
||||
driver: 'postgres',
|
||||
enrichment: enrichment(),
|
||||
dryRun: true,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
enrichmentArtifacts: [],
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
});
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-dry-run/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
});
|
||||
});
|
||||
871
packages/cli/test/context/scan/local-enrichment.test.ts
Normal file
871
packages/cli/test/context/scan/local-enrichment.test.ts
Normal file
|
|
@ -0,0 +1,871 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { buildDefaultKtxProjectConfig } from '../../../src/context/project/config.js';
|
||||
import type {
|
||||
KtxScanEnrichmentCompletedStage,
|
||||
KtxScanEnrichmentFailedStage,
|
||||
KtxScanEnrichmentStageLookup,
|
||||
KtxScanEnrichmentStateStore,
|
||||
} from '../../../src/context/scan/enrichment-state.js';
|
||||
import {
|
||||
createDeterministicLocalScanEnrichmentProviders,
|
||||
runLocalScanEnrichment,
|
||||
snapshotToKtxEnrichedSchema,
|
||||
} from '../../../src/context/scan/local-enrichment.js';
|
||||
import {
|
||||
createKtxConnectorCapabilities,
|
||||
type KtxQueryResult,
|
||||
type KtxReadOnlyQueryInput,
|
||||
type KtxEmbeddingPort,
|
||||
type KtxScanConnector,
|
||||
type KtxScanContext,
|
||||
type KtxSchemaSnapshot,
|
||||
} from '../../../src/context/scan/types.js';
|
||||
|
||||
function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
|
||||
return {
|
||||
dimensions: options.dimensions,
|
||||
maxBatchSize: options.maxBatchSize ?? 64,
|
||||
async embedBatch(texts) {
|
||||
return texts.map((_, textIndex) =>
|
||||
Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
|
||||
);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: 'Customer accounts',
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
function connector(): KtxScanConnector {
|
||||
return {
|
||||
id: 'test:warehouse',
|
||||
driver: 'postgres',
|
||||
capabilities: createKtxConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
readOnlySql: true,
|
||||
columnStats: true,
|
||||
}),
|
||||
introspect: vi.fn(async () => snapshot),
|
||||
listSchemas: vi.fn(async () => []),
|
||||
listTables: vi.fn(async () => []),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'customer_id'],
|
||||
rows: [[1, 10]],
|
||||
totalRows: 1,
|
||||
})),
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['10', '11'],
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function noDeclaredRelationshipSnapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function memoryEnrichmentStateStore(): KtxScanEnrichmentStateStore {
|
||||
const records = new Map<string, KtxScanEnrichmentCompletedStage | KtxScanEnrichmentFailedStage>();
|
||||
const key = (input: Pick<KtxScanEnrichmentStageLookup, 'runId' | 'stage'>) => `${input.runId}:${input.stage}`;
|
||||
return {
|
||||
async findCompletedStage<TOutput>(input: KtxScanEnrichmentStageLookup) {
|
||||
const record = records.get(key(input));
|
||||
if (!record || record.status !== 'completed' || record.inputHash !== input.inputHash) {
|
||||
return null;
|
||||
}
|
||||
return record as KtxScanEnrichmentCompletedStage<TOutput>;
|
||||
},
|
||||
async saveCompletedStage(input) {
|
||||
records.set(key(input), {
|
||||
...input,
|
||||
status: 'completed',
|
||||
errorMessage: null,
|
||||
});
|
||||
},
|
||||
async saveFailedStage(input) {
|
||||
records.set(key(input), {
|
||||
...input,
|
||||
status: 'failed',
|
||||
output: null,
|
||||
});
|
||||
},
|
||||
async listRunStages(runId) {
|
||||
return [...records.values()].filter((record) => record.runId === runId);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('local scan enrichment', () => {
|
||||
it('maps a scan snapshot into relationship detector schema', () => {
|
||||
const schema = snapshotToKtxEnrichedSchema(snapshot);
|
||||
|
||||
expect(schema.connectionId).toBe('warehouse');
|
||||
expect(schema.tables).toHaveLength(2);
|
||||
expect(schema.tables[1]?.columns.map((column) => column.name)).toEqual(['id', 'customer_id']);
|
||||
expect(schema.tables[1]?.columns[1]).toMatchObject({
|
||||
id: 'public.orders.customer_id',
|
||||
tableId: 'public.orders',
|
||||
primaryKey: false,
|
||||
sampleValues: null,
|
||||
embedding: null,
|
||||
});
|
||||
});
|
||||
|
||||
it('maps snapshot foreign keys into formal schema relationships', () => {
|
||||
const source = noDeclaredRelationshipSnapshot();
|
||||
const snapshotWithForeignKey = {
|
||||
...source,
|
||||
tables: source.tables.map((table) =>
|
||||
table.name === 'orders'
|
||||
? {
|
||||
...table,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'account_id',
|
||||
toCatalog: null,
|
||||
toDb: null,
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_account_id_fkey',
|
||||
},
|
||||
],
|
||||
}
|
||||
: table.name === 'accounts'
|
||||
? {
|
||||
...table,
|
||||
columns: table.columns.map((column) =>
|
||||
column.name === 'id' ? { ...column, primaryKey: true } : column,
|
||||
),
|
||||
}
|
||||
: table,
|
||||
),
|
||||
};
|
||||
|
||||
const schema = snapshotToKtxEnrichedSchema(snapshotWithForeignKey);
|
||||
|
||||
expect(schema.relationships).toEqual([
|
||||
{
|
||||
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['account_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'accounts',
|
||||
columnIds: ['accounts.id'],
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses the supplied snapshot without calling connector.introspect', async () => {
|
||||
const scanConnector = connector();
|
||||
const introspect = vi.mocked(scanConnector.introspect);
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'structural',
|
||||
connector: scanConnector,
|
||||
snapshot,
|
||||
context: { runId: 'scan-run-snapshot' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.snapshot).toEqual(snapshot);
|
||||
expect(introspect).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('falls back to connector.introspect when no snapshot is supplied', async () => {
|
||||
const scanConnector = connector();
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'structural',
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-introspect' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.snapshot).toEqual(snapshot);
|
||||
expect(scanConnector.introspect).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('fails when connector driver and snapshot driver differ', async () => {
|
||||
const mismatchedConnector: KtxScanConnector = {
|
||||
...connector(),
|
||||
driver: 'mysql',
|
||||
};
|
||||
|
||||
await expect(
|
||||
runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: mismatchedConnector,
|
||||
snapshot,
|
||||
context: { runId: 'scan-run-driver-mismatch' },
|
||||
providers: null,
|
||||
}),
|
||||
).rejects.toThrow(
|
||||
'ktx scan connector driver "mysql" does not match snapshot driver "postgres" for connection "warehouse"',
|
||||
);
|
||||
});
|
||||
|
||||
it('runs deterministic relationship detection for relationship scans', async () => {
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-1' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.summary).toMatchObject({
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
});
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.summary.statisticalValidation).toBe('skipped');
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'relationship_validation_failed',
|
||||
message: 'KTX scan connector advertises readOnlySql but does not expose executeReadOnly',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
});
|
||||
});
|
||||
|
||||
it('runs relationship discovery with connector SQL evidence', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
try {
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
driver: 'sqlite' as const,
|
||||
capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
|
||||
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
|
||||
executeReadOnly: executor.executeReadOnly.bind(executor),
|
||||
};
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-relationship-discovery' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.summary.statisticalValidation).toBe('completed');
|
||||
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
|
||||
expect(result.resolvedRelationships).toEqual([
|
||||
expect.objectContaining({
|
||||
status: 'accepted',
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
|
||||
} finally {
|
||||
executor.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('honors scan relationship config when LLM proposals are disabled', async () => {
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders();
|
||||
const generateObject = vi.fn();
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-llm-disabled' },
|
||||
providers: {
|
||||
...providers,
|
||||
llmRuntime: {
|
||||
...providers.llmRuntime,
|
||||
generateObject: generateObject as never,
|
||||
},
|
||||
},
|
||||
relationshipSettings: {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
llmProposals: false,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.summary.llmRelationshipValidation).toBe('skipped');
|
||||
expect(generateObject).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('skips relationship detection when scan relationships are disabled', async () => {
|
||||
const settings = {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
enabled: false,
|
||||
};
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
connector: connector(),
|
||||
context: { runId: 'disabled-relationships' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
relationshipSettings: settings,
|
||||
});
|
||||
|
||||
expect(result.summary.deterministicRelationships).toBe('skipped');
|
||||
expect(result.summary.statisticalValidation).toBe('skipped');
|
||||
expect(result.summary.llmRelationshipValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate).toBeNull();
|
||||
expect(result.relationshipProfile).toBeNull();
|
||||
expect(result.resolvedRelationships).toBeNull();
|
||||
});
|
||||
|
||||
it('forwards context.logger and emits warnings when sampleTable fails repeatedly', async () => {
|
||||
const failingConnector: KtxScanConnector = {
|
||||
...connector(),
|
||||
sampleTable: vi.fn(async () => {
|
||||
throw new Error('pool: ECONNRESET');
|
||||
}),
|
||||
};
|
||||
const logger = {
|
||||
debug: vi.fn(),
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
error: vi.fn(),
|
||||
};
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: failingConnector,
|
||||
context: { runId: 'scan-run-warnings', logger },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
});
|
||||
|
||||
const codes = result.warnings.map((warning) => warning.code);
|
||||
expect(codes).toContain('sampling_failed');
|
||||
expect(codes).toContain('description_fallback_used');
|
||||
expect(result.warnings.some((warning) => warning.table === 'customers')).toBe(true);
|
||||
expect(logger.warn).toHaveBeenCalled();
|
||||
expect(logger.error).toHaveBeenCalled();
|
||||
// Each of the two tables produced sampling_failed + description_fallback_used, so 2 + 2 = 4 warnings minimum.
|
||||
expect(result.warnings.length).toBeGreaterThanOrEqual(4);
|
||||
// Sampling was retried 3× for each of the 2 tables = 6 calls
|
||||
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
|
||||
});
|
||||
|
||||
it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-2' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
});
|
||||
|
||||
expect(result.summary).toMatchObject({
|
||||
dataDictionary: 'completed',
|
||||
tableDescriptions: 'completed',
|
||||
columnDescriptions: 'completed',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
});
|
||||
expect(result.embeddingUpdates).toEqual([]);
|
||||
expect(result.snapshot).toEqual(snapshot);
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
});
|
||||
|
||||
it('generates batched table descriptions with bounded table-level concurrency', async () => {
|
||||
const concurrentSnapshot: KtxSchemaSnapshot = {
|
||||
...snapshot,
|
||||
tables: Array.from({ length: 8 }, (_, index) => ({
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: `table_${index + 1}`,
|
||||
kind: 'table' as const,
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number' as const,
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
})),
|
||||
};
|
||||
let activeTableSamples = 0;
|
||||
let maxActiveTableSamples = 0;
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => concurrentSnapshot),
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['1'],
|
||||
nullCount: 0,
|
||||
distinctCount: 1,
|
||||
})),
|
||||
sampleTable: vi.fn(async () => {
|
||||
activeTableSamples += 1;
|
||||
maxActiveTableSamples = Math.max(maxActiveTableSamples, activeTableSamples);
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
activeTableSamples -= 1;
|
||||
return {
|
||||
headers: ['id'],
|
||||
rows: [[1]],
|
||||
totalRows: 1,
|
||||
};
|
||||
}),
|
||||
};
|
||||
const settings = {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
enabled: false,
|
||||
};
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-concurrent-descriptions' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
relationshipSettings: settings,
|
||||
});
|
||||
|
||||
expect(maxActiveTableSamples).toBe(4);
|
||||
expect(scanConnector.sampleColumn).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('reports enrichment progress for countable stages', async () => {
|
||||
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
|
||||
const progress = {
|
||||
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
|
||||
events.push({ progress: progressValue, message, transient: options?.transient });
|
||||
},
|
||||
startPhase() {
|
||||
return progress;
|
||||
},
|
||||
};
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-progress', progress },
|
||||
providers: {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
},
|
||||
});
|
||||
|
||||
expect(events).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
|
||||
expect.objectContaining({ message: 'Generating descriptions 2/2 tables', transient: true }),
|
||||
expect.objectContaining({ message: 'Building embeddings 1/1 batches', transient: true }),
|
||||
expect.objectContaining({ message: 'Detecting relationships' }),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('reports progress before enrichment connector introspection starts', async () => {
|
||||
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
|
||||
const progress = {
|
||||
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
|
||||
events.push({ progress: progressValue, message, transient: options?.transient });
|
||||
},
|
||||
startPhase() {
|
||||
return progress;
|
||||
},
|
||||
};
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => {
|
||||
expect(events).toContainEqual(expect.objectContaining({ message: 'Loading enrichment schema snapshot' }));
|
||||
return snapshot;
|
||||
}),
|
||||
};
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-progress-before-introspection', progress },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(scanConnector.introspect).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('splits enrichment embedding requests by provider batch size', async () => {
|
||||
const manyColumnSnapshot: KtxSchemaSnapshot = {
|
||||
...snapshot,
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'wide_orders',
|
||||
kind: 'table',
|
||||
comment: 'Wide order facts',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: Array.from({ length: 5 }, (_, index) => ({
|
||||
name: `metric_${index + 1}`,
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number' as const,
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: `Metric ${index + 1}`,
|
||||
})),
|
||||
},
|
||||
],
|
||||
};
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => manyColumnSnapshot),
|
||||
};
|
||||
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
|
||||
const embedBatch = vi.fn(async (texts: string[]) => {
|
||||
if (texts.length > 2) {
|
||||
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
|
||||
}
|
||||
return texts.map((_, index) => [index, index + 1, index + 2]);
|
||||
});
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-batched-embeddings' },
|
||||
providers: {
|
||||
llmRuntime: deterministicProviders.llmRuntime,
|
||||
embedding: {
|
||||
dimensions: 3,
|
||||
maxBatchSize: 2,
|
||||
embedBatch,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.embeddingUpdates).toHaveLength(5);
|
||||
expect(embedBatch.mock.calls.map(([texts]) => texts).map((texts) => texts.length)).toEqual([2, 2, 1]);
|
||||
});
|
||||
|
||||
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
|
||||
const stateStore = memoryEnrichmentStateStore();
|
||||
const scanConnector = connector();
|
||||
const providers = {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
};
|
||||
|
||||
const first = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-1' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-1',
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
|
||||
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
|
||||
const second = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-1' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-1',
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(first.state.resumedStages).toEqual([]);
|
||||
expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(generateObject).not.toHaveBeenCalled();
|
||||
expect(embedBatch).not.toHaveBeenCalled();
|
||||
expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
|
||||
expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
|
||||
expect(second.relationships).toEqual(first.relationships);
|
||||
});
|
||||
|
||||
it('does not reuse completed stages when the snapshot changes', async () => {
|
||||
const stateStore = memoryEnrichmentStateStore();
|
||||
const providers = {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
};
|
||||
const scanConnector = connector();
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-hash' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-hash',
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
const firstTable = snapshot.tables[0];
|
||||
if (!firstTable) {
|
||||
throw new Error('Expected test snapshot table');
|
||||
}
|
||||
const changedConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => ({
|
||||
...snapshot,
|
||||
tables: [{ ...firstTable, name: 'customers' }],
|
||||
})),
|
||||
};
|
||||
const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: changedConnector,
|
||||
context: { runId: 'scan-run-resume-hash' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-hash',
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
expect(result.state.resumedStages).toEqual([]);
|
||||
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(generateObject).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
try {
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
driver: 'sqlite' as const,
|
||||
capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
|
||||
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
|
||||
executeReadOnly: executor.executeReadOnly.bind(executor),
|
||||
};
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-providerless-enriched' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.summary).toEqual({
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'completed',
|
||||
});
|
||||
expect(result.descriptionUpdates).toEqual([]);
|
||||
expect(result.embeddingUpdates).toEqual([]);
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
|
||||
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
|
||||
expect(result.resolvedRelationships).toEqual([
|
||||
expect.objectContaining({
|
||||
status: 'accepted',
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'scan_enrichment_backend_not_configured',
|
||||
message:
|
||||
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
skippedStages: ['descriptions', 'embeddings'],
|
||||
relationshipDetection: true,
|
||||
},
|
||||
});
|
||||
} finally {
|
||||
executor.close();
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
1965
packages/cli/test/context/scan/local-scan.test.ts
Normal file
1965
packages/cli/test/context/scan/local-scan.test.ts
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -0,0 +1,278 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
|
||||
import { readLocalScanStructuralSnapshot } from '../../../src/context/scan/local-structural-artifacts.js';
|
||||
|
||||
describe('readLocalScanStructuralSnapshot', () => {
|
||||
let tempDir: string;
|
||||
let project: KtxLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-structural-artifacts-'));
|
||||
project = await initKtxProject({
|
||||
projectDir: join(tempDir, 'project'),
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('rebuilds a canonical snapshot from persisted live-database raw files', async () => {
|
||||
const rawRoot = 'raw-sources/warehouse/live-database/sync-1';
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/connection.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
metadata: { source: 'sqlite-smoke' },
|
||||
tableCount: 2,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed connection artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/customers.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: 'Customer table',
|
||||
estimatedRows: 12,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed customers artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/orders.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 20,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed orders artifact',
|
||||
);
|
||||
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
rawSourcesDir: rawRoot,
|
||||
extractedAtFallback: '2026-04-29T13:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(snapshot).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
metadata: { source: 'sqlite-smoke' },
|
||||
tables: [
|
||||
{
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
comment: 'Customer table',
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('rebuilds scan warnings from persisted live-database warning files', async () => {
|
||||
const rawRoot = 'raw-sources/warehouse/live-database/sync-warnings';
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/connection.json`,
|
||||
'{"connectionId":"warehouse","metadata":{}}\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed connection artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/warnings.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
warnings: [
|
||||
{
|
||||
code: 'constraint_discovery_unauthorized',
|
||||
message: 'Skipped foreign-key discovery in public (insufficient grants on system catalogs)',
|
||||
recoverable: true,
|
||||
metadata: { schema: 'public', kind: 'foreign_key' },
|
||||
},
|
||||
],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed warning artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/orders.json`,
|
||||
'{"name":"orders","catalog":null,"db":"public","kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":false,"comment":null}],"foreignKeys":[]}\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed orders artifact',
|
||||
);
|
||||
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
rawSourcesDir: rawRoot,
|
||||
extractedAtFallback: '2026-04-29T13:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(snapshot.warnings).toEqual([
|
||||
{
|
||||
code: 'constraint_discovery_unauthorized',
|
||||
message: 'Skipped foreign-key discovery in public (insufficient grants on system catalogs)',
|
||||
recoverable: true,
|
||||
metadata: { schema: 'public', kind: 'foreign_key' },
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses the scan report timestamp when connection.json omits extractedAt', async () => {
|
||||
const rawRoot = 'raw-sources/warehouse/live-database/sync-2';
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/connection.json`,
|
||||
'{"connectionId":"warehouse","metadata":{}}\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed connection artifact without extractedAt',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/orders.json`,
|
||||
'{"name":"orders","catalog":null,"db":null,"kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed orders artifact',
|
||||
);
|
||||
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
rawSourcesDir: rawRoot,
|
||||
extractedAtFallback: '2026-04-29T13:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(snapshot.extractedAt).toBe('2026-04-29T13:00:00.000Z');
|
||||
});
|
||||
|
||||
it('tolerates older live-database staged directories without warnings.json', async () => {
|
||||
const rawRoot = 'raw-sources/warehouse/live-database/sync-no-warnings';
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/connection.json`,
|
||||
'{"connectionId":"warehouse","metadata":{}}\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed connection artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/orders.json`,
|
||||
'{"name":"orders","catalog":null,"db":null,"kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed orders artifact',
|
||||
);
|
||||
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
rawSourcesDir: rawRoot,
|
||||
extractedAtFallback: '2026-04-29T13:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(snapshot.warnings).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,451 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
buildKtxRelationshipBenchmarkReport,
|
||||
formatKtxRelationshipBenchmarkReportMarkdown,
|
||||
} from '../../../src/context/scan/relationship-benchmark-report.js';
|
||||
import type {
|
||||
KtxRelationshipBenchmarkCaseResult,
|
||||
KtxRelationshipBenchmarkFixture,
|
||||
KtxRelationshipBenchmarkSuiteResult,
|
||||
} from '../../../src/context/scan/relationship-benchmarks.js';
|
||||
|
||||
type CaseResultOverrides = Omit<Partial<KtxRelationshipBenchmarkCaseResult>, 'metrics'> & {
|
||||
metrics?: Partial<KtxRelationshipBenchmarkCaseResult['metrics']>;
|
||||
};
|
||||
|
||||
function caseResult(overrides: CaseResultOverrides = {}): KtxRelationshipBenchmarkCaseResult {
|
||||
return {
|
||||
fixtureId: overrides.fixtureId ?? 'demo_b2b_no_declared_constraints',
|
||||
mode: overrides.mode ?? 'declared_pks_and_declared_fks_removed',
|
||||
metrics: {
|
||||
pkPrecision: 1,
|
||||
pkRecall: 0.5,
|
||||
pkF1: 0.6666666666666666,
|
||||
fkPrecision: 1,
|
||||
fkRecall: 1,
|
||||
fkF1: 1,
|
||||
acceptedFalsePositiveCount: 0,
|
||||
reviewRecall: 0,
|
||||
acceptedOrReviewRecall: 1,
|
||||
runtimeSeconds: 0.012345,
|
||||
sqlQueries: 14,
|
||||
llmCalls: 0,
|
||||
...(overrides.metrics ?? {}),
|
||||
},
|
||||
expected: overrides.expected ?? {
|
||||
pk: ['accounts.(id)', 'users.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
},
|
||||
predicted: overrides.predicted ?? {
|
||||
pk: ['accounts.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
acceptedFk: ['users.(account_id)->accounts.(id)'],
|
||||
reviewFk: [],
|
||||
},
|
||||
falsePositives: overrides.falsePositives ?? { pk: [], fk: [] },
|
||||
falseNegatives: overrides.falseNegatives ?? { pk: ['users.(id)'], fk: [] },
|
||||
skippedComposite: overrides.skippedComposite ?? { pk: [], fk: [] },
|
||||
validationBlocked: overrides.validationBlocked ?? false,
|
||||
};
|
||||
}
|
||||
|
||||
function fixture(overrides: Partial<KtxRelationshipBenchmarkFixture> = {}): KtxRelationshipBenchmarkFixture {
|
||||
return {
|
||||
id: overrides.id ?? 'demo_b2b_no_declared_constraints',
|
||||
name: overrides.name ?? 'Packaged B2B demo with declared PK and FK metadata masked',
|
||||
tier: overrides.tier ?? 'smoke',
|
||||
origin: overrides.origin ?? 'synthetic',
|
||||
thresholdEligible: overrides.thresholdEligible,
|
||||
validationBudget: overrides.validationBudget,
|
||||
snapshot: overrides.snapshot ?? {
|
||||
connectionId: 'demo_b2b',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [],
|
||||
},
|
||||
expected: overrides.expected ?? { expectedPks: [], expectedLinks: [] },
|
||||
defaultModes: overrides.defaultModes ?? ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
dataPath: overrides.dataPath ?? '/tmp/demo.sqlite',
|
||||
columnEmbeddings: overrides.columnEmbeddings ?? {},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship benchmark report', () => {
|
||||
it('classifies run, validation-blocked, and not-run benchmark cases', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult(),
|
||||
caseResult({
|
||||
mode: 'validation_disabled',
|
||||
validationBlocked: true,
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
|
||||
predicted: {
|
||||
pk: ['accounts.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
acceptedFk: [],
|
||||
reviewFk: ['users.(account_id)->accounts.(id)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: ['demo_b2b_no_declared_constraints:validation_disabled'],
|
||||
aggregate: {
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [fixture()],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled', 'profiling_disabled'],
|
||||
});
|
||||
|
||||
expect(report.headline).toEqual({
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
acceptedFalsePositiveCount: 0,
|
||||
validationBlockedCount: 1,
|
||||
});
|
||||
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.status}`)).toEqual([
|
||||
'demo_b2b_no_declared_constraints:declared_pks_and_declared_fks_removed:run',
|
||||
'demo_b2b_no_declared_constraints:validation_disabled:validation_blocked',
|
||||
'demo_b2b_no_declared_constraints:profiling_disabled:not_run',
|
||||
]);
|
||||
expect(report.cases[2]?.reason).toBe('mode not selected by fixture defaultModes');
|
||||
});
|
||||
|
||||
it('surfaces validation budget review candidates in the report reason', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'scale_stress_no_declared_constraints',
|
||||
metrics: { fkRecall: 0.5, acceptedOrReviewRecall: 1 },
|
||||
predicted: {
|
||||
pk: ['dim_entity_00.(entity_00_key)'],
|
||||
fk: [
|
||||
'fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)',
|
||||
'fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)',
|
||||
],
|
||||
acceptedFk: ['fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)'],
|
||||
reviewFk: ['fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 0,
|
||||
headlinePkRecall: 1,
|
||||
headlineFkRecall: 0.5,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 1,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'scale_stress_no_declared_constraints',
|
||||
name: 'Scale stress fixture',
|
||||
tier: 'row_bearing',
|
||||
validationBudget: 800,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
});
|
||||
|
||||
expect(report.cases[0]?.reason).toBe('review candidate validation reasons: validation_unattempted (1)');
|
||||
expect(formatKtxRelationshipBenchmarkReportMarkdown(report)).toContain('validation_unattempted');
|
||||
});
|
||||
|
||||
it('uses benchmark suite eligibility for product and smoke report rows', () => {
|
||||
const productCase = caseResult({ fixtureId: 'product_curated' });
|
||||
const productBlocked = caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
mode: 'validation_disabled',
|
||||
validationBlocked: true,
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
|
||||
});
|
||||
const smokeCase = caseResult({ fixtureId: 'smoke_even_if_marked' });
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [productCase, productBlocked, smokeCase],
|
||||
validationBlockedCases: ['product_curated:validation_disabled'],
|
||||
aggregate: {
|
||||
caseCount: 3,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0.6666666666666666,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'product_curated',
|
||||
name: 'Curated product fixture',
|
||||
tier: 'product',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
}),
|
||||
fixture({
|
||||
id: 'smoke_even_if_marked',
|
||||
name: 'Marked smoke fixture',
|
||||
tier: 'smoke',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
});
|
||||
|
||||
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.tuningEligible}`)).toEqual([
|
||||
'product_curated:declared_pks_and_declared_fks_removed:true',
|
||||
'product_curated:validation_disabled:false',
|
||||
'smoke_even_if_marked:declared_pks_and_declared_fks_removed:false',
|
||||
'smoke_even_if_marked:validation_disabled:false',
|
||||
]);
|
||||
expect(formatKtxRelationshipBenchmarkReportMarkdown(report)).toContain(
|
||||
'| product_curated | product | declared_pks_and_declared_fks_removed | run | yes |',
|
||||
);
|
||||
});
|
||||
|
||||
it('formats a compact Markdown report with false negatives and blocked modes', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
falseNegatives: { pk: ['users.(id)'], fk: ['users.(account_id)->accounts.(id)'] },
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
|
||||
buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [fixture()],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('# KTX Relationship Discovery Benchmark Evidence');
|
||||
expect(markdown).toContain(
|
||||
'| demo_b2b_no_declared_constraints | smoke | declared_pks_and_declared_fks_removed | run | no | 0.500 | 0.000 | 0.000 | 0 |',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(id)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(account_id)->accounts.(id)',
|
||||
);
|
||||
});
|
||||
|
||||
it('keeps headline failures separate from non-headline failure details', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
falseNegatives: { pk: [], fk: [] },
|
||||
metrics: { pkRecall: 1, fkRecall: 1, acceptedOrReviewRecall: 1 },
|
||||
}),
|
||||
caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
mode: 'embeddings_disabled',
|
||||
falseNegatives: {
|
||||
pk: ['customers.(id)'],
|
||||
fk: ['orders.(buyer_ref)->customers.(id)'],
|
||||
},
|
||||
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 1,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.75,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 0.5,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
|
||||
buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'product_curated',
|
||||
name: 'Curated product fixture',
|
||||
tier: 'product',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('## Failure Details');
|
||||
expect(markdown).toContain('### Headline False Negative FKs\n\n- none');
|
||||
expect(markdown).toContain(
|
||||
'- `product_curated` / `embeddings_disabled` / `run`: orders.(buyer_ref)->customers.(id)',
|
||||
);
|
||||
expect(markdown).toContain('- `product_curated` / `embeddings_disabled` / `run`: customers.(id)');
|
||||
});
|
||||
|
||||
it('formats headline failure context from remaining headline false negatives', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'public_headline_fixture',
|
||||
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
falseNegatives: {
|
||||
pk: ['parent_table.(opaque_key)'],
|
||||
fk: ['child_table.(parent_table_id)->parent_table.(opaque_key)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
|
||||
buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'public_headline_fixture',
|
||||
name: 'Public headline fixture',
|
||||
tier: 'row_bearing',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('## Headline Failure Context');
|
||||
expect(markdown).toContain('- Remaining headline false-negative PKs: 1');
|
||||
expect(markdown).toContain('- Remaining headline false-negative FKs: 1');
|
||||
expect(markdown).toContain(
|
||||
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: parent_table.(opaque_key)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: child_table.(parent_table_id)->parent_table.(opaque_key)',
|
||||
);
|
||||
});
|
||||
|
||||
it('formats skipped composite ground truth separately from false-negative details', () => {
|
||||
const compositePk = 'order_lines.(order_id,line_number)';
|
||||
const compositeFk = 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)';
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'composite_keys_no_declared_constraints',
|
||||
metrics: { pkRecall: 0, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
expected: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
predicted: {
|
||||
pk: [],
|
||||
fk: [],
|
||||
acceptedFk: [],
|
||||
reviewFk: [],
|
||||
},
|
||||
falseNegatives: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
skippedComposite: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'composite_keys_no_declared_constraints',
|
||||
name: 'Composite key fixture with no declared constraints',
|
||||
tier: 'row_bearing',
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
});
|
||||
|
||||
expect(report.cases[0]?.skippedComposite).toEqual({
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
});
|
||||
|
||||
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(report);
|
||||
expect(markdown).toContain('## Composite Ground Truth Skips');
|
||||
expect(markdown).toContain(
|
||||
'### Skipped Composite PKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_lines.(order_id,line_number)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'### Skipped Composite FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'### Headline False Negative FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
);
|
||||
});
|
||||
});
|
||||
1275
packages/cli/test/context/scan/relationship-benchmarks.test.ts
Normal file
1275
packages/cli/test/context/scan/relationship-benchmarks.test.ts
Normal file
File diff suppressed because it is too large
Load diff
86
packages/cli/test/context/scan/relationship-budget.test.ts
Normal file
86
packages/cli/test/context/scan/relationship-budget.test.ts
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { applyKtxRelationshipValidationBudget, defaultKtxRelationshipValidationBudget } from '../../../src/context/scan/relationship-budget.js';
|
||||
|
||||
interface Candidate {
|
||||
id: string;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
describe('relationship validation budget', () => {
|
||||
it('computes the default validation budget from table count', () => {
|
||||
expect(defaultKtxRelationshipValidationBudget(0)).toBe(0);
|
||||
expect(defaultKtxRelationshipValidationBudget(3)).toBe(6);
|
||||
expect(defaultKtxRelationshipValidationBudget(400)).toBe(800);
|
||||
expect(defaultKtxRelationshipValidationBudget(900)).toBe(1000);
|
||||
expect(defaultKtxRelationshipValidationBudget(-4)).toBe(0);
|
||||
expect(defaultKtxRelationshipValidationBudget(3.8)).toBe(6);
|
||||
});
|
||||
|
||||
it('splits candidates by descending score with stable tie ordering', () => {
|
||||
const result = applyKtxRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 0.8 },
|
||||
{ id: 'second', confidence: 0.9 },
|
||||
{ id: 'third', confidence: 0.9 },
|
||||
{ id: 'fourth', confidence: 0.2 },
|
||||
],
|
||||
tableCount: 100,
|
||||
budget: 2,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(2);
|
||||
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['second', 'third']);
|
||||
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'fourth']);
|
||||
expect(result.toValidate.map((entry) => entry.originalIndex)).toEqual([1, 2]);
|
||||
});
|
||||
|
||||
it('uses the default budget when the budget is omitted', () => {
|
||||
const candidates = Array.from({ length: 8 }, (_, index) => ({
|
||||
id: `candidate-${index}`,
|
||||
confidence: 1 - index / 10,
|
||||
}));
|
||||
|
||||
const result = applyKtxRelationshipValidationBudget<Candidate>({
|
||||
candidates,
|
||||
tableCount: 2,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(4);
|
||||
expect(result.toValidate).toHaveLength(4);
|
||||
expect(result.deferred).toHaveLength(4);
|
||||
});
|
||||
|
||||
it('treats budget zero as disabling SQL validation', () => {
|
||||
const result = applyKtxRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 1 },
|
||||
{ id: 'second', confidence: 0.5 },
|
||||
],
|
||||
tableCount: 10,
|
||||
budget: 0,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(0);
|
||||
expect(result.toValidate).toEqual([]);
|
||||
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
|
||||
});
|
||||
|
||||
it('treats budget all as validating every candidate', () => {
|
||||
const result = applyKtxRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 0.1 },
|
||||
{ id: 'second', confidence: 0.9 },
|
||||
],
|
||||
tableCount: 1,
|
||||
budget: 'all',
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe('all');
|
||||
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
|
||||
expect(result.deferred).toEqual([]);
|
||||
});
|
||||
});
|
||||
881
packages/cli/test/context/scan/relationship-candidates.test.ts
Normal file
881
packages/cli/test/context/scan/relationship-candidates.test.ts
Normal file
|
|
@ -0,0 +1,881 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
|
||||
import { normalizeKtxRelationshipName } from '../../../src/context/scan/relationship-name-similarity.js';
|
||||
import {
|
||||
generateKtxRelationshipDiscoveryCandidates,
|
||||
inferKtxRelationshipTargetPks,
|
||||
mergeKtxRelationshipDiscoveryCandidates,
|
||||
} from '../../../src/context/scan/relationship-candidates.js';
|
||||
import type { KtxRelationshipProfileArtifact } from '../../../src/context/scan/relationship-profiling.js';
|
||||
|
||||
function column(
|
||||
tableId: string,
|
||||
id: string,
|
||||
name: string,
|
||||
options: Partial<KtxEnrichedColumn> = {},
|
||||
): KtxEnrichedColumn {
|
||||
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
|
||||
return {
|
||||
id,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: options.nativeType ?? 'INTEGER',
|
||||
normalizedType: options.normalizedType ?? 'integer',
|
||||
dimensionType: options.dimensionType ?? 'number',
|
||||
nullable: options.nullable ?? true,
|
||||
primaryKey: options.primaryKey ?? false,
|
||||
parentColumnId: options.parentColumnId ?? null,
|
||||
descriptions: options.descriptions ?? {},
|
||||
embedding: options.embedding ?? null,
|
||||
sampleValues: options.sampleValues ?? null,
|
||||
cardinality: options.cardinality ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function table(id: string, name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: 'public', name };
|
||||
return {
|
||||
id,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables: KtxEnrichedTable[]): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables,
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
function planCodeProfiles(): KtxRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [
|
||||
{ table: { catalog: null, db: 'public', name: 'stg_plans' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: 'public', name: 'mart_account_segments' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' }, rowCount: 4 },
|
||||
],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.created_at': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'created_at',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['2026-05-01', '2026-05-02', '2026-05-03', '2026-05-04'],
|
||||
minTextLength: 10,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.email': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'email',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['a@example.test', 'b@example.test', 'c@example.test', 'd@example.test'],
|
||||
minTextLength: 14,
|
||||
maxTextLength: 14,
|
||||
},
|
||||
'stg_plans.is_deleted': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'is_deleted',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['deleted-a', 'deleted-b', 'deleted-c', 'deleted-d'],
|
||||
minTextLength: 9,
|
||||
maxTextLength: 9,
|
||||
},
|
||||
'mart_account_segments.current_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
|
||||
column: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'mart_account_segments.normalized_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
|
||||
column: 'normalized_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plan_segment_mapping.canonical_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' },
|
||||
column: 'canonical_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.canonical_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'canonical_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship discovery candidates', () => {
|
||||
it('normalizes warehouse prefixes and emits review candidates without declared primary keys', () => {
|
||||
const accounts = table('accounts-id', 'dim_accounts', [
|
||||
column('accounts-id', 'accounts-id-col', 'id', { primaryKey: false }),
|
||||
column('accounts-id', 'accounts-name-col', 'account_name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'fct_invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: false }),
|
||||
column('invoices-id', 'account-id-col', 'account_id', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]));
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
from: { tableId: 'invoices-id', columnIds: ['account-id-col'], columns: ['account_id'] },
|
||||
to: { tableId: 'accounts-id', columnIds: ['accounts-id-col'], columns: ['id'] },
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
source: 'normalized_table_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account',
|
||||
targetKeyScore: 0.92,
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.8);
|
||||
expect(candidates[0]?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.92,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: 0.92,
|
||||
});
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.contributions.nameSimilarity).toBeGreaterThan(0);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
|
||||
);
|
||||
});
|
||||
|
||||
it('generates candidates for PascalCase ID columns without declared keys', () => {
|
||||
const artists = table('artist-id', 'Artist', [
|
||||
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
|
||||
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
|
||||
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([artists, albums]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['Album.ArtistId->Artist.ArtistId']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'normalized_table_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'artist',
|
||||
targetTableBase: 'artist',
|
||||
targetColumnBase: 'artist_id',
|
||||
targetKeyScore: 0.9,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
|
||||
});
|
||||
|
||||
it('uses the locality cap before scanning parent tables', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id'),
|
||||
column('invoices-id', 'account-id-col', 'account_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]), {
|
||||
maxCandidateParentTables: 0,
|
||||
});
|
||||
|
||||
expect(candidates).toEqual([]);
|
||||
});
|
||||
|
||||
it('keeps the nearest parent when the locality cap is one', () => {
|
||||
const artists = table('artist-id', 'Artist', [
|
||||
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
|
||||
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
|
||||
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
|
||||
]);
|
||||
const fillerTables = Array.from({ length: 25 }, (_, index) =>
|
||||
table(`filler-${index}`, `WarehouseFiller${index}`, [
|
||||
column(`filler-${index}`, `filler-${index}-id`, 'WarehouseFillerId', { primaryKey: false }),
|
||||
]),
|
||||
);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([albums, ...fillerTables, artists]), {
|
||||
maxCandidateParentTables: 1,
|
||||
});
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['Album.ArtistId->Artist.ArtistId']);
|
||||
});
|
||||
|
||||
it('uses final table tokens from dotted parent table names', () => {
|
||||
const customers = table('customer-id', 'SalesLT.Customer', [
|
||||
column('customer-id', 'customer-id-col', 'CustomerID', { primaryKey: false }),
|
||||
column('customer-id', 'customer-name-col', 'CustomerName', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const orders = table('order-id', 'SalesLT.SalesOrderHeader', [
|
||||
column('order-id', 'order-id-col', 'SalesOrderID', { primaryKey: false }),
|
||||
column('order-id', 'customer-id-fk-col', 'CustomerID', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customers, orders]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['SalesLT.SalesOrderHeader.CustomerID->SalesLT.Customer.CustomerID']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
evidence: {
|
||||
sourceColumnBase: 'customer',
|
||||
targetTableBase: 'sales_lt_customer',
|
||||
targetColumnBase: 'customer_id',
|
||||
targetKeyScore: 0.9,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'inflection', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('emits lower-confidence parent-table-name candidates when the target key name differs from the table name', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', { primaryKey: true }),
|
||||
column('customer-account-id', 'account-name-col', 'AccountName', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
|
||||
column('subscriptions-id', 'subscription-id-col', 'SubscriptionID', { primaryKey: false }),
|
||||
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'parent_table_name_match',
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
sourceColumnBase: 'customer_account',
|
||||
targetTableBase: 'crm_customer_account',
|
||||
targetColumnBase: 'business_entity_id',
|
||||
targetKeyScore: 1,
|
||||
nameScore: 0.82,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'parent_table_name_match', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.82,
|
||||
typeCompatibility: 1,
|
||||
});
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
|
||||
});
|
||||
|
||||
it('does not emit parent-table-name candidates when the target key type is incompatible', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', {
|
||||
primaryKey: true,
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
|
||||
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', {
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID');
|
||||
});
|
||||
|
||||
it('does not use parent-table-name matching to create same-table same-column self-links', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
|
||||
column('customer-account-id', 'account-name-col', 'AccountName', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('crm.CustomerAccount.CustomerAccountID->crm.CustomerAccount.CustomerAccountID');
|
||||
});
|
||||
|
||||
it('uses profile evidence to generate natural-key candidates without id-like target names', () => {
|
||||
const countries = table('countries-id', 'dim_countries', [
|
||||
column('countries-id', 'countries-code-col', 'iso_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('countries-id', 'countries-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'fct_accounts', [
|
||||
column('accounts-id', 'account-id-col', 'id', { primaryKey: false }),
|
||||
column('accounts-id', 'country-code-col', 'country_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'dim_countries.iso_code': {
|
||||
table: { catalog: null, db: 'public', name: 'dim_countries' },
|
||||
column: 'iso_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['DE', 'FR', 'US'],
|
||||
minTextLength: 2,
|
||||
maxTextLength: 2,
|
||||
},
|
||||
'fct_accounts.country_code': {
|
||||
table: { catalog: null, db: 'public', name: 'fct_accounts' },
|
||||
column: 'country_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
nullRate: 0,
|
||||
sampleValues: ['FR', 'US'],
|
||||
minTextLength: 2,
|
||||
maxTextLength: 2,
|
||||
},
|
||||
},
|
||||
} satisfies KtxRelationshipProfileArtifact;
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([countries, accounts]), { profiles });
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'profile_match',
|
||||
from: { tableId: 'accounts-id', columnIds: ['country-code-col'], columns: ['country_code'] },
|
||||
to: { tableId: 'countries-id', columnIds: ['countries-code-col'], columns: ['iso_code'] },
|
||||
evidence: {
|
||||
sourceColumnBase: 'country',
|
||||
targetTableBase: 'country',
|
||||
targetColumnBase: 'iso_code',
|
||||
targetKeyScore: 0.86,
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.78);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining([
|
||||
'foreign_key_code_suffix',
|
||||
'normalized_table_name',
|
||||
'profile_unique_target',
|
||||
'profile_sample_overlap',
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('drops same-table same-column self-links using ordered endpoint equality', () => {
|
||||
const accounts = table('accounts-id', 'stg_accounts', [
|
||||
column('accounts-id', 'accounts-account-id-col', 'account_id', { primaryKey: false }),
|
||||
column('accounts-id', 'accounts-name-col', 'account_name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('stg_accounts.account_id->stg_accounts.account_id');
|
||||
});
|
||||
|
||||
it('keeps legitimate same-table different-column self-references', () => {
|
||||
const employees = table('employees-id', 'employees', [
|
||||
column('employees-id', 'employees-id-col', 'id', { primaryKey: false }),
|
||||
column('employees-id', 'employees-parent-id-col', 'parent_id', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([employees]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toContain('employees.parent_id->employees.id');
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'self_reference',
|
||||
evidence: {
|
||||
reasons: expect.arrayContaining(['self_reference']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('emits column_suffix_match candidates for relationship-key-shaped trailing target columns', () => {
|
||||
const plans = table('plans-id', 'stg_plans', [
|
||||
column('plans-id', 'plans-plan-code-col', 'plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-canonical-plan-code-col', 'canonical_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-created-at-col', 'created_at', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-email-col', 'email', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-is-deleted-col', 'is_deleted', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accountSegments = table('account-segments-id', 'mart_account_segments', [
|
||||
column('account-segments-id', 'current-plan-code-col', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'normalized-plan-code-col', 'normalized_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'source-created-at-col', 'source_created_at', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'billing-email-col', 'billing_email', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'source-is-deleted-col', 'source_is_deleted', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const mapping = table('mapping-id', 'stg_plan_segment_mapping', [
|
||||
column('mapping-id', 'mapping-canonical-plan-code-col', 'canonical_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([plans, accountSegments, mapping]), {
|
||||
profiles: planCodeProfiles(),
|
||||
});
|
||||
const candidateKeys = candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
);
|
||||
|
||||
expect(candidateKeys).toEqual([
|
||||
'mart_account_segments.current_plan_code->stg_plans.plan_code',
|
||||
'mart_account_segments.normalized_plan_code->stg_plans.plan_code',
|
||||
'stg_plan_segment_mapping.canonical_plan_code->stg_plans.plan_code',
|
||||
'stg_plans.canonical_plan_code->stg_plans.plan_code',
|
||||
]);
|
||||
expect(candidates).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
source: 'column_suffix_match',
|
||||
confidence: expect.any(Number),
|
||||
evidence: expect.objectContaining({
|
||||
nameScore: 0.78,
|
||||
targetKeyScore: 0.86,
|
||||
reasons: expect.arrayContaining(['column_suffix_match', 'profile_unique_target']),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.source_created_at->stg_plans.created_at');
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.billing_email->stg_plans.email');
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.source_is_deleted->stg_plans.is_deleted');
|
||||
const suffixCandidate = candidates.find(
|
||||
(candidate) => candidate.from.table.name === 'mart_account_segments' && candidate.from.columns[0] === 'current_plan_code',
|
||||
);
|
||||
expect(suffixCandidate?.confidence).toBe(suffixCandidate?.evidence.scoreBreakdown?.score);
|
||||
expect(suffixCandidate?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.78,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 1,
|
||||
profileUniqueness: 1,
|
||||
profileNullRate: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('does not suffix-match bare single-token targets or incompatible target types', () => {
|
||||
const users = table('users-id', 'users', [
|
||||
column('users-id', 'users-id-col', 'id', { primaryKey: false }),
|
||||
column('users-id', 'users-account-id-col', 'account_id', { primaryKey: false }),
|
||||
]);
|
||||
const plans = table('plans-id', 'plans', [
|
||||
column('plans-id', 'plans-plan-code-col', 'plan_code', {
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'accounts', [
|
||||
column('accounts-id', 'current-plan-code-col', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const profiles = {
|
||||
...planCodeProfiles(),
|
||||
columns: {
|
||||
...planCodeProfiles().columns,
|
||||
'users.id': {
|
||||
table: { catalog: null, db: 'public', name: 'users' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'plans.plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
} satisfies KtxRelationshipProfileArtifact;
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([users, plans, accounts]), { profiles });
|
||||
const candidateKeys = candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
);
|
||||
|
||||
expect(candidateKeys).not.toContain('users.account_id->users.id');
|
||||
expect(candidateKeys).not.toContain('accounts.current_plan_code->plans.plan_code');
|
||||
});
|
||||
|
||||
it('uses column embeddings as a recall source for non-standard source names', () => {
|
||||
const customers = table('customers-id', 'customers', [
|
||||
column('customers-id', 'customers-id-col', 'id', {
|
||||
primaryKey: false,
|
||||
embedding: [1, 0, 0],
|
||||
}),
|
||||
column('customers-id', 'customers-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
embedding: [0, 1, 0],
|
||||
}),
|
||||
]);
|
||||
const orders = table('orders-id', 'orders', [
|
||||
column('orders-id', 'orders-id-col', 'id', {
|
||||
primaryKey: false,
|
||||
embedding: [0, 0, 1],
|
||||
}),
|
||||
column('orders-id', 'buyer-ref-col', 'buyer_ref', {
|
||||
primaryKey: false,
|
||||
embedding: [0.995, 0.005, 0],
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customers, orders]), {
|
||||
embeddingSimilarityThreshold: 0.95,
|
||||
});
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'embedding_similarity',
|
||||
from: { tableId: 'orders-id', columnIds: ['buyer-ref-col'], columns: ['buyer_ref'] },
|
||||
to: { tableId: 'customers-id', columnIds: ['customers-id-col'], columns: ['id'] },
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
sourceColumnBase: 'buyer_ref',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
embeddingSimilarity: expect.any(Number),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['embedding_similarity', 'target_key_like']),
|
||||
);
|
||||
});
|
||||
|
||||
it('singularizes names and caps candidates per source column deterministically', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const archivedAccounts = table('archived-accounts-id', 'accounts_archive', [
|
||||
column('archived-accounts-id', 'archived-accounts-id-col', 'id'),
|
||||
]);
|
||||
const events = table('events-id', 'product_events', [
|
||||
column('events-id', 'event-id-col', 'id'),
|
||||
column('events-id', 'account-id-col', 'account_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([events, archivedAccounts, accounts]), {
|
||||
maxCandidatesPerColumn: 1,
|
||||
});
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['product_events.account_id->accounts.id']);
|
||||
});
|
||||
|
||||
it('infers target primary-key candidates from incoming review links', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const users = table('users-id', 'users', [column('users-id', 'users-id-col', 'id')]);
|
||||
const events = table('events-id', 'product_events', [
|
||||
column('events-id', 'event-id-col', 'id'),
|
||||
column('events-id', 'account-id-col', 'account_id'),
|
||||
column('events-id', 'user-id-col', 'user_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, users, events]));
|
||||
const inferredPks = inferKtxRelationshipTargetPks(candidates);
|
||||
|
||||
expect(inferredPks).toEqual([
|
||||
{
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
score: expect.any(Number),
|
||||
status: 'review',
|
||||
incomingCandidateCount: 1,
|
||||
},
|
||||
{
|
||||
table: 'users',
|
||||
columns: ['id'],
|
||||
score: expect.any(Number),
|
||||
status: 'review',
|
||||
incomingCandidateCount: 1,
|
||||
},
|
||||
]);
|
||||
expect(inferredPks.every((pk) => pk.score >= 0.8)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not generate candidates from primary-key source columns or incompatible target types', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [
|
||||
column('accounts-id', 'accounts-id-col', 'id', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: true }),
|
||||
column('invoices-id', 'account-id-col', 'account_id', { nativeType: 'INTEGER', normalizedType: 'integer' }),
|
||||
]);
|
||||
|
||||
expect(generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]))).toEqual([]);
|
||||
});
|
||||
|
||||
it('normalizes layer prefixes, punctuation, plural forms, and non-plural trailing s words', () => {
|
||||
expect(normalizeKtxRelationshipName('mart__Sales_Accounts')).toMatchObject({
|
||||
normalized: 'sales_accounts',
|
||||
singular: 'sales_account',
|
||||
tokens: ['sales', 'accounts'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('dim_users')).toMatchObject({
|
||||
normalized: 'users',
|
||||
singular: 'user',
|
||||
tokens: ['users'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('Address')).toMatchObject({
|
||||
normalized: 'address',
|
||||
singular: 'address',
|
||||
plural: 'addresses',
|
||||
tokens: ['address'],
|
||||
});
|
||||
});
|
||||
|
||||
it('merges duplicate deterministic and LLM proposal candidates without losing LLM rationale', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'account-id-col', 'account_id')]);
|
||||
const [deterministic] = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]));
|
||||
if (!deterministic) {
|
||||
throw new Error('Expected deterministic relationship candidate');
|
||||
}
|
||||
const llmCandidate = {
|
||||
...deterministic,
|
||||
confidence: 0.99,
|
||||
source: 'llm_proposal' as const,
|
||||
evidence: {
|
||||
...deterministic.evidence,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Invoices point at the owning account dimension.',
|
||||
},
|
||||
};
|
||||
|
||||
const merged = mergeKtxRelationshipDiscoveryCandidates([deterministic, llmCandidate]);
|
||||
|
||||
expect(merged).toHaveLength(1);
|
||||
expect(merged[0]).toMatchObject({
|
||||
id: deterministic.id,
|
||||
source: 'normalized_table_match',
|
||||
confidence: 0.99,
|
||||
evidence: {
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Invoices point at the owning account dimension.',
|
||||
},
|
||||
});
|
||||
expect(merged[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like', 'llm_proposal']),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { getDialectForDriver } from '../../../src/context/connections/dialects.js';
|
||||
import { snapshotToKtxEnrichedSchema } from '../../../src/context/scan/local-enrichment.js';
|
||||
import { loadKtxRelationshipBenchmarkFixture, maskKtxRelationshipBenchmarkSnapshot } from '../../../src/context/scan/relationship-benchmarks.js';
|
||||
import { discoverKtxCompositeRelationships } from '../../../src/context/scan/relationship-composite-candidates.js';
|
||||
import { profileKtxRelationshipSchema, type KtxRelationshipReadOnlyExecutor } from '../../../src/context/scan/relationship-profiling.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from '../../../src/context/scan/types.js';
|
||||
|
||||
class TestSqliteExecutor implements KtxRelationshipReadOnlyExecutor {
|
||||
private readonly db: Database.Database;
|
||||
|
||||
constructor(dataPath: string) {
|
||||
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
|
||||
}
|
||||
|
||||
async executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return {
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
};
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
describe('composite relationship discovery detector', () => {
|
||||
it('infers composite primary keys and validates composite foreign keys from row evidence', async () => {
|
||||
const fixtureRoot = new URL('../../fixtures/relationship-benchmarks', import.meta.url);
|
||||
const fixture = await loadKtxRelationshipBenchmarkFixture(
|
||||
join(fixtureRoot.pathname, 'composite_keys_no_declared_constraints'),
|
||||
);
|
||||
const snapshot = maskKtxRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
|
||||
const schema = snapshotToKtxEnrichedSchema(snapshot, new Map());
|
||||
const executor = new TestSqliteExecutor(fixture.dataPath ?? '');
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: snapshot.connectionId,
|
||||
dialect: getDialectForDriver(snapshot.driver),
|
||||
schema,
|
||||
executor,
|
||||
ctx: { runId: 'test:composite-profile' },
|
||||
});
|
||||
|
||||
const result = await discoverKtxCompositeRelationships({
|
||||
connectionId: snapshot.connectionId,
|
||||
dialect: getDialectForDriver(snapshot.driver),
|
||||
schema,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'test:composite-detect' },
|
||||
});
|
||||
executor.close();
|
||||
|
||||
expect(result.primaryKeys.map((item) => `${item.table.name}.(${item.columns.join(',')})`)).toEqual([
|
||||
'order_line_allocations.(order_id,line_number,warehouse_code)',
|
||||
'order_lines.(order_id,line_number)',
|
||||
]);
|
||||
expect(
|
||||
result.relationships.map(
|
||||
(item) =>
|
||||
`${item.from.table.name}.(${item.from.columns.join(',')})->${item.to.table.name}.(${item.to.columns.join(',')})`,
|
||||
),
|
||||
).toEqual(['order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)']);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'accepted',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
});
|
||||
expect(result.queryCount).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
373
packages/cli/test/context/scan/relationship-diagnostics.test.ts
Normal file
373
packages/cli/test/context/scan/relationship-diagnostics.test.ts
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedRelationship, KtxRelationshipEndpoint } from '../../../src/context/scan/enrichment-types.js';
|
||||
import type { KtxResolvedRelationshipDiscoveryCandidate } from '../../../src/context/scan/relationship-graph-resolver.js';
|
||||
import {
|
||||
buildKtxRelationshipArtifacts,
|
||||
buildKtxRelationshipDiagnostics,
|
||||
emptyKtxRelationshipProfileArtifact,
|
||||
} from '../../../src/context/scan/relationship-diagnostics.js';
|
||||
|
||||
function endpoint(table: string, column: string): KtxRelationshipEndpoint {
|
||||
return {
|
||||
tableId: table,
|
||||
columnIds: [`${table}.${column}`],
|
||||
table: { catalog: null, db: null, name: table },
|
||||
columns: [column],
|
||||
};
|
||||
}
|
||||
|
||||
function enrichedRelationship(input: {
|
||||
id: string;
|
||||
fromTable: string;
|
||||
fromColumn: string;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
confidence?: number;
|
||||
}): KtxEnrichedRelationship {
|
||||
return {
|
||||
id: input.id,
|
||||
source: 'inferred',
|
||||
from: endpoint(input.fromTable, input.fromColumn),
|
||||
to: endpoint(input.toTable, input.toColumn),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: input.confidence ?? 0.92,
|
||||
isPrimaryKeyReference: true,
|
||||
};
|
||||
}
|
||||
|
||||
function resolvedRelationship(input: {
|
||||
id: string;
|
||||
status: 'accepted' | 'review' | 'rejected';
|
||||
source?: 'normalized_table_match' | 'exact_column_match' | 'inflection' | 'self_reference' | 'llm_proposal';
|
||||
fkScore?: number;
|
||||
pkScore?: number;
|
||||
validationReasons?: string[];
|
||||
graphReasons?: string[];
|
||||
}): KtxResolvedRelationshipDiscoveryCandidate {
|
||||
return {
|
||||
id: input.id,
|
||||
from: endpoint('orders', 'customer_id'),
|
||||
to: endpoint('customers', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.88,
|
||||
source: input.source ?? 'normalized_table_match',
|
||||
status: input.status,
|
||||
evidence:
|
||||
input.source === 'llm_proposal'
|
||||
? {
|
||||
sourceColumnBase: 'buyer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.88,
|
||||
nameScore: 0.45,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
}
|
||||
: {
|
||||
sourceColumnBase: 'customer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.9,
|
||||
nameScore: 0.85,
|
||||
reasons: ['table_name_matches_source_column'],
|
||||
},
|
||||
score: 0.91,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: input.status === 'rejected' ? 0.2 : 1,
|
||||
violationCount: input.status === 'rejected' ? 8 : 0,
|
||||
violationRatio: input.status === 'rejected' ? 0.8 : 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 10,
|
||||
parentDistinct: 10,
|
||||
overlap: input.status === 'rejected' ? 2 : 10,
|
||||
checkedValues: 10,
|
||||
reasons: input.validationReasons ?? ['validation_passed'],
|
||||
},
|
||||
pkScore: input.pkScore ?? 0.97,
|
||||
fkScore: input.fkScore ?? 0.94,
|
||||
graph: {
|
||||
targetPkScore: input.pkScore ?? 0.97,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: input.graphReasons ?? ['target_pk_score_passed', 'fk_score_passed'],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship diagnostics artifacts', () => {
|
||||
it('groups graph-resolved relationships and preserves evidence reasons', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
resolvedRelationship({ id: 'accepted-edge', status: 'accepted', source: 'llm_proposal' }),
|
||||
resolvedRelationship({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
validationReasons: ['validation_unavailable'],
|
||||
graphReasons: ['validation_unavailable_review_only', 'fk_score_review'],
|
||||
}),
|
||||
resolvedRelationship({
|
||||
id: 'rejected-edge',
|
||||
status: 'rejected',
|
||||
validationReasons: ['low_source_coverage'],
|
||||
graphReasons: ['fk_score_rejected'],
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toHaveLength(1);
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
source: 'llm_proposal',
|
||||
evidence: {
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
|
||||
});
|
||||
expect(artifacts.review).toHaveLength(1);
|
||||
expect(artifacts.rejected).toHaveLength(1);
|
||||
expect(artifacts.review[0]).toMatchObject({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
source: 'normalized_table_match',
|
||||
fkScore: 0.94,
|
||||
reasons: expect.arrayContaining(['validation_unavailable', 'validation_unavailable_review_only']),
|
||||
});
|
||||
expect(artifacts.rejected[0]?.reasons).toEqual(
|
||||
expect.arrayContaining(['table_name_matches_source_column', 'low_source_coverage', 'fk_score_rejected']),
|
||||
);
|
||||
});
|
||||
|
||||
it('adapts relationship updates into the artifact shape', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
enrichedRelationship({
|
||||
id: 'orders-customer',
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
}),
|
||||
],
|
||||
rejected: [
|
||||
enrichedRelationship({
|
||||
id: 'orders-account',
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'account_id',
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
confidence: 0.4,
|
||||
}),
|
||||
],
|
||||
skipped: [{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
id: 'orders-customer',
|
||||
status: 'accepted',
|
||||
source: 'inferred',
|
||||
reasons: ['accepted_relationship_update'],
|
||||
});
|
||||
expect(artifacts.rejected[0]).toMatchObject({
|
||||
id: 'orders-account',
|
||||
status: 'rejected',
|
||||
reasons: ['rejected_relationship_update'],
|
||||
});
|
||||
expect(artifacts.skipped).toEqual([{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }]);
|
||||
});
|
||||
|
||||
it('deduplicates resolved and formal relationship update artifacts by edge id', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
{
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
from: endpoint('orders', 'account_id'),
|
||||
to: endpoint('accounts', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'normalized_table_match',
|
||||
status: 'accepted',
|
||||
confidence: 0.92,
|
||||
score: 0.9,
|
||||
pkScore: 0.92,
|
||||
fkScore: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.92,
|
||||
reasons: ['foreign_key_suffix'],
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
checkedValues: 2,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
graph: {
|
||||
targetPkScore: 0.92,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: ['fk_score_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
from: endpoint('orders', 'account_id'),
|
||||
to: endpoint('accounts', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toHaveLength(1);
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'normalized_table_match',
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'validation_passed', 'fk_score_passed']),
|
||||
});
|
||||
});
|
||||
|
||||
it('explains validation-unavailable review candidates', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
resolvedRelationship({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
validationReasons: ['validation_unavailable'],
|
||||
graphReasons: ['validation_unavailable_review_only'],
|
||||
}),
|
||||
],
|
||||
});
|
||||
const profile = emptyKtxRelationshipProfileArtifact({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
reason: 'read_only_sql_unavailable',
|
||||
});
|
||||
|
||||
const diagnostics = buildKtxRelationshipDiagnostics({
|
||||
connectionId: 'warehouse',
|
||||
generatedAt: '2026-05-07T12:00:00.000Z',
|
||||
artifacts,
|
||||
profile,
|
||||
warnings: [
|
||||
{
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KTX scan connector cannot run standalone statistical relationship validation',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
},
|
||||
],
|
||||
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
|
||||
});
|
||||
|
||||
expect(diagnostics.summary).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(diagnostics.noAcceptedReason).toBe('validation unavailable; review candidates written');
|
||||
expect(diagnostics.candidateCountsBySource).toEqual({ normalized_table_match: 1 });
|
||||
expect(diagnostics.validation).toEqual({
|
||||
available: false,
|
||||
sqlAvailable: false,
|
||||
queryCount: 0,
|
||||
});
|
||||
expect(diagnostics.profileWarnings).toEqual(['read_only_sql_unavailable']);
|
||||
expect(diagnostics.warnings[0]).toMatchObject({ code: 'connector_capability_missing' });
|
||||
});
|
||||
|
||||
it('explains empty relationship output as a no-candidate outcome', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({ connectionId: 'warehouse' });
|
||||
const diagnostics = buildKtxRelationshipDiagnostics({
|
||||
connectionId: 'warehouse',
|
||||
generatedAt: '2026-05-07T12:00:00.000Z',
|
||||
artifacts,
|
||||
profile: emptyKtxRelationshipProfileArtifact({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
reason: 'relationship_profiling_not_run',
|
||||
}),
|
||||
});
|
||||
|
||||
expect(diagnostics.summary).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(diagnostics.noAcceptedReason).toBe('no candidate pairs passed type compatibility');
|
||||
expect(diagnostics.candidateCountsBySource).toEqual({});
|
||||
});
|
||||
|
||||
it('records composite relationship endpoints in relationship artifacts', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
compositeRelationships: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'order_line_allocations',
|
||||
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: null, name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'order_lines',
|
||||
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
|
||||
table: { catalog: null, db: null, name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
from: expect.objectContaining({
|
||||
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
|
||||
columns: ['order_id', 'line_number'],
|
||||
}),
|
||||
to: expect.objectContaining({
|
||||
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
|
||||
columns: ['order_id', 'line_number'],
|
||||
}),
|
||||
reasons: ['composite_validation_passed'],
|
||||
validation: expect.objectContaining({ sourceCoverage: 1 }),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
});
|
||||
683
packages/cli/test/context/scan/relationship-discovery.test.ts
Normal file
683
packages/cli/test/context/scan/relationship-discovery.test.ts
Normal file
|
|
@ -0,0 +1,683 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it, vi } from 'vitest';
|
||||
import type { KtxLlmRuntimePort } from '../../../src/context/llm/runtime-port.js';
|
||||
import { getDialectForDriver } from '../../../src/context/connections/dialects.js';
|
||||
import { buildDefaultKtxProjectConfig } from '../../../src/context/project/config.js';
|
||||
import { snapshotToKtxEnrichedSchema } from '../../../src/context/scan/local-enrichment.js';
|
||||
import {
|
||||
loadKtxRelationshipBenchmarkFixture,
|
||||
maskKtxRelationshipBenchmarkSnapshot,
|
||||
} from '../../../src/context/scan/relationship-benchmarks.js';
|
||||
import { discoverKtxRelationships } from '../../../src/context/scan/relationship-discovery.js';
|
||||
import { createKtxConnectorCapabilities } from '../../../src/context/scan/types.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanConnector, KtxScanContext, KtxSchemaSnapshot } from '../../../src/context/scan/types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function snapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function declaredForeignKeySnapshot(): KtxSchemaSnapshot {
|
||||
const source = snapshot();
|
||||
return {
|
||||
...source,
|
||||
tables: source.tables.map((table) =>
|
||||
table.name === 'accounts'
|
||||
? {
|
||||
...table,
|
||||
columns: table.columns.map((column) => (column.name === 'id' ? { ...column, primaryKey: true } : column)),
|
||||
}
|
||||
: table.name === 'orders'
|
||||
? {
|
||||
...table,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'account_id',
|
||||
toCatalog: null,
|
||||
toDb: null,
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_account_id_fkey',
|
||||
},
|
||||
],
|
||||
}
|
||||
: table,
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
function naturalKeySnapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'dim_countries',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'iso_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'fct_accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 4,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'country_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function connector(executor: InMemorySqliteExecutor | null): KtxScanConnector {
|
||||
return {
|
||||
id: 'sqlite:test',
|
||||
driver: 'sqlite',
|
||||
capabilities: createKtxConnectorCapabilities({
|
||||
readOnlySql: executor !== null,
|
||||
columnStats: executor !== null,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
}),
|
||||
introspect: async () => snapshot(),
|
||||
listSchemas: async () => [],
|
||||
listTables: async () => [],
|
||||
executeReadOnly: executor ? executor.executeReadOnly.bind(executor) : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function llmRuntime(output: unknown): KtxLlmRuntimePort {
|
||||
return {
|
||||
generateText: vi.fn(),
|
||||
generateObject: vi.fn(async () => output) as KtxLlmRuntimePort['generateObject'],
|
||||
runAgentLoop: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipSettings() {
|
||||
return buildDefaultKtxProjectConfig().scan.relationships;
|
||||
}
|
||||
|
||||
function llmOnlyRelationshipSnapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'buyer_ref',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
describe('production relationship discovery', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('accepts a validated relationship without declared PK or FK metadata', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKtxEnrichedSchema(snapshot()),
|
||||
context: { runId: 'relationship-run-1' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.statisticalValidation).toBe('completed');
|
||||
expect(result.profile.sqlAvailable).toBe(true);
|
||||
expect(result.profile.queryCount).toBeGreaterThan(0);
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'inferred',
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts a profile-driven natural-key relationship without declared metadata', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE dim_countries (iso_code TEXT NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE fct_accounts (id INTEGER NOT NULL, country_code TEXT NOT NULL);
|
||||
INSERT INTO dim_countries (iso_code, name) VALUES ('US', 'United States'), ('FR', 'France'), ('DE', 'Germany');
|
||||
INSERT INTO fct_accounts (id, country_code) VALUES (1, 'US'), (2, 'FR'), (3, 'US'), (4, 'DE');
|
||||
`);
|
||||
|
||||
const schema = naturalKeySnapshot();
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => schema,
|
||||
},
|
||||
schema: snapshotToKtxEnrichedSchema(schema),
|
||||
context: { runId: 'natural-key-relationship-run' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'fct_accounts' }), columns: ['country_code'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'dim_countries' }), columns: ['iso_code'] }),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'inferred',
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'profile_match',
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts an embedding-driven relationship without declared metadata or LLM proposals', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, buyer_ref INTEGER NOT NULL);
|
||||
INSERT INTO customers (id, name) VALUES (1, 'Acme'), (2, 'Orbit'), (3, 'Globex');
|
||||
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2), (12, 2), (13, 3);
|
||||
`);
|
||||
|
||||
const sourceSnapshot = llmOnlyRelationshipSnapshot();
|
||||
const schema = snapshotToKtxEnrichedSchema(
|
||||
sourceSnapshot,
|
||||
new Map([
|
||||
['customers.id', [1, 0, 0]],
|
||||
['customers.name', [0, 1, 0]],
|
||||
['orders.id', [0, 0, 1]],
|
||||
['orders.buyer_ref', [0.995, 0.005, 0]],
|
||||
]),
|
||||
);
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => sourceSnapshot,
|
||||
},
|
||||
schema,
|
||||
context: { runId: 'embedding-relationship-run' },
|
||||
settings: {
|
||||
...relationshipSettings(),
|
||||
llmProposals: false,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.llmRelationshipValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
|
||||
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
|
||||
to: { table: { name: 'customers' }, columns: ['id'] },
|
||||
});
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'embedding_similarity',
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['embedding_similarity', 'target_key_like']),
|
||||
embeddingSimilarity: expect.any(Number),
|
||||
}),
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps candidates review-only when read-only SQL is unavailable', async () => {
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
connector: connector(null),
|
||||
schema: snapshotToKtxEnrichedSchema(snapshot()),
|
||||
context: { runId: 'relationship-run-no-sql' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.statisticalValidation).toBe('skipped');
|
||||
expect(result.relationshipUpdate.accepted).toEqual([]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_unavailable']) }),
|
||||
});
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KTX scan connector cannot run read-only SQL relationship validation',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts formal metadata relationships when read-only SQL is unavailable', async () => {
|
||||
const sourceSnapshot = declaredForeignKeySnapshot();
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
connector: connector(null),
|
||||
schema: snapshotToKtxEnrichedSchema(sourceSnapshot),
|
||||
context: { runId: 'formal-metadata-no-sql' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.statisticalValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.resolvedRelationships).toEqual([]);
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.relationshipUpdate.rejected).toEqual([]);
|
||||
expect(result.relationshipUpdate.skipped).toEqual([]);
|
||||
});
|
||||
|
||||
it('accepts LLM-only relationship proposals only after SQL validation and graph resolution pass', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER);
|
||||
CREATE TABLE orders (id INTEGER, buyer_ref INTEGER);
|
||||
INSERT INTO customers (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2);
|
||||
`);
|
||||
const llmOutput = {
|
||||
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.91, rationale: 'Unique customer key.' }],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'buyer_ref',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.89,
|
||||
rationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKtxEnrichedSchema(llmOnlyRelationshipSnapshot()),
|
||||
context: { runId: 'llm-relationship-orchestrator' },
|
||||
settings: relationshipSettings(),
|
||||
llmRuntime: llmRuntime(llmOutput),
|
||||
});
|
||||
|
||||
expect(result.llmRelationshipValidation).toBe('completed');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'llm_proposal',
|
||||
status: 'accepted',
|
||||
evidence: {
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
});
|
||||
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
|
||||
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
|
||||
to: { table: { name: 'customers' }, columns: ['id'] },
|
||||
});
|
||||
});
|
||||
|
||||
it('uses configured acceptance thresholds when resolving graph relationships', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
|
||||
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const settings = {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
acceptThreshold: 0.99,
|
||||
reviewThreshold: 0.55,
|
||||
};
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKtxEnrichedSchema(snapshot()),
|
||||
context: { runId: 'configured-thresholds' },
|
||||
settings,
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted).toEqual([]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
graph: { reasons: expect.arrayContaining(['fk_score_review']) },
|
||||
});
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('passes maxCandidatesPerColumn into broad deterministic candidate generation', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE account_archive (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
|
||||
INSERT INTO account_archive VALUES (99, 'Archive');
|
||||
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const richSnapshot = snapshot();
|
||||
richSnapshot.tables.splice(1, 0, {
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'account_archive',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 1,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => richSnapshot,
|
||||
},
|
||||
schema: snapshotToKtxEnrichedSchema(richSnapshot),
|
||||
context: { runId: 'candidate-cap' },
|
||||
settings: {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
maxCandidatesPerColumn: 1,
|
||||
},
|
||||
});
|
||||
|
||||
const sourceTargets = result.resolvedRelationships
|
||||
.filter((relationship) => relationship.from.columns[0] === 'account_id')
|
||||
.map((relationship) => `${relationship.to.table.name}.${relationship.to.columns[0]}`);
|
||||
expect(sourceTargets).toHaveLength(1);
|
||||
expect(sourceTargets).toEqual(['accounts.id']);
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('accepts SQL-validated composite relationships in production relationship-discovery detection', async () => {
|
||||
const fixtureRoot = new URL(
|
||||
'../../fixtures/relationship-benchmarks/composite_keys_no_declared_constraints',
|
||||
import.meta.url,
|
||||
);
|
||||
const fixture = await loadKtxRelationshipBenchmarkFixture(fixtureRoot.pathname);
|
||||
const maskedSnapshot = maskKtxRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
|
||||
const database = new Database(fixture.dataPath ?? '', { readonly: true, fileMustExist: true });
|
||||
const testConnector: KtxScanConnector = {
|
||||
id: 'sqlite:composite',
|
||||
driver: 'sqlite',
|
||||
capabilities: createKtxConnectorCapabilities({
|
||||
readOnlySql: true,
|
||||
columnStats: true,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
}),
|
||||
introspect: async () => maskedSnapshot,
|
||||
listSchemas: async () => [],
|
||||
listTables: async () => [],
|
||||
executeReadOnly: async (input) => {
|
||||
const rows = database.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return {
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: maskedSnapshot.connectionId,
|
||||
dialect: getDialectForDriver(maskedSnapshot.driver),
|
||||
connector: testConnector,
|
||||
schema: snapshotToKtxEnrichedSchema(maskedSnapshot, new Map()),
|
||||
context: { runId: 'test:production-composite' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
database.close();
|
||||
|
||||
expect(
|
||||
result.relationshipUpdate.accepted.map(
|
||||
(relationship) =>
|
||||
`${relationship.from.table.name}.(${relationship.from.columns.join(',')})->${relationship.to.table.name}.(${relationship.to.columns.join(',')})`,
|
||||
),
|
||||
).toContain('order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)');
|
||||
expect(result.relationships.accepted).toBeGreaterThanOrEqual(1);
|
||||
expect(result.compositeRelationships.map((relationship) => relationship.status)).toContain('accepted');
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedRelationship, KtxEnrichedSchema } from '../../../src/context/scan/enrichment-types.js';
|
||||
import { collectKtxFormalMetadataRelationships } from '../../../src/context/scan/relationship-formal-metadata.js';
|
||||
|
||||
function schema(relationships: KtxEnrichedRelationship[]): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
{
|
||||
id: 'accounts',
|
||||
ref: { catalog: null, db: null, name: 'accounts' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'accounts.id',
|
||||
tableId: 'accounts',
|
||||
tableRef: { catalog: null, db: null, name: 'accounts' },
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'orders',
|
||||
ref: { catalog: null, db: null, name: 'orders' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'orders.account_id',
|
||||
tableId: 'orders',
|
||||
tableRef: { catalog: null, db: null, name: 'orders' },
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
relationships,
|
||||
};
|
||||
}
|
||||
|
||||
function formalRelationship(overrides: Partial<KtxEnrichedRelationship> = {}): KtxEnrichedRelationship {
|
||||
return {
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['account_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'accounts',
|
||||
columnIds: ['accounts.id'],
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.6,
|
||||
isPrimaryKeyReference: false,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('formal metadata relationship collection', () => {
|
||||
it('accepts valid formal relationships with ground-truth confidence', () => {
|
||||
const result = collectKtxFormalMetadataRelationships(schema([formalRelationship()]));
|
||||
|
||||
expect(result.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.skipped).toEqual([]);
|
||||
expect(result.acceptedIds).toEqual(new Set(['orders:orders.account_id->accounts:accounts.id']));
|
||||
});
|
||||
|
||||
it('skips duplicate and invalid formal relationships with reasons', () => {
|
||||
const result = collectKtxFormalMetadataRelationships(
|
||||
schema([
|
||||
formalRelationship(),
|
||||
formalRelationship(),
|
||||
formalRelationship({
|
||||
id: 'orders:orders.missing_account_id->accounts:accounts.id',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.missing_account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['missing_account_id'],
|
||||
},
|
||||
}),
|
||||
formalRelationship({
|
||||
id: 'manual-edge',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
expect(result.accepted).toHaveLength(1);
|
||||
expect(result.skipped).toEqual([
|
||||
{
|
||||
relationshipId: 'orders:orders.account_id->accounts:accounts.id',
|
||||
reason: 'formal_metadata_duplicate',
|
||||
},
|
||||
{
|
||||
relationshipId: 'orders:orders.missing_account_id->accounts:accounts.id',
|
||||
reason: 'formal_metadata_endpoint_not_found',
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,649 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type {
|
||||
KtxEnrichedColumn,
|
||||
KtxEnrichedSchema,
|
||||
KtxEnrichedTable,
|
||||
KtxRelationshipEndpoint,
|
||||
} from '../../../src/context/scan/enrichment-types.js';
|
||||
import type { KtxRelationshipProfileArtifact } from '../../../src/context/scan/relationship-profiling.js';
|
||||
import type { KtxValidatedRelationshipDiscoveryCandidate } from '../../../src/context/scan/relationship-validation.js';
|
||||
import { resolveKtxRelationshipGraph } from '../../../src/context/scan/relationship-graph-resolver.js';
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(overrides: { accountsPrimaryKey?: boolean } = {}): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false, primaryKey: overrides.accountsPrimaryKey ?? false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('account_archive', [column('account_archive', 'id', { nullable: false })]),
|
||||
table('users', [
|
||||
column('users', 'id', { nullable: false }),
|
||||
column('users', 'account_id', { nullable: false }),
|
||||
]),
|
||||
],
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
function endpoint(tableName: string, columnName: string): KtxRelationshipEndpoint {
|
||||
return {
|
||||
tableId: tableName,
|
||||
columnIds: [`${tableName}.${columnName}`],
|
||||
table: { catalog: null, db: null, name: tableName },
|
||||
columns: [columnName],
|
||||
};
|
||||
}
|
||||
|
||||
function profiles(): KtxRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [
|
||||
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 3 },
|
||||
{ table: { catalog: null, db: null, name: 'account_archive' }, rowCount: 3 },
|
||||
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
|
||||
],
|
||||
columns: {
|
||||
'accounts.id': {
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'account_archive.id': {
|
||||
table: { catalog: null, db: null, name: 'account_archive' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'users.account_id': {
|
||||
table: { catalog: null, db: null, name: 'users' },
|
||||
column: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
function validatedCandidate(
|
||||
overrides: Partial<KtxValidatedRelationshipDiscoveryCandidate> = {},
|
||||
): KtxValidatedRelationshipDiscoveryCandidate {
|
||||
const from = overrides.from ?? endpoint('users', 'account_id');
|
||||
const to = overrides.to ?? endpoint('accounts', 'id');
|
||||
return {
|
||||
id: `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`,
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: overrides.confidence ?? 0.95,
|
||||
source: overrides.source ?? 'normalized_table_match',
|
||||
status: overrides.status ?? 'accepted',
|
||||
score: overrides.score ?? 0.96,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: to.table.name,
|
||||
targetColumnBase: to.columns[0] ?? '',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.92,
|
||||
reasons: ['foreign_key_suffix', 'normalized_table_name', 'target_key_like'],
|
||||
...overrides.evidence,
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 3,
|
||||
parentDistinct: 3,
|
||||
overlap: 3,
|
||||
checkedValues: 3,
|
||||
reasons: ['validation_passed'],
|
||||
...overrides.validation,
|
||||
},
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship graph resolver', () => {
|
||||
it('promotes validated relationship discovery references to accepted relationships and inferred PKs', () => {
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: profiles(),
|
||||
candidates: [validatedCandidate()],
|
||||
});
|
||||
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
pkScore: expect.any(Number),
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 1,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 1,
|
||||
incomingReviewCount: 0,
|
||||
reasons: expect.arrayContaining(['unique_target_column', 'incoming_validated_reference']),
|
||||
},
|
||||
});
|
||||
expect(result.pks.find((pk) => pk.table === 'accounts')?.pkScore).toBeGreaterThanOrEqual(0.85);
|
||||
expect(result.relationships).toHaveLength(1);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
from: { table: { name: 'users' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'accepted',
|
||||
pkScore: expect.any(Number),
|
||||
fkScore: expect.any(Number),
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['target_pk_score_passed', 'fk_score_passed']),
|
||||
},
|
||||
});
|
||||
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.85);
|
||||
});
|
||||
|
||||
it('keeps validation-unavailable candidates in review even when name evidence is strong', () => {
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: { ...profiles(), sqlAvailable: false, columns: {}, warnings: ['read_only_sql_unavailable'] },
|
||||
candidates: [
|
||||
validatedCandidate({
|
||||
status: 'review',
|
||||
score: 0.57,
|
||||
validation: {
|
||||
targetUniqueness: 0,
|
||||
sourceCoverage: 0,
|
||||
violationCount: 0,
|
||||
violationRatio: 1,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 0,
|
||||
parentDistinct: 0,
|
||||
overlap: 0,
|
||||
checkedValues: 0,
|
||||
reasons: ['validation_unavailable'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.relationships).toHaveLength(1);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['validation_unavailable_review_only']),
|
||||
},
|
||||
});
|
||||
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.55);
|
||||
});
|
||||
|
||||
it('accepts at most one target per source column and rejects the lower-scored conflict loser', () => {
|
||||
const winner = validatedCandidate({ confidence: 0.95, score: 0.96 });
|
||||
const loser = validatedCandidate({
|
||||
from: endpoint('users', 'account_id'),
|
||||
to: endpoint('account_archive', 'id'),
|
||||
confidence: 0.85,
|
||||
score: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account_archive',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.78,
|
||||
reasons: ['foreign_key_suffix', 'inflection', 'target_key_like'],
|
||||
},
|
||||
});
|
||||
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: profiles(),
|
||||
candidates: [loser, winner],
|
||||
});
|
||||
|
||||
expect(result.relationships.map((relationship) => relationship.status)).toEqual(['accepted', 'rejected']);
|
||||
expect(result.relationships[0]?.to.table.name).toBe('accounts');
|
||||
expect(result.relationships[1]).toMatchObject({
|
||||
to: { table: { name: 'account_archive' }, columns: ['id'] },
|
||||
status: 'rejected',
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['conflict_lost']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('preserves declared primary keys as accepted even without incoming candidates', () => {
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: schema({ accountsPrimaryKey: true }),
|
||||
profiles: profiles(),
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual([]);
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
pkScore: 1,
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: true,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons: ['declared_primary_key'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('infers profile-only key-like columns without incoming relationship candidates', () => {
|
||||
const baseSchema = schema();
|
||||
const invoices = table('invoices', [
|
||||
column('invoices', 'id', { nullable: false }),
|
||||
column('invoices', 'invoice_number', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
column('invoices', 'amount', {
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
}),
|
||||
]);
|
||||
const baseProfiles = profiles();
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: { ...baseSchema, tables: [...baseSchema.tables, invoices] },
|
||||
profiles: {
|
||||
...baseProfiles,
|
||||
tables: [...baseProfiles.tables, { table: invoices.ref, rowCount: 3 }],
|
||||
columns: {
|
||||
...baseProfiles.columns,
|
||||
'invoices.id': {
|
||||
table: invoices.ref,
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'invoices.invoice_number': {
|
||||
table: invoices.ref,
|
||||
column: 'invoice_number',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['INV-1', 'INV-2', 'INV-3'],
|
||||
minTextLength: 5,
|
||||
maxTextLength: 5,
|
||||
},
|
||||
'invoices.amount': {
|
||||
table: invoices.ref,
|
||||
column: 'amount',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 2 / 3,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '200'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual([]);
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'invoices',
|
||||
columns: ['id'],
|
||||
pkScore: 1,
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons: expect.arrayContaining([
|
||||
'unique_target_column',
|
||||
'profile_key_name',
|
||||
'not_null_profile',
|
||||
'profile_only_primary_key',
|
||||
'no_incoming_references',
|
||||
]),
|
||||
},
|
||||
});
|
||||
expect(result.pks).toContainEqual(
|
||||
expect.objectContaining({
|
||||
table: 'invoices',
|
||||
columns: ['invoice_number'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(result.pks.some((pk) => pk.table === 'invoices' && pk.columns[0] === 'amount')).toBe(false);
|
||||
});
|
||||
|
||||
it('pins single-incoming column_suffix_match resolver scores', () => {
|
||||
const schema = {
|
||||
connectionId: 'warehouse',
|
||||
relationships: [],
|
||||
tables: [
|
||||
{
|
||||
id: 'plans-id',
|
||||
ref: { catalog: null, db: null, name: 'stg_plans' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'plan-code-col',
|
||||
tableId: 'plans-id',
|
||||
tableRef: { catalog: null, db: null, name: 'stg_plans' },
|
||||
name: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'segments-id',
|
||||
ref: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'current-plan-code-col',
|
||||
tableId: 'segments-id',
|
||||
tableRef: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
name: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
} satisfies KtxEnrichedSchema;
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite' as const,
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
};
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema,
|
||||
profiles,
|
||||
candidates: [
|
||||
{
|
||||
id: 'segments:(current_plan_code)->plans:(plan_code)',
|
||||
from: {
|
||||
tableId: 'segments-id',
|
||||
columnIds: ['current-plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
columns: ['current_plan_code'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'plans-id',
|
||||
columnIds: ['plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
columns: ['plan_code'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.902,
|
||||
source: 'column_suffix_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'current_plan',
|
||||
targetTableBase: 'plan',
|
||||
targetColumnBase: 'plan_code',
|
||||
targetKeyScore: 0.86,
|
||||
nameScore: 0.78,
|
||||
reasons: ['column_suffix_match', 'profile_unique_target'],
|
||||
},
|
||||
status: 'accepted',
|
||||
score: 0.98,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 4,
|
||||
parentDistinct: 4,
|
||||
overlap: 4,
|
||||
checkedValues: 4,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.pks).toEqual([
|
||||
expect.objectContaining({
|
||||
table: 'stg_plans',
|
||||
columns: ['plan_code'],
|
||||
pkScore: 0.922,
|
||||
status: 'accepted',
|
||||
}),
|
||||
]);
|
||||
expect(result.relationships).toEqual([
|
||||
expect.objectContaining({
|
||||
source: 'column_suffix_match',
|
||||
status: 'accepted',
|
||||
pkScore: 0.922,
|
||||
fkScore: 0.953,
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('keeps strong profile-only primary key evidence when name evidence is weak', () => {
|
||||
const baseSchema = schema();
|
||||
baseSchema.tables.push(
|
||||
table('events', [
|
||||
column('events', 'warehouse_key', {
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
const baseProfiles = profiles();
|
||||
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
|
||||
baseProfiles.columns['events.warehouse_key'] = {
|
||||
table: { catalog: null, db: null, name: 'events' },
|
||||
column: 'warehouse_key',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '101', '102'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
};
|
||||
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: baseSchema,
|
||||
profiles: baseProfiles,
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.pks).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
table: 'events',
|
||||
columns: ['warehouse_key'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('keeps strong profile-only primary key evidence when the column is not key-shaped', () => {
|
||||
const baseSchema = schema();
|
||||
baseSchema.tables.push(
|
||||
table('events', [
|
||||
column('events', 'opaque_reference', {
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
const baseProfiles = profiles();
|
||||
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
|
||||
baseProfiles.columns['events.opaque_reference'] = {
|
||||
table: { catalog: null, db: null, name: 'events' },
|
||||
column: 'opaque_reference',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '101', '102'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
};
|
||||
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: baseSchema,
|
||||
profiles: baseProfiles,
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
const inferredPk = result.pks.find((candidate) => candidate.table === 'events');
|
||||
expect(inferredPk).toMatchObject({
|
||||
table: 'events',
|
||||
columns: ['opaque_reference'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
});
|
||||
expect(inferredPk?.pkScore).toBeGreaterThanOrEqual(0.55);
|
||||
});
|
||||
});
|
||||
214
packages/cli/test/context/scan/relationship-llm-proposal.test.ts
Normal file
214
packages/cli/test/context/scan/relationship-llm-proposal.test.ts
Normal file
|
|
@ -0,0 +1,214 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { KtxLlmRuntimePort } from '../../../src/context/llm/runtime-port.js';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
|
||||
import type { KtxRelationshipProfileArtifact } from '../../../src/context/scan/relationship-profiling.js';
|
||||
import { proposeKtxRelationshipCandidatesWithLlm } from '../../../src/context/scan/relationship-llm-proposal.js';
|
||||
|
||||
function llmRuntime(output?: unknown): KtxLlmRuntimePort {
|
||||
return {
|
||||
generateText: vi.fn(),
|
||||
generateObject: vi.fn(async () => output) as KtxLlmRuntimePort['generateObject'],
|
||||
runAgentLoop: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
relationships: [],
|
||||
tables: [
|
||||
table('customers', [
|
||||
column('customers', 'id', { nullable: false }),
|
||||
column('customers', 'email', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('orders', [
|
||||
column('orders', 'id', { nullable: false }),
|
||||
column('orders', 'buyer_ref'),
|
||||
]),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function profile(): KtxRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 4,
|
||||
warnings: [],
|
||||
tables: [
|
||||
{ table: { catalog: null, db: null, name: 'customers' }, rowCount: 2 },
|
||||
{ table: { catalog: null, db: null, name: 'orders' }, rowCount: 2 },
|
||||
],
|
||||
columns: {
|
||||
'customers.id': {
|
||||
table: { catalog: null, db: null, name: 'customers' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'orders.buyer_ref': {
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
column: 'buyer_ref',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship LLM proposals', () => {
|
||||
it('maps valid structured FK proposals into review candidates with rationale evidence', async () => {
|
||||
const runtime = llmRuntime({
|
||||
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.94, rationale: 'Unique customer identifier.' }],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'buyer_ref',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.88,
|
||||
rationale: 'Buyer reference values match customer identifiers.',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmRuntime: runtime,
|
||||
});
|
||||
|
||||
expect(result.summary).toBe('completed');
|
||||
expect(result.llmCalls).toBe(1);
|
||||
expect(result.warnings).toEqual([]);
|
||||
expect(result.candidates).toHaveLength(1);
|
||||
expect(result.candidates[0]).toMatchObject({
|
||||
from: { tableId: 'orders', columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
|
||||
to: { tableId: 'customers', columnIds: ['customers.id'], columns: ['id'] },
|
||||
source: 'llm_proposal',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
llmConfidence: 0.88,
|
||||
llmRationale: 'Buyer reference values match customer identifiers.',
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
},
|
||||
});
|
||||
expect(runtime.generateObject).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
role: 'candidateExtraction',
|
||||
system: expect.stringContaining('You are helping KTX review possible SQL relationships'),
|
||||
prompt: expect.stringContaining('"tables"'),
|
||||
}),
|
||||
);
|
||||
const call = vi.mocked(runtime.generateObject).mock.calls[0]?.[0];
|
||||
expect(call?.prompt).not.toContain('You are helping KTX review possible SQL relationships');
|
||||
});
|
||||
|
||||
it('skips when no runtime is configured', async () => {
|
||||
const result = await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmRuntime: null,
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({ candidates: [], llmCalls: 0, summary: 'skipped' });
|
||||
expect(result.warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns recoverable warnings for invalid references and generation failures', async () => {
|
||||
const invalidReference = await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmRuntime: llmRuntime({
|
||||
pkCandidates: [],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'missing_column',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.7,
|
||||
rationale: 'Invalid source column.',
|
||||
},
|
||||
],
|
||||
}),
|
||||
});
|
||||
expect(invalidReference.candidates).toEqual([]);
|
||||
expect(invalidReference.summary).toBe('completed');
|
||||
expect(invalidReference.warnings[0]).toMatchObject({
|
||||
code: 'relationship_llm_invalid_reference',
|
||||
recoverable: true,
|
||||
});
|
||||
|
||||
const failed = await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmRuntime: {
|
||||
generateText: vi.fn(),
|
||||
generateObject: vi.fn(async () => {
|
||||
throw new Error('model unavailable');
|
||||
}),
|
||||
runAgentLoop: vi.fn(),
|
||||
},
|
||||
});
|
||||
expect(failed).toMatchObject({ candidates: [], llmCalls: 1, summary: 'failed' });
|
||||
expect(failed.warnings[0]).toMatchObject({
|
||||
code: 'relationship_llm_proposal_failed',
|
||||
message: 'KTX relationship LLM proposal failed: model unavailable',
|
||||
recoverable: true,
|
||||
});
|
||||
});
|
||||
});
|
||||
151
packages/cli/test/context/scan/relationship-locality.test.ts
Normal file
151
packages/cli/test/context/scan/relationship-locality.test.ts
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
|
||||
import { localCandidateTables } from '../../../src/context/scan/relationship-locality.js';
|
||||
|
||||
function column(
|
||||
tableId: string,
|
||||
id: string,
|
||||
name: string,
|
||||
options: Partial<KtxEnrichedColumn> = {},
|
||||
): KtxEnrichedColumn {
|
||||
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
|
||||
return {
|
||||
id,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: options.nativeType ?? 'INTEGER',
|
||||
normalizedType: options.normalizedType ?? 'integer',
|
||||
dimensionType: options.dimensionType ?? 'number',
|
||||
nullable: options.nullable ?? true,
|
||||
primaryKey: options.primaryKey ?? false,
|
||||
parentColumnId: options.parentColumnId ?? null,
|
||||
descriptions: options.descriptions ?? {},
|
||||
embedding: options.embedding ?? null,
|
||||
sampleValues: options.sampleValues ?? null,
|
||||
cardinality: options.cardinality ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function table(id: string, name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: 'public', name };
|
||||
return {
|
||||
id,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship locality', () => {
|
||||
it('ranks the referenced parent table ahead of the child table for id-like source columns', () => {
|
||||
const artists = table('artist-id', 'Artist', [column('artist-id', 'artist-pk', 'ArtistId')]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-pk', 'AlbumId'),
|
||||
column('album-id', 'artist-fk', 'ArtistId'),
|
||||
]);
|
||||
const unrelated = table('invoice-id', 'Invoice', [column('invoice-id', 'invoice-pk', 'InvoiceId')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: albums,
|
||||
childColumn: albums.columns[1]!,
|
||||
parentTables: [albums, unrelated, artists],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['Artist']);
|
||||
expect(ranked[0]).toMatchObject({
|
||||
score: expect.any(Number),
|
||||
tokenScore: expect.any(Number),
|
||||
embeddingScore: 0,
|
||||
reasons: expect.arrayContaining(['column_table_token_overlap']),
|
||||
});
|
||||
});
|
||||
|
||||
it('uses singular and plural variants so plan_code can rank stg_plans', () => {
|
||||
const plans = table('plans-id', 'stg_plans', [column('plans-id', 'plan-code', 'plan_code')]);
|
||||
const segments = table('segments-id', 'mart_account_segments', [
|
||||
column('segments-id', 'current-plan-code', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: segments,
|
||||
childColumn: segments.columns[0]!,
|
||||
parentTables: [accounts, segments, plans],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['stg_plans']);
|
||||
expect(ranked[0]?.tokenScore).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('returns all tables when the schema is smaller than the default locality cap', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id', 'id'),
|
||||
column('invoices-id', 'account-id', 'account_id'),
|
||||
]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: invoices,
|
||||
childColumn: invoices.columns[1]!,
|
||||
parentTables: [invoices, accounts],
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name).sort()).toEqual(['accounts', 'invoices']);
|
||||
});
|
||||
|
||||
it('supports an explicit zero cap for deterministic tests', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id', 'id'),
|
||||
column('invoices-id', 'account-id', 'account_id'),
|
||||
]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: invoices,
|
||||
childColumn: invoices.columns[1]!,
|
||||
parentTables: [invoices, accounts],
|
||||
maxParentTables: 0,
|
||||
});
|
||||
|
||||
expect(ranked).toEqual([]);
|
||||
});
|
||||
|
||||
it('uses parent-column embeddings when token locality is weak', () => {
|
||||
const customers = table('customers-id', 'customers', [
|
||||
column('customers-id', 'customers-id-col', 'id', { embedding: [1, 0, 0] }),
|
||||
column('customers-id', 'customers-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
embedding: [0, 1, 0],
|
||||
}),
|
||||
]);
|
||||
const orders = table('orders-id', 'orders', [
|
||||
column('orders-id', 'orders-id-col', 'id', { embedding: [0, 0, 1] }),
|
||||
column('orders-id', 'buyer-ref-col', 'buyer_ref', { embedding: [0.995, 0.005, 0] }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'invoice-id', 'id')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: orders,
|
||||
childColumn: orders.columns[1]!,
|
||||
parentTables: [invoices, customers],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['customers']);
|
||||
expect(ranked[0]).toMatchObject({
|
||||
embeddingScore: expect.any(Number),
|
||||
reasons: expect.arrayContaining(['embedding_similarity']),
|
||||
});
|
||||
expect(ranked[0]!.embeddingScore).toBeGreaterThan(0.99);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
normalizeKtxRelationshipName,
|
||||
pluralizeKtxRelationshipToken,
|
||||
singularizeKtxRelationshipToken,
|
||||
tokenSimilarity,
|
||||
tokenizeKtxRelationshipName,
|
||||
} from '../../../src/context/scan/relationship-name-similarity.js';
|
||||
|
||||
describe('relationship name similarity', () => {
|
||||
it('tokenizes common warehouse naming styles', () => {
|
||||
expect(normalizeKtxRelationshipName('AlbumId')).toMatchObject({
|
||||
normalized: 'album_id',
|
||||
singular: 'album_id',
|
||||
plural: 'album_ids',
|
||||
tokens: ['album', 'id'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('artistID')).toMatchObject({
|
||||
normalized: 'artist_id',
|
||||
tokens: ['artist', 'id'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('SalesLT.CustomerID')).toMatchObject({
|
||||
normalized: 'sales_lt_customer_id',
|
||||
singular: 'sales_lt_customer_id',
|
||||
tokens: ['sales', 'lt', 'customer', 'id'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('SCREAMING_CUSTOMER_UUID')).toMatchObject({
|
||||
normalized: 'screaming_customer_uuid',
|
||||
tokens: ['screaming', 'customer', 'uuid'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('billing-account-key')).toMatchObject({
|
||||
normalized: 'billing_account_key',
|
||||
tokens: ['billing', 'account', 'key'],
|
||||
});
|
||||
});
|
||||
|
||||
it('removes only leading warehouse layer prefixes', () => {
|
||||
expect(normalizeKtxRelationshipName('mart__Sales_Accounts')).toMatchObject({
|
||||
normalized: 'sales_accounts',
|
||||
singular: 'sales_account',
|
||||
plural: 'sales_accounts',
|
||||
tokens: ['sales', 'accounts'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('dim_users')).toMatchObject({
|
||||
normalized: 'users',
|
||||
singular: 'user',
|
||||
plural: 'users',
|
||||
tokens: ['users'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('customer_dim_id')).toMatchObject({
|
||||
normalized: 'customer_dim_id',
|
||||
tokens: ['customer', 'dim', 'id'],
|
||||
});
|
||||
});
|
||||
|
||||
it('folds accents and preserves non-suffix trailing s words', () => {
|
||||
expect(normalizeKtxRelationshipName('KundénID')).toMatchObject({
|
||||
normalized: 'kunden_id',
|
||||
tokens: ['kunden', 'id'],
|
||||
});
|
||||
expect(singularizeKtxRelationshipToken('address')).toBe('address');
|
||||
expect(singularizeKtxRelationshipToken('addresses')).toBe('address');
|
||||
expect(singularizeKtxRelationshipToken('status')).toBe('status');
|
||||
expect(pluralizeKtxRelationshipToken('address')).toBe('addresses');
|
||||
expect(pluralizeKtxRelationshipToken('company')).toBe('companies');
|
||||
});
|
||||
|
||||
it('returns deterministic tokens for direct tokenization calls', () => {
|
||||
expect(tokenizeKtxRelationshipName('HTTPResponseCode')).toEqual(['http', 'response', 'code']);
|
||||
expect(tokenizeKtxRelationshipName('customer2AddressID')).toEqual(['customer', '2', 'address', 'id']);
|
||||
});
|
||||
|
||||
it('scores token overlap and ordered suffix similarity', () => {
|
||||
expect(tokenSimilarity('artist_id', 'artist_id')).toBe(1);
|
||||
expect(tokenSimilarity('Album.ArtistId', 'ArtistID')).toBeGreaterThanOrEqual(0.74);
|
||||
expect(tokenSimilarity('customer_account_id', 'account_id')).toBeGreaterThan(
|
||||
tokenSimilarity('customer_account_id', 'invoice_id'),
|
||||
);
|
||||
expect(tokenSimilarity('', 'artist')).toBe(0);
|
||||
});
|
||||
});
|
||||
430
packages/cli/test/context/scan/relationship-profiling.test.ts
Normal file
430
packages/cli/test/context/scan/relationship-profiling.test.ts
Normal file
|
|
@ -0,0 +1,430 @@
|
|||
import { readFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it, vi } from 'vitest';
|
||||
import { getDialectForDriver } from '../../../src/context/connections/dialects.js';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
|
||||
import { snapshotToKtxEnrichedSchema } from '../../../src/context/scan/local-enrichment.js';
|
||||
import { loadKtxRelationshipBenchmarkFixture, maskKtxRelationshipBenchmarkSnapshot } from '../../../src/context/scan/relationship-benchmarks.js';
|
||||
import { createKtxRelationshipProfileCache, profileKtxRelationshipSchema } from '../../../src/context/scan/relationship-profiling.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from '../../../src/context/scan/types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
class FileSqliteExecutor {
|
||||
readonly db: Database.Database;
|
||||
queryCount = 0;
|
||||
|
||||
constructor(dataPath: string) {
|
||||
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
|
||||
}
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables: KtxEnrichedTable[]): KtxEnrichedSchema {
|
||||
return { connectionId: 'warehouse', tables, relationships: [] };
|
||||
}
|
||||
|
||||
describe('relationship profiling', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('keeps profiling on the batched table path', async () => {
|
||||
const source = await readFile(new URL('../../../src/context/scan/relationship-profiling.ts', import.meta.url), 'utf-8');
|
||||
|
||||
expect(source).not.toMatch(new RegExp('queryColumn' + 'Profile'));
|
||||
expect(source).not.toMatch(/for \(const column of table\.columns\)[\s\S]*executeReadOnly/);
|
||||
expect(source).toMatch(/queryTableProfile/);
|
||||
expect(source).toMatch(/UNION ALL/);
|
||||
});
|
||||
|
||||
it('profiles row count, null rate, uniqueness, sample values, and text lengths', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
|
||||
INSERT INTO accounts (id, code, parent_id) VALUES
|
||||
(1, 'A-1', NULL),
|
||||
(2, 'B-2', 1),
|
||||
(3, 'C-3', 1),
|
||||
(4, 'C-3', 2);
|
||||
`);
|
||||
|
||||
const result = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { primaryKey: false, nullable: false }),
|
||||
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
column('accounts', 'parent_id'),
|
||||
]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-test' },
|
||||
sampleValuesPerColumn: 3,
|
||||
});
|
||||
|
||||
expect(result.sqlAvailable).toBe(true);
|
||||
expect(result.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(result.tables).toHaveLength(1);
|
||||
expect(result.tables[0]).toMatchObject({ table: { name: 'accounts' }, rowCount: 4 });
|
||||
expect(result.columns['accounts.id']).toMatchObject({
|
||||
table: { name: 'accounts' },
|
||||
column: 'id',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
});
|
||||
expect(result.columns['accounts.code']).toMatchObject({
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
sampleValues: ['C-3', 'A-1', 'B-2'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
});
|
||||
expect(result.columns['accounts.parent_id']).toMatchObject({
|
||||
nullCount: 1,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.5,
|
||||
nullRate: 0.25,
|
||||
});
|
||||
});
|
||||
|
||||
it('profiles each enabled table with one read-only SQL query', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, code, parent_id) VALUES
|
||||
(1, 'A-1', NULL),
|
||||
(2, 'B-2', 1),
|
||||
(3, 'C-3', 1),
|
||||
(4, 'C-3', 2);
|
||||
INSERT INTO users (id, account_id) VALUES
|
||||
(10, 1),
|
||||
(11, 1),
|
||||
(12, 2);
|
||||
`);
|
||||
|
||||
const result = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
column('accounts', 'parent_id'),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id')]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-batched-query-count' },
|
||||
sampleValuesPerColumn: 3,
|
||||
});
|
||||
|
||||
expect(result.sqlAvailable).toBe(true);
|
||||
expect(result.queryCount).toBe(2);
|
||||
expect(executor.queryCount).toBe(2);
|
||||
expect(result.tables).toEqual([
|
||||
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
|
||||
]);
|
||||
expect(result.columns['accounts.code']).toMatchObject({
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
sampleValues: ['C-3', 'A-1', 'B-2'],
|
||||
});
|
||||
expect(result.columns['users.account_id']).toMatchObject({
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 2 / 3,
|
||||
});
|
||||
});
|
||||
|
||||
it('bounds column profile statistics with profileSampleRows', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a3'), (4, 'a4');
|
||||
`);
|
||||
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'account_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-sample-rows' },
|
||||
profileSampleRows: 2,
|
||||
});
|
||||
|
||||
expect(profiles.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(profiles.tables).toEqual([{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 }]);
|
||||
expect(profiles.columns['accounts.id']).toMatchObject({
|
||||
rowCount: 2,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
});
|
||||
expect(profiles.columns['accounts.account_code']?.sampleValues).toEqual(['a1', 'a2']);
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('reuses a profile cache inside one scan run but re-queries with a fresh cache', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a2');
|
||||
`);
|
||||
const relationshipSchema = schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'account_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
]),
|
||||
]);
|
||||
const cache = createKtxRelationshipProfileCache();
|
||||
|
||||
const first = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-run' },
|
||||
cache,
|
||||
});
|
||||
const second = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-run' },
|
||||
cache,
|
||||
});
|
||||
const third = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-fresh-run' },
|
||||
cache: createKtxRelationshipProfileCache(),
|
||||
});
|
||||
|
||||
expect(first.queryCount).toBe(1);
|
||||
expect(second.queryCount).toBe(0);
|
||||
expect(third.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(2);
|
||||
expect(second.tables).toEqual(first.tables);
|
||||
expect(second.columns).toEqual(first.columns);
|
||||
});
|
||||
|
||||
it('profiles the checked-in scale stress fixture with one query per table', async () => {
|
||||
const fixtureRoot = new URL('../../fixtures/relationship-benchmarks', import.meta.url);
|
||||
const fixture = await loadKtxRelationshipBenchmarkFixture(join(fixtureRoot.pathname, 'scale_stress_no_declared_constraints'));
|
||||
if (!fixture.dataPath) {
|
||||
throw new Error('scale_stress_no_declared_constraints is missing data.sqlite');
|
||||
}
|
||||
const maskedSnapshot = maskKtxRelationshipBenchmarkSnapshot(
|
||||
fixture.snapshot,
|
||||
'declared_pks_and_declared_fks_removed',
|
||||
);
|
||||
const scaleExecutor = new FileSqliteExecutor(fixture.dataPath);
|
||||
try {
|
||||
const result = await profileKtxRelationshipSchema({
|
||||
connectionId: fixture.snapshot.connectionId,
|
||||
dialect: getDialectForDriver(fixture.snapshot.driver),
|
||||
schema: snapshotToKtxEnrichedSchema(maskedSnapshot, new Map()),
|
||||
executor: scaleExecutor,
|
||||
ctx: { runId: 'scale-stress-profile-query-count' },
|
||||
profileSampleRows: 3,
|
||||
});
|
||||
|
||||
expect(fixture.snapshot.tables).toHaveLength(400);
|
||||
expect(result.queryCount).toBe(400);
|
||||
expect(result.queryCount).toBeLessThanOrEqual(2 * fixture.snapshot.tables.length);
|
||||
expect(scaleExecutor.queryCount).toBe(400);
|
||||
} finally {
|
||||
scaleExecutor.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('profiles tables concurrently up to profileConcurrency', async () => {
|
||||
let inFlight = 0;
|
||||
let maxInFlight = 0;
|
||||
const executor = {
|
||||
executeReadOnly: vi.fn(async (input: KtxReadOnlyQueryInput) => {
|
||||
inFlight += 1;
|
||||
maxInFlight = Math.max(maxInFlight, inFlight);
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
inFlight -= 1;
|
||||
return {
|
||||
headers: [
|
||||
'column_name',
|
||||
'table_row_count',
|
||||
'row_count',
|
||||
'null_count',
|
||||
'distinct_count',
|
||||
'min_text_length',
|
||||
'max_text_length',
|
||||
'sample_values',
|
||||
],
|
||||
rows: [[input.sql.includes('accounts') ? 'id' : 'account_id', 2, 2, 0, 2, 1, 2, '1\u001f2']],
|
||||
totalRows: 1,
|
||||
rowCount: 1,
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: schemaWithTables(['accounts', 'orders', 'payments', 'refunds']),
|
||||
executor,
|
||||
ctx: { runId: 'profile-concurrency' },
|
||||
profileConcurrency: 4,
|
||||
});
|
||||
|
||||
expect(maxInFlight).toBe(4);
|
||||
});
|
||||
|
||||
it('keeps profiling other tables when one table profile fails', async () => {
|
||||
const executor = {
|
||||
executeReadOnly: vi.fn(async (input: KtxReadOnlyQueryInput) => {
|
||||
if (input.sql.includes('"orders"')) {
|
||||
throw new Error('orders unavailable');
|
||||
}
|
||||
return {
|
||||
headers: [
|
||||
'column_name',
|
||||
'table_row_count',
|
||||
'row_count',
|
||||
'null_count',
|
||||
'distinct_count',
|
||||
'min_text_length',
|
||||
'max_text_length',
|
||||
'sample_values',
|
||||
],
|
||||
rows: [['id', 2, 2, 0, 2, 1, 2, '1\u001f2']],
|
||||
totalRows: 1,
|
||||
rowCount: 1,
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const result = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: schemaWithTables(['accounts', 'orders']),
|
||||
executor,
|
||||
ctx: { runId: 'profile-error-isolated' },
|
||||
profileConcurrency: 2,
|
||||
});
|
||||
|
||||
expect(result.warnings).toContain('profile_failed:orders:orders unavailable');
|
||||
expect(result.tables).toHaveLength(2);
|
||||
expect(Object.keys(result.columns)).toContain('accounts.id');
|
||||
});
|
||||
});
|
||||
|
||||
function schemaWithTables(names: string[]): KtxEnrichedSchema {
|
||||
return schema(
|
||||
names.map((name) =>
|
||||
table(name, [
|
||||
column(name, name === 'orders' ? 'account_id' : 'id', {
|
||||
nullable: false,
|
||||
primaryKey: name !== 'orders',
|
||||
}),
|
||||
]),
|
||||
),
|
||||
);
|
||||
}
|
||||
108
packages/cli/test/context/scan/relationship-scoring.test.ts
Normal file
108
packages/cli/test/context/scan/relationship-scoring.test.ts
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
calibrateWeightsFromSyntheticFixtures,
|
||||
defaultKtxRelationshipScoreWeights,
|
||||
normalizeKtxRelationshipScoreWeights,
|
||||
scoreKtxRelationshipCandidate,
|
||||
type KtxRelationshipSignalVector,
|
||||
} from '../../../src/context/scan/relationship-scoring.js';
|
||||
|
||||
function signals(overrides: Partial<KtxRelationshipSignalVector> = {}): KtxRelationshipSignalVector {
|
||||
return {
|
||||
nameSimilarity: 0.5,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: 0.5,
|
||||
profileNullRate: 0.5,
|
||||
structuralPrior: 0.5,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship scoring', () => {
|
||||
it('scores stronger evidence higher without hard-gating on names', () => {
|
||||
const weakNameStrongProfile = scoreKtxRelationshipCandidate(
|
||||
signals({
|
||||
nameSimilarity: 0.05,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0.7,
|
||||
profileUniqueness: 1,
|
||||
profileNullRate: 1,
|
||||
structuralPrior: 0.7,
|
||||
}),
|
||||
);
|
||||
const strongNameWeakProfile = scoreKtxRelationshipCandidate(
|
||||
signals({
|
||||
nameSimilarity: 0.95,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
profileUniqueness: 0.3,
|
||||
profileNullRate: 0.4,
|
||||
structuralPrior: 0.5,
|
||||
}),
|
||||
);
|
||||
|
||||
expect(weakNameStrongProfile.score).toBeGreaterThan(strongNameWeakProfile.score);
|
||||
expect(weakNameStrongProfile.contributions.profileUniqueness).toBeGreaterThan(0);
|
||||
expect(weakNameStrongProfile.contributions.nameSimilarity).toBeLessThan(0.02);
|
||||
});
|
||||
|
||||
it('normalizes partial and invalid weights into a usable vector', () => {
|
||||
const weights = normalizeKtxRelationshipScoreWeights({
|
||||
nameSimilarity: 3,
|
||||
typeCompatibility: -1,
|
||||
valueOverlap: Number.POSITIVE_INFINITY,
|
||||
profileUniqueness: 1,
|
||||
});
|
||||
|
||||
const total = Object.values(weights).reduce((sum, value) => sum + value, 0);
|
||||
expect(total).toBeCloseTo(1, 6);
|
||||
expect(weights.nameSimilarity).toBeGreaterThan(weights.profileUniqueness);
|
||||
expect(weights.typeCompatibility).toBe(0);
|
||||
expect(weights.valueOverlap).toBe(0);
|
||||
});
|
||||
|
||||
it('returns deterministic defaults as a defensive copy', () => {
|
||||
const first = defaultKtxRelationshipScoreWeights();
|
||||
const second = defaultKtxRelationshipScoreWeights();
|
||||
|
||||
expect(first).toEqual(second);
|
||||
expect(first).not.toBe(second);
|
||||
expect(Object.values(first).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
|
||||
});
|
||||
|
||||
it('calibrates only from synthetic observations', () => {
|
||||
expect(() =>
|
||||
calibrateWeightsFromSyntheticFixtures([
|
||||
{
|
||||
fixtureId: 'chinook_with_declared_metadata',
|
||||
origin: 'public',
|
||||
expectedRelationship: true,
|
||||
signals: signals({ nameSimilarity: 1 }),
|
||||
},
|
||||
]),
|
||||
).toThrow(/synthetic/i);
|
||||
});
|
||||
|
||||
it('calibrates deterministic weights from positive and negative synthetic observations', () => {
|
||||
const weights = calibrateWeightsFromSyntheticFixtures([
|
||||
{
|
||||
fixtureId: 'synthetic_positive',
|
||||
origin: 'synthetic',
|
||||
expectedRelationship: true,
|
||||
signals: signals({ nameSimilarity: 0.8, valueOverlap: 0.9, profileUniqueness: 1, profileNullRate: 1 }),
|
||||
},
|
||||
{
|
||||
fixtureId: 'synthetic_negative',
|
||||
origin: 'synthetic',
|
||||
expectedRelationship: false,
|
||||
signals: signals({ nameSimilarity: 0.2, valueOverlap: 0.1, profileUniqueness: 0.4, profileNullRate: 0.5 }),
|
||||
},
|
||||
]);
|
||||
|
||||
expect(Object.values(weights).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
|
||||
expect(weights.valueOverlap).toBeGreaterThan(weights.structuralPrior);
|
||||
expect(weights.profileUniqueness).toBeGreaterThan(weights.embeddingSimilarity);
|
||||
});
|
||||
});
|
||||
498
packages/cli/test/context/scan/relationship-validation.test.ts
Normal file
498
packages/cli/test/context/scan/relationship-validation.test.ts
Normal file
|
|
@ -0,0 +1,498 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it } from 'vitest';
|
||||
import { getDialectForDriver } from '../../../src/context/connections/dialects.js';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from '../../../src/context/scan/enrichment-types.js';
|
||||
import { generateKtxRelationshipDiscoveryCandidates } from '../../../src/context/scan/relationship-candidates.js';
|
||||
import type { KtxRelationshipProfileArtifact } from '../../../src/context/scan/relationship-profiling.js';
|
||||
import { profileKtxRelationshipSchema } from '../../../src/context/scan/relationship-profiling.js';
|
||||
import { validateKtxRelationshipDiscoveryCandidates } from '../../../src/context/scan/relationship-validation.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from '../../../src/context/scan/types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables?: KtxEnrichedTable[]): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: tables ?? [
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
|
||||
table('invoices', [
|
||||
column('invoices', 'id', { nullable: false }),
|
||||
column('invoices', 'account_id', { nullable: false }),
|
||||
]),
|
||||
],
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship validation', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('accepts a relationship-discovery candidate with unique parent values and full source coverage', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 999);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
});
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).filter(
|
||||
(candidate) => candidate.from.table.name === 'users',
|
||||
);
|
||||
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
tableCount: testSchema.tables.length,
|
||||
});
|
||||
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
from: { table: { name: 'users' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'accepted',
|
||||
score: expect.any(Number),
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
reasons: expect.arrayContaining(['validation_passed']),
|
||||
},
|
||||
});
|
||||
expect(validated[0]?.score).toBeGreaterThanOrEqual(0.85);
|
||||
});
|
||||
|
||||
it('rejects a candidate with missing parent values and records the deterministic reason', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 999), (22, 1000);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
});
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).filter(
|
||||
(candidate) => candidate.from.table.name === 'invoices',
|
||||
);
|
||||
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: {
|
||||
minSourceCoverage: 0.9,
|
||||
maxViolationRatio: 0.01,
|
||||
},
|
||||
});
|
||||
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
from: { table: { name: 'invoices' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'rejected',
|
||||
validation: {
|
||||
sourceCoverage: 1 / 3,
|
||||
violationCount: 2,
|
||||
violationRatio: 2 / 3,
|
||||
reasons: expect.arrayContaining(['low_source_coverage', 'excessive_violations']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps over-budget candidates review-only without executing coverage SQL for them', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 3);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-budget-profile' },
|
||||
});
|
||||
executor.queryCount = 0;
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).map((candidate) => ({
|
||||
...candidate,
|
||||
confidence: candidate.from.table.name === 'users' ? 0.99 : 0.5,
|
||||
}));
|
||||
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-budget' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: {
|
||||
validationBudget: 1,
|
||||
},
|
||||
});
|
||||
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(validated).toHaveLength(2);
|
||||
expect(validated.find((candidate) => candidate.from.table.name === 'users')).toMatchObject({
|
||||
status: 'accepted',
|
||||
validation: { reasons: expect.arrayContaining(['validation_passed']) },
|
||||
});
|
||||
expect(validated.find((candidate) => candidate.from.table.name === 'invoices')).toMatchObject({
|
||||
status: 'review',
|
||||
validation: {
|
||||
reasons: ['validation_unattempted'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('treats validation budget zero as review-only validation without coverage SQL', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
|
||||
`);
|
||||
const testSchema = schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
|
||||
]);
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-zero-budget-profile' },
|
||||
});
|
||||
executor.queryCount = 0;
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema);
|
||||
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-zero-budget' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: {
|
||||
validationBudget: 0,
|
||||
},
|
||||
});
|
||||
|
||||
expect(executor.queryCount).toBe(0);
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
status: 'review',
|
||||
score: expect.any(Number),
|
||||
validation: {
|
||||
checkedValues: 0,
|
||||
reasons: ['validation_unattempted'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('marks rejected LLM proposals with the spec rejection reason', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER);
|
||||
CREATE TABLE orders (buyer_ref INTEGER);
|
||||
INSERT INTO customers (id) VALUES (1), (2);
|
||||
INSERT INTO orders (buyer_ref) VALUES (98), (99);
|
||||
`);
|
||||
const testSchema = schema([
|
||||
table('customers', [column('customers', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'buyer_ref')]),
|
||||
]);
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'llm-rejected-validation' },
|
||||
});
|
||||
const [candidate] = generateKtxRelationshipDiscoveryCandidates(
|
||||
schema([
|
||||
table('customers', [column('customers', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'customer_id')]),
|
||||
]),
|
||||
);
|
||||
if (!candidate) {
|
||||
throw new Error('Expected base candidate');
|
||||
}
|
||||
const llmCandidate = {
|
||||
...candidate,
|
||||
id: 'orders:(orders.buyer_ref)->customers:(customers.id)',
|
||||
from: { ...candidate.from, columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
|
||||
source: 'llm_proposal' as const,
|
||||
evidence: {
|
||||
...candidate.evidence,
|
||||
reasons: ['llm_proposal'],
|
||||
llmConfidence: 0.84,
|
||||
llmRationale: 'Buyer references should map to customers.',
|
||||
},
|
||||
};
|
||||
|
||||
const [validated] = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
candidates: [llmCandidate],
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'llm-rejected-validation' },
|
||||
tableCount: testSchema.tables.length,
|
||||
});
|
||||
|
||||
expect(validated?.status).toBe('rejected');
|
||||
expect(validated?.validation.reasons).toEqual(
|
||||
expect.arrayContaining(['low_source_coverage', 'llm_proposed_but_validation_failed']),
|
||||
);
|
||||
});
|
||||
|
||||
it('limits validation query concurrency', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
CREATE TABLE invoices (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1), (2);
|
||||
INSERT INTO orders VALUES (10, 1), (11, 2);
|
||||
INSERT INTO invoices VALUES (20, 1), (21, 2);
|
||||
`);
|
||||
|
||||
let active = 0;
|
||||
let maxActive = 0;
|
||||
const throttled = {
|
||||
executeReadOnly: async (input: KtxReadOnlyQueryInput, ctx: KtxScanContext) => {
|
||||
active += 1;
|
||||
maxActive = Math.max(maxActive, active);
|
||||
await new Promise((resolve) => setTimeout(resolve, input.sql.includes('WITH child_values') ? 10 : 0));
|
||||
const result = await executor.executeReadOnly(input, ctx);
|
||||
active -= 1;
|
||||
return result;
|
||||
},
|
||||
};
|
||||
|
||||
const testSchema = schema([
|
||||
table('accounts', [column('accounts', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'id', { nullable: false }), column('orders', 'account_id')]),
|
||||
table('invoices', [column('invoices', 'id', { nullable: false }), column('invoices', 'account_id')]),
|
||||
]);
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validation-concurrency-profile' },
|
||||
});
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema);
|
||||
|
||||
await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
candidates,
|
||||
profiles,
|
||||
executor: throttled,
|
||||
ctx: { runId: 'validation-concurrency' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: { concurrency: 1 },
|
||||
});
|
||||
|
||||
expect(maxActive).toBe(1);
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('pins column_suffix_match validation scoring for plan-code suffix candidates', async () => {
|
||||
const candidate = {
|
||||
id: 'mart:(current_plan_code)->plans:(plan_code)',
|
||||
from: {
|
||||
tableId: 'mart-account-segments-id',
|
||||
columnIds: ['current-plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
columns: ['current_plan_code'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'plans-id',
|
||||
columnIds: ['plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
columns: ['plan_code'],
|
||||
},
|
||||
relationshipType: 'many_to_one' as const,
|
||||
confidence: 0.902,
|
||||
source: 'column_suffix_match' as const,
|
||||
status: 'review' as const,
|
||||
evidence: {
|
||||
sourceColumnBase: 'current_plan',
|
||||
targetTableBase: 'plan',
|
||||
targetColumnBase: 'plan_code',
|
||||
targetKeyScore: 0.86,
|
||||
nameScore: 0.78,
|
||||
reasons: ['column_suffix_match', 'profile_unique_target'],
|
||||
},
|
||||
};
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'mart_account_segments.current_plan_code': {
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
column: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
} satisfies KtxRelationshipProfileArtifact;
|
||||
const executor = {
|
||||
async executeReadOnly() {
|
||||
return {
|
||||
headers: ['child_distinct', 'parent_distinct', 'overlap', 'violation_count'],
|
||||
rows: [[4, 4, 4, 0]],
|
||||
rowCount: 1,
|
||||
totalRows: 1,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const [validated] = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
dialect: getDialectForDriver('sqlite'),
|
||||
candidates: [candidate],
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'rule-b-validation-score' },
|
||||
tableCount: 2,
|
||||
});
|
||||
|
||||
expect(validated).toMatchObject({
|
||||
status: 'accepted',
|
||||
score: 0.98,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationRatio: 0,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
67
packages/cli/test/context/scan/table-ref.test.ts
Normal file
67
packages/cli/test/context/scan/table-ref.test.ts
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
scopedTableNames,
|
||||
tableRefFromKey,
|
||||
tableRefKey,
|
||||
tableRefSet,
|
||||
type KtxTableRefKey,
|
||||
} from '../../../src/context/scan/table-ref.js';
|
||||
|
||||
describe('tableRefKey roundtrip', () => {
|
||||
it('encodes and decodes a three-part ref', () => {
|
||||
const ref = { catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' };
|
||||
expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref);
|
||||
});
|
||||
|
||||
it('treats null catalog/db as the empty segment', () => {
|
||||
const ref = { catalog: null, db: 'public', name: 'users' };
|
||||
expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref);
|
||||
});
|
||||
|
||||
it('roundtrips a bare-name ref', () => {
|
||||
const ref = { catalog: null, db: null, name: 'orders' };
|
||||
expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref);
|
||||
});
|
||||
});
|
||||
|
||||
describe('tableRefSet', () => {
|
||||
it('produces a set with member-equality on canonical keys', () => {
|
||||
const scope = tableRefSet([
|
||||
{ catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' },
|
||||
{ catalog: 'ANALYTICS', db: 'MARTS', name: 'ITEMS' },
|
||||
]);
|
||||
expect(scope.size).toBe(2);
|
||||
expect(scope.has(tableRefKey({ catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' }))).toBe(true);
|
||||
expect(scope.has(tableRefKey({ catalog: 'ANALYTICS', db: 'MARTS', name: 'OTHER' }))).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('scopedTableNames', () => {
|
||||
it('projects to the requested (catalog, db) namespace', () => {
|
||||
const scope = tableRefSet([
|
||||
{ catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' },
|
||||
{ catalog: 'ANALYTICS', db: 'MARTS', name: 'ITEMS' },
|
||||
{ catalog: 'ANALYTICS', db: 'STAGING', name: 'LISTINGS' },
|
||||
]);
|
||||
expect(scopedTableNames(scope, { catalog: 'ANALYTICS', db: 'MARTS' }).sort()).toEqual(['ITEMS', 'LISTINGS']);
|
||||
expect(scopedTableNames(scope, { catalog: 'ANALYTICS', db: 'STAGING' })).toEqual(['LISTINGS']);
|
||||
});
|
||||
|
||||
it('requires non-null scope segments to match the namespace', () => {
|
||||
const scope = tableRefSet([{ catalog: null, db: 'public', name: 'users' }]);
|
||||
expect(scopedTableNames(scope, { catalog: 'any-catalog', db: 'public' })).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns empty when no scope entry matches the namespace', () => {
|
||||
const scope = tableRefSet([{ catalog: 'A', db: 'B', name: 'C' }]);
|
||||
expect(scopedTableNames(scope, { catalog: 'X', db: 'Y' })).toEqual([]);
|
||||
});
|
||||
|
||||
it('dedupes exact namespace matches only', () => {
|
||||
const scope: ReadonlySet<KtxTableRefKey> = tableRefSet([
|
||||
{ catalog: null, db: 'public', name: 'users' },
|
||||
{ catalog: 'A', db: 'public', name: 'users' },
|
||||
]);
|
||||
expect(scopedTableNames(scope, { catalog: 'A', db: 'public' })).toEqual(['users']);
|
||||
});
|
||||
});
|
||||
24
packages/cli/test/context/scan/type-normalization.test.ts
Normal file
24
packages/cli/test/context/scan/type-normalization.test.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { inferKtxDimensionType, ktxColumnTypeMappingFromNative, normalizeKtxNativeType } from '../../../src/context/scan/type-normalization.js';
|
||||
|
||||
describe('KTX scan type normalization', () => {
|
||||
it('normalizes native database type strings', () => {
|
||||
expect(normalizeKtxNativeType(' NUMERIC(12, 2) ')).toBe('numeric');
|
||||
expect(normalizeKtxNativeType('TIMESTAMP WITH TIME ZONE')).toBe('timestamp with time zone');
|
||||
expect(normalizeKtxNativeType('')).toBe('unknown');
|
||||
});
|
||||
|
||||
it('infers dimension types from native types', () => {
|
||||
expect(inferKtxDimensionType('BOOLEAN')).toBe('boolean');
|
||||
expect(inferKtxDimensionType('timestamp with time zone')).toBe('time');
|
||||
expect(inferKtxDimensionType('decimal(10,2)')).toBe('number');
|
||||
expect(inferKtxDimensionType('varchar(255)')).toBe('string');
|
||||
});
|
||||
|
||||
it('builds a complete column type mapping', () => {
|
||||
expect(ktxColumnTypeMappingFromNative('BIGINT')).toEqual({
|
||||
normalizedType: 'bigint',
|
||||
dimensionType: 'number',
|
||||
});
|
||||
});
|
||||
});
|
||||
262
packages/cli/test/context/scan/types.test.ts
Normal file
262
packages/cli/test/context/scan/types.test.ts
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
createKtxConnectorCapabilities,
|
||||
type KtxEventPropertyDiscovery,
|
||||
type KtxEventPropertyDiscoveryInput,
|
||||
type KtxEventPropertyValuesInput,
|
||||
type KtxEventPropertyValuesResult,
|
||||
type KtxEventStreamDiscoveryPort,
|
||||
type KtxEventTypeDiscovery,
|
||||
type KtxEventTypeDiscoveryInput,
|
||||
type KtxNetworkEndpoint,
|
||||
type KtxNetworkTunnelPort,
|
||||
type KtxQueryResult,
|
||||
type KtxScanConnector,
|
||||
type KtxScanContext,
|
||||
type KtxScanInput,
|
||||
type KtxSchemaSnapshot,
|
||||
} from '../../../src/context/scan/types.js';
|
||||
|
||||
describe('KTX scan contract types', () => {
|
||||
it('defaults to structural-only connector capabilities', () => {
|
||||
expect(createKtxConnectorCapabilities()).toEqual({
|
||||
structuralIntrospection: true,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
columnStats: false,
|
||||
readOnlySql: false,
|
||||
nestedAnalysis: false,
|
||||
eventStreamDiscovery: false,
|
||||
formalForeignKeys: false,
|
||||
estimatedRowCounts: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps structural introspection mandatory when optional capabilities are enabled', () => {
|
||||
expect(
|
||||
createKtxConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
readOnlySql: true,
|
||||
eventStreamDiscovery: true,
|
||||
estimatedRowCounts: true,
|
||||
}),
|
||||
).toEqual({
|
||||
structuralIntrospection: true,
|
||||
tableSampling: true,
|
||||
columnSampling: false,
|
||||
columnStats: false,
|
||||
readOnlySql: true,
|
||||
nestedAnalysis: false,
|
||||
eventStreamDiscovery: true,
|
||||
formalForeignKeys: false,
|
||||
estimatedRowCounts: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('describes the connector surface without requiring enrichment methods', async () => {
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T00:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: { source: 'unit-test' },
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: 42,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Primary key',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const connector: KtxScanConnector = {
|
||||
id: 'test-postgres',
|
||||
driver: 'postgres',
|
||||
capabilities: createKtxConnectorCapabilities({ estimatedRowCounts: true }),
|
||||
async introspect(input: KtxScanInput, ctx: KtxScanContext) {
|
||||
expect(input.connectionId).toBe('warehouse');
|
||||
expect(ctx.runId).toBe('scan-run-1');
|
||||
return snapshot;
|
||||
},
|
||||
listSchemas: async () => [],
|
||||
listTables: async () => [],
|
||||
};
|
||||
|
||||
await expect(
|
||||
connector.introspect(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
scope: { schemas: ['public'] },
|
||||
mode: 'structural',
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual(snapshot);
|
||||
});
|
||||
|
||||
it('models optional event-stream discovery as a connector capability and port', async () => {
|
||||
const eventTypes: KtxEventTypeDiscovery[] = [{ value: '$pageview', count: 42 }];
|
||||
const propertyKeys: KtxEventPropertyDiscovery[] = [{ key: '$browser', count: 31 }];
|
||||
const propertyValues: KtxEventPropertyValuesResult = { values: ['Chrome', 'Safari'], cardinality: 2 };
|
||||
const discovery: KtxEventStreamDiscoveryPort = {
|
||||
async listEventTypes(input: KtxEventTypeDiscoveryInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
eventColumn: 'event',
|
||||
limit: 2,
|
||||
minCount: 30,
|
||||
lookbackDays: 14,
|
||||
});
|
||||
return eventTypes;
|
||||
},
|
||||
async listPropertyKeys(input: KtxEventPropertyDiscoveryInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
sampleSize: 1000,
|
||||
limit: 5,
|
||||
lookbackDays: 7,
|
||||
});
|
||||
return propertyKeys;
|
||||
},
|
||||
async listPropertyValues(input: KtxEventPropertyValuesInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
propertyKey: '$browser',
|
||||
limit: 3,
|
||||
maxCardinality: 1000,
|
||||
lookbackDays: 30,
|
||||
});
|
||||
return propertyValues;
|
||||
},
|
||||
};
|
||||
|
||||
const connector: KtxScanConnector = {
|
||||
id: 'clickhouse:product',
|
||||
driver: 'clickhouse',
|
||||
capabilities: createKtxConnectorCapabilities({ eventStreamDiscovery: true }),
|
||||
eventStreamDiscovery: discovery,
|
||||
async introspect() {
|
||||
return {
|
||||
connectionId: 'product',
|
||||
driver: 'clickhouse',
|
||||
extractedAt: '2026-04-29T00:00:00.000Z',
|
||||
scope: { catalogs: ['157881'] },
|
||||
metadata: {},
|
||||
tables: [],
|
||||
};
|
||||
},
|
||||
listSchemas: async () => [],
|
||||
listTables: async () => [],
|
||||
};
|
||||
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listEventTypes(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
eventColumn: 'event',
|
||||
limit: 2,
|
||||
minCount: 30,
|
||||
lookbackDays: 14,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual([{ value: '$pageview', count: 42 }]);
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listPropertyKeys(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
sampleSize: 1000,
|
||||
limit: 5,
|
||||
lookbackDays: 7,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual([{ key: '$browser', count: 31 }]);
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listPropertyValues(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
propertyKey: '$browser',
|
||||
limit: 3,
|
||||
maxCardinality: 1000,
|
||||
lookbackDays: 30,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual({ values: ['Chrome', 'Safari'], cardinality: 2 });
|
||||
});
|
||||
|
||||
it('keeps read-only query results separate from schema snapshots', () => {
|
||||
const result: KtxQueryResult = {
|
||||
headers: ['id', 'amount'],
|
||||
headerTypes: ['integer', 'numeric'],
|
||||
rows: [[1, 10.5]],
|
||||
totalRows: 1,
|
||||
rowCount: 1,
|
||||
};
|
||||
|
||||
expect(result).toEqual({
|
||||
headers: ['id', 'amount'],
|
||||
headerTypes: ['integer', 'numeric'],
|
||||
rows: [[1, 10.5]],
|
||||
totalRows: 1,
|
||||
rowCount: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('models host-provided network tunnel endpoint resolution without app imports', async () => {
|
||||
const endpoint: KtxNetworkEndpoint = {
|
||||
host: '127.0.0.1',
|
||||
port: 15432,
|
||||
close: async () => undefined,
|
||||
};
|
||||
const tunnelPort: KtxNetworkTunnelPort<{ networkProxy?: { type: 'ssh_tunnel' } }> = {
|
||||
async resolveEndpoint(input) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
host: 'db.internal',
|
||||
port: 5432,
|
||||
connection: { networkProxy: { type: 'ssh_tunnel' } },
|
||||
});
|
||||
return endpoint;
|
||||
},
|
||||
};
|
||||
|
||||
await expect(
|
||||
tunnelPort.resolveEndpoint({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
host: 'db.internal',
|
||||
port: 5432,
|
||||
connection: { networkProxy: { type: 'ssh_tunnel' } },
|
||||
}),
|
||||
).resolves.toBe(endpoint);
|
||||
});
|
||||
});
|
||||
216
packages/cli/test/context/scan/warehouse-catalog.test.ts
Normal file
216
packages/cli/test/context/scan/warehouse-catalog.test.ts
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
|
||||
import { WarehouseCatalogService } from '../../../src/context/scan/warehouse-catalog.js';
|
||||
|
||||
describe('WarehouseCatalogService', () => {
|
||||
let tempDir: string;
|
||||
let project: KtxLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-warehouse-catalog-'));
|
||||
project = await initKtxProject({ projectDir: join(tempDir, 'project') });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
async function seedLiveDatabaseScan(connectionId = 'warehouse', syncId = 'sync-2', driver = 'postgres') {
|
||||
const root = `raw-sources/${connectionId}/live-database/${syncId}`;
|
||||
const tableRef = {
|
||||
catalog: driver === 'bigquery' ? 'analytics' : null,
|
||||
db: driver === 'sqlite' ? null : 'public',
|
||||
name: 'orders',
|
||||
};
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/connection.json`,
|
||||
JSON.stringify({ connectionId, driver, extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed connection',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/tables/orders.json`,
|
||||
JSON.stringify(
|
||||
{
|
||||
catalog: tableRef.catalog,
|
||||
db: tableRef.db,
|
||||
name: tableRef.name,
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: 12,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'Order status',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed orders',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/enrichment/relationship-profile.json`,
|
||||
JSON.stringify(
|
||||
{
|
||||
connectionId,
|
||||
driver,
|
||||
sqlAvailable: true,
|
||||
queryCount: 3,
|
||||
tables: [{ table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name }, rowCount: 12 }],
|
||||
columns: {
|
||||
'orders.status': {
|
||||
table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name },
|
||||
column: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'text',
|
||||
rowCount: 12,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.1667,
|
||||
nullRate: 0,
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 8,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed profile',
|
||||
);
|
||||
}
|
||||
|
||||
it('finds the latest sync and merges table schema with relationship profile values', async () => {
|
||||
await seedLiveDatabaseScan('warehouse', 'sync-1');
|
||||
await seedLiveDatabaseScan('warehouse', 'sync-2');
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.getLatestSyncId('warehouse')).resolves.toBe('sync-2');
|
||||
const detail = await catalog.getTable({ connectionId: 'warehouse', catalog: null, db: 'public', name: 'orders' });
|
||||
|
||||
expect(detail).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
display: 'public.orders',
|
||||
rowCount: 12,
|
||||
columns: [
|
||||
{ name: 'id', nativeType: 'integer', primaryKey: true },
|
||||
{ name: 'status', nativeType: 'text', sampleValues: ['paid', 'refunded'], distinctCount: 2 },
|
||||
],
|
||||
});
|
||||
expect(detail).not.toHaveProperty(['connection', 'Name'].join(''));
|
||||
|
||||
const hits = await catalog.searchByName('warehouse', 'orders', 5);
|
||||
expect(hits[0]).toMatchObject({
|
||||
kind: 'table',
|
||||
connectionId: 'warehouse',
|
||||
display: 'public.orders',
|
||||
});
|
||||
expect(hits[0]).not.toHaveProperty(['connection', 'Name'].join(''));
|
||||
});
|
||||
|
||||
it('returns scanAvailable=false when no live-database scan exists', async () => {
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
await expect(catalog.getTable({ connectionId: 'missing', catalog: null, db: 'public', name: 'orders' })).resolves.toBeNull();
|
||||
await expect(catalog.hasScan('missing')).resolves.toBe(false);
|
||||
});
|
||||
|
||||
it('resolves postgres display strings and returns closest candidates for missing tables', async () => {
|
||||
await seedLiveDatabaseScan();
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
|
||||
resolved: { catalog: null, db: 'public', name: 'orders' },
|
||||
candidates: [],
|
||||
dialect: 'postgres',
|
||||
});
|
||||
await expect(catalog.resolveDisplay('warehouse', 'public.orderz')).resolves.toMatchObject({
|
||||
resolved: null,
|
||||
candidates: [{ name: 'orders' }],
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps one-part table display fallback for loose catalog resolution', async () => {
|
||||
await seedLiveDatabaseScan();
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplay('warehouse', 'orders')).resolves.toMatchObject({
|
||||
resolved: { catalog: null, db: 'public', name: 'orders' },
|
||||
candidates: [],
|
||||
dialect: 'postgres',
|
||||
});
|
||||
});
|
||||
|
||||
it('treats two-part BigQuery identifiers as ambiguous instead of guessing', async () => {
|
||||
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
|
||||
resolved: null,
|
||||
dialect: 'bigquery',
|
||||
});
|
||||
});
|
||||
|
||||
it('resolves postgres column display strings without treating the column as a table', async () => {
|
||||
await seedLiveDatabaseScan();
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplayTarget('warehouse', 'public.orders.status')).resolves.toMatchObject({
|
||||
resolved: { catalog: null, db: 'public', name: 'orders', column: 'status' },
|
||||
candidates: [],
|
||||
dialect: 'postgres',
|
||||
});
|
||||
});
|
||||
|
||||
it('resolves BigQuery column display strings with four parts', async () => {
|
||||
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplayTarget('warehouse', 'analytics.public.orders.status')).resolves.toMatchObject({
|
||||
resolved: { catalog: 'analytics', db: 'public', name: 'orders', column: 'status' },
|
||||
candidates: [],
|
||||
dialect: 'bigquery',
|
||||
});
|
||||
});
|
||||
|
||||
it('searches table names, column names, comments, and descriptions', async () => {
|
||||
await seedLiveDatabaseScan();
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.searchByName('warehouse', 'status', 10)).resolves.toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
kind: 'column',
|
||||
ref: expect.objectContaining({ db: 'public', name: 'orders', column: 'status' }),
|
||||
matchedOn: 'name',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue